Commits

Anonymous committed 5f322db

+ utf8_of_utf16 enumeratee

Comments (0)

Files changed (3)

 ;
 
 
+type uchar = int;
+
+
 module UTF8(IO : It_Types.MonadIO)
  :
   sig
-    type uchar = private int;
     value utf8_of_char : (Iteratees.Make(IO)).enumeratee char uchar 'a;
   end
  =
   struct
 
-    type uchar = int;
-
     open Iteratees;
     module I = Make(IO);
     open I;
     exception Bad_utf8 of string
     ;
 
-
 (*  without actual conversion:
     value sc_ulen sc =
       let len = SC.length sc in
 ;
 
 
+
+value break_feed : ('a -> bool) -> enumeratee 'a 'a 'b = fun pred it ->
+  let rec break_feed it =
+    match it with
+    [ IE_done _ | IE_cont (Some _) _ -> return it
+    | IE_cont None k -> ie_cont & step k
+    ]
+  and step k s =
+    match s with
+    [ EOF _ -> ie_doneM (ie_cont k) s
+    | Chunk c ->
+        let (to_feed, to_leave) = S.break pred c in
+        if S.is_empty to_feed
+        then
+          ie_doneM it s
+        else
+          k (Chunk to_feed) >>% fun (it, sl) ->
+          match it with
+          [ IE_done _ | IE_cont (Some _) _ ->
+              ie_doneMsl it (Sl.append sl & Sl.one & Chunk to_leave)
+          | IE_cont None _k ->
+              (* here: s is empty *)
+              if S.is_empty to_leave  (* break not found *)
+              then
+                IO.return (break_feed it, Sl.empty)
+              else
+                ie_doneM it (Chunk to_leave)
+          ]
+    ]
+  in
+    break_feed it
+;
+
+
+(* +
+   [junk] = [drop 1]
+*)
+
+value junk = IE_cont None (fun s -> drop_step 1 s)
+;
+
+
 value array_ensure_size ~default array_ref size =
   let realloc () =
     let r = Array.make size default in
 module SC = Subarray_cat
 ;
 
+type uchar = int;
+type utf16item = int;
+
+
 module UTF8
  :
   sig
-    type uchar = private int;
     value utf8_of_char : enumeratee char uchar 'a;
+    value utf8_of_utf16 : enumeratee utf16item uchar 'a;
   end
  =
   struct
-    type uchar = int;
 
     exception Bad_utf8 of string
     ;
 
+    exception Bad_utf16 of string
+    ;
+
     value ensure_size = array_ensure_size ~default:(-1)
     ;
 
         utf8_of_char ~acc:(`Acc S.empty) uit
     ;
 
+    (* todo: in a more chunk-way *)
+
+    value utf8_of_utf16 : enumeratee utf16item uchar 'a =
+    fun it ->
+      let is_surr c = (c >= 0xD800 && c <= 0xDFFF) in
+      let rec utf8_of_utf16 it =
+        break_feed is_surr it >>= fun it ->
+        get_stream_eof >>= fun opt_opt_err ->
+        match opt_opt_err with
+        [ Some None -> return it
+        | Some (Some err) -> throw_err err
+        | None ->
+            match it with
+            [ IE_done _ | IE_cont (Some _) _ ->
+                return it
+            | IE_cont None k ->
+                peek >>= fun hi_surr_opt ->
+                match hi_surr_opt with
+                [ None -> return it
+                | Some hi_surr ->
+                    if hi_surr < 0xD800 || hi_surr > 0xDBFF
+                    then
+                      throw_err (Bad_utf16 "high surrogate out of range")
+                    else
+                      junk >>= fun () ->
+                      peek >>= fun lo_surr_opt ->
+                      match lo_surr_opt with
+                      [ None -> throw_err
+                          (Bad_utf16 "eof after high surrogate")
+                      | Some lo_surr ->
+                          junk >>= fun () ->
+                          if lo_surr < 0xDC00 || lo_surr > 0xDFFF
+                          then
+                            throw_err (Bad_utf16 "low surrogate out of range")
+                          else
+                            let uchar = (hi_surr - 0xD800) * 0x400
+                                        + lo_surr - 0xDC00 + 0x10000
+                            in
+                              let it =
+                                liftI (k (chunk_of uchar) >>% fun (it, _sl) ->
+                                       IO.return it)
+                              in
+                                utf8_of_utf16 it
+                      ]
+                  ]
+            ]
+
+
+
+        ]
+      in
+        utf8_of_utf16 it
+    ;
+
   end;  (* `UTF8' functor *)
 
 
 ;
 
 
-(* +
-   [junk] = [drop 1]
-*)
-
-value junk = IE_cont None (fun s -> drop_step 1 s)
-;
-
-
 exception SInt_overflow;
 exception SInt_not_a_number of string;
 
 =
 struct
 
+value dbg fmt = Printf.ksprintf (Printf.printf "%s\n%!") fmt
+;
+
 value runIO = IO.runIO
 ;
 
 ;
 
 value (runA : IO.m (iteratee 'el 'a) -> res 'a) i =
-  runIO (i >>% run)
+      let r = runIO (i >>% run) in (flush stdout; r)
 ;
 
 value run_print f i =
 
 module U = I.UTF8;
 
-value (dump_utf8_chars : iteratee U.uchar unit) =
- let pr s = mprintf "dump_utf8_chars: %s\n" s in
+value (dump_utf8_chars : iteratee uchar unit) =
 (*
  let pr s = IO.catch
    (fun () -> mprintf "dump_utf8_chars: %s\n" s)
  where rec inner s =
   match s with
   [ EOF opt_err ->
-      match opt_err with
-      [ None -> pr "natural end"
-      | Some e -> pr & sprintf "unnatural end: \"%s\"" & Printexc.to_string e
+    let () = match opt_err with
+      [ None -> dbg "natural end"
+      | Some e -> dbg "unnatural end: \"%s\"" & Printexc.to_string e
       ]
-      >>% fun () ->
+      in
       match opt_err with
       [ None -> ie_doneM () s
       | Some e -> IO.return & (throw_err e, Sl.one s)
       ]
   | Chunk c ->
-      pr
-       (sprintf "chunk of %i chars: [%s]"
+      let () = dbg "chunk of %i chars: [%s]"
         (S.length c)
         (String.concat "" &
-         List.map (fun c -> sprintf "&#x%X;" (c : U.uchar :> int)) &
+         List.map (fun c -> sprintf "&#x%X;" (c : uchar :> int)) &
          S.to_list c)
-       )
-      >>% fun () ->
+      in
       ie_contM inner
   ]
 ;
 )
 ;
 
+value test_utf16_enumeratee () =
+(
+  dbg "test_utf16_enumeratee"
+;
+  assert ((
+    runA & enum_pure_nchunk [0x41; 0x42; 0x0436; 0x43] 1
+           (joinI & U.utf8_of_utf16 dump_utf8_chars)
+    ) = `Ok ()
+  )
+;
+  let res =
+      runA & (enum_pure_nchunk [0x0436; 0xD834; 0xDF06; 0x0437] 1
+              >>> enum_err Myexc)
+             (joinI & U.utf8_of_utf16 dump_utf8_chars)
+  in
+(*
+  match res with
+  [ `Ok () -> assert False
+  | `Error e -> P.printf "exn: %s\n%!" (Printexc.to_string e)
+  ]
+*)
+    assert (res = `Error (Iteratees_err_msg Myexc))
+;
+  ()
+)
+;
+
+
 
 
 value limit_chars = expl "12345678abcdefgh"
   ; tests_driver_full ()
 
   ; test_utf8_enumeratee ()
+  ; test_utf16_enumeratee ()
 
   ; test_limits ()
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.