1. james woodyatt
  2. oni

Commits

james woodyatt  committed af3c6c3

Finish up the UTF-16BE and UTF-16LE.

  • Participants
  • Parent commits 2a0536a
  • Branches sideline

Comments (0)

Files changed (2)

File cf/cf_ucs.ml

View file
  • Ignore whitespace
     val scan: (char, t) Cf_llscan.t
 end
 
-module UTF8: Transform = struct
+module UTF_8: Transform = struct
     let format =
         let rec loop pp x k pre =
             if k > 0 then begin
         start
 end
 
-module type UCS2_transform = sig
+module type UCS_2_endian = sig
     val format2: Format.formatter -> int -> unit
     val scan2: (char, int) Cf_llscan.t
 end
 
-module UCS2_be: UCS2_transform = struct
+module UCS_2_big_endian: UCS_2_endian = struct
     let format2 pp n =
         let c1 = char_of_int (n land 0xFF) in
         let n = n lsr 8 in
         Cf_llscan.ret ((c0 lsr 8) lor c1)
 end
 
-module UCS2_le: UCS2_transform = struct
+module UCS_2_little_endian: UCS_2_endian = struct
     let format2 pp n =
         let c1 = char_of_int (n land 0xFF) in
         let n = n lsr 8 in
         Cf_llscan.ret ((c1 lsr 8) lor c0)
 end
 
-module UTF16x_transform(UCS2: UCS2_transform) = struct
+module UTF_16_functor(UCS2: UCS_2_endian) = struct
     let format pp n =
         if n = n land 0xffff then
             UCS2.format2 pp n
         end
         else
             UCS2.format2 pp 0xfffd
+    
+    let scan =
+        let ( >>= ) = Cf_llscan.bind in
+        UCS2.scan2 >>= fun c ->
+        assert (c >= 0 && c < 0xffff);
+        if c < 0xd800 || c >= 0xe000 then
+            Cf_llscan.ret c
+        else begin
+            if c lsr 10 <> 0xd8 then
+                Cf_llscan.nil
+            else begin
+                let c0 = c land 0x3ff in
+                UCS2.scan2 >>= fun c ->
+                if c lsr 10 <> 0xdc then
+                    Cf_llscan.nil
+                else begin
+                    let c1 = c land 0x3ff in
+                    Cf_llscan.ret ((c0 lsl 10) lor c1)
+                end
+            end
+        end
 end
 
+module UTF_16BE = UTF_16_functor(UCS_2_big_endian)
+module UTF_16LE = UTF_16_functor(UCS_2_little_endian)
+
 (*--- $File$ ---*)

File cf/cf_ucs.mli

View file
  • Ignore whitespace
     val scan: (char, t) Cf_llscan.t
 end
 
-module UTF8: Transform
+module UTF_8: Transform
+module UTF_16BE: Transform
+module UTF_16LE: Transform
 
 (*--- $File$ ---*)