Commits

james woodyatt committed 8e2b867

Add Cf_llscan.any and polish Cf_ucs module a bit before it will be
moving into the new Ucs library as the Ucs_code module before the UTF
transforms are split out into a new Ucs_transform module.

Comments (0)

Files changed (4)

     | Cf_seq.Z -> Some ((), s)
     | Cf_seq.P _ -> None
 
+let any s =
+    match Lazy.force s with
+    | Cf_seq.P (hd, tl) -> Some (hd, tl)
+    | Cf_seq.Z -> None
+
 let sat f s =
     match Lazy.force s with
     | Cf_seq.P (hd, tl) when f hd -> Some (hd, tl)
 val ret: 'r -> ('s, 'r) t
 val bind: ('s, 'a) t -> ('a -> ('s, 'b) t) -> ('s, 'b) t
 val fin: ('s, unit) t
+val any: ('s, 's) t
 val sat: ('s -> bool) -> ('s, 's) t
 val tok: ('s -> 'r option) -> ('s, 'r) t
 val opt: ('s, 'r) t -> ('s, 'r option) t
   OF THE POSSIBILITY OF SUCH DAMAGE. 
  *---------------------------------------------------------------------------*)
 
+let ( >>= ) = Cf_llscan.bind
+
 type t = int
 
 let compare a b = b - a
 external chr: int -> t = "%identity"
 
 module type Transform = sig
-    val format: Format.formatter -> t -> unit
+    val emit: Format.formatter -> t -> unit
     val scan: (char, t) Cf_llscan.t
 end
 
 module UTF_8: Transform = struct
-    let format =
+    let emit =
         let rec loop pp x k pre =
             if k > 0 then begin
                 let c = char_of_int ((x land 0x3f) lor 0x80) in
 end
 
 module type UCS_2_endian = sig
-    val format2: Format.formatter -> int -> unit
+    val emit2: Format.formatter -> int -> unit
     val scan2: (char, int) Cf_llscan.t
 end
 
 module UCS_2_big_endian: UCS_2_endian = struct
-    let format2 pp n =
+    let emit2 pp n =
         let c1 = char_of_int (n land 0xFF) in
         let n = n lsr 8 in
         let c0 = char_of_int (n land 0xFF) in
         Format.pp_print_char pp c1
     
     let scan2 =
-        let ( >>= ) = Cf_llscan.bind in
         let lift c = Some (int_of_char c) in
         Cf_llscan.tok lift >>= fun c0 ->
         Cf_llscan.tok lift >>= fun c1 ->
 end
 
 module UCS_2_little_endian: UCS_2_endian = struct
-    let format2 pp n =
+    let emit2 pp n =
         let c1 = char_of_int (n land 0xFF) in
         let n = n lsr 8 in
         let c0 = char_of_int (n land 0xFF) in
         Format.pp_print_char pp c0
     
     let scan2 =
-        let ( >>= ) = Cf_llscan.bind in
         let lift c = Some (int_of_char c) in
         Cf_llscan.tok lift >>= fun c0 ->
         Cf_llscan.tok lift >>= fun c1 ->
 end
 
 module UTF_16_functor(UCS2: UCS_2_endian) = struct
-    let format pp n =
+    let emit pp n =
         if n = n land 0xffff then
-            UCS2.format2 pp n
+            UCS2.emit2 pp n
         else if n > 0 && n < 0x110000 then begin
             let n = n - 0x10000 in
-            UCS2.format2 pp (0xd800 lor ((n lsr 10) land 0x3ff));
-            UCS2.format2 pp (0xdc00 lor (n land 0x3ff))
+            UCS2.emit2 pp (0xd800 lor ((n lsr 10) land 0x3ff));
+            UCS2.emit2 pp (0xdc00 lor (n land 0x3ff))
         end
         else
-            UCS2.format2 pp 0xfffd
+            UCS2.emit2 pp 0xfffd
     
     let scan =
-        let ( >>= ) = Cf_llscan.bind in
         UCS2.scan2 >>= fun c ->
         assert (c >= 0 && c < 0xffff);
         if c < 0xd800 || c >= 0xe000 then
 module UTF_16BE = UTF_16_functor(UCS_2_big_endian)
 module UTF_16LE = UTF_16_functor(UCS_2_little_endian)
 
+type endian = [ `BE | `LE ]
+
+let emit_bom pp = function
+    | `BE ->
+        Format.pp_print_char pp '\xff';
+        Format.pp_print_char pp '\xfe'
+    | `LE ->
+        Format.pp_print_char pp '\xfe';
+        Format.pp_print_char pp '\xff'
+
+let scan_bom =
+    Cf_llscan.any >>= fun c0 ->
+    Cf_llscan.any >>= fun c1 ->
+    match c0, c1 with
+    | '\xff', '\xfe' -> Cf_llscan.ret `BE
+    | '\xfe', '\xff' -> Cf_llscan.ret `LE
+    | _, _ -> Cf_llscan.nil
+
 (*--- $File$ ---*)
 val chr: int -> t
 
 module type Transform = sig
-    val format: Format.formatter -> t -> unit
+    val emit: Format.formatter -> t -> unit
     val scan: (char, t) Cf_llscan.t
 end
 
 module UTF_16BE: Transform
 module UTF_16LE: Transform
 
+type endian = [ `BE | `LE ]
+
+val emit_bom: Format.formatter -> [< endian ] -> unit
+val scan_bom: (char, [> endian ]) Cf_llscan.t
+
 (*--- $File$ ---*)