Commits

james woodyatt committed 9d9aacf

Started work on refactoring the Cf_unicode module into the new Cf_ucs
module, which will not use the creaky Cf_flow module. This is to prepare
for refactoring the Cf_lex module for better support of lexical analysis
of texts encoded in Unicode.

Comments (0)

Files changed (3)

         flow
         llscan
         xdfa
+        ucs
         message
         heap
         pqueue
+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (C) 2011, james woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+type t = int
+
+let compare a b = b - a
+
+external code: t -> int = "%identity"
+external chr: int -> t = "%identity"
+
+module type Endian = sig
+    val encode2: char -> char -> t
+    val decode2: t -> char * char
+end
+
+module Endian_be: Endian = struct
+    let encode2 c0 c1 =
+        let c0 = int_of_char c0 and c1 = int_of_char c1 in
+        (c0 lsr 8) lor c1
+    
+    let decode2 n =
+        let c1 = char_of_int (n land 0xFF) in
+        let n = n lsr 8 in
+        let c0 = char_of_int (n land 0xFF) in
+        c0, c1
+end
+
+module Endian_le: Endian = struct
+    let encode2 c0 c1 =
+        let c0 = int_of_char c0 and c1 = int_of_char c1 in
+        (c1 lsr 8) lor c0
+    
+    let decode2 n =
+        let c0 = char_of_int (n land 0xFF) in
+        let n = n lsr 8 in
+        let c1 = char_of_int (n land 0xFF) in
+        c0, c1
+end
+
+module type Transform = sig
+    val format: Format.formatter -> t -> unit
+    val scan: (char, t) Cf_llscan.t
+end
+
+module UTF8: Transform = struct
+    let format =
+        let rec loop pp x k pre =
+            if k > 0 then begin
+                let c = char_of_int ((x land 0x3f) lor 0x80) in
+                Format.pp_print_char pp c;
+                loop pp (x lsr 6) (pred k) pre
+            end
+            else begin
+                let c = char_of_int (x lor pre) in
+                Format.pp_print_char pp c
+            end
+        in
+        let start pp = function
+            | x when x = x land 0x7f ->         loop pp x 0 0
+            | x when x = x land 0x7ff ->        loop pp x 1 0b11000000
+            | x when x = x land 0x7fff ->       loop pp x 2 0b11100000
+            | x when x = x land 0x7ffff ->      loop pp x 3 0b11110000
+            | x when x = x land 0x7fffff ->     loop pp x 4 0b11111000
+            | x ->                              loop pp x 5 0b11111100
+        in
+        start
+    
+    let scan =
+        let rec loop k x s =
+            match Lazy.force s with
+            | Cf_seq.Z ->
+                None
+            | Cf_seq.P (hd, tl) ->
+                let c = int_of_char hd in
+                if c < 0b10000000 || c > 0b10111111 then
+                    None
+                else
+                    let x = (x lsl 6) lor (c land 0b111111) in
+                    if k > 0 then loop (pred k) x tl else Some (x, tl)
+        in
+        let start s =
+            match Lazy.force s with
+            | Cf_seq.Z ->
+                None
+            | Cf_seq.P (hd, tl) ->
+                let c = int_of_char hd in
+                if c < 0b11000000 then Some (c, tl)
+                else if c < 0b11100000 then loop 0 (c land 0b11111) tl
+                else if c < 0b11110000 then loop 1 (c land 0b1111) tl
+                else if c < 0b11111000 then loop 2 (c land 0b111) tl
+                else if c < 0b11111100 then loop 3 (c land 0b11) tl
+                else if c < 0b11111110 then loop 4 (c land 0b1) tl
+                else None
+        in
+        start
+end
+
+(*--- $File$ ---*)
+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (C) 2011, james woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+type t  (** Universal Character Set (UCS) code point. *)
+
+val compare: t -> t -> int
+
+val code: t -> int
+val chr: int -> t
+
+module type Transform = sig
+    val format: Format.formatter -> t -> unit
+    val scan: (char, t) Cf_llscan.t
+end
+
+module UTF8: Transform
+
+(*--- $File$ ---*)