Commits

james woodyatt committed 1eb6e5c

More progress toward refactored modules to support lexical analysis
and LL(x) parsing of both ISO-8891 and Unicode texts.

  • Participants
  • Parent commits 3ccc8d5
  • Branches sideline

Comments (0)

Files changed (13)

 The distribution builds with OMake and includes the following subpackages:
 
   oni.cf    - Core foundation [included by default]
+  oni.ucs   - Unicode character set
   oni.nx    - Extended socket interface
   oni.iom   - I/O event multiplexing
   oni.xml   - XML stream read/write utility

File cf/OMakefile

         dyn
         deque
         flow
-        llscan
-        xdfa
-        message
         heap
         pqueue
         map
         set
         sbheap
         rbtree
+        llscan
+        xdfa
+        regx
+        clex
+        fmt_llscan
+        message
         gadget
         state_gadget
         machine

File cf/cf_clex.ml

+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (c) 2005-2010, James H. Woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+module DFA = Cf_regx.DFA
+
+type x = DFA.x
+type 'a r = 'a DFA.r
+type 'a t = 'a DFA.t
+
+let nil = DFA.nil
+let create r = DFA.create r
+
+module type Expr_Op_T = sig
+    val ( $| ): x -> x -> x
+    val ( $& ): x -> x -> x
+    
+    val ( !* ): x -> x
+    val ( !+ ): x -> x
+    val ( !? ): x -> x
+    val ( !: ): char -> x
+    val ( !^ ): (char -> bool) -> x
+    val ( !~ ): char Cf_seq.t -> x
+    val ( !$ ): string -> x
+end
+
+external identity: 'a -> 'a = "%identity"
+
+module Op = struct
+    include DFA.Op
+    
+    let ( !~ ) = Cf_regx.expr_of_seq
+    let ( !$ ) = Cf_regx.expr_of_string
+    let ( $> ) e f = e $> (fun z -> f (Cf_seq.to_string z))
+    let ( ?~ ) e = DFA.create (e $> identity)
+    let ( ?$ ) s = DFA.create (Cf_regx.expr_of_string s $> identity)
+end
+
+(*--- $File$ ---*)

File cf/cf_clex.mli

+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (c) 2005-2010, James H. Woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+(** Lexical analysis with functional composition of regular grammars. *)
+
+(** {6 Overview}
+    
+    This module implements functional parsers on octet character sequences
+    using regular expressions and functional composition of lazy deterministic
+    finite automata.
+*)
+
+(** {6 Core Interface} *)
+
+(** The type of regular expressions. *)
+type x = Cf_regx.DFA.x
+
+(** The type of lexical analysis rules. *)
+type 'a r = 'a Cf_regx.DFA.r
+
+(** Character stream parser. *)
+type 'a t = (char, 'a) Cf_llscan.t
+
+(** Epsilon, i.e. a subexpression that matches an empty input sequence. *)
+val nil: x
+
+(** Use [create r] to compose a lexical analyzer from the rule [r]. *)
+val create: 'a r -> 'a t
+
+(** Open this module to bring the operator functions for simple parsers into
+    the current scope.
+*)
+module Op: sig
+    
+    (** Alternating composition.  Use [a $| b] to compose an expression that
+        matches either expression [a] or expression [b].
+    *)
+    val ( $| ): x -> x -> x
+    
+    (** Serial composition.  Use [a $& b] to compose an expression that matches
+        expression [a] followed by expression [b].
+    *)
+    val ( $& ): x -> x -> x
+    
+    (** Star composition.  Use [!*a] to compose an expression that matches zero
+        or any number of instances of [a].
+    *)
+    val ( !* ): x -> x
+    
+    (** Plus composition.  Use [!+a] to compose an expression that matches one
+        or more instances of [a].
+    *)
+    val ( !+ ): x -> x
+    
+    (** Optional composition.  Use [!?a] to compose an expression that matches
+        zero or one instance of [a].
+    *)
+    val ( !? ): x -> x
+    
+    (** Character literal.  Use [!:c] to compose an expression that matches the
+        character [c].
+    *)
+    val ( !: ): char -> x
+    
+    (** Character set.  Use [!^f] to compose an expression that matches any
+        character for which the satisfier function [f] returns [true].
+    *)
+    val ( !^ ): (char -> bool) -> x
+    
+    (** Regular expression sequence.  Use [!~z] to parse the sequence [z]
+        according to the grammar defined in {!Cf_regx} module and compose
+        an expression that matches input accordingly.  Raises {!Cf_regx.Error}
+        if the sequence is not a regular expression.
+    *)
+    val ( !~ ): char Cf_seq.t -> x
+    
+    (** Regular expression string.  Use [!~s] to parse the string [s] according
+        to the grammar defined in {!Cf_regx} module and compose an expression
+        that matches input accordingly.  Raises {!Cf_regx.Error} if the string
+        is not a regular expression.
+    *)
+    val ( !$ ): string -> x
+    
+    (** Literal token rule.  Use [e $= obj] to compose a rule that outputs the
+        literal object [obj] when the expression [e] is recognized.
+    *)
+    val ( $= ): x -> 'a -> 'a r
+    
+    (** String token rule.  Use [e $> f] to compose a rule that applies the
+        string recognized by the expression [e] to the tokenizer function [f]
+        to produce its result.
+    *)
+    val ( $> ): x -> (string -> 'a) -> 'a r
+    
+    (** Advanced token rule.  Use [e $@ f] to compose a rule that applies the
+        length of the character sequence recognized by the expression [e] to
+        the advanced tokenizer function [f] to obtain a parser that produces
+        the output of the rule and makes any other manipulations necessary to
+        continue parsing the input stream.  If the parser returned by [f] does
+        not recognize the input, then no output is produced and no other rules
+        are matched.
+    *)
+    val ( $@ ): x -> (int -> 'a t) -> 'a r
+    
+    (** Rule aggregation.  Use this operator to combine a list of rules into a
+        single rule.
+    *)
+    val ( !@ ): 'a r list -> 'a r
+    
+    (** String parser.  Use [?~x] to create a simple parser that recognizes any
+        string that matches the expression [x].  {b Note:} Care should be taken
+        when composing parsers with this operator to keep the lazy DFA from
+        being recreated in every pass.
+    *)
+    val ( ?~ ): x -> string t
+    
+    (** String parser.  Use [?$s] to create a simple parser that recognizes any
+        string that matches the regular expression specified in the string [s]
+        according to the grammar in the {!Cf_regx} module.  Raises
+        {!Cf_regx.Error} if the string is not a regular expression.  {b Note:}
+        Care should be taken when composing parsers with this operator to keep
+        from parsing the argument string in every pass.
+    *)
+    val ( ?$ ): string -> string t
+end
+
+(*--- $File$ ---*)

File cf/cf_fmt_llscan.ml

+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (c) 2004-2010, James H. Woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+exception No_match
+
+class virtual ['i] scanner z =
+    object(self)
+        val mutable next_: 'i Cf_seq.t = z
+        
+        method private virtual get: char
+        
+        method init = Scanf.Scanning.from_function (fun () -> self#get)
+        method fini = next_
+    end
+
+let cscanf cf ef fmt rf z =
+    let s = cf z in
+    let ef0 _ x = ef s x in
+    try
+        let v = Scanf.kscanf s#init ef0 fmt rf in
+        let z = s#fini in
+        Some (v, z)
+    with
+    | No_match ->
+        None
+
+class lex_scanner z =
+    object
+        inherit [char] scanner z
+                
+        method private get =
+            match Lazy.force next_ with
+            | Cf_seq.Z ->
+                raise End_of_file
+            | Cf_seq.P (hd, tl) ->
+                next_ <- tl;
+                hd
+    end
+
+let scanf fmt rf z =
+    let ef _ = raise No_match in
+    cscanf (new lex_scanner) ef fmt rf z
+
+(*--- $File$ ---*)

File cf/cf_fmt_llscan.mli

+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (c) 2004-2010, James H. Woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+(** Lexical analysis with functional composition using [Scanf] scanners. *)
+
+(** {6 Overview}
+
+    This module implements and extension to the {!Cf_llscan} module for mixing
+    calls to the standard library [Scanf] functions with functional parsers.
+*)
+
+(** {6 Classes and Types} *)
+
+(** An exception provided so that the [cscanf] function (below) can be signaled
+    to transform its answer with the effect that the parser stack is unwound
+    until an alternative production can be matched.
+*)
+exception No_match
+
+(** A virtual base class used in the [cscanf] function (below) for constructing
+    a scanning buffer from an input sequence.
+*)
+class virtual ['i] scanner:
+    'i Cf_seq.t ->      (** The input sequence *)
+    object
+        val mutable next_: 'i Cf_seq.t  (** The next unmatched input symbol *)
+        
+        (** Get the next character for the scanning buffer *)
+        method private virtual get: char
+        
+        (** Initialize the scanning buffer *)
+        method init: Scanf.Scanning.scanbuf
+        
+        (** Finalize the scanning buffer and return the next unmatched input
+            symbol.
+        *)
+        method fini: 'i Cf_seq.t
+    end
+
+(** {6 Functions} *)
+
+(** This is the primitive function in the module.  Use [cscanf cf ef fmt rf] to
+    construct a parser that applies [cf] to the input sequence to acquire a
+    scanner object [s], invokes the [s#init] method to obtain a scanning buffer
+    with which to apply [Scanf.kscanf], using the exception function [ef], the
+    scanning format [fmt] and the return continuation [rf].  If the exception
+    function raises [No_match] then the resulting parser unwinds to the next
+    production alternative, otherwise the parser answers with the result of the
+    return continuation.
+*)
+val cscanf:
+    ('i Cf_seq.t -> ('i #scanner as 's)) -> ('s -> exn -> 'o) ->
+    ('f, Scanf.Scanning.scanbuf, 'u, 'f -> 'o, 'f -> 'o, 'o) format6 -> 'f ->
+    ('i, 'o) Cf_llscan.t
+
+(** Use [scanf fmt rf] to construct a lexical parser that scans the input text
+    according to the scanning format [fmt] and produces the value returned by
+    the return continuation.  If the scanner raises an exception, then the
+    parser unwinds to the next production alternative.
+*)
+val scanf:
+    ('f, Scanf.Scanning.scanbuf, 'u, 'f -> 'o, 'f -> 'o, 'o) format6 -> 'f ->
+    (char, 'o) Cf_llscan.t
+
+(*--- $File$ ---*)

File cf/cf_regx.ml

+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (c) 2005-2010, James H. Woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+module Symbol = struct
+    type t = char and 'a map = 'a array
+    let map f = Array.init 256 (fun n -> f (char_of_int n))
+    let get m c = Array.unsafe_get m (int_of_char c)
+end
+
+module DFA = Cf_xdfa.Create(Symbol)
+
+exception Error of string
+        
+module P = Cf_llscan
+open P.Op
+open DFA.Op
+
+let p_digit_ =
+    P.sat (function '0'..'9' -> true | _ -> false) >>= fun c ->
+    P.ret (Char.code c - 48)
+
+let p_hexdigit_ =
+    let base_ c =
+        if c >= 'a' then 87 else if c >= 'A' then 55 else 48
+    in
+    P.sat begin
+        function ('0'..'9' | 'a'..'f' | 'A'..'F') -> true | _ -> false
+    end >>= fun c ->
+    P.ret (Char.code c - base_ c)
+
+let p_backtick_ = ?.'`'
+
+let p_any_ = P.sat (fun _ -> true)
+
+let esc_ch_list_ =
+    let hexcode_ _ =
+        p_hexdigit_ >>= fun a ->
+        p_hexdigit_ >>= fun b ->
+        P.ret (Char.chr (a * 16 + b))
+    in
+    let deccode_ chA =
+        let a = Char.code chA - 48 in
+        p_digit_ >>= fun b ->
+        p_digit_ >>= fun c ->
+        let code = a * 100 + b * 10 + c in
+        if code > 255 then P.nil else P.ret (Char.chr code)
+    in
+    let control_ _ =
+        P.sat begin function
+            | '@'..'_' | 'a'..'z' -> true
+            | _ -> false
+        end >>= fun c ->
+        let n = Char.code c in
+        let n = if n >= 97 then n - 96 else n - 64 in
+        P.ret (Char.chr n)
+    in
+    let newline_ _ = P.ret '\x0A' in
+    let tab_ _ = P.ret '\x09' in
+    let return_ _ = P.ret '\x0D' in
+    [
+        'n', newline_;
+        't', tab_;
+        'r', return_;
+        'x', hexcode_;
+        'c', control_;
+        '0', deccode_;
+        '1', deccode_;
+        '2', deccode_;
+        '`', P.ret;
+    ]
+
+let ch_class_ =
+    let l_bracket = ?.'[' in
+    let r_bracket = ?.']' in
+    let hyphen = ?.'-' in
+    let eq (c1 : char) (c2 : char) = (c1 = c2) in
+    let raw_ch = P.sat (function '-' | ']' -> false | _ -> true) in
+    let esc_ch =
+        let mapF (ch, f) = ?.ch >>= f in
+        let aux = P.alt (List.map mapF ((']', P.ret) :: esc_ch_list_)) in
+        p_backtick_ >>= fun _ ->
+        aux
+    in
+    let single_ch = P.alt [ esc_ch; raw_ch ] in
+    let range: (char, char -> bool) P.t =
+        single_ch >>= fun a ->
+        hyphen >>= fun _ ->
+        single_ch >>= fun b ->
+        P.ret (fun ch -> ch >= a && ch <= b)
+    in
+    let eqLift p = p >>= fun ch -> P.ret (fun c -> c = ch) in
+    let esc_set =
+        let alpha = function 'A'..'Z' | 'a'..'z' -> true | _ -> false in
+        let digit = function '0'..'9' -> true | _ -> false in
+        let alnum ch = alpha ch || digit ch in
+        let specifier = P.alt [
+            (?.'a' >>= fun _ -> P.ret alpha);
+            (?.'d' >>= fun _ -> P.ret digit);
+            (?.'i' >>= fun _ -> P.ret alnum);
+        ] in
+        p_backtick_ >>= fun _ ->
+        specifier
+    in
+    let single = P.alt (List.map eqLift [ esc_ch; raw_ch ]) in
+    let hyphen_ch = P.tok (function '-' -> Some (eq '-') | _ -> None) in
+    let atom0 = P.alt [ hyphen_ch; esc_set; range; single ] in
+    let atomN = P.alt [ esc_set; range; single ] in
+    let atomlist =
+        let existF ch f = f ch in
+        atom0 >>= fun hd ->
+        ?*atomN >>= fun tl ->
+        P.ret (fun ch -> List.exists (existF ch) (hd :: tl))
+    in
+    let negate =
+        ?/(?.'^') >>= function None -> P.ret false | _ -> P.ret true
+    in
+    l_bracket >>= fun _ ->
+    negate >>= fun _ ->
+    atomlist >>= fun f ->
+    r_bracket >>= fun _ ->
+    P.ret !^f
+
+let esc_expr_list_ =
+    let meta_ ch = P.ret !:ch in
+    let alpha_ _ =
+        P.ret !^(function 'A'..'Z' | 'a'..'z' -> true | _ -> false)
+    in
+    let alnum_ _ =
+        P.ret begin
+            !^begin function
+                | '0'..'9' | 'A'..'Z' | 'a'..'z' -> true
+                | _ -> false
+            end
+        end
+    in
+    let digit_ _ = P.ret !^(function '0'..'9' -> true | _ -> false) in
+    let sat_white_ x = function '\009'..'\013' | '\032' -> x | _ -> not x in
+    let white_ _ = P.ret !^(sat_white_ true) in
+    let nonwhite_ _ = P.ret !^(sat_white_ false) in
+    [
+        'a', alpha_;
+        'i', alnum_;
+        'd', digit_;
+        's', white_;
+        'w', nonwhite_;
+        '.', meta_;
+        '?', meta_;
+        '*', meta_;
+        '+', meta_;
+        '(', meta_;
+        ')', meta_;
+        '|', meta_;
+        '[', meta_;
+        ']', meta_;
+        '^', meta_;
+        '$', meta_;
+    ]
+
+let esc_expr_ =        
+    let p_escape_ fLst =
+        p_backtick_ >>= fun _ ->
+        P.alt (List.map (fun (ch, f) -> ?.ch >>= f) fLst)
+    in
+    let esc_chx_list_ =
+        List.rev_map begin fun (c, f) ->
+            c, fun x -> f x >>= fun y -> P.ret !:y
+        end esc_ch_list_
+    in
+    p_escape_ (List.rev_append esc_chx_list_ esc_expr_list_)
+
+let expr_llscan =
+    let symbol =
+        let f = function
+            | '\x00'..'\x1f' | '?' | '*' | '+' | '(' | ')' | '|'
+            | '\x7f'..'\xff' -> false
+            | _ -> true
+        in
+        P.sat f >>= fun c ->
+        P.ret !:c
+    in
+    let dot = ?.'.' >>= fun _ -> P.ret !^(fun c -> c <> '\n') in
+    let star x = ?.'*' >>= fun _ -> P.ret !*x in
+    let plus x = ?.'+' >>= fun _ -> P.ret !+x in
+    let question x = ?.'?' >>= fun _ -> P.ret !?x in
+    let postfix x = P.alt [ star x; plus x; question x; P.ret x ] in
+    let rec expr _ =
+        term () >>= fun x ->
+        ?* (?.'|' >>= fun _ -> term ()) >>= fun y ->
+        P.ret (List.fold_left (fun x y -> x $| y) x y)
+    and term () =
+        ?+(factor () >>= postfix) >>= fun (hd, tl) ->
+        P.ret (List.fold_left (fun x y -> x $& y) hd tl)
+    and factor () = P.alt [ group (); ch_class_; esc_expr_; dot; symbol ]
+    and group () =
+        ?.'(' >>= fun _ ->
+        expr DFA.nil >>= fun x ->
+        ?.')' >>= fun _ ->
+        P.ret x
+    in
+    expr DFA.nil
+
+let expr_of_seq z =
+    match expr_llscan z with
+    | Some (v, _) -> v
+    | _ -> raise (Error (Cf_seq.to_string z))
+
+let expr_of_string s = expr_of_seq (Cf_seq.of_string s)
+
+let quote =
+    let esc_ =
+        [ '`'; '.'; '?'; '*'; '+'; '('; ')'; '|'; '['; ']'; '^'; '$' ]
+    in
+    let rec loop c =
+        let w = Lazy.lazy_from_val (Cf_flow.Q loop) in
+        match c with
+        | _ when List.exists (fun c' -> c == c') esc_ ->
+            let w = Lazy.lazy_from_val (Cf_flow.P (c, w)) in
+            Cf_flow.P ('`', w)
+        | _ ->
+            Cf_flow.P (c, w)
+    in
+    Lazy.lazy_from_val (Cf_flow.Q loop)
+
+let unquote =
+    let rec loop c =
+        let w = Lazy.lazy_from_val (Cf_flow.Q loop) in
+        match c with
+        | '`' ->
+            Cf_flow.Q begin fun c ->
+                Cf_flow.P (c, Lazy.lazy_from_val (Cf_flow.Q loop))
+            end
+        | _ ->
+            Cf_flow.P (c, w)
+    in
+    Lazy.lazy_from_val (Cf_flow.Q loop)
+
+type t = int DFA.t
+
+let of_expression x = DFA.create (x $@ (fun n z -> Some (n, Cf_seq.shift n z)))
+let of_seq z = of_expression (expr_of_seq z)
+let of_string s = of_expression (expr_of_string s)
+
+let test r s =
+    let z = Cf_seq.of_string s in
+    match r z with
+    | Some (n, _) when n = String.length s -> true
+    | _ -> false
+
+let search =
+    let rec loop r pos z =
+        match r z with
+        | Some (n, _) ->
+            pos, n
+        | None ->
+            match p_any_ z with
+            | Some (_, z) ->
+                loop r (succ pos) z
+            | None ->
+                raise Not_found
+    in
+    fun r -> loop r 0
+
+let rec separate r z =
+    lazy begin
+        try
+            let pos, len = search r z in
+            let s = Cf_seq.limit pos z in
+            let z = Cf_seq.shift (pos + len) z in
+            Cf_seq.P (s, separate r z)
+        with
+        | Not_found ->
+            Cf_seq.P (z, Cf_seq.nil)
+    end
+
+let split =
+    let rec loop r s pos acc z =
+        match
+            try Some (search r z) with Not_found -> None
+        with
+        | Some (pos', len') ->
+            let x = String.sub s pos pos' in
+            let pos = pos + pos' + len' in
+            let z = Cf_seq.shift (pos' + len') z in
+            loop r s pos (x :: acc) z
+        | None ->
+            List.rev_append acc [ Cf_seq.to_string z ]
+    in
+    fun r s ->
+        loop r s 0 [] (Cf_seq.of_string s)
+
+let llscan r z =
+    match r z with
+    | Some (n, tl) -> Some (Cf_seq.to_string (Cf_seq.limit n z), tl)
+    | None -> None
+
+(*--- $File$ ---*)

File cf/cf_regx.mli

+(*---------------------------------------------------------------------------*
+  $Change$
+  Copyright (c) 2005-2010, James H. Woodyatt
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+    Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+    
+    Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+  OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *---------------------------------------------------------------------------*)
+
+(** Regular expression parsing, search and matching. *)
+
+(** {6 Overview}
+
+    This module implements regular expression parsing, search and matching in
+    pure Objective Caml.  The grammar for regular expressions is a little
+    unconventional.  Instead of using a backslash as the escape character, the
+    backtick character is used instead.  This makes it easy to write regular
+    expressions in string literals.
+    
+    Use any of the following constructions in regular expressions:
+    - [`n     ] Matches LF ("newline") character.
+    - [`t     ] Matches TAB character.
+    - [`r     ] Matches RETURN character.
+    - [`a     ] Matches an alphabetical character.
+    - [`d     ] Matches a decimal digit character.
+    - [`i     ] Matches an alphanumerical character.
+    - [`s     ] Matches a TAB, LF, VT, FF, CR or SPACE (whitespace) character.
+    - [`w     ] Matches a character other than a whitespace character.
+    - [`xNN   ] Matches the character with hexadecimal code [NN].
+    - [`DDD   ] Matches the character with decimal code [DDD], where DDD is a
+        three digit number between [000] and [255].
+    - [`c_    ] Matches the control character corresponding to the subsequent
+        printable character, e.g. [`cA] is CONTROL-A, and [`c\[] is ESCAPE.
+    - [.      ] Matches any character except newline.
+    - [*      ] (postfix) Matches the preceding expression, zero, one or
+        several times in sequence.
+    - [+      ] (postfix) Matches the preceding expression, one or several
+        times in sequence.
+    - [?      ] (postfix) Matches the preceding expression once or not at all.
+    - [[..]   ] Character set.  Ranges are denoted with ['-'], as in [[a-z]].
+        An initial ['^'], as in [[^0-9]], complements the set.  Special
+        characters in the character set syntax may be included in the set by
+        escaping them with a backtick, e.g. [[`^```\]]] is a set containing
+        three characters: the carat, the backtick and the right bracket
+        characters.
+    - [(..|..)] Alternatives.  Matches one of the expressions between the
+        parentheses, which are separated by vertical bar characters.
+    - [`_     ] Escaped special character.  The special characters are ['`'],
+        ['.'], ['*'], ['+'], ['?'], ['('], ['|'], [')'], ['\['].
+*)
+
+(** {6 Modules} *)
+
+(** The deterministic finite automata on octet character symbols. *)
+module DFA: Cf_xdfa.T with type S.t = char
+
+(** {6 Exceptions} *)
+
+(** An error parsing the specified string as a regular expression. *)
+exception Error of string
+
+(** {6 Types} *)
+
+(** An abstract type representing a regular expression. *)
+type t
+
+(** {6 Functions} *)
+
+(** A LL(x) parser combinator on character streams that recognizes a regular
+    expression and produces a DFA expression for it.
+*)
+val expr_llscan: (char, DFA.x) Cf_llscan.t
+
+(** Use [expr_of_seq z] to evaluate the character sequence [z] as a regular
+    expression and produce the corresponding DFA expression.  Raises [Error] if
+    the sequence is not a valid expression.
+*)
+val expr_of_seq: char Cf_seq.t -> DFA.x
+
+(** Use [expr_of_string s] to evaluate the string [s] as a regular expression
+    and produce the corresponding DFA expression.  Raises [Error] if the string
+    is not a valid expression.
+*)
+val expr_of_string: string -> DFA.x
+
+(** A character flow that quotes all the special characters in the input so
+    that the output may be used in a regular expression to match the input
+    exactly.
+*)
+val quote: (char, char) Cf_flow.t
+
+(** A character flow that unquotes all the quoted special characters in the
+    input so that the output may by used in a regular expression to match the
+    specified pattern.
+*)
+val unquote: (char, char) Cf_flow.t
+
+(** Use [of_expression x] to produce a regular expression from the DFA
+    expression [x].
+*)
+val of_expression: DFA.x -> t
+
+(** Use [of_seq z] to ingest the whole character sequence [z], parse it and
+    produce a regular expression.  Raises [Error s] if the sequence is
+    not a valid regular expression, with [s] containing the string composed of
+    the characters in the sequence.
+*)
+val of_seq: char Cf_seq.t -> t
+
+(** Use [of_string s] to produce a regular expression from the string [s].
+    Raises [Error s] if the string is not a valid regular expression.
+*)
+val of_string: string -> t
+
+(** Use [test r s] to test whether the string [s] matches the regular
+    expression [r].
+*)
+val test: t -> string -> bool
+
+(** Use [search r z] to search the character sequence [z] for a pattern that
+    matches the regular expression [r].  Returns [(pos, len)], where [pos] is
+    the number of characters into the sequence where the matching sequence
+    begins, and [len] is the number matching characters.
+*)
+val search: t -> char Cf_seq.t -> int * int
+
+(** Use [separate r z] to map the character sequence [z] into the sequence of
+    sequences found between matches for the regular expression [r].
+*)
+val separate: t -> char Cf_seq.t -> char Cf_seq.t Cf_seq.t
+
+(** Use [split r s] to produce a list of strings by searching [s] left to right
+    for blocks of characters between patterns that match the regular expression
+    [r].
+*)
+val split: t -> string -> string list
+
+(** Use [llscan r] to produce a LL(x) parser that matches the input stream to
+    the regular expression [r] and returns the corresponding string value.
+*)
+val llscan: t -> (char, string) Cf_llscan.t
+
+(*--- $File$ ---*)

File cf/cf_xdfa.ml

 module N_set = Cf_rbtree.Set(Cf_ordered.Int_order)
 module N_map = Cf_rbtree.Map(Cf_ordered.Int_order)
 
-let identity x = x
+external identity: 'a -> 'a = "%identity"
 
 module type Symbol = sig
     type t and 'a map

File cf/t/t_cf.ml

 
 module T6 = struct
     open Printf
-    open Cf_lex.Op
-    open Cf_parser.Op
+    open Cf_clex.Op
+    open Cf_llscan.Op
         
     module L1 = struct        
         let test1 lexer =
     
     module L2 = struct
         let p =
-            let q0 = Cf_scan_parser.scanf "%3u" (fun y -> y) in
-            let q1 = Cf_scan_parser.scanf "%c" (fun y -> y) in
-            let q2 = Cf_scan_parser.scanf "%3u" (fun y -> y) in
+            let q0 = Cf_fmt_llscan.scanf "%3u" (fun y -> y) in
+            let q1 = Cf_fmt_llscan.scanf "%c" (fun y -> y) in
+            let q2 = Cf_fmt_llscan.scanf "%3u" (fun y -> y) in
             q0 >>= fun v0 ->
             q1 >>= fun v1 ->
             q2 >>= fun v2 ->
-            ~:(v0, v1, v2)
+            Cf_llscan.ret (v0, v1, v2)
                 
         let y s = p (Cf_seq.of_string s)
         
         ]
         
         let loop (a, b) =
-            let s = Cf_flow.commute_string Cf_regex.unquote b in
+            let s = Cf_flow.commute_string Cf_regx.unquote b in
             if s <> a then
                 failwith (Printf.sprintf "unquoting \"%s\" <> \"%s\"" s a);
-            let s = Cf_flow.commute_string Cf_regex.quote a in
+            let s = Cf_flow.commute_string Cf_regx.quote a in
             if s <> b then
                 failwith (Printf.sprintf "quoting \"%s\" <> \"%s\"" s b);
             ()
     
     module L4 = struct
         let lex =
-            Cf_lex.create !@[
+            Cf_clex.create !@[
                 !$"`s+" $= None;
                 !$"`w+" $> (fun x -> Some x);
             ]
         
         let p =
             loop [] >>= fun arr ->
-            Cf_parser.fin >>= fun () ->
-            ~:arr
+            Cf_llscan.fin >>= fun () ->
+            Cf_llscan.ret arr
         
         let input_ = "bim bam boom"
         
         ]
         
         let yes_ s x v =
-            if not (Cf_regex.test x v) then
+            if not (Cf_regx.test x v) then
                 jout#fail "Expected '%s' to match expression '%s'." v s
         
         let no_ s x v =
-            if Cf_regex.test x v then
+            if Cf_regx.test x v then
                 jout#fail "Expected '%s' to match expression '%s'." v s
         
         let test1 (s, yes, no) =
-            let x = Cf_regex.of_string s in
+            let x = Cf_regx.of_string s in
             List.iter (yes_ s x) yes;
             List.iter (no_ s x) no
         
     
     let test () =
         let lexer =
-            Cf_lex.create begin
+            Cf_clex.create begin
                 (!*(!:'a' $| !:'b')) $& !$"abb" $> (fun x -> x)
             end
         in
 end
 
 module T7 = struct
-    open Cf_parser.Op
+    open Cf_llscan.Op
     
     let digit =
         let zero = int_of_char '0' in
-        let sat = Cf_parser.sat (function '0'..'9' -> true | _ -> false) in
-        sat >>= fun ch -> ~:((int_of_char ch) - zero)
+        let sat = Cf_llscan.sat (function '0'..'9' -> true | _ -> false) in
+        sat >>= fun ch -> Cf_llscan.ret ((int_of_char ch) - zero)
         
     let unumber =
         let rec to_int x = function
             | [] -> x
             | hd :: tl -> to_int (hd + (x * 10)) tl
         in
-        ?+digit >>= fun (d0, dn) -> ~:(to_int 0 (d0 :: dn))
+        ?+digit >>= fun (d0, dn) -> Cf_llscan.ret (to_int 0 (d0 :: dn))
     
     let plus = ?. '+'
     let minus = ?. '-'
     let div = ?. '/'
     let lparen = ?. '('
     
-    let rparen = Cf_parser.alt [
+    let rparen = Cf_llscan.alt [
         ?. ')';
-        Cf_parser.err ~f:(fun _ -> failwith "Expected ')'") ();
+        Cf_llscan.err ~f:(fun _ -> failwith "Expected ')'") ();
     ]
     
-    let number = Cf_parser.alt [
+    let number = Cf_llscan.alt [
         unumber;
         
         begin
             plus >>= fun _ ->
             unumber >>= fun n ->
-            ~:n
+            Cf_llscan.ret n
         end;
         
         begin
             minus >>= fun _ ->
             unumber >>= fun n ->
-            ~:(-n)
+            Cf_llscan.ret (-n)
         end;
     ]
     
     type fval_t = Mul of int | Div of int
     
-    let rec expr s = Cf_parser.alt [
+    let rec expr s = Cf_llscan.alt [
         begin
             term >>= fun hd ->
             ?*expr_c >>= fun tl ->
-            ~:(List.fold_left (+) hd tl)
+            Cf_llscan.ret (List.fold_left (+) hd tl)
         end;
         
         term;
     ] s
     
-    and expr_c s = Cf_parser.alt [
+    and expr_c s = Cf_llscan.alt [
         begin
             plus >>= fun _ ->
             term >>= fun n ->
-            ~:n
+            Cf_llscan.ret n
         end;
         
         begin
             minus >>= fun _ ->
             term >>= fun n ->
-            ~:(-n)
+            Cf_llscan.ret (-n)
         end;
     ] s
     
     and term =
         let f a = function Mul x -> a * x | Div x -> a / x in
-        fun s -> Cf_parser.alt [
+        fun s -> Cf_llscan.alt [
             begin
                 factor >>= fun hd ->
                 ?*term_c >>= fun tl ->
-                ~:(List.fold_left f hd tl)
+                Cf_llscan.ret (List.fold_left f hd tl)
             end;
             
             term;
         ] s
     
-    and term_c s = Cf_parser.alt [
+    and term_c s = Cf_llscan.alt [
         begin
             mult >>= fun _ ->
             factor >>= fun n ->
-            ~:(Mul n)
+            Cf_llscan.ret (Mul n)
         end;
         
         begin
             div >>= fun _ ->
             factor >>= fun n ->
             if n = 0 then failwith "Div 0";
-            ~:(Div n)
+            Cf_llscan.ret (Div n)
         end;
     ] s
     
-    and factor s = Cf_parser.alt [
+    and factor s = Cf_llscan.alt [
         begin
             lparen >>= fun _ ->
             expr >>= fun e ->
             rparen >>= fun _ ->
-            ~:e
+            Cf_llscan.ret e
         end;
         
         number
     let run s =
         begin
             expr >>= fun e ->
-            Cf_parser.fin >>= fun () ->
-            ~:e
+            Cf_llscan.fin >>= fun () ->
+            Cf_llscan.ret e
         end s
     
     let calc s =
   for related issues.
 
 + Refactored the functional parser modules.  The Cf_llscan module now has the
-  core LL(x) parsing monad.
-
+  core LL(x) parsing monad.  The old X submodules in Cf_parser, Cf_dfa and
+  cognates has been removed to be closer to where they make sense: in the
+  modules intended to support lexical analysis of ISO-8859 texts.
+  
 ==== Version 1.0 ===
 
 This release marks the repackaging of the separately distributed libraries in

File etc/PROBLEMS

 
 + (Cf_rbtree): Document the complexity of the [incr_next] and [decr_next]
     functions.  The binary set functions could be improved for performance by
-    using recursive [join] and [split] functions.  Need set disjoint.
+    using recursive [join] and [split] functions.
 
 + (Cf_gadget): Write some documentation about polymorphic variants as channel
     type parameters.

File ucs/OMakefile

         LIBREFS[]=
             $(OCaml_library_referral.new $(M.cf), cf, mixed)
             $(OCaml_library_referral.new $(M.ucs), ucs, pure)
-        PACKAGES= oni.cf oUnit
+        PACKAGES= oUnit
         COMPONENTS= $(M.cf)
     
     ### Construct the composition unit for the ucs library