Source

oni / cf / cf_dfa.mli

Full commit
(*---------------------------------------------------------------------------*
  $Change$
  Copyright (C) 2011, james woodyatt
  All rights reserved.
  
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  
    Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
    
    Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in
    the documentation and/or other materials provided with the
    distribution
  
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  OF THE POSSIBILITY OF SUCH DAMAGE. 
 *---------------------------------------------------------------------------*)

(** Functional composition of lazy deterministic finite automata. *)

(** {6 Overview}

    This module implements operators for functional composition of lazy
    deterministic finite automata (DFA).  A lazy DFA is more efficient at
    recognizing regular grammars than a non-deterministic finite automaton,
    and the lazy evaluation amortizes the cost of compiling the state table
    so that it compares well to that of the NFA.
    
    The interface defined here is used as the underlying algorithm for the
    {!Cf_clex} module.  It uses a functor that operates on a module defining
    the type of a symbol, the type of parser input tokens that contain such
    symbols, and a map of symbols to some polymorphic type.  The result of the
    functor is a module that contains operator functions for composing
    expressions and rules for automata that operate on streams of the input
    symbol type.
    
    {b Note}: a DFA can be remarkably inefficient compared to an NFA for
    certain classes of unusual grammars and unusual input.
*)

(** {6 Module Types} *)

(** The type of the input module for [Create(S: Symbol)] functor defined
    below.
*)
module type Symbol = sig
    (** The symbol type *)
    type t
        
    (** The type of maps from symbols to polymorphic types. *)
    type 'a map
        
    (** The engine uses [map f] to construct a map from symbols to state
        transitions.
    *)
    val map: (t -> 'a) -> 'a map
    
    (** The engine uses [get m s] to get the state transition from map [m] for
        the symbol [s].
    *)
    val get: 'a map -> t -> 'a
end

(** The output of the [Create(S: Symbol)] functor, which is a module that
    can be used to compose deterministic finite automata which operate on
    symbols of the type specified.
*)
module type T = sig
    
    (** The module used as the input to the [Create(S: Symbol)] functor. *)
    module S: Symbol
    
    (** The type of an expression in the regular grammar of an automaton. *)
    type x
    
    (** The expression that matches the empty symbol sequence. *)
    val nil: x
    
    (** The type of a rule for recognizing a sequence of symbols according to
        the regular grammar of an automaton and producing an output token.
    *)
    type 'a r
    
    (** A parser that works on the symbols used in the automaton. *)
    type 'a t = (S.t, 'a) Cf_parser.t
    
    (** Open this module to bring the composition operators into the current
        scope.
    *)
    module Op: sig
        
        (** Use [a $| b] to compose an expression that matches either [a] or
            [b] in the symbol stream.
        *)
        val ( $| ): x -> x -> x
        
        (** Use [a $& b] to compose an expression that matches [a] followed by
            [b] in the symbol stream.
        *)
        val ( $& ): x -> x -> x
        
        (** Use [!*a] to compose an expression that matches zero or more
            occurances of [a] in the symbol stream.
        *)
        val ( !* ): x -> x
        
        (** Use [!+a] to compose an expression that matches one or more
            occurances of [a] in the symbol stream.
        *)
        val ( !+ ): x -> x
        
        (** Use [!?a] to compose an expression that matches zero or one
            occurance of [a] in the symbol stream.
        *)
        val ( !? ): x -> x
        
        (** Use [!:sym] to compose an expression that matches the symbol [sym]
            in the symbol stream.
        *)
        val ( !: ): S.t -> x
        
        (** Use [!^f] to compose an expression that matches any symbol in the
            symbol stream for which applying the function [f] returns [true].
        *)
        val ( !^ ): (S.t -> bool) -> x
        
        (** Use [!~z] to compose an expression that matches the sequence of
            symbols [z] in the symbol stream.
        *)
        val ( !~ ): S.t Cf_seq.t -> x
        
        (** Use [e $= x] to compose a rule that produces [x] when the symbols
            in the symbol stream match the expression [e].
        *)
        val ( $= ): x -> 'a -> 'a r
        
        (** Use [e $> f] to compose a rule that applies the tokenizer function
            [f] to the sequence of input symbols in the stream recognized by
            the expression [e] to produce an output token.
        *)
        val ( $> ): x -> (S.t Cf_seq.t -> 'a) -> 'a r
        
        (** Use [e $@ f] to compose a rule that applies the scanning function
            [f] to the input stream when it is recognized by the expression
            [e].  The scanning function is passed the length of the recognized
            sequence of symbols and receives a parser in return that produces
            the output of the rule and makes any advanced manipulations of the
            input stream necessary to continue parsing for the next token.
            If the parser returned from the scanning function does not
            recognize the input stream, then the rule is not matched and the
            next best matching rule is selected.
        *)
        val ( $@ ): x -> (int -> 'a t) -> 'a r
        
        (** Use this operator to combine a list of rules into a single rule. *)
        val ( !@ ): 'a r list -> 'a r
    end
    
    (** Use [create r] to construct a parser that recognizes the longest
        sequence that matches the rule [r].
    *)
    val create: 'a r -> 'a t
end

(** The functor that creates a DFA module. *)
module Create(S: Symbol): T with module S = S

(*--- $File$ ---*)