Source

oni / xml / xml_parser.ml

Full commit
(*---------------------------------------------------------------------------*
  $Change$
  Copyright (c) 2003-2010, James H. Woodyatt
  All rights reserved.
  
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  
    Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
    
    Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in
    the documentation and/or other materials provided with the
    distribution
  
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  OF THE POSSIBILITY OF SUCH DAMAGE. 
 *---------------------------------------------------------------------------*)

type 'a t = (Xml_event.t, 'a) Cf_llscan.t

open Cf_llscan.Op
open Cf_lex.Op

let character_data =
    Cf_llscan.tok begin function
        | Xml_event.T_character_data data, _ -> Some data
        | _ -> None
    end

let element_start =
    Cf_llscan.tok begin function
        | Xml_event.T_element_start (tag, attrs), _ -> Some (tag, attrs)
        | _ -> None
    end

let element_end =
    Cf_llscan.tok begin function
        | Xml_event.T_element_end tag, _ -> Some tag
        | _ -> None
    end

let processing_instruction =
    Cf_llscan.tok begin function
        | Xml_event.T_processing_instruction pi, _ -> Some pi
        | _ -> None
    end

let cdata_start =
    let sat =
        Cf_llscan.sat begin function
            | Xml_event.T_cdata_start, _ -> true
            | _ -> false
        end
    in
    sat >>= fun _ -> Cf_llscan.ret ()

let cdata_end =
    let sat =
        Cf_llscan.sat begin function
            | Xml_event.T_cdata_end, _ -> true
            | _ -> false
        end
    in
    sat >>= fun _ -> Cf_llscan.ret ()

let comment =
    Cf_llscan.tok begin function
        | Xml_event.T_comment text, _ -> Some text
        | _ -> None
    end

let default_text =
    Cf_llscan.tok begin function
        | Xml_event.T_default_text text, _ -> Some text
        | _ -> None
    end

let xml_decl =
    Cf_llscan.tok begin function
        | Xml_event.T_xml_decl decl, _ -> Some decl
        | _ -> None
    end

let end_of_document =
    let sat =
        Cf_llscan.sat begin function
            | Xml_event.T_end_of_document, _ -> true
            | _ -> false
        end
    in
    sat >>= fun _ -> Cf_llscan.ret ()

let accumulated_character_data =
    let rec inner_loop q s =
        match character_data s with
        | None ->
            if Cf_deque.empty q then None else Some (q, s)
        | Some (data, s) ->
            inner_loop (Cf_deque.A.push (data, 0, String.length data) q) s
    in
    let cdata_section () =
        cdata_start >>= fun _ ->
        inner_loop Cf_deque.nil >>= fun q ->
        cdata_end >>= fun _ ->
        Cf_llscan.ret q
    in
    let outer_pass =
        Cf_llscan.alt [
            inner_loop Cf_deque.nil;
            cdata_section ();
        ]
    in
    let rec outer_loop q s =
        match outer_pass s with
        | None ->
            Some (q, s)
        | Some (q', s) ->
            outer_loop (Cf_deque.catenate q q') s
    in
    outer_loop Cf_deque.nil >>= fun q ->
    let msg = Cf_seq.reverse (Cf_deque.B.to_seq q) in
    Cf_llscan.ret msg

let optional_whitespace =
    let rec get seq =
        match character_data seq with
        | Some (s, seq) when loop s (String.length s) 0 -> get seq
        | Some (_, _) -> Some ((), seq)
        | _ ->
            match default_text seq with
            | Some (s, seq) when loop s (String.length s) 0 -> get seq
            | _ -> Some ((), seq)
    and loop s n i =
        if i >= n then
            true
        else
            match String.unsafe_get s i with
            | ' ' | '\t' | '\r' | '\n' -> loop s n (succ i)
            | _ -> false
    in
    fun seq -> get seq

exception Invalid of Xml_event.t

let invalid seq =
    match Lazy.force seq with
    | Cf_seq.Z -> None
    | Cf_seq.P (event, _) -> raise (Invalid event)

type space_handling = S_default | S_preserve

(*
val validated_element:
    tag:string -> attr:(Xml_event.position -> 'a -> string * string -> 'a) ->
    content:('a -> 'a t) -> ?space:space_handling -> 'a -> 'a t
*)

let validated_element ~tag ~attr ~content ?(space = S_default) =
    let rec attributes ~pos ~acc = function
        | [] -> acc
        | hd :: tl -> attributes ~pos ~acc:(attr pos acc hd) tl
    and e_start acc seq =
        match Lazy.force seq with
        | Cf_seq.P ((Xml_event.T_element_start (tag', attlist), pos), tl) ->
            if tag = tag' then
                Some (attributes ~pos ~acc attlist, tl)
            else
                None
        | _ ->
            None
    and e_end seq =
        match Lazy.force seq with
        | Cf_seq.P ((Xml_event.T_element_end tag', _ as event), tl) ->
            if tag = tag' then Some ((), tl) else raise (Invalid event)
        | _ ->
            None
    in
    let content acc =
        match space with
        | S_preserve ->
            content acc
        | S_default ->
            optional_whitespace >>= fun () ->
            content acc >>= fun x ->
            optional_whitespace >>= fun () ->
            Cf_llscan.ret x
    in
    fun acc ->
        e_start acc >>= fun acc ->
        content acc >>= fun acc ->
        e_end >>= fun () ->
        Cf_llscan.ret acc

let split_attribute_value =
    let is_white x = function
        | '\009' | '\010' | '\013' | '\032' -> x
        | _ -> not x
    in
    let x_white = !+(!^(is_white true)) in
    let x_token = !+(!^(is_white false)) in
    let lexer = Cf_lex.create !@[
        x_white $= None;
        x_token $> (fun x -> Some x);
    ] in
    fun s ->
        let rec loop acc sx =
            match lexer sx with
            | None -> Some (List.rev acc, sx)
            | Some (None, sx) -> loop acc sx
            | Some (Some tok, sx) -> loop (tok :: acc) sx
        in
        loop [] s

let standalone_document content =
    xml_decl >>= fun decl ->
    match decl with
    | {
        Xml_event.xml_version = Some "1.0";
        Xml_event.xml_standalone = Xml_expat.SA_yes;
      } ->
        begin
            optional_whitespace >>= fun () ->
            content >>= fun x ->
            optional_whitespace >>= fun () ->
            end_of_document >>= fun () ->
            Cf_llscan.ret x
        end
    | _ ->
        Cf_llscan.nil

(*--- $File$ ---*)