Source

orakuda / pa / pa_regexp.ml

The default branch has multiple heads

Full commit
open Camlp4                                             (* -*- camlp4of -*- *)

module Id : Sig.Id = struct
  let name = "pa_regexp"
  let version = "1.0"
end

module Make (Syntax : Sig.Camlp4Syntax) = struct
  open Sig
  include Syntax

  open Ast
  open Printf

  open Lexrex;;

  module Pa_format = Pa_format.Make(Syntax) 

  (* methods for group access *) 
  let group_methods _loc num_of_groups named_groups =
    let num_methods n =
      let num n =
	CrMth(_loc, sprintf "_%d" n, OvNil, PrNil, 
	     <:expr<self#_unsafe_group $ExInt(_loc, string_of_int n)$>>,
	     TyNil _loc)
      in
      let num_opt n =
	CrMth(_loc, sprintf "_%dopt" n, OvNil, PrNil, 
	     <:expr<self#_unsafe_group_opt $ExInt(_loc, string_of_int n)$>>,
	     TyNil _loc)
      in
      let rec f n =
	if n < 0 then []
	else num n :: num_opt n :: f (n-1)
      in
      f n
    in
    let named_methods named_groups =
      let named name pos =
	let mname =
	  match name.[0] with
	  | 'a'..'z' -> name
	  | _ -> "_" ^ name
	in
	CrMth(_loc, mname, OvNil, PrNil,
             (* <:expr<self#_named_group $ExStr(_loc, name)$>>, *)
	     <:expr<self#_unsafe_group $ExInt(_loc, string_of_int pos)$>>,
             TyNil _loc)
      in
      List.map (fun (name,pos) -> named name pos) named_groups
    in
    let rec f = function
      | [] -> CrNil _loc
      | m::ms -> 
	  <:class_str_item<
	    $m$ $f ms$
	  >>
    in
    f (num_methods num_of_groups @ named_methods named_groups)
  ;;

  (* syntax encoding for named_groups *)
  let rec named_groups _loc = function
    | [] -> <:expr<[]>>
    | (n,pos)::gs ->
        <:expr<
          ($ExStr(_loc, n)$, $ExInt(_loc, string_of_int pos)$)
          :: $named_groups _loc gs$
        >>
  ;;

  let in_implem = ref false
  let set_in_implem () = in_implem := true

  let build_group_binder _loc typ = 
    let group_methods = 
      group_methods _loc typ.num_of_groups typ.named_groups 
    in
    let named_groups = named_groups _loc typ.named_groups in
    <:expr<
      fun ~left ~right ~last groups ->
        object (self)
         $group_methods$
         inherit Orakuda.Regexp.Internal_use_only.group $named_groups$ groups ~left ~right ~last
       end
    >>

  module Top = Pa_top.Register(Syntax)

  let build_rex _loc s typ flags =
    let group_binder = build_group_binder _loc typ in
    let str = ExStr (_loc, s) in
    let rex = 
      <:expr<
	(* This enforces linking with Orakuda library *)
        Orakuda.Regexp.Internal_use_only.create $str$ ~flags:$flags$ $group_binder$
      >>
    in
    Top.may_put_in_top rex
  ;;

  (* CR jfuruse; BUG: This is wrong.
     It cannot parse correctly \\/
  *)
  let rec find_non_escaped_slash from s =
    let pos = String.rindex_from s from '/' in
    if pos = 0 || s.[pos-1] <> '\\' then pos
    else find_non_escaped_slash (pos - 1) s
  ;;

  let split_by_non_escaped_slash s =
    let from = String.length s - 1 in
    let rec split st from =
      if from < 0 then "" :: st
      else
        try
          let pos = find_non_escaped_slash from s in
          split (String.sub s (pos + 1) (from - pos) :: st) (pos-1)
        with
        | Not_found -> 
            String.sub s 0 (from+1) :: st
    in
    split [] from
  ;;

  let parse_rex_quotation _loc q =
    let loc = Loc.join (Loc.move `start q.q_shift _loc) in
    let rex, tokens, flags = Lexrex.from_string q.q_contents in
    let parse_flag = function
      | 'i' -> <:expr<`CASELESS>>
      | 'm' -> <:expr<`MULTLINE>>
      | 's' -> <:expr<`DOTALL>>
      | 'x' -> <:expr<`EXTENDED>>
      | 'U' -> <:expr<`LAZY>>
      | '8' -> <:expr<`UTF8>>
      | c -> raise (Stream.Error 
		       (Printf.sprintf "unknown pcre match flag %C" c))
    in
    let flags = 
      let len = String.length flags in
      let rec iter acc pos =
	if pos = len then acc
	else iter (parse_flag flags.[pos] :: acc) (pos + 1)
      in
      List.fold_left (fun acc sw ->
	<:expr<$sw$ :: $acc$>>) <:expr<[]>> (iter [] 0)
    in
    let typ = Lexrex.type_regexp tokens in
    build_rex loc rex typ flags
  ;;

  let parse_rex_replace_quotation _loc q =
    let loc = Loc.join (Loc.move `start q.q_shift _loc) in
    let rex, tokens, replace_ = Lexrex.replace_from_string q.q_contents in
    let replace, _, flags = Pa_format.parse ['/'] _loc None replace_ in

    let replace_global = ref false in
    let parse_flag = function
      | 'i' -> Some <:expr<`CASELESS>>
      | 'm' -> Some <:expr<`MULTLINE>>
      | 's' -> Some <:expr<`DOTALL>>
      | 'x' -> Some <:expr<`EXTENDED>>
      | 'U' -> Some <:expr<`LAZY>>
      | '8' -> Some <:expr<`UTF8>>
      | 'g' -> replace_global := true; None
      | c -> raise (Stream.Error 
		       (Printf.sprintf "unknown pcre replace flag %C" c))
    in
    let flags =
      let len = String.length flags in
      let rec iter acc pos =
	if pos = len then acc
	else 
	  let acc = 
	    match parse_flag flags.[pos] with
	    | None -> acc
	    | Some f -> f :: acc
	  in
	  iter acc (pos + 1)
      in
      List.fold_left (fun acc sw ->
	<:expr<$sw$ :: $acc$>>) <:expr<[]>> (iter [] 0)
    in
    let typ = Lexrex.type_regexp tokens in
    let rex = build_rex loc rex typ flags in
    (* CR jfuruse: we can unify these to substitute_substrings(_first) *)
    match replace with
    | `Const replace ->
        if !replace_global then
          <:expr<
	    Orakuda.Regexp.replace $rex$ ~templ:$replace$ 
          >>
        else
          <:expr<
	    Orakuda.Regexp.replace_first $rex$ ~templ:$replace$ 
          >>
    | `Fun ([], f) -> 
        let e = f <:expr<Printf.sprintf>> in
        if !replace_global then 
          <:expr<
            Orakuda.Regexp.substitute_substrings 
              (fun __rex_group -> $e$)
              $rex$
          >>
        else
          <:expr<
            Orakuda.Regexp.substitute_substrings_first
              (fun __rex_group -> $e$)
              $rex$
          >>
    | `Fun (_abss, _) -> 
        raise (Stream.Error 
		  (Printf.sprintf "non closed template %S" replace_))
  ;;

(* manual call of quotation expander, since <:rex<>> has special
   parsing rules

  let _ =
    Syntax.Quotation.add "m"
      Syntax.Quotation.DynAst.expr_tag
      ...;

   Syntax.Quotation.add "m"
      Syntax.Quotation.DynAst.str_item_tag
      (fun _loc _loc_var_opt s ->
	let e = ... s in
	<:str_item< $exp:e$ >>)
    ;;
*)

  EXTEND Gram
    GLOBAL: expr str_item
  ;

  (* manual call of quotation expander, since <:rex<>> has special
     parsing rules
  *)

  expr: BEFORE "simple"
    [
      [ `QUOTATION ({ q_name = "m"; _ } as q) ->
	  parse_rex_quotation _loc q
      | `QUOTATION ({ q_name = "m"; _ } as q); 
	(rex_bind, (cases, default)) = rex_bind ->
	  let rex = parse_rex_quotation _loc q in
	  let cases = <:expr<(Orakuda.Regexp.build_case $rex$ $rex_bind$::$cases$)>> in
	  <:expr<
	    Orakuda.Regexp.build_cases ?default:$default$ $cases$
	  >>
      | `QUOTATION ({ q_name = "s"; _ } as q) ->
	    parse_rex_replace_quotation _loc q
      ]
    ];

  str_item: 
    [
      [ `QUOTATION ({ q_name = "m"; _ } as q) ->
	  <:str_item<$exp:parse_rex_quotation _loc q$>>
      | `QUOTATION ({ q_name = "m"; _ } as q); 
	(rex_bind, (cases, default)) = rex_bind ->
	  let rex = parse_rex_quotation _loc q in
	  let cases = <:expr<(Orakuda.Regexp.build_case $rex$ $rex_bind$::$cases$)>> in
	  <:str_item<
	    Orakuda.Regexp.build_cases ?default:$default$ $cases$
	  >>
      | `QUOTATION ({ q_name = "s"; _ } as q) ->
	  <:str_item<
	    $exp:parse_rex_replace_quotation _loc q$
	  >>
      ]
    ];

  rex_bind:
    [
      [ "as"; id = patt; "->"; e = expr; rex_cases = rex_cases ->
	  (<:expr<fun $id$ -> $e$>>, rex_cases)
      | "->"; e = expr; rex_cases = rex_cases ->
	  (<:expr<fun _ -> $e$>>, rex_cases)
      ]
    ];

  rex_cases:
    [
      [ -> 
	  <:expr<[]>>, <:expr<None>>
      | "|"; "_"; "->"; e = expr -> 
	  <:expr<[]>>, <:expr<Some (fun () -> $e$)>>
      | "|"; `QUOTATION ({ q_name = "m"; _ } as q); 
	  (rex_bind, (cases, default)) = rex_bind ->
	    let rex = parse_rex_quotation _loc q in
	    <:expr<(Orakuda.Regexp.build_case $rex$ $rex_bind$ :: $cases$)>>, 
	    default
      ]
    ];

  END

end


let module M = Register.OCamlSyntaxExtension(Id)(Make) in ()