ocaml-lib / text_index.ml

module Intset = Intset.Intmap

module type T =
  sig
    type oid = int
    type 'a index
    type oids_by_pos

    val create : int -> 'a index
    val add : oid -> 'a list -> 'a index -> unit
    val remove : oid -> 'a list -> 'a index -> unit

    val find : 'a list -> 'a index -> Intset.t

    val memory_size : f:('a -> int) -> 'a index -> int
  end
      
module Intmap = Intmap.M

module M =
  struct
    type oid = int
    type oids_by_pos = Intset.t Intmap.t
	  (* for each position, gives the set of matching objects *)
    type 'a index = ('a, oids_by_pos) Hashtbl.t

    let create n = Hashtbl.create n

    let add_pos oid pos x index =
      let obp =
	try Hashtbl.find index x
	with Not_found -> Intmap.empty in
      let oids =
	try Intmap.get pos obp
	with Not_found -> Intset.empty in
      let obp =
	Intmap.set pos (Intset.add oid oids) obp in
      Hashtbl.replace index x obp

    let add oid l index =
      let _ =
	List.fold_left
	  (fun pos x ->
	    add_pos oid pos x index;
	    pos+1)
	  0 l in
      ()

    let add_string oid s index =
      for pos = 0 to String.length s - 1 do
	  add_pos oid pos s.[pos] index
      done

    let remove oid l index =
      let _ =
	List.fold_left
	  (fun pos x ->
	    (try
	      let obp = Hashtbl.find index x in
	      let oids = Intmap.get pos obp in
	      let oids = Intset.remove oid oids in
	      let obp =
		if Intset.is_empty oids
		then Intmap.remove pos obp
		else Intmap.set pos oids obp in
	      if Intmap.is_empty obp
	      then Hashtbl.remove index x
	      else Hashtbl.replace index x obp;
	    with Not_found -> ());
	    pos+1)
	  0 l in
      ()

    let shift_left offset obp =
      if offset = 0
      then obp
      else
	Intmap.fold
	  (fun res pos oids ->
	    let pos' = pos-offset in
	    if pos' >= 0
	    then Intmap.set pos' oids res
	    else res)
	  Intmap.empty obp

    let find_elt offset x index = (* raise Not_found *)
      assert (offset >= 0);
      let obp = Hashtbl.find index x in
      let obp = shift_left offset obp in
      obp

    let find l index =
      match l with
      | [] -> invalid_arg "Text_index.find: empty list"
      | x::xs ->
	  try
	    let obp = find_elt 0 x index in
	    let _, obp =
	      List.fold_left
		(fun (pos,obp) x ->
		  let obp =
		    Intmap.map_inter
		      (fun pos oids1 oids2 ->
			let inter = Intset.inter oids1 oids2 in
			if Intset.is_empty inter
			then None
			else Some inter)
		      obp (find_elt pos x index) in
		  pos+1, obp)
		(1,obp) xs in
	    let oids =
	      Intmap.fold
		(fun res pos oids -> Intset.union res oids)
		Intset.empty obp in
	    oids
	  with Not_found -> Intset.empty
	      
    let memory_size ~f index =
      Hashtbl.fold
	(fun x obp res ->
	  res
	    + 1 (* hash array cell *) + 4 (* hashtable bucket *) + f x (* key *)
	    + Intmap.memory_size ~f:Intset.memory_size obp (* val *))
	index 0
      
  end

module Test =
  struct
    let idx : char M.index = M.create 101
    let str : string Intmap.t ref = ref Intmap.empty

    let print_obp obp =
      Intmap.iter
	(fun pos oids ->
	  print_string "    "; print_int pos; print_string ":";
	  Intset.iter (fun oid -> print_string " "; print_int oid) oids)
	obp;
      print_newline ()

    let print_index () =
      Hashtbl.iter
	(fun c obp ->
	  print_char c; print_newline ();
	  print_obp obp; print_newline ())
	idx

    let add_file filename =
      let ch_in = open_in filename in
      let oid = ref 0 in
      (try while true do
	if !oid mod 1000 = 0 then begin print_int !oid; print_newline () end;
	let line = input_line ch_in in
	M.add_string !oid line idx;
	str := Intmap.set !oid line !str;
	incr oid
      done with _ -> ());
      print_int !oid; print_endline " read lines";
      close_in ch_in
	
    let list_of_string s =
      let l = ref [] in
      for i = String.length s - 1 downto 0 do
	l := s.[i] :: !l
      done;
      !l

    let find_string s =
      let oids = M.find (list_of_string s) idx in
      Intset.iter (fun oid -> print_int oid; print_string ": "; print_endline (Intmap.get oid !str)) oids;
      print_int (Intset.cardinal oids); print_endline " matching lines"

    let search_string s =
      let re = Str.regexp_string s in
      Intmap.iter
	(fun oid line ->
	  try 
	    let _ = Str.search_forward re line 0 in
	    print_int oid; print_string ": "; print_endline line
	  with _ -> ())
	!str
  end
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.