ocaml-lib / suffix_tree.ml

(* type of nodes in suffix trees *)
type node = {
    seqid : int; (* sequence index in which the positions start and final are defined *)
    mutable start : int;     (* start and final position of the word labelling the node *)
    final : int ref;
    mutable link : node; (* suffix link *)
    v : node_value
  }
and node_value =
  | Children of (char,node) Hashtbl.t (* for non-leaves: children nodes *)
      (* for the key '\000', all values are relevant (use Hashtbl.find_all) *)
  | Index of int (* for leaves: position of recognized suffix *)

(* type of suffix trees *)
type t = string array * node

(* the initial root node *)
let empty : unit -> node =
  fun () ->
    let rec root = {seqid= -1; start=0; final=ref (-1); link=root; v=Children (Hashtbl.create 2)} in
    root


(* --------------------------------------------------------------------------------
   Operations on substrings of sequences
   -------------------------------------------------------------------------------- *)

type subseq = string * int * int  (* (seq, pos, len) *)

let subseq_empty = ("",0,0)  (* non-significant subseq *)

let subseq_is_empty (s,pos,len) = len = 0

let subseq_get (s,pos,len) i = s.[pos+i]

let subseq_length (s,pos,len) = len

let subseq_sub (s,pos,len) pos' len' = (s,pos+pos',len')

let subseq_extend (s,pos,len) = (s,pos,len+1)

(* -------------------------------------------------------------------------------
   Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
   the snd node [child] is significant only when [implicit] is not the empty string,
   and is the child that recognizes [implicit] starting from [explicit]. [implicit] is
   defined by a sequence, a start and a length.
   ------------------------------------------------------------------------------- *)

let eq_char c1 c2 =
  c1<>'\000' & c1=c2  (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)

(* returns the child node that recognizes [implicit] from the node [explicit] *)
let get_child seqar (explicit,implicit) =
  if subseq_is_empty implicit
  then explicit
  else
    let c = subseq_get implicit 0 in
    if c = '\000'
    then raise Not_found
    else
      match explicit.v with
      | Children h -> Hashtbl.find h c
      | Index _ -> raise Not_found
    (* List.find (fun child -> eq_char seqar.(child.seqid).[child.start] c) explicit.children *)

(* ensures that implicit does not span over another node below [explicit] *)
let rec canonical seqar (explicit,implicit,child) =
  if subseq_is_empty implicit
  then (explicit,implicit,child)
  else
    let l = !(child.final) - child.start + 1 in
    let a = subseq_length implicit in
    if a < l
    then (explicit,implicit,child)
    else
      let implicit' = subseq_sub implicit l (a-l) in
      canonical seqar (child, implicit', get_child seqar (child,implicit'))

(* test whether an implicit node is the root node *)
let is_root root (explicit,implicit,_) =
  explicit == root & subseq_is_empty implicit

(* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
   and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
let has_child seqar (explicit,implicit,child) (k,i) =
  let a = subseq_length implicit in
  if a <> 0 then
    if eq_char seqar.(child.seqid).[child.start+a] seqar.(k).[i]
    then Some (explicit, subseq_extend implicit, child)
    else None
  else
    try
      let implicit' = (seqar.(k),i,1) in
      Some (explicit, implicit', get_child seqar (explicit,implicit'))
    with Not_found -> None

(* --------------------------------
   creation of new nodes and leaves
   -------------------------------- *)

let add_leaf (seqar,root) node seqid start final_ref index =
  match node.v with
  | Children h ->
      Hashtbl.add h
	seqar.(seqid).[start]
	{seqid=seqid; start=start; final=final_ref; link=root; v=(Index index)}
  | Index _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")

(* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
let insert_node (seqar,root) (explicit,implicit,child) =
  let a = subseq_length implicit in
  if a = 0
  then explicit
  else
    match explicit.v with
    | Children h ->
	let c_child_old = seqar.(child.seqid).[child.start] in
	let c_child_new = seqar.(child.seqid).[child.start+a] in
	let n' = {
	  seqid = child.seqid;
	  start = child.start;
	  final = ref (child.start+a-1);
	  link = root;
	  v = Children (let h' = Hashtbl.create (Hashtbl.length h) in Hashtbl.add h' c_child_new child; h')
	} in
	child.start <- child.start+a;
	Hashtbl.replace h c_child_old n';
	n'
    | Index _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")

(* add a suffix link from [pred_opt] (if defined) to [explicit] *)
let add_link root pred_opt explicit =
  (*if explicit != root then*) (* create a new suffix link *)
    match pred_opt with
    | Some n -> (*if n.link = None then*) n.link <- explicit
    | None -> ()

(* ------------ 
   suffix links
   ------------ *)

(* get the node refered by the suffix link at [n] *)
(*
let suffix_link (root : node) (n : node) : node =
  match n.link with
  | None -> root  (* by default, the suffix link points to the root node *)
  | Some n' -> n'
*)

(* extend suffix_link for implicit nodes *)
let link (seqar,root) = function  (* TODO *)
  | (explicit,implicit,_) when subseq_is_empty implicit ->
      let explicit' = explicit.link (*suffix_link root explicit*) in
      (explicit', subseq_empty, explicit')
  | (explicit,implicit,_) ->
      if explicit == root
      then
	let implicit' = subseq_sub implicit 1 (subseq_length implicit - 1) in
	canonical seqar (root, implicit', get_child seqar (root,implicit'))
      else
	let explicit' = explicit.link (*suffix_link root explicit*) in
	canonical seqar (explicit', implicit, get_child seqar (explicit',implicit))

(* --------------------------------------------------------------
   GST update for the new character c at position i in sequence k
   -------------------------------------------------------------- *)

(* state for 'update' *)
type res = {
    terminal : int ref;
    mutable startj : int;
    mutable startnode : node * subseq * node
  }

let rec update (seqar,root) (k,i) res pred_opt =
  (* c = seqar.(k).[i] *)
  match has_child seqar res.startnode (k,i) with
  | Some extended_startnode -> (* startnode can be extended by [c] *)
      let explicit, implicit, _ = res.startnode in
      assert (pred_opt = None or subseq_is_empty implicit);
        (* if a link has been followed after node creation, then we are on an explicit node *)
      add_link root pred_opt explicit;
      res.startnode <- canonical seqar extended_startnode
  | None -> (* startnode cannot be extended by [c] ... *)
      let n' = insert_node (seqar,root) res.startnode in (* ... so we insert a new node ... *)
      add_link root pred_opt n';  (* ... a suffix link from the last created node (if defined) ... *)
      if seqar.(k).[res.startj] <> '\000' then
	add_leaf (seqar,root) n' k i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
      res.startj <- res.startj + 1; (* prepare for the next suffix *)
      if not (is_root root res.startnode)
      then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
	res.startnode <- link (seqar,root) res.startnode; (* ... follow the suffix link to find the next suffix ... *)
	update (seqar,root) (k,i) res (Some n') end  (* ... and loop on [update] *)

(* -------------------------------
   implementing the .mli interface
   ------------------------------- *)

let make : string list -> t =
  fun l_seq ->
    let l = List.length l_seq in
    let seqar = Array.make l "" in
    let root = empty () in
    let st = (seqar, root) in
    ignore (List.fold_left
      (fun k seq -> (* for every sequence/string [seq], numbered [k] ... *)
	seqar.(k) <- seq ^ String.make 1 '\000'; (* add a terminal symbol ... *)
	let res = {terminal=ref (-1); startj=0; startnode=(root,subseq_empty,root)} in (* initialize for [update] ... *)
	for i = 0 to String.length seqar.(k) - 1 do (* for every position [i] in the sequence ... *)
	  incr res.terminal; (* increment the leaves final position ... *)
	  update st (k,i) res None (* call [update] for updating the suffix tree with the character at position [i] *)
	done;
	k+1)
      0 l_seq);
    st

let string (seqar,root : t) (k : int) =
  let seq = seqar.(k) in
  String.sub seq 0 (String.length seq - 1) (* removing the terminal symbol *)

let string_list (seqar,root : t) =
  List.map (fun seq -> String.sub seq 0 (String.length seq - 1)) (Array.to_list seqar)

let root (seq,root : t) = root

let word (seqar,root) node =
  if node == root
  then ""
  else String.sub seqar.(node.seqid) node.start (!(node.final) - node.start + (match node.v with Children _ -> 1 | Index _ -> 0))

let children (gst : t) node =
  match node.v with
  | Children h ->
      Hashtbl.fold (fun c n l -> if c='\000' then Hashtbl.find_all h c @ l else n::l) h []
  | Index _ -> []

let index (seq,root) node : int * int =
  match node.v with
  | Children _ -> raise (Invalid_argument "Suffix_tree.index: 2nd argument must be a leaf")
  | Index i -> (node.seqid, i)

let linked_node (seqar,root : t) (n : node) : node =
  n.link (*suffix_link root n*)

let rec implicit_node (seqar,node : t) (word : string) =
  let (explicit, (s,i,len), child) = implicit_node_aux (seqar,node) (word,0,String.length word) in
  (explicit, String.sub s i len, child)
and implicit_node_aux (seqar,node) implicit =
  let w = subseq_length implicit in
  let child = get_child seqar (node,implicit) in
  let l = !(child.final) - child.start + 1 in
  let a = ref 1 in
  while !a < l & !a < w & eq_char seqar.(child.seqid).[child.start + !a] (subseq_get implicit !a) do
    incr a
  done; (* [!a] is the first mismatch position, or the length of [child] label *)
  if !a < w then
    if !a < l
    then raise Not_found
    else implicit_node_aux (seqar,child) (subseq_sub implicit !a (w - !a))
  else (node,implicit,child) 

(*
let rec synthesized (seqar,root : t) (f : 'a list -> node -> 'a) =
  synthesized_node (seqar,root) f root
and synthesized_node st f node =
  f (List.map (synthesized_node st f) (children st node)) node
*)

(* general fold *)
let rec fold : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's =
  fun gst f h s init ->
    fold_node gst f h s init (root gst)
and fold_node gst f h s h_node node =
  s
    (List.map
       (fun child -> fold_node gst f h s (h h_node child) child)
       (List.filter (f h_node) (children gst node)))
    h_node
    node

(* synthesized attributes only *)
let fold_s_node gst s node = fold_node gst (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) () node
let fold_s gst s = fold_s_node gst s (root gst)

(* filtering and synthesizing, no inheritance *)
let fold_fs gst f s = fold gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) ()


type tree = Node of string * tree list | Leaf of string * (int * int)

let readable gst =
  fold_s gst
    (fun l n ->
      let w = word gst n in
      if l=[]
      then Leaf (w, index gst n)
      else Node (w, l))

(* applications of suffix trees *)

let exact_matches : t -> string -> (int * int) list =
  fun gst word ->
    try
      let explicit, implicit, child = implicit_node gst word in
    fold_s_node gst
	(fun l n -> if l=[] then [index gst n] else List.concat l)
	child
    with Not_found -> []
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.