1. Sébastien Ferré
  2. ocaml-lib

Commits

Sébastien Ferré  committed 0c4d665

Initial revision

  • Participants
  • Parent commits bbf4380
  • Branches master

Comments (0)

Files changed (1)

File suffix_tree.ml

View file
  • Ignore whitespace
+let taille_alphabet = 4
+
+
+(* type of nodes in suffix trees *)
+type node = {
+    seqid : int; (* sequence index in which the positions start and final are defined *)
+    mutable start : int;     (* start and final position of the word labelling the node *)
+    final : int ref;
+    mutable link : node option; (* suffix link *)
+    v : node_value
+  }
+and node_value =
+  | Children of (char,node) Hashtbl.t (* for non-leaves: children nodes *)
+      (* for the key '\000', all values are relevant (use Hashtbl.find_all) *)
+  | Index of int (* for leaves: position of recognized suffix *)
+
+
+(* type of suffix trees *)
+type t = string array * node
+
+(* the initial root node *)
+let empty : unit -> node =
+  fun () -> {seqid= -1; start=0; final=ref (-1); link=None; v=Children (Hashtbl.create taille_alphabet)}
+
+
+(* --------------------------------------------------------------------------------
+   Operations on substrings of sequences
+   -------------------------------------------------------------------------------- *)
+
+type subseq = string * int * int  (* (seq, pos, len) *)
+
+let subseq_empty = ("",0,0)  (* non-significant subseq *)
+
+let subseq_is_empty (s,pos,len) = len = 0
+
+let subseq_get (s,pos,len) i = s.[pos+i]
+
+let subseq_length (s,pos,len) = len
+
+let subseq_sub (s,pos,len) pos' len' = (s,pos+pos',len')
+
+let subseq_extend (s,pos,len) = (s,pos,len+1)
+
+(* -------------------------------------------------------------------------------
+   Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
+   the snd node [child] is significant only when [implicit] is not the empty string,
+   and is the child that recognizes [implicit] starting from [explicit]. [implicit] is
+   defined by a sequence, a start and a length.
+   ------------------------------------------------------------------------------- *)
+
+let eq_char c1 c2 =
+  c1<>'\000' & c1=c2  (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)
+
+(* returns the child node that recognizes [implicit] from the node [explicit] *)
+let get_child seqar (explicit,implicit) =
+  if subseq_is_empty implicit
+  then explicit
+  else
+    let c = subseq_get implicit 0 in
+    if c = '\000'
+    then raise Not_found
+    else
+      match explicit.v with
+      | Children h -> Hashtbl.find h c
+      | Index _ -> raise Not_found
+    (* List.find (fun child -> eq_char seqar.(child.seqid).[child.start] c) explicit.children *)
+
+(* ensures that implicit does not span over another node below [explicit] *)
+let rec canonical seqar (explicit,implicit,child) =
+  if subseq_is_empty implicit
+  then (explicit,implicit,child)
+  else
+    let l = !(child.final) - child.start + 1 in
+    let a = subseq_length implicit in
+    if a < l
+    then (explicit,implicit,child)
+    else
+      let implicit' = subseq_sub implicit l (a-l) in
+      canonical seqar (child, implicit', get_child seqar (child,implicit'))
+
+(* test whether an implicit node is the root node *)
+let is_root root (explicit,implicit,_) =
+  explicit == root & subseq_is_empty implicit
+
+(* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
+   and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
+let has_child seqar (explicit,implicit,child) (k,i) =
+  let a = subseq_length implicit in
+  if a <> 0 then
+    if eq_char seqar.(child.seqid).[child.start+a] seqar.(k).[i]
+    then Some (explicit, subseq_extend implicit, child)
+    else None
+  else
+    try
+      let implicit' = (seqar.(k),i,1) in
+      Some (explicit, implicit', get_child seqar (explicit,implicit'))
+    with Not_found -> None
+
+(* --------------------------------
+   creation of new nodes and leaves
+   -------------------------------- *)
+
+let add_leaf seqar node seqid start final_ref index =
+  match node.v with
+  | Children h ->
+      Hashtbl.add h
+	seqar.(seqid).[start]
+	{seqid=seqid; start=start; final=final_ref; link=None; v=(Index index)}
+  | Index _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")
+
+(* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
+let insert_node seqar (explicit,implicit,child) =
+  let a = subseq_length implicit in
+  if a = 0
+  then explicit
+  else
+    match explicit.v with
+    | Children h ->
+	let c_child_old = seqar.(child.seqid).[child.start] in
+	let c_child_new = seqar.(child.seqid).[child.start+a] in
+	let n' = {
+	  seqid = child.seqid;
+	  start = child.start;
+	  final = ref (child.start+a-1);
+	  link = None;
+	  v = Children (let h' = Hashtbl.create (Hashtbl.length h) in Hashtbl.add h' c_child_new child; h')
+	} in
+	child.start <- child.start+a;
+	Hashtbl.replace h c_child_old n';
+	n'
+    | Index _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")
+
+(* add a suffix link from [pred_opt] (if defined) to [explicit] *)
+let add_link root pred_opt explicit =
+  if explicit != root then (* create a new suffix link *)
+    match pred_opt with
+    | Some n -> if n.link = None then n.link <- Some explicit
+    | None -> ()
+
+(* ------------ 
+   suffix links
+   ------------ *)
+
+(* get the node refered by the suffix link at [n] *)
+let suffix_link (root : node) (n : node) : node =
+  match n.link with
+  | None -> root  (* by default, the suffix link points to the root node *)
+  | Some n' -> n'
+
+(* extend suffix_link for implicit nodes *)
+let link (seqar,root) = function  (* TODO *)
+  | (explicit,implicit,_) when subseq_is_empty implicit ->
+      let explicit' = suffix_link root explicit in
+      (explicit', subseq_empty, explicit')
+  | (explicit,implicit,_) ->
+      if explicit == root
+      then
+	let implicit' = subseq_sub implicit 1 (subseq_length implicit - 1) in
+	canonical seqar (root, implicit', get_child seqar (root,implicit'))
+      else
+	let explicit' = suffix_link root explicit in
+	canonical seqar (explicit', implicit, get_child seqar (explicit',implicit))
+
+(* --------------------------------------------------------------
+   GST update for the new character c at position i in sequence k
+   -------------------------------------------------------------- *)
+
+(* state for 'update' *)
+type res = {
+    terminal : int ref;
+    mutable startj : int;
+    mutable startnode : node * subseq * node
+  }
+
+let rec update (seqar,root) (k,i) res pred_opt =
+  (* c = seqar.(k).[i] *)
+  match has_child seqar res.startnode (k,i) with
+  | Some extended_startnode -> (* startnode can be extended by [c] *)
+      let explicit, implicit, _ = res.startnode in
+      assert (pred_opt = None or subseq_is_empty implicit);
+        (* if a link has been followed after node creation, then we are on an explicit node *)
+      add_link root pred_opt explicit;
+      res.startnode <- canonical seqar extended_startnode
+  | None -> (* startnode cannot be extended by [c] ... *)
+      let n' = insert_node seqar res.startnode in (* ... so we insert a new node ... *)
+      add_link root pred_opt n';  (* ... a suffix link from the last created node (if defined) ... *)
+      if seqar.(k).[res.startj] <> '\000' then
+	add_leaf seqar n' k i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
+      res.startj <- res.startj + 1; (* prepare for the next suffix *)
+      if not (is_root root res.startnode)
+      then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
+	res.startnode <- link (seqar,root) res.startnode; (* ... follow the suffix link to find the next suffix ... *)
+	update (seqar,root) (k,i) res (Some n') end  (* ... and loop on [update] *)
+
+(* -------------------------------
+   implementing the .mli interface
+   ------------------------------- *)
+
+let make : string list -> t =
+  fun l_seq ->
+    let l = List.length l_seq in
+    let seqar = Array.make l "" in
+    let root = empty () in
+    let st = (seqar, root) in
+    ignore (List.fold_left
+      (fun k seq -> (* for every sequence/string [seq], numbered [k] ... *)
+	seqar.(k) <- seq ^ String.make 1 '\000'; (* add a terminal symbol ... *)
+	let res = {terminal=ref (-1); startj=0; startnode=(root,subseq_empty,root)} in (* initialize for [update] ... *)
+	for i = 0 to String.length seqar.(k) - 1 do (* for every position [i] in the sequence ... *)
+	  incr res.terminal; (* increment the leaves final position ... *)
+	  update st (k,i) res None (* call [update] for updating the suffix tree with the character at position [i] *)
+	done;
+	k+1)
+      0 l_seq);
+    st
+
+let string (seqar,root : t) (k : int) =
+  let seq = seqar.(k) in
+  String.sub seq 0 (String.length seq - 1) (* removing the terminal symbol *)
+
+let string_list (seqar,root : t) =
+  List.map (fun seq -> String.sub seq 0 (String.length seq - 1)) (Array.to_list seqar)
+
+let root (seq,root : t) = root
+
+let word (seqar,root) node =
+  if node == root
+  then ""
+  else String.sub seqar.(node.seqid) node.start (!(node.final) - node.start + (match node.v with Children _ -> 1 | Index _ -> 0))
+
+let children (gst : t) node =
+  match node.v with
+  | Children h ->
+      Hashtbl.fold (fun c n l -> if c='\000' then Hashtbl.find_all h c @ l else n::l) h []
+  | Index _ -> []
+
+let index (seq,root) node : int * int =
+  match node.v with
+  | Children _ -> raise (Invalid_argument "Suffix_tree.index: 2nd argument must be a leaf")
+  | Index i -> (node.seqid, i)
+
+let linked_node (seqar,root : t) (n : node) : node =
+  suffix_link root n
+
+let rec implicit_node (seqar,node : t) (word : string) =
+  let (explicit, (s,i,len), child) = implicit_node_aux (seqar,node) (word,0,String.length word) in
+  (explicit, String.sub s i len, child)
+and implicit_node_aux (seqar,node) implicit =
+  let w = subseq_length implicit in
+  let child = get_child seqar (node,implicit) in
+  let l = !(child.final) - child.start + 1 in
+  let a = ref 1 in
+  while !a < l & !a < w & eq_char seqar.(child.seqid).[child.start + !a] (subseq_get implicit !a) do
+    incr a
+  done; (* [!a] is the first mismatch position, or the length of [child] label *)
+  if !a < w then
+    if !a < l
+    then raise Not_found
+    else implicit_node_aux (seqar,child) (subseq_sub implicit !a (w - !a))
+  else (node,implicit,child) 
+
+(*
+let rec synthesized (seqar,root : t) (f : 'a list -> node -> 'a) =
+  synthesized_node (seqar,root) f root
+and synthesized_node st f node =
+  f (List.map (synthesized_node st f) (children st node)) node
+*)
+
+(* general fold *)
+let rec fold : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's =
+  fun gst f h s init ->
+    fold_node gst f h s init (root gst)
+and fold_node gst f h s h_node node =
+  s
+    (List.map
+       (fun child -> fold_node gst f h s (h h_node child) child)
+       (List.filter (f h_node) (children gst node)))
+    h_node
+    node
+
+(* synthesized attributes only *)
+let fold_s_node gst s node = fold_node gst (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) () node
+let fold_s gst s = fold_s_node gst s (root gst)
+
+(* filtering and synthesizing, no inheritance *)
+let fold_fs gst f s = fold gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) ()
+
+
+type tree = Node of string * tree list | Leaf of string * (int * int)
+
+let readable gst =
+  fold_s gst
+    (fun l n ->
+      let w = word gst n in
+      if l=[]
+      then Leaf (w, index gst n)
+      else Node (w, l))