1. Sébastien Ferré
  2. ocaml-lib

Commits

Sébastien Ferré  committed 907ad3e

Many changes.

  • Participants
  • Parent commits ade1687
  • Branches master

Comments (0)

Files changed (1)

File suffix_tree.ml

View file
  • Ignore whitespace
+(**
+   Suffix trees with incremental addition and removal of strings.
+   plus incremental maintenance of maximal factors.
+*)
+
 
 module type T =
   sig
     type strid = int
+	  (** Type of string ids. Functions using such ids are unspecified if the id is not valid. *)
+    type t
+	  (** Type of suffix trees. This is not a pure functional data-structure. *)
 
-    type t (** Type of suffix trees *)
+(** {1 Suffix trees as string sets. } *)
 
     val create : unit -> t
+	(** [create ()] returns a fresh and empty suffix tree. *)
+    val size : t -> int
+	(** [size st] returns the number of strings registered in the suffix tree [st]. *)
     val add : t -> string -> strid
+	(** [add st s] adds the string [s], and all its suffixes in the suffix tree [st], unless [s] has already been added.
+	   It also returns the string id as an handle on this string. *)
     val remove : t -> strid -> unit
+	(** [remove st id] removes the string identified by [id], and all its suffixes, from the suffix tree [st]. *)
     val get : t -> strid -> string
+	(** [get st id] returns the string associated to [id]. *)
     val find : t -> string -> strid
+	(** [find st s] returns the id associated to the string [s], if the strings exists in the suffix tree [st].
+	   Otherwise raise Not_found. *)
     val fold : (strid -> string -> 'a -> 'a) -> t -> 'a -> 'a
+	(** [fold f st e] is a classic folding on all strings in the suffix tree [st]. *)
+
+(** {1 Low-level interface on suffix trees. } *)
 
     type node
+	  (** Type of the nodes of suffix trees.
+	     Nodes are either leaves or internal nodes. *)
 
     val root : t -> node
+	(** [root st] returns the root node of the suffix tree [st]. *)
     val is_leaf : t -> node -> bool
+	(** [is_leaf st n] returns whether the node [n] is a leaf. *)
     val label : t -> node -> string
-    val children : t -> node -> node list
-    val suffix : t -> node -> strid * int (* raise Not_found *)
-    val link : t -> node -> node
+	(** [label st n] returns the string labelling the node [n]. *)
+    val path : t -> node -> string
+	(** [path st n] returns the full path from the root to the node [n]. *)
+    val ext : t -> node -> strid LSet.t
+	(** [ext st n] returns an ordered list of string ids that match the path of the node [n]. *)
+    val children : t -> node -> node LSet.t
+	(** [children st n] returns the list of children nodes of [n]. *)
+    val parent : t -> node -> node option
+	(** [parent st n] returns the parent node of [n], unless [n] is the root node. *)
+    val succ : t -> node -> node option
+	(** [succ st n] returns the successor node through the suffix link of [n], unless there is no suffix link. *)
+    val preds : t -> node -> node LSet.t
+	(** [preds st n] returns the list of all nodes having [n] as successor node. *)
+    val suffix : t -> node -> strid * int
+	(** [suffix st n] returns the suffix represented by the leaf node [n] as a couple [(string id, position in the string)].
+	   Raise Not_found if [n] is not a leaf. *)
+    val find_node : t -> string -> node
+	(** [find_node st s] returns the node whose path is equal to the string [s], if it exists.
+	   Raise Not_found otherwise. *)
     val fold_tree : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's
+	(** [fold_tree st filter herit synth h0] returns the result of an attribute evaluation on the suffix tree [st].
+	   - [filter] is used to filter which children of a node should be explored given the heritance value of the parent node,
+	   - [herit] defines the heritance value of a node, given the heritance value of its parent,
+	   - [synth] defines the synthesized value of a node given its heritance value, and the list of synthesized values of its filtered children,
+	   - [h0] is the heritance value given to the root.
+	 *)
+
+(** {1 Exploring the suffix tree through the substring relation. } *)
+		
+    val path_restrictions : t -> node -> node list
+	(** [path_restrictions st n] returns the list of nodes whose path is a direct restriction of the path of [n]. *)
+    val path_extensions : t -> node -> node list
+	(** [path_extensions st n] returns the list of nodes whose path is a direct extension of the path of [n]. *)
+    val is_maximal : t -> node -> bool
+	(** [is_maximal st n] returns whether a node is maximal.
+	   A node is maximal is each of its extensions has a strictly smaller extent, or the node represents a full string. *)
+    val max_restrictions : t -> node -> node list
+	(** [max_restrictions st n] returns the list of maximal nodes whose path is a restriction of the path of [n]. *)
+    val max_extensions : t -> node option -> node list
+	(** [max_extensions st n_opt] returns the list of maximal nodes whose path is a extension of the path of [n], when given.
+	   If a start node is not given, then the maximal nodes with shortest path are returned. *)
+    val string_extensions : t -> node option -> strid list
+	(** [string_extensions st n_opt] completes the result of [max_extensions st n_opt] with full strings through their ids. *)
+    val string_restrictions : t -> strid -> node list
+	(** [string_restrictions st strid] returns the list of maximal nodes having [strid] as a string extension. *)
+
+(** {1 Searching in a suffix tree} *)
 
-    type factor
-
-    val find_factor : t -> string -> factor (* raise Not_found *)
+    type factor = node * string * node
+	  (** [(parent,s,child)] locates a factor string on the edge from node [parent] to node [child], where [s] is a prefix of the label of [child].
+	     If [s] is the empty string, then [parent] and [child] are a same node.
+	     The path of a factor is the concatenation of [path st parent] and [s]. *)
+
+    val find_factor : t -> string -> factor
+	(** [find_factor st s] returns the factor locating [s] in the suffix tree [st].
+	   This means the path of the result factor is equal to [s].
+	   Raise [Not_found] if the string [s] does not appear in any string of [st]. *)
     val suffixes : t -> factor -> (strid * int) list
+	(** [suffixes st f] returns the list of all suffixes [(strid,pos)] that have the path of [f] as a prefix: this path occurs in string [strid] at position [pos]. *)
     val strings : t -> factor -> strid LSet.t
+	(** [strings st f] returns the ids of all string containing the path of [f]. *)
+
+(** {1 Simpler representation of a suffix tree (for debugging purpose at top-level)} *)
 
-    type tree = Node of string * tree list | Leaf of string * (strid * int)
+    type tree = Node of string * bool * int list * tree list | Leaf of string * (strid * int)
 
     val tree : t -> tree
 
   end
 
 
+(* --------------------------------------------------------------------------------
+   Operations on substrings of sequences
+   -------------------------------------------------------------------------------- *)
+
+    module Subseq =
+      struct
+	type t = string * int * int  (* (seq, pos, len) *)
+	  
+	let empty = ("",0,0)  (* non-significant subseq *)
+    
+	let is_empty (s,pos,len) = len = 0
+
+	let get (s,pos,len) i = s.[pos+i]
+
+	let length (s,pos,len) = len
+
+	let sub (s,pos,len) pos' len' = (s,pos+pos',len')
+
+	let extend (s,pos,len) = (s,pos,len+1)
+      end
+
+
+module Ext =
+  struct
+    type t = int * Cis.t
+
+    let cardinal (k,_) = k
+
+    let empty = (0,Cis.empty)
+
+    let mem i (k,cis) = Cis.mem i cis
+
+    let singleton i = (1,Cis.singleton i)
+
+    let add i (k,cis as ext) =
+      if Cis.mem i cis
+      then ext
+      else (k+1,Cis.add i cis)
+
+    let remove i (k,cis as ext) =
+      if Cis.mem i cis
+      then (k-1,Cis.remove i cis)
+      else ext
+
+    let union (_,cis1) (_,cis2) =
+      let cis = Cis.union cis1 cis2 in
+      (Cis.cardinal cis, cis)
+
+    let diff (_,cis1) (_,cis2) =
+      let cis = Cis.diff cis1 cis2 in
+      (Cis.cardinal cis, cis)
+
+    let elements (_,cis) = Cis.elements cis
+  end
+
+
 module Make : T =
   struct
     type strid = int
 	mutable seqid : strid; (* sequence index in which the positions start and final are defined *)
 	mutable start : int;     (* start and final position of the word labelling the node *)
 	mutable final : int ref;
-	mutable link : node; (* suffix link *)
+	mutable parent : node; (* prefix link, the root for the root itself *)
 	v : node_value
       }
     and node_value =
-      | Children of (char,node) Hashtbl.t (* for non-leaves: children nodes *)
-      | Index of int (* for leaves: position of recognized suffix *)
+      | I of node_internal (* for non-leaves, internal nodes *)
+      | L of int (* for leaves: position of recognized suffix *)
+    and node_internal = {
+	children : (char,node) Hashtbl.t;
+	mutable link : node; (* suffix link *)
+	mutable backlinks : node LSet.t;
+	mutable ext : Ext.t; (* set of strids under this node *)
+	mutable locals : Ext.t; (* subset of ext, strids only in leaves of the node (only on maximal nodes) *)
+	mutable maximal : bool; (* whether this node has a maximal path given its ext (a concept intent) *)
+      }
 
     type factor = node * string * node
 
       try Hashtbl.find st.ht strid
       with Not_found -> failwith ("Invalid string id: " ^ string_of_int strid)
 
+    let ext0 node =
+      match node.v with
+      | I x -> x.ext
+      | L _ -> Ext.singleton node.seqid
 
+    let locals0 node =
+      match node.v with
+      | I x -> x.locals
+      | L _ -> Ext.empty
 
-(* --------------------------------------------------------------------------------
-   Operations on substrings of sequences
-   -------------------------------------------------------------------------------- *)
-
-    module Subseq =
-      struct
-	type t = string * int * int  (* (seq, pos, len) *)
-	  
-	let empty = ("",0,0)  (* non-significant subseq *)
-    
-	let is_empty (s,pos,len) = len = 0
-
-	let get (s,pos,len) i = s.[pos+i]
-
-	let length (s,pos,len) = len
 
-	let sub (s,pos,len) pos' len' = (s,pos+pos',len')
-
-	let extend (s,pos,len) = (s,pos,len+1)
-      end
 
 (* -------------------------------------------------------------------------------
    Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
 	then raise Not_found
 	else
 	  match explicit.v with
-	  | Children h -> Hashtbl.find h c
-	  | Index _ -> raise Not_found
+	  | I x -> Hashtbl.find x.children c
+	  | L _ -> raise Not_found
 
 (* ensures that implicit does not span over another node below [explicit] *)
     let rec canonical (explicit,implicit,child) =
 	else None
       else
 	match explicit.v with
-	| Children h -> Some (Hashtbl.find_all h '\000')
-	| Index _ -> None
+	| I x -> Some (Hashtbl.find_all x.children '\000')
+	| L _ -> None
 
 (* --------------------------------
    creation of new nodes and leaves
 
     let add_leaf st node seqid start final_ref index =
       match node.v with
-      | Children h ->
-	  Hashtbl.add h
-	    (get0 st seqid).[start]
-	    {seqid=seqid; start=start; final=final_ref; link=st.root; v=(Index index)}
-      | Index _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")
+      | I x ->
+	  let child = {seqid=seqid; start=start; final=final_ref; parent=node; v=(L index)} in
+	  Hashtbl.add x.children (get0 st seqid).[start] child
+      | L _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")
 
 (* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
     let insert_node st (explicit,implicit,child) =
       then explicit
       else
 	match explicit.v with
-	| Children h ->
+	| I x ->
 	    let c_child_old = (get0 st child.seqid).[child.start] in
 	    let c_child_new = (get0 st child.seqid).[child.start+a] in
+	    let h' = Hashtbl.create (Hashtbl.length x.children) in Hashtbl.add h' c_child_new child;
 	    let n' = {
 	      seqid = child.seqid;
 	      start = child.start;
 	      final = ref (child.start+a-1);
-	      link = st.root;
-	      v = Children (let h' = Hashtbl.create (Hashtbl.length h) in Hashtbl.add h' c_child_new child; h')
+	      parent = explicit;
+	      v = I
+		{ children = h';
+		  link = st.root;
+		  backlinks = LSet.empty ();
+		  ext = ext0 child;
+		  locals = (match child.v with L _ -> Ext.singleton child.seqid | _ -> Ext.empty);
+		  maximal = false
+		};
 	    } in
 	    child.start <- child.start+a;
-	    Hashtbl.replace h c_child_old n';
+	    child.parent <- n';
+	    Hashtbl.replace x.children c_child_old n';
 	    n'
-	| Index _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")
+	| L _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")
+
+(* add some strid in the extent of all ancestor of a node (except the root) *)
+    let rec add_strid st new_maximal strid node =
+      if node != st.root then
+	match node.v with
+	| I x ->
+	    if not (Ext.mem strid x.ext) then begin
+	      x.ext <- Ext.add strid x.ext;
+	      add_strid st new_maximal strid node.parent end
+	    else
+	      LSet.remove node new_maximal
+	| _ -> assert false
+      else LSet.remove node new_maximal
 
 (* add a suffix link from [pred_opt] (if defined) to [explicit] *)
-    let add_link pred_opt explicit =
+    let add_link new_maximal pred_opt explicit =
       match pred_opt with
-      | Some n -> n.link <- explicit
-      | None -> ()
+      | Some n ->
+	  ( match n.v, explicit.v with
+	  | I x0, I x ->
+	      x0.link <- explicit;
+	      x.backlinks <- LSet.add n x.backlinks;
+	      LSet.remove explicit new_maximal
+	  | _ -> assert false)
+      | None -> new_maximal
 
 (* ------------ 
    suffix links
 (* extends suffix_link for implicit nodes *)
     let get_link st = function  (* TODO *)
       | (explicit,implicit,_) when Subseq.is_empty implicit ->
-	  let explicit' = explicit.link (*suffix_link root explicit*) in
+	  let explicit' = match explicit.v with I x -> x.link | _ -> assert false in  (*suffix_link root explicit*)
 	  (explicit', Subseq.empty, explicit')
       | (explicit,implicit,_) ->
 	  if explicit == st.root
 	    let implicit' = Subseq.sub implicit 1 (Subseq.length implicit - 1) in
 	    canonical (st.root, implicit', get_child (st.root,implicit'))
 	  else
-	    let explicit' = explicit.link (*suffix_link root explicit*) in
+	    let explicit' = match explicit.v with I x -> x.link | _ -> assert false in  (*suffix_link root explicit*)
 	    canonical (explicit', implicit, get_child (explicit',implicit))
 
 (* --------------------------------------------------------------
     type res = {
 	terminal : int ref;
 	mutable startj : int;
-	mutable startnode : node * Subseq.t * node
+	mutable startnode : node * Subseq.t * node;
+	mutable new_maximal : node LSet.t;
       }
 
-    let rec update st (k,i) res pred_opt =
-  (* c = seqar.(k).[i] *)
-      match has_child st res.startnode (k,i) with
+    let rec update st (strid,i) res pred_opt =
+      (* c = seqar.(strid).[i] *)
+      match has_child st res.startnode (strid,i) with
       | Some extended_startnode -> (* startnode can be extended by [c] *)
 	  let explicit, implicit, _ = res.startnode in
 	  assert (pred_opt = None or Subseq.is_empty implicit);
           (* if a link has been followed after node creation, then we are on an explicit node *)
-	  add_link pred_opt explicit;
+	  res.new_maximal <- add_link res.new_maximal pred_opt explicit;
 	  res.startnode <- canonical extended_startnode
       | None -> (* startnode cannot be extended by [c] ... *)
 	  let n' = insert_node st res.startnode in (* ... so we insert a new node ... *)
-	  add_link pred_opt n';  (* ... a suffix link from the last created node (if defined) ... *)
-	  if (get0 st k).[res.startj] <> '\000' then
-	    add_leaf st n' k i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
+	  if (get0 st strid).[res.startj] <> '\000' then begin
+	    add_leaf st n' strid i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
+	    res.new_maximal <- LSet.add n' res.new_maximal;
+	    res.new_maximal <- add_strid st res.new_maximal strid n';  (* updating the extent of ancestor nodes *)
+	    res.new_maximal <- add_link res.new_maximal pred_opt n';  (* ... a suffix link from the last created node (if defined) ... *)
+	  end;
 	  res.startj <- res.startj + 1; (* prepare for the next suffix *)
 	  if not (is_root st res.startnode)
 	  then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
 	    res.startnode <- get_link st res.startnode; (* ... follow the suffix link to find the next suffix ... *)
-	    update st (k,i) res (Some n') end  (* ... and loop on [update] *)
+	    update st (strid,i) res (Some n') end  (* ... and loop on [update] *)
 
 (* -------------------------------
    implementing the signature T
    ------------------------------- *)
 
     let create () =
-      let rec root = {seqid= -1; start=0; final=ref (-1); link=root; v=Children (Hashtbl.create 2)} in
+      let rec root = {
+	seqid = -1;
+	start = 0;
+	final = ref (-1);
+	parent = root;
+	v=I
+	  { children = Hashtbl.create 2;
+	    link = root;
+	    backlinks = LSet.empty ();
+	    ext = Ext.empty;
+	    locals = Ext.empty;
+	    maximal = false
+	  }} in
       { cpt = 0;
 	ht = Hashtbl.create 100;
 	root = root
       }
 
+    let size st = st.cpt
 
+    let is_root st node = node == st.root
 
     let root st = st.root
 
     let is_leaf st node =
       match node.v with
-      | Index _ -> true
+      | L _ -> true
       | _ -> false
 
     let label st node =
       if node == st.root
       then ""
-      else String.sub (get0 st node.seqid) node.start (!(node.final) - node.start + (match node.v with Children _ -> 1 | Index _ -> 0))
+      else String.sub (get0 st node.seqid) node.start (!(node.final) - node.start + (match node.v with I _ -> 1 | L _ -> 0))
+
+    let ext st node =
+      match node.v with
+      | I x -> LSet.of_list (Ext.elements x.ext)
+      | L _ -> LSet.singleton node.seqid
+
+    let is_maximal st node =
+      match node.v with
+      | I x -> x.maximal
+      | L pos -> pos=0
 
     let children st node =
       match node.v with
-      | Children h ->
-	  Hashtbl.fold (fun c n l -> n::l) h []
-      | Index _ -> []
+      | I x ->
+	  Hashtbl.fold (fun c n l -> LSet.add n l) x.children (LSet.empty ())
+      | L _ -> []
+
+    let parent st node =
+      if is_root st node
+      then None
+      else Some node.parent
+
+    let succ st node =
+      if is_root st node
+      then None
+      else 
+	match node.v with
+	| I x -> Some x.link
+	| L _ -> None
+
+    let preds st node =
+      match node.v with
+      | I x -> x.backlinks
+      | L _ -> LSet.empty ()
 
     let suffix st node =
       match node.v with
-      | Children _ -> raise Not_found
-      | Index i -> (node.seqid, i)
+      | I _ -> raise Not_found
+      | L i -> (node.seqid, i)
 
-    let link st node =
-      node.link
 
 (* general fold *)
     let rec fold_tree : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's =
     let fold_fs_node gst f s node = fold_node gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) () node
     let fold_fs_tree gst f s = fold_fs_node gst f s (root gst)
 
+    let rec path st node =
+      match parent st node with
+      | None -> ""
+	   | Some parent -> path st parent ^ label st node
+
+    let path_restrictions st node =
+      let lp = match parent st node with None -> LSet.empty () | Some p -> LSet.singleton p in (* the prefix restriction, if it exists *)
+      let ls = match succ st node with None -> LSet.empty () | Some s -> LSet.singleton s in (* the suffix restriction, if it exists *)
+      LSet.union lp ls
+
+    let path_extensions st node =
+      let lr = List.filter (fun n -> not (is_leaf st n)) (children st node) in (* right extensions *)
+      let lf = preds st node in (* left extensions *)
+      LSet.union lr lf
+
+    let rec max_restrictions st node =
+      max_restrictions_aux st (LSet.empty ()) (path_restrictions st node)
+    and max_restrictions_aux st acc = function
+      | [] -> acc
+      | n::ns ->
+	  if is_maximal st n
+	  then max_restrictions_aux st (LSet.add n acc) ns
+	  else max_restrictions_aux st acc (path_restrictions st n @ ns)
+
+    let rec max_extensions st node_opt =
+      let ns =
+	match node_opt with
+	| None -> [st.root]
+	| Some n -> path_extensions st n in
+      max_extensions_aux st (LSet.empty ()) ns
+    and max_extensions_aux st acc = function
+      | [] -> acc
+      | n::ns ->
+	  if is_maximal st n
+	  then max_extensions_aux st (LSet.add n acc) ns
+	  else max_extensions_aux st acc (path_extensions st n @ ns)
+
+    let string_restrictions st strid =
+      fold_fs_tree st  (* looking for the nodes having strid as a local *)
+	(fun n -> Ext.mem strid (ext0 n))
+	(fun l n ->
+	  List.fold_left
+	    LSet.union
+	    (if is_maximal st n && Ext.mem strid (locals0 n) then LSet.singleton n else LSet.empty ())
+	    l)
+
+    let string_extensions st node_opt =
+      Ext.elements (locals0 (match node_opt with None -> st.root | Some n -> n))
+
+
 
     let rec find_factor st str =
       let (explicit, (s,i,len), child) = find_factor_aux st st.root (str,0,String.length str) in
       (explicit, String.sub s i len, child)
     and find_factor_aux st node implicit =
       let w = Subseq.length implicit in
-      let child = get_child (node,implicit) in
-      let l = !(child.final) - child.start + 1 in
-      let a = ref 1 in
-      while !a < l & !a < w & eq_char (get0 st child.seqid).[child.start + !a] (Subseq.get implicit !a) do
-	incr a
-      done; (* [!a] is the first mismatch position, or the length of [child] label *)
+      if w = 0
+      then (node,implicit,node)
+      else
+	let child = get_child (node,implicit) in
+	let l = !(child.final) - child.start + 1 in
+	let a = ref 1 in
+	while !a < l & !a < w & eq_char (get0 st child.seqid).[child.start + !a] (Subseq.get implicit !a) do
+	  incr a
+	done; (* [!a] is the first mismatch position, or the length of [child] label *)
+	if ! a = l
+	then find_factor_aux st child (Subseq.sub implicit !a (w - !a))
+	else 
+	  if !a = w
+	  then (node,implicit,child)
+	  else raise Not_found
+(*      
       if !a < w then
 	if !a < l
 	then raise Not_found
 	else find_factor_aux st child (Subseq.sub implicit !a (w - !a))
       else (node,implicit,child) 
+*)
 
 
     let suffixes st (_,_,child) =
 	(fun l n -> if l=[] then [suffix st n] else List.concat l)
 	child
 
-    let strings st (_,_,child) =
+    let strings st (_,_,child) = ext st child
+(*
       fold_s_node st
 	(fun l n -> if l=[] then LSet.singleton (fst (suffix st n)) else LSet.union_r l)
 	child
-
+*)
 
     let get st strid =
       let str0 = get0 st strid in
 	    leafs
       | None -> raise Not_found
 
+    let find_node st str =
+      let (explicit,implicit,_) = find_factor st str in
+      if implicit = ""
+      then explicit
+      else raise Not_found
+
 (* add a string and returns its strid. If the string already exists, the GST is not modified,
    and the existing id is returned *)
     let add st str =
 	let strid = st.cpt <- st.cpt+1; st.cpt in
 	let str0 = str ^ String.make 1 '\000' in (* add a terminal symbol *)
 	Hashtbl.add st.ht strid str0;
-	let res = {terminal=ref (-1); startj=0; startnode=(st.root,Subseq.empty,st.root)} in (* initialize for [update] ... *)
+	( match st.root.v with I x -> x.ext <- Ext.add strid x.ext | _ -> assert false); (* update the extent of the root *)
+	let res = {terminal=ref (-1); startj=0; startnode=(st.root,Subseq.empty,st.root); new_maximal=LSet.singleton st.root} in (* initialize for [update] ... *)
 	for pos = 0 to String.length str0 - 1 do (* for every position [i] in the sequence ... *)
 	  incr res.terminal; (* increment the leaves final position ... *)
 	  update st (strid,pos) res None (* call [update] for updating the suffix tree with the character at position [pos] *)
 	done;
+	List.iter
+	  (fun n -> 
+	    match n.v with
+	    | I x ->
+		x.locals <- Ext.add strid x.locals;
+		x.maximal <- true
+	    | _ -> assert false)
+	  res.new_maximal; (* recording new maximal nodes *)
 	strid
 
     let rec remove st strid =
       Hashtbl.remove st.ht strid
     and remove_node on_root st strid node =
       match node.v with
-      | Index _ ->
+      | L _ ->
 	  if node.seqid = strid
 	  then None
-	  else Some node
-      | Children h ->
-	  let new_children =
-	    Hashtbl.fold
-	      (fun c n res -> match remove_node false st strid n with None -> res | Some n' -> (c,n')::res)
-	      h [] in
-	  match new_children with
-	  | [] ->
-	      if on_root
-	      then begin
-		Hashtbl.clear h;
-		Some node end
-	      else None
-	  | [(c,child)] when not on_root ->
-	      child.start <- child.start - (!(node.final) - node.start + 1);
-	      child.link <- node.link;
-	      Some child
-	  | (_,n0)::_ as l ->
-	      Hashtbl.clear h;
-	      List.iter (fun (c,n) -> Hashtbl.add h c n) l;
-	      if node.seqid = strid
-	      then begin (* strid string does not exists any more *)
-		let a = !(node.final) - node.start + 1 in
-		node.seqid <- n0.seqid;
-		node.start <- n0.start - a;
-		node.final <- ref (n0.start - 1) end;
-	      Some node
+	  else Some (node, Ext.empty)
+      | I x ->
+	  if not (Ext.mem strid x.ext)
+	  then Some (node, Ext.empty)
+	  else
+	    let new_children =
+	      Hashtbl.fold
+		(fun c n res -> match remove_node false st strid n with None -> res | Some (n',moved_locals) -> (c,n',moved_locals)::res)
+		x.children [] in
+	    match new_children with
+	    | [] ->
+		if on_root
+		then begin
+		  Hashtbl.clear x.children;
+		  x.ext <- Ext.remove strid x.ext;
+		  x.locals <- Ext.remove strid x.locals;
+		  x.maximal <- false;
+		  Some (node, Ext.empty) end
+		else begin
+		  remove_link st node;
+		  None end
+	    | [(c,child,_)] when not on_root ->
+		child.start <- child.start - (!(node.final) - node.start + 1);
+		child.parent <- node.parent;
+		remove_link st node;
+		Some (child, x.locals)
+	    | (_,n0,_)::_ as l ->
+		Hashtbl.clear x.children;
+		List.iter
+		  (fun (c,n,moved_locals) ->
+		    Hashtbl.add x.children c n;
+		    n.parent <- node;
+		    x.locals <- Ext.union x.locals moved_locals
+		  ) l;
+		x.ext <- Ext.remove strid x.ext;
+		x.locals <- Ext.remove strid x.locals;
+		let card = Ext.cardinal x.ext in
+		x.maximal <- x.maximal && (remove_maxcard l x.backlinks < card);
+		if node.seqid = strid
+		then begin (* strid string does not exists any more *)
+		  let a = !(node.final) - node.start + 1 in
+		  node.seqid <- n0.seqid;
+		  node.start <- n0.start - a;
+		  node.final <- ref (n0.start - 1) end;
+		Some (node, Ext.empty)
+    and remove_link st node =
+      match succ st node with
+      | None -> ()
+      | Some n ->
+	  match n.v with
+	  | I x -> x.backlinks <- LSet.remove node x.backlinks
+	  | _ -> assert false
+    and remove_maxcard children backlinks =
+      let maxcard_children =
+	List.fold_left (fun res (_,n,_) -> max res (Ext.cardinal (ext0 n))) 0 children in
+      let maxcard_backlinks =
+	List.fold_left (fun res n -> max res (Ext.cardinal (ext0 n))) 0 backlinks in
+      max maxcard_children maxcard_backlinks
 
 
     let fold f st init =
 
 (* readable version of a GST *)
 
-    type tree = Node of string * tree list | Leaf of string * (strid * int)
+    type tree = Node of string * bool * int list * tree list | Leaf of string * (strid * int)
 	
     let tree st =
       fold_s_tree st
 	(fun l n ->
 	  if n == st.root
-	  then Node ("",l)
+	  then Node ("", is_maximal st n, ext st n, l)
 	  else
 	    let w = label st n in
 	    if l=[]
 	    then Leaf (w, suffix st n)
-	    else Node (w, l))
+	    else Node (w, is_maximal st n, ext st n, l))
 
   end
+