Commits

Sébastien Ferré  committed 274bf60

Initial revision

  • Participants
  • Parent commits fc9b48e

Comments (0)

Files changed (1)

File stringset.ml

+(**
+   Suffix trees with incremental addition and removal of strings.
+   plus incremental maintenance of maximal factors.
+*)
+
+(* for test *)
+(*
+#load "unix.cma";;
+#load "str.cma";;
+#load "nums.cma";;
+#load "dbm.cma";;
+#load "common.cmo";;
+#load "cis.cmo";;
+#load "lSet.cmo";;
+#load "bintree.cmo";;
+#load "intset.cmo";;
+#load "index.cmo";;
+#load "persindex.cmo";;
+#load "genid.cmo";;
+*)
+
+open Persindex
+
+type strid = int (* string id *)
+module StrSet = Intset.Cis (* sets of string ids *)
+
+type nodeid = int (* node id *)
+module NodeSet = Intset.LSet (* sets of node ids *)
+
+type factor = nodeid * string * nodeid
+      (** [(parent,s,child)] locates a factor string on the edge from node [parent] to node [child], where [s] is a prefix of the label of [child].
+	 If [s] is the empty string, then [parent] and [child] are a same node.
+	 The path of a factor is the concatenation of [path st parent] and [s]. *)
+
+type tree = Node of string * int * int list * tree list | Leaf of string * (strid * int)
+
+(* Operations on substrings of sequences *)
+module Subseq =
+  struct
+    type t = string * int * int  (* (seq, pos, len) *)
+	  
+    let empty = ("",0,0)  (* non-significant subseq *)
+	
+    let is_empty (s,pos,len) = len = 0
+	
+    let get (s,pos,len) i = s.[pos+i]
+	
+    let length (s,pos,len) = len
+	
+    let sub (s,pos,len) pos' len' = (s,pos+pos',len')
+	
+    let extend (s,pos,len) = (s,pos,len+1)
+  end
+
+(* Operations on nodes *)
+module Node =
+  struct
+    type internal = {
+	children : (char,nodeid) Hashtbl.t;
+	mutable link : nodeid; (* suffix link *)
+	mutable backlinks : NodeSet.t;
+	mutable ext : StrSet.t; (* set of strids under this node *)
+	mutable locals : StrSet.t; (* subset of ext, strids only in leaves of the node (only on maximal nodes) *)
+	mutable maximal_right : bool; (* whether this node is maximal on its right given its ext (a concept intent) *)
+	mutable maximal : strid; (* by which strid (>0) this node became a maximal node (a concept intent) *)
+	mutable visible : int * int; (* which part of this node is visible: left and right offset. *)
+      }
+      
+    type specific  =
+      | I of internal (* for non-leaves, internal nodes *)
+      | L of int (* for leaves: position of recognized suffix *)
+
+(** Type of the nodes of suffix trees.
+   Nodes are either leaves or internal nodes. *)
+    type node = {
+	mutable seqid : strid; (* sequence index in which the positions start and final are defined *)
+	mutable start : int;     (* start and final position of the word labelling the node *)
+	mutable final : int ref;
+	mutable parent : nodeid; (* prefix link, the root for the root itself *)
+	v : specific
+      }
+
+    let ext node =
+      match node.v with
+      | I x -> x.ext
+      | L _ -> StrSet.singleton node.seqid
+	
+    let maximal node =
+      match node.v with
+      | I x -> x.maximal
+      | L _ -> assert false
+
+    let locals node =
+      match node.v with
+      | I x -> x.locals
+      | L _ -> StrSet.empty
+	    
+    let length node =
+      !(node.final) - node.start + (match node.v with I _ -> 1 | L _ -> 0)
+
+    let label str node =
+      String.sub (str # get node.seqid) node.start (length node)
+
+    let get_visible node =
+      match node.v with
+      | I x -> x.visible
+      | _ -> raise (Invalid_argument "Stringset.get_visible: applied to a leaf")
+
+  end
+
+open Node
+
+(* state for 'update' *)
+type res = {
+    terminal : int ref;
+    mutable startj : int;
+    mutable startnode : (nodeid * node) * Subseq.t * (nodeid * node);
+  }
+
+let eq_char c1 c2 =
+  c1<>'\000' & c1=c2  (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)
+
+
+(** Class of suffix trees. This is not a purely functional data-structure. *)
+
+class st ?(get_visible : string -> int * int = fun _ -> 0, 0) (name : string) (db : database) =
+  object (self)
+    val strgen = new Genid.genid (name ^ ".strgen") db
+    val stridx : (strid,string) index = new varray_vector_opt ~vals:Serialize.string 13 chunk (fun _ -> raise Not_found) db
+    val nodegen = new Genid.genid (name ^ ".nodegen") db
+    val nodeidx : (nodeid,node) index = new varray_vector_opt ~vals:Serialize.marshal 13 chunk (fun _ -> raise Not_found) db
+    val root : (unit,nodeid) index = new var ~vals:Serialize.int (-1) (fun () -> raise Not_found) db
+
+    initializer
+      stridx # locate (name ^ "_strid") name "str";
+      nodeidx # locate (name ^ "_nodeid") name "node";
+      root # locate name name "root";
+      try
+	ignore (root # get ())
+      with Not_found -> (* creation of the suffix tree, and not re-opening from db *)
+	let root_id = nodegen # alloc in
+	let root_node = {
+	  seqid = -1;
+	  start = 0;
+	  final = ref (-1);
+	  parent = root_id;
+	  v=I
+	    { children = Hashtbl.create 2;
+	      link = root_id;
+	      backlinks = NodeSet.empty;
+	      ext = StrSet.empty;
+	      locals = StrSet.empty;
+	      maximal = 0;
+	      maximal_right = false;
+	      visible = (0,0);
+	    }} in
+	root # set () root_id;
+	nodeidx # set root_id root_node
+
+    method name = name
+
+    method sync =
+      strgen # sync;
+      stridx # sync;
+      nodegen # sync;
+      nodeidx # sync;
+      root # sync
+
+    method unload (p : int) =
+      stridx # unload p;
+      nodeidx # unload p
+
+
+(** {1 Suffix trees as string sets. } *)
+
+(* returns the child node that recognizes [implicit] from the node [explicit] *)
+    method private get_child (n,node) implicit =
+      if Subseq.is_empty implicit
+      then (n,node)
+      else
+	let c = Subseq.get implicit 0 in
+	if c = '\000'
+	then raise Not_found
+	else
+	  match node.v with
+	  | I x -> let ch = Hashtbl.find x.children c in (ch, nodeidx # get ch)
+	  | L _ -> raise Not_found
+
+    method add (str : string) : strid =
+	(** [st # add s] adds the string [s], and all its suffixes in the suffix tree [st], unless [s] has already been added.
+	   It also returns the string id as an handle on this string. *)
+    (* ensures that implicit does not span over another node below [explicit] *)
+      let rec canonical ((e,explicit),implicit,(c,child) as res) =
+	if Subseq.is_empty implicit
+	then res
+	else
+	  let l = !(child.final) - child.start + 1 in
+	  let a = Subseq.length implicit in
+	  if a < l
+	  then res
+	  else
+	    let implicit' = Subseq.sub implicit l (a-l) in
+	    canonical ((c, child), implicit', self # get_child (c,child) implicit')
+      in
+    (* test whether an implicit node is the root node *)
+      let is_root ((_,explicit), implicit, _) =
+	explicit.seqid = (-1) && Subseq.is_empty implicit
+      in
+    (* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
+       and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
+      let has_child ((e,explicit),implicit,(c,child)) (k,i) =
+	let a = Subseq.length implicit in
+	if a <> 0 then
+	  if eq_char (stridx # get child.seqid).[child.start+a] (stridx # get k).[i]
+	  then Some ((e,explicit), Subseq.extend implicit, (c,child))
+	  else None
+	else
+	  try
+	    let implicit' = (stridx # get k,i,1) in
+	    Some ((e,explicit), implicit', self # get_child (e,explicit) implicit')
+	  with Not_found -> None
+      in
+    (* creation of new nodes and leaves *)
+      let add_leaf (n,node) seqid start final_ref index =
+	match node.v with
+	| I x ->
+	    let c = nodegen # alloc in
+	    let child = {seqid=seqid; start=start; final=final_ref; parent=n; v=(L index)} in
+	    nodeidx # set c child;
+	    Hashtbl.add x.children (stridx # get seqid).[start] c;
+	    nodeidx # set n node (* for update *)
+	| L _ -> raise (Invalid_argument "Stringset.add_leaf: 2nd argument must not be a leaf")
+      in      
+    (* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
+      let insert_node strid ((e,explicit),implicit,(c,child)) =
+	match explicit.v with
+	| I x ->
+	    let a = Subseq.length implicit in
+	    if a = 0
+	    then begin
+	      if x.maximal = 0 then begin (* otherwise explicit has been made maximal by another strid *)
+		x.maximal <- strid;
+		x.maximal_right <- true;
+		x.locals <- StrSet.add strid x.locals end;
+	      nodeidx # set e explicit;
+	      e, explicit end
+	    else begin
+	      let c_child_old = (stridx # get child.seqid).[child.start] in
+	      let c_child_new = (stridx # get child.seqid).[child.start+a] in
+	      let h' = Hashtbl.create (Hashtbl.length x.children) in Hashtbl.add h' c_child_new c;
+	      let n' = nodegen # alloc in
+	      let node' = {
+		seqid = child.seqid;
+		start = child.start;
+		final = ref (child.start+a-1);
+		parent = e;
+		v = I
+		  { children = h';
+		    link = root # get ();
+		    backlinks = NodeSet.empty;
+		    ext = Node.ext child;
+		    locals = StrSet.singleton strid;
+		    maximal = strid;
+		    maximal_right = true;
+		    visible = (0,0);
+		  };
+	      } in
+	      nodeidx # set n' node';
+	      child.start <- child.start+a;
+	      child.parent <- n';
+	      nodeidx # set c child;
+	      Hashtbl.replace x.children c_child_old n';
+	      nodeidx # set e explicit;
+	      self # set_visible n' (get_visible (self # path n'));
+	      nodeidx # set n' node';
+	      n', node' end
+	| L _ -> raise (Invalid_argument "Stringset.insert_node: first part of 2nd argument must not be a leaf")
+      in
+    (* add some strid in the extent of all ancestor of a node (except the root) *)
+      let rec add_strid strid (n,node) =
+	match node.v with
+	| I x ->
+	    if node.seqid <> (-1) && not (StrSet.mem strid x.ext) then begin
+	      x.ext <- StrSet.add strid x.ext;
+	      nodeidx # set n node;
+	      add_strid strid (node.parent, nodeidx # get node.parent) end
+	    else
+	      if x.maximal = strid then begin (* this node has been made maximal when adding strid *)
+		x.maximal <- 0;
+		x.maximal_right <- false;
+		x.locals <- StrSet.remove strid x.locals;
+		nodeidx # set n node end
+	| _ -> assert false
+      in
+    (* add a suffix link from [pred_opt] (if defined) to [explicit] *)
+      let add_link strid pred_opt (e,explicit) =
+	match pred_opt with
+	| Some (n,node) ->
+	    ( match node.v, explicit.v with
+	    | I x0, I x ->
+		x0.link <- e;
+		x.backlinks <- NodeSet.add n x.backlinks;
+		if x.maximal = strid then begin
+		  x.maximal <- 0; (* maximal_right is left unchanged *)
+		  x.locals <- StrSet.remove strid x.locals end;
+		nodeidx # set n node;
+		nodeidx # set e explicit
+	    | _ -> assert false)
+	| None -> ()
+      in
+    (* extends suffix_link for implicit nodes *)
+      let get_link ((e,explicit),implicit,_) =
+	if Subseq.is_empty implicit
+	then
+	  let e_explicit' = match explicit.v with I x -> x.link, nodeidx # get x.link | _ -> assert false in  (*suffix_link root explicit*)
+	  (e_explicit', Subseq.empty, e_explicit')
+	else
+	  if explicit.seqid = (-1)
+	  then
+	    let implicit' = Subseq.sub implicit 1 (Subseq.length implicit - 1) in
+	    let root_id = root # get () in
+	    let root_node = nodeidx # get root_id in
+	    canonical ((root_id, root_node), implicit', self # get_child (root_id,root_node) implicit')
+	  else
+	    let e', explicit' = match explicit.v with I x -> x.link, nodeidx # get x.link | _ -> assert false in  (*suffix_link root explicit*)
+	    canonical ((e',explicit'), implicit, self # get_child (e',explicit') implicit)
+      in
+    (* GST update for the new character c at position i in sequence k *)
+      let rec update (strid,i) res pred_opt =
+	(* c = (stridx # get strid).[i] *)
+	match has_child res.startnode (strid,i) with
+	| Some extended_startnode -> (* startnode can be extended by [c] *)
+	    let e_explicit, implicit, _ = res.startnode in
+	    assert (pred_opt = None or Subseq.is_empty implicit);
+            (* if a link has been followed after node creation, then we are on an explicit node *)
+	    add_link strid pred_opt e_explicit;
+	    res.startnode <- canonical extended_startnode
+	| None -> (* startnode cannot be extended by [c] ... *)
+	    let n', node' as n_node' = insert_node strid res.startnode in (* ... so we insert a new node ... *)
+	    if (stridx # get strid).[res.startj] <> '\000' then begin
+	      add_leaf n_node' strid i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
+	      add_strid strid n_node';  (* updating the extent of ancestor nodes *)
+	      add_link strid pred_opt n_node';  (* ... a suffix link from the last created node (if defined) ... *)
+	    end;
+	    res.startj <- res.startj + 1; (* prepare for the next suffix *)
+	    if not (is_root res.startnode)
+	    then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
+	      res.startnode <- get_link res.startnode; (* ... follow the suffix link to find the next suffix ... *)
+	      update (strid,i) res (Some n_node') end (* ... and loop on [update] *)
+      in
+    (* add a string and returns its strid. If the string already exists, the GST is not modified,
+       and the existing id is returned *)
+      try
+	self # find str
+      with Not_found ->
+	let strid = strgen # alloc in
+	let str0 = str ^ String.make 1 '\000' in (* add a terminal symbol *)
+	stridx # set strid str0;
+	let root_id = root # get () in
+	let root_node = nodeidx # get root_id in
+	( match root_node.v with
+	| I x ->
+	    x.ext <- StrSet.add strid x.ext;
+	    x.maximal <- strid;
+	    x.maximal_right <- true;
+	    x.locals <- StrSet.add strid x.locals;
+	    nodeidx # set root_id root_node
+	| _ -> assert false ); (* update the extent of the root *)
+	let res = {terminal=ref (-1); startj=0; startnode=((root_id,root_node), Subseq.empty, (root_id,root_node))} in (* initialize for [update] ... *)
+	for pos = 0 to String.length str0 - 1 do (* for every position [i] in the sequence ... *)
+	  incr res.terminal; (* increment the leaves final position ... *)
+	  update (strid,pos) res None (* call [update] for updating the suffix tree with the character at position [pos] *)
+	done;
+	strid
+
+    method remove (strid : strid) : unit =
+	(** [st # remove id] removes the string identified by [id], and all its suffixes, from the suffix tree [st]. *)
+      let remove_maxcard children backlinks =
+	let maxcard_children =
+	  List.fold_left (fun res (_,n,node,_) -> max res (StrSet.cardinal (Node.ext node))) 0 children in
+	let maxcard_backlinks =
+	  NodeSet.fold (fun res n -> max res (StrSet.cardinal (self # ext n))) 0 backlinks in
+	maxcard_children, maxcard_backlinks
+      in
+      let remove_link (n,node) =
+	if node.seqid = (-1)
+	then ()
+	else
+	  match node.v with
+	  | L _ -> ()
+	  | I x ->
+	      let succ = nodeidx # get x.link in
+	      match succ.v with
+	      | I x -> x.backlinks <- NodeSet.remove n x.backlinks
+	      | _ -> assert false
+      in
+      let rec remove_node on_root strid n0 =
+	let node = nodeidx # get n0 in
+	match node.v with
+	| L _ ->
+	    if node.seqid = strid
+	    then None
+	    else Some (n0, node, StrSet.empty)
+	| I x ->
+	    if not (StrSet.mem strid x.ext)
+	    then Some (n0, node, StrSet.empty)
+	    else
+	      let new_children =
+		Hashtbl.fold
+		(fun c n res -> match remove_node false strid n with None -> res | Some (n',node',moved_locals) -> (c,n',node',moved_locals)::res)
+		  x.children [] in
+	      match new_children with
+	      | [] ->
+		  if on_root
+		  then begin
+		    Hashtbl.clear x.children;
+		    x.ext <- StrSet.remove strid x.ext;
+		    x.locals <- StrSet.remove strid x.locals;
+		    x.maximal <- 0;
+		    x.maximal_right <- false; (* ? *)
+		    nodeidx # set n0 node;
+		    Some (n0, node, StrSet.empty) end
+		  else begin
+		    remove_link (n0,node);
+		    None end
+	      | [(c,ch,child,_)] when not on_root ->
+		  child.start <- child.start - (!(node.final) - node.start + 1);
+		  child.parent <- node.parent;
+		  nodeidx # set ch child;
+		  remove_link (n0,node);
+		  Some (ch, child, x.locals)
+	      | (_,n1,node1,_)::_ as l ->
+		  Hashtbl.clear x.children;
+		  List.iter
+		    (fun (c,n',node',moved_locals) ->
+		      Hashtbl.add x.children c n';
+		      node'.parent <- n0;
+		      nodeidx # set n' node';
+		      x.locals <- StrSet.union x.locals moved_locals
+		    ) l;
+		  x.ext <- StrSet.remove strid x.ext;
+		  x.locals <- StrSet.remove strid x.locals;
+		  let card = StrSet.cardinal x.ext in
+		  let maxcard_children, maxcard_backlinks = remove_maxcard l x.backlinks in
+		  if max maxcard_children maxcard_backlinks = card then x.maximal <- 0;
+		  if maxcard_children = card then x.maximal_right <- false;
+		  if node.seqid = strid
+		  then begin (* strid string does not exists any more *)
+		    let a = !(node.final) - node.start + 1 in
+		    node.seqid <- node1.seqid;
+		    node.start <- node1.start - a;
+		    node.final <- ref (node1.start - 1) end;
+		  nodeidx # set n0 node;
+		  Some (n0, node, StrSet.empty)
+      in
+      ignore (remove_node true strid (root # get ()));
+      stridx # reset strid
+
+    method get (strid : strid) : string =
+	(** [st # get id] returns the string associated to [id]. *)
+      let str0 = stridx # get strid in
+      String.sub str0 0 (String.length str0 - 1)
+
+    method find (str : string) : strid =
+	(** [st # find s] returns the id associated to the string [s], if the strings exists in the suffix tree [st].
+	   Otherwise raise Not_found. *)
+    (* test whether the extension of an implicit node by '\000' exists in the GST,
+       and if so, returns the corresponding leaves, otherwise returns None. *)
+      let has_end (e,implicit,c) =
+	let a = String.length implicit in
+	if a <> 0 then
+	  let child = nodeidx # get c in
+	  if (stridx # get child.seqid).[child.start+a] = '\000'
+	  then Some [c]
+	  else None
+	else
+	  let explicit = nodeidx # get e in
+	  match explicit.v with
+	  | I x -> Some (Hashtbl.find_all x.children '\000')
+	  | L _ -> None
+      in
+      let factor = self # find_factor str in
+      match has_end factor with
+      | Some leafs ->
+	  Common.mapfind
+	    (fun leaf ->
+	      let (strid,pos) = self # suffix leaf in
+	      if pos = 0 then Some strid else None) (* there should be only one *)
+	    leafs
+      | None -> raise Not_found
+
+(** {1 Low-level interface on suffix trees. } *)
+
+    method root : nodeid =
+         (** [st # root] returns the root node of the suffix tree [st]. *)
+      root # get ()
+
+    method is_leaf (n : nodeid) : bool =
+	(** [st # is_leaf n] returns whether the node [n] is a leaf. *)
+      let node = nodeidx # get n in
+      match node.v with
+      | L _ -> true
+      | _ -> false
+
+    method label (n : nodeid) : string =
+	(** [st # label n] returns the string labelling the node [n]. *)
+      if n = root # get ()
+      then ""
+      else
+	let node = nodeidx # get n in
+	Node.label stridx node
+
+    method length (n : nodeid) : int =
+	(** [st # length n] returns the length of the string labelling the node [n]. *)
+      let node = nodeidx # get n in
+      Node.length node
+
+    method path (n : nodeid) : string =
+	(** [st # path n] returns the full path from the root to the node [n]. *)
+      let rec aux n =
+	if n = root # get ()
+	then ""
+	else
+	  let node = nodeidx # get n in
+	  aux node.parent ^ Node.label stridx node in
+      aux n
+
+    method height (n : nodeid) : int =
+	(** [st # height n] returns the height of node [n], i.e. the length of the path from root to [n]. *)
+      let rec aux n =
+	if n = root # get ()
+	then 0
+	else
+	  let node = nodeidx # get n in
+	  aux node.parent + Node.length node in
+      aux n
+
+    method ext (n : nodeid) : StrSet.t =
+	(** [st # ext n] returns a set of string ids that match the path of the node [n]. *)
+      let node = nodeidx # get n in
+      Node.ext node
+
+    method children (n : nodeid) : NodeSet.t =
+	(** [st # children n] returns the set of children nodes of [n]. *)
+      let node = nodeidx # get n in
+      match node.v with
+      | I x -> Hashtbl.fold (fun c -> NodeSet.add) x.children NodeSet.empty
+      | L _ -> NodeSet.empty
+
+    method parent (n : nodeid) : nodeid option =
+	(** [st # parent n] returns the parent node of [n], unless [n] is the root node. *)
+      if n = root # get ()
+      then None
+      else Some (nodeidx # get n).parent
+
+    method succ (n : nodeid) : nodeid option =
+	(** [st # succ n] returns the successor node through the suffix link of [n], unless there is no suffix link. *)
+      if n = root # get ()
+      then None
+      else 
+	let node = nodeidx # get n in
+	match node.v with
+	| I x -> Some x.link
+	| L _ -> None
+
+    method preds (n : nodeid) : NodeSet.t =
+	(** [st # preds n] returns the list of all nodes having [n] as successor node. *)
+      let node = nodeidx # get n in
+      match node.v with
+      | I x -> x.backlinks
+      | L _ -> NodeSet.empty
+
+    method suffix (n : nodeid) : strid * int =
+	(** [st # suffix n] returns the suffix represented by the leaf node [n] as a couple [(string id, position in the string)].
+	   Raise Not_found if [n] is not a leaf. *)
+      let node = nodeidx # get n in
+      match node.v with
+      | I _ -> raise Not_found
+      | L i -> (node.seqid, i)
+
+    method find_node (str : string) : nodeid =
+	(** [st # find_node s] returns the node whose path is equal to the string [s], if it exists.
+	   Raise Not_found otherwise. *)
+      let (explicit,implicit,_) = self # find_factor str in
+      if implicit = ""
+      then explicit
+      else raise Not_found
+
+    method fold : 'h 's . ?start:nodeid -> ('h -> nodeid -> bool) -> ('h -> nodeid -> 'h) -> ('s list -> 'h -> nodeid -> 's) -> 'h -> 's =
+	(** [st # fold start filter herit synth h0] returns the result of an attribute evaluation on the suffix tree [st].
+	   - [start] is the starting node for the evaluation, only the subtree is explored,
+	   - [filter] is used to filter which children of a node should be explored given the heritance value of the parent node,
+	   - [herit] defines the heritance value of a node, given the heritance value of its parent,
+	   - [synth] defines the synthesized value of a node given its heritance value, and the list of synthesized values of its filtered children,
+	   - [h0] is the heritance value given to the root.
+	 *)
+      fun ?(start = root # get ()) f h s init ->
+	let rec aux f h s h_node node =
+	  s
+	    (List.map
+	       (fun child -> aux f h s (h h_node child) child)
+	       (List.filter (f h_node) (NodeSet.elements (self # children node))))
+	    h_node
+	    node in
+	aux f h s init start
+
+  (* synthesized attributes only *)
+    method fold_s : 's . ?start:nodeid -> ('s list -> nodeid -> 's) -> 's =
+      fun ?start s ->
+	self # fold ?start (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) ()
+	
+  (* filtering and synthesizing, no inheritance *)
+    method fold_fs : 's . ?start:nodeid -> (nodeid -> bool) -> ('s list -> nodeid -> 's) -> 's =
+      fun ?start f s ->
+	self # fold ?start (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) ()
+
+
+(** {1 Exploring the suffix tree through the substring relation. } *)
+		
+    method path_restrictions (x : nodeid) : nodeid list =
+	(** [st # path_restrictions n] returns the list of nodes whose path is a direct restriction of the path of [n]. *)
+      let lp = match self # parent x with None -> NodeSet.empty | Some p -> NodeSet.singleton p in (* the prefix restriction, if it exists *)
+      let ls = match self # succ x with None -> NodeSet.empty | Some s -> NodeSet.singleton s in (* the suffix restriction, if it exists *)
+      NodeSet.elements (NodeSet.union lp ls)
+
+    method path_extensions (x : nodeid) : nodeid list =
+	(** [st # path_extensions n] returns the list of nodes whose path is a direct extension of the path of [n]. *)
+      let lr = NodeSet.filter (fun n -> not (self # is_leaf n)) (self # children x) in (* right extensions *)
+      let lf = self # preds x in (* left extensions *)
+      NodeSet.elements (NodeSet.union lr lf)
+
+    method is_maximal (n : nodeid) : bool =
+	(** [st # is_maximal n] returns whether a node is maximal.
+	   A node is maximal is each of its extensions has a strictly smaller extent, or the node represents a full string. *)
+      let node = nodeidx # get n in
+      match node.v with
+      | I x -> x.maximal > 0
+      | L pos -> pos=0
+
+    method set_visible (n : nodeid) (lr : int * int) : unit =
+	(** [st # set_visible n (left_pos, right_pos)] sets which part of a node path should be visible when maximal. *)
+      let node = nodeidx # get n in
+      match node.v with
+      | I x -> x.visible <- lr
+      | L _ -> ()
+
+    method max_restrictions (x : nodeid) : nodeid list =
+	(** [st # max_restrictions n] returns the list of maximal nodes whose path is a restriction of the path of [n]. *)
+      let rec aux acc = function
+	| [] -> acc
+	| n::ns ->
+	    if self # is_maximal n
+	    then aux (LSet.add (self # height n, n) acc) ns
+	    else aux acc (self # path_restrictions n @ ns)
+      in
+      let res1 = aux (LSet.empty ()) (self # path_restrictions x) in
+      let _, res2 =
+	Common.fold_while
+	  (fun (res1, res2) ->
+	    match res1 with
+	    | [] -> None
+	    | (_,n)::hns -> Some (LSet.diff hns (aux (LSet.empty ()) (self # path_restrictions n)), n::res2))
+	  (res1, []) in
+      res2
+
+    method max_extensions (n0_opt : nodeid option) : nodeid list * strid list =
+	(** [st # max_extensions n_opt] returns the list of maximal nodes and leaves whose path is an extension of the path of [n], when given.
+	   If a start node is not given, then the maximal nodes with shortest path are returned. *)
+      let rec max_extensions_remove_left on_start n0 acc =
+	(* node is right-maximal right extension of start (possibly start itself) *)
+	let node = nodeidx # get n0 in
+	match node.v with
+	| I x ->
+	    let acc0 =
+	      if not on_start && x.maximal > 0
+	      then NodeSet.remove n0 acc
+	      else acc in
+	    NodeSet.fold
+	      (fun acc n -> max_extensions_remove_left false n acc)
+	      acc0
+	      x.backlinks
+	| _ -> assert false
+      in
+      let rec max_extensions_remove_right start_is_root on_start n0 acc =
+	(* node is right extension of start (possibly start itself (on_start = true)) *)
+	let node = nodeidx # get n0 in
+	match node.v with
+	| I x ->
+	    let acc0 =
+	      if not on_start && x.maximal > 0
+	      then NodeSet.remove n0 acc
+	      else acc in
+	    let acc1 =
+	      if not start_is_root && x.maximal_right
+	      then max_extensions_remove_left on_start n0 acc0
+	      else acc0 in
+	    Hashtbl.fold
+	      (fun _ n acc -> max_extensions_remove_right start_is_root false n acc)
+	      x.children
+	      acc1
+	| _ -> acc
+      in
+      let rec max_extensions_left on_start (shift_left, shift_right) n0 acc =
+	(* node is right-maximal right extension of start (possibly start itself) *)
+	let node = nodeidx # get n0 in
+	match node.v with
+	| I x ->
+	    let sl, sr = x.visible in
+	    if not on_start && x.maximal > 0 && ((*sl <= shift_left && sr <= shift_right && *) sl + sr < shift_left + shift_right) then
+	      NodeSet.add n0 acc
+	    else
+	      NodeSet.fold
+		(fun acc n -> max_extensions_left false (shift_left + 1, shift_right) n acc)
+		acc
+		x.backlinks
+	| _ -> assert false
+      in
+      let rec max_extensions_right start_is_root on_start (shift_left, shift_right) n0 acc =
+	(* node is right extension of start (possibly start itself (on_start = true)) *)
+	let node = nodeidx # get n0 in
+	match node.v with
+	| I x ->
+	    let sl, sr = x.visible in
+	    if not on_start && x.maximal > 0 && ((* sl <= shift_left && sr <= shift_right && *) sl + sr < shift_left + shift_right) then
+	      NodeSet.add n0 acc
+	    else
+	      let acc1 =
+		if not start_is_root && x.maximal_right
+		then max_extensions_left on_start (shift_left, shift_right) n0 acc
+		else acc in
+	      Hashtbl.fold
+		(fun _ n acc ->
+		  let node' = nodeidx # get n in
+		  max_extensions_right start_is_root false (shift_left, shift_right + !(node'.final) - node'.start + 1) n acc)
+		x.children
+		acc1
+	| _ -> acc
+      in
+      let on_start, start_is_root, start =
+	match n0_opt with
+	| None -> false, true, root # get ()
+	| Some n -> true, n = root # get (), n in
+      let start_node = nodeidx # get start in
+      let incrs0 = max_extensions_right start_is_root on_start (Node.get_visible start_node) start NodeSet.empty in
+      let incrs1 =
+	NodeSet.fold
+	  (fun incrs1 n ->
+	    max_extensions_remove_right false true n incrs1)
+	  incrs0 incrs0 in
+      let incrs = NodeSet.elements incrs1 in
+      let ext = Node.ext start_node in
+      let locals =
+	StrSet.elements (List.fold_left (fun res n -> StrSet.diff res (self # ext n)) ext incrs) in
+      incrs, locals
+
+    method string_restrictions (strid : strid) : nodeid list =
+	(** [st # string_restrictions strid] returns the list of maximal nodes having [strid] as a string extension. *)
+      NodeSet.elements
+	(self # fold_fs  (* looking for the nodes having strid as a local *)
+	   (fun n -> StrSet.mem strid (self # ext n))
+	   (fun l n ->
+	     List.fold_left
+	       NodeSet.union
+	       (if self # is_maximal n && StrSet.mem strid (Node.locals (nodeidx # get n)) then NodeSet.singleton n else NodeSet.empty)
+	       l))
+
+
+(** {1 Searching in a suffix tree} *)
+
+    method find_factor (str : string) : factor =
+	(** [st # find_factor s] returns the factor locating [s] in the suffix tree [st].
+	   This means the path of the result factor is equal to [s].
+	   Raise [Not_found] if the string [s] does not appear in any string of [st]. *)
+      let rec find_factor_aux (n,node) implicit =
+	let w = Subseq.length implicit in
+	if w = 0
+	then (n,implicit,n)
+	else
+	  let c, child = self # get_child (n,node) implicit in
+	  let l = !(child.final) - child.start + 1 in
+	  let a = ref 1 in
+	  while !a < l & !a < w & eq_char (stridx # get child.seqid).[child.start + !a] (Subseq.get implicit !a) do
+	    incr a
+	  done; (* [!a] is the first mismatch position, or the length of [child] label *)
+	  if !a = l
+	  then find_factor_aux (c,child) (Subseq.sub implicit !a (w - !a))
+	  else 
+	    if !a = w
+	    then (n,implicit,c)
+	    else raise Not_found
+      in
+      let root_id = root # get () in
+      let root_node = nodeidx # get root_id in
+      let (explicit, (s,i,len), child) = find_factor_aux (root_id, root_node) (str,0,String.length str) in
+      (explicit, String.sub s i len, child)
+
+    method suffixes (_, _, child : factor) : (strid * int) list =
+	(** [st # suffixes f] returns the list of all suffixes [(strid,pos)] that have the path of [f] as a prefix: this path occurs in string [strid] at position [pos]. *)
+      self # fold_s
+	~start:child
+	(fun l n -> if l=[] then [self # suffix n] else List.concat l)
+
+    method strings (_, _, child : factor) : StrSet.t =
+	(** [st # strings f] returns the set of the ids of all strings containing the path of [f]. *)
+      self # ext child
+(*
+      fold_s
+        ~start:child
+	(fun l n -> if l=[] then StrSet.singleton (fst (self # suffix n)) else StrSet.union_r l)
+*)
+
+
+(** {1 Simpler representation of a suffix tree (for debugging purpose at top-level)} *)
+
+    method tree : tree =
+      (* readable version of a GST *)
+      let maximal n =
+	let node = nodeidx # get n in
+	Node.maximal node
+      in
+      let root_id = root # get () in
+      self # fold_s
+	(fun l n ->
+	  if n = root_id
+	  then
+	    Node ("", maximal n, StrSet.elements (self # ext n), l)
+	  else
+	    let w = self # label n in
+	    if l=[]
+	    then Leaf (w, self # suffix n)
+	    else Node (w, maximal n, StrSet.elements (self # ext n), l))
+
+  end
+
+(* for test *)
+(*
+let st = new st ~get_visible:(fun _ -> (0,0)) "root" (new database);;
+let _ =
+  ignore (st # add "formal concept analysis");
+  ignore (st # add "logical concept analysis");
+  ignore (st # add "conceptual graphs");;
+*)