1. Sébastien Ferré
  2. ocaml-lib

Source

ocaml-lib / suffix_tree.ml

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
(**
   Suffix trees with incremental addition and removal of strings.
   plus incremental maintenance of maximal factors.
*)


module type T =
  sig
    type strid = int
	  (** Type of string ids. Functions using such ids are unspecified if the id is not valid. *)
    type t
	  (** Type of suffix trees. This is not a pure functional data-structure. *)

(** {1 Suffix trees as string sets. } *)

    val create : unit -> t
	(** [create ()] returns a fresh and empty suffix tree. *)
    val size : t -> int
	(** [size st] returns the number of strings registered in the suffix tree [st]. *)
    val add : t -> string -> strid
	(** [add st s] adds the string [s], and all its suffixes in the suffix tree [st], unless [s] has already been added.
	   It also returns the string id as an handle on this string. *)
    val remove : t -> strid -> unit
	(** [remove st id] removes the string identified by [id], and all its suffixes, from the suffix tree [st]. *)
    val get : t -> strid -> string
	(** [get st id] returns the string associated to [id]. *)
    val find : t -> string -> strid
	(** [find st s] returns the id associated to the string [s], if the strings exists in the suffix tree [st].
	   Otherwise raise Not_found. *)
    val fold : (strid -> string -> 'a -> 'a) -> t -> 'a -> 'a
	(** [fold f st e] is a classic folding on all strings in the suffix tree [st]. *)

(** {1 Low-level interface on suffix trees. } *)

    type node
	  (** Type of the nodes of suffix trees.
	     Nodes are either leaves or internal nodes. *)

    val root : t -> node
	(** [root st] returns the root node of the suffix tree [st]. *)
    val is_leaf : t -> node -> bool
	(** [is_leaf st n] returns whether the node [n] is a leaf. *)
    val label : t -> node -> string
	(** [label st n] returns the string labelling the node [n]. *)
    val path : t -> node -> string
	(** [path st n] returns the full path from the root to the node [n]. *)
    val ext : t -> node -> strid LSet.t
	(** [ext st n] returns an ordered list of string ids that match the path of the node [n]. *)
    val children : t -> node -> node LSet.t
	(** [children st n] returns the list of children nodes of [n]. *)
    val parent : t -> node -> node option
	(** [parent st n] returns the parent node of [n], unless [n] is the root node. *)
    val succ : t -> node -> node option
	(** [succ st n] returns the successor node through the suffix link of [n], unless there is no suffix link. *)
    val preds : t -> node -> node LSet.t
	(** [preds st n] returns the list of all nodes having [n] as successor node. *)
    val suffix : t -> node -> strid * int
	(** [suffix st n] returns the suffix represented by the leaf node [n] as a couple [(string id, position in the string)].
	   Raise Not_found if [n] is not a leaf. *)
    val find_node : t -> string -> node
	(** [find_node st s] returns the node whose path is equal to the string [s], if it exists.
	   Raise Not_found otherwise. *)
    val fold_tree : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's
	(** [fold_tree st filter herit synth h0] returns the result of an attribute evaluation on the suffix tree [st].
	   - [filter] is used to filter which children of a node should be explored given the heritance value of the parent node,
	   - [herit] defines the heritance value of a node, given the heritance value of its parent,
	   - [synth] defines the synthesized value of a node given its heritance value, and the list of synthesized values of its filtered children,
	   - [h0] is the heritance value given to the root.
	 *)

(** {1 Exploring the suffix tree through the substring relation. } *)
		
    val path_restrictions : t -> node -> node list
	(** [path_restrictions st n] returns the list of nodes whose path is a direct restriction of the path of [n]. *)
    val path_extensions : t -> node -> node list
	(** [path_extensions st n] returns the list of nodes whose path is a direct extension of the path of [n]. *)
    val is_maximal : t -> node -> bool
	(** [is_maximal st n] returns whether a node is maximal.
	   A node is maximal is each of its extensions has a strictly smaller extent, or the node represents a full string. *)
    val max_restrictions : t -> node -> node list
	(** [max_restrictions st n] returns the list of maximal nodes whose path is a restriction of the path of [n]. *)
    val max_extensions : t -> node option -> node list
	(** [max_extensions st n_opt] returns the list of maximal nodes whose path is a extension of the path of [n], when given.
	   If a start node is not given, then the maximal nodes with shortest path are returned. *)
    val string_extensions : t -> node option -> strid list
	(** [string_extensions st n_opt] completes the result of [max_extensions st n_opt] with full strings through their ids. *)
    val string_restrictions : t -> strid -> node list
	(** [string_restrictions st strid] returns the list of maximal nodes having [strid] as a string extension. *)

(** {1 Searching in a suffix tree} *)

    type factor = node * string * node
	  (** [(parent,s,child)] locates a factor string on the edge from node [parent] to node [child], where [s] is a prefix of the label of [child].
	     If [s] is the empty string, then [parent] and [child] are a same node.
	     The path of a factor is the concatenation of [path st parent] and [s]. *)

    val find_factor : t -> string -> factor
	(** [find_factor st s] returns the factor locating [s] in the suffix tree [st].
	   This means the path of the result factor is equal to [s].
	   Raise [Not_found] if the string [s] does not appear in any string of [st]. *)
    val suffixes : t -> factor -> (strid * int) list
	(** [suffixes st f] returns the list of all suffixes [(strid,pos)] that have the path of [f] as a prefix: this path occurs in string [strid] at position [pos]. *)
    val strings : t -> factor -> strid LSet.t
	(** [strings st f] returns the ids of all string containing the path of [f]. *)

(** {1 Simpler representation of a suffix tree (for debugging purpose at top-level)} *)

    type tree = Node of string * bool * int list * tree list | Leaf of string * (strid * int)

    val tree : t -> tree

  end


(* --------------------------------------------------------------------------------
   Operations on substrings of sequences
   -------------------------------------------------------------------------------- *)

    module Subseq =
      struct
	type t = string * int * int  (* (seq, pos, len) *)
	  
	let empty = ("",0,0)  (* non-significant subseq *)
    
	let is_empty (s,pos,len) = len = 0

	let get (s,pos,len) i = s.[pos+i]

	let length (s,pos,len) = len

	let sub (s,pos,len) pos' len' = (s,pos+pos',len')

	let extend (s,pos,len) = (s,pos,len+1)
      end


module Ext =
  struct
    type t = int * Cis.t

    let cardinal (k,_) = k

    let empty = (0,Cis.empty)

    let mem i (k,cis) = Cis.mem i cis

    let singleton i = (1,Cis.singleton i)

    let add i (k,cis as ext) =
      if Cis.mem i cis
      then ext
      else (k+1,Cis.add i cis)

    let remove i (k,cis as ext) =
      if Cis.mem i cis
      then (k-1,Cis.remove i cis)
      else ext

    let union (_,cis1) (_,cis2) =
      let cis = Cis.union cis1 cis2 in
      (Cis.cardinal cis, cis)

    let diff (_,cis1) (_,cis2) =
      let cis = Cis.diff cis1 cis2 in
      (Cis.cardinal cis, cis)

    let elements (_,cis) = Cis.elements cis
  end


module Make : T =
  struct
    type strid = int

(* type of nodes in suffix trees *)
    type node = {
	mutable seqid : strid; (* sequence index in which the positions start and final are defined *)
	mutable start : int;     (* start and final position of the word labelling the node *)
	mutable final : int ref;
	mutable parent : node; (* prefix link, the root for the root itself *)
	v : node_value
      }
    and node_value =
      | I of node_internal (* for non-leaves, internal nodes *)
      | L of int (* for leaves: position of recognized suffix *)
    and node_internal = {
	children : (char,node) Hashtbl.t;
	mutable link : node; (* suffix link *)
	mutable backlinks : node LSet.t;
	mutable ext : Ext.t; (* set of strids under this node *)
	mutable locals : Ext.t; (* subset of ext, strids only in leaves of the node (only on maximal nodes) *)
	mutable maximal : bool; (* whether this node has a maximal path given its ext (a concept intent) *)
      }

    type factor = node * string * node

(* type of suffix trees *)
    type t = {
	mutable cpt : strid;
	ht : (strid,string) Hashtbl.t;
	mutable root : node;
      }

    let get0 st strid =
      try Hashtbl.find st.ht strid
      with Not_found -> failwith ("Invalid string id: " ^ string_of_int strid)

    let ext0 node =
      match node.v with
      | I x -> x.ext
      | L _ -> Ext.singleton node.seqid

    let locals0 node =
      match node.v with
      | I x -> x.locals
      | L _ -> Ext.empty



(* -------------------------------------------------------------------------------
   Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
   the snd node [child] is significant only when [implicit] is not the empty string,
   and is the child that recognizes [implicit] starting from [explicit]. [implicit] is
   defined by a sequence, a start and a length.
   ------------------------------------------------------------------------------- *)

    let eq_char c1 c2 =
      c1<>'\000' & c1=c2  (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)

(* returns the child node that recognizes [implicit] from the node [explicit] *)
    let get_child (explicit,implicit) =
      if Subseq.is_empty implicit
      then explicit
      else
	let c = Subseq.get implicit 0 in
	if c = '\000'
	then raise Not_found
	else
	  match explicit.v with
	  | I x -> Hashtbl.find x.children c
	  | L _ -> raise Not_found

(* ensures that implicit does not span over another node below [explicit] *)
    let rec canonical (explicit,implicit,child) =
      if Subseq.is_empty implicit
      then (explicit,implicit,child)
      else
	let l = !(child.final) - child.start + 1 in
	let a = Subseq.length implicit in
	if a < l
	then (explicit,implicit,child)
	else
	  let implicit' = Subseq.sub implicit l (a-l) in
	  canonical (child, implicit', get_child (child,implicit'))

(* test whether an implicit node is the root node *)
    let is_root st (explicit,implicit,_) =
      explicit == st.root & Subseq.is_empty implicit

(* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
   and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
    let has_child st (explicit,implicit,child) (k,i) =
      let a = Subseq.length implicit in
      if a <> 0 then
	if eq_char (get0 st child.seqid).[child.start+a] (get0 st k).[i]
	then Some (explicit, Subseq.extend implicit, child)
	else None
      else
	try
	  let implicit' = (get0 st k,i,1) in
	  Some (explicit, implicit', get_child (explicit,implicit'))
	with Not_found -> None

(* test whether the extension of an implicit node by '\000' exists in the GST,
   and if so, returns the corresponding leaves, otherwise returns None. *)
    let has_end st (explicit,implicit,child) =
      let a = String.length implicit in
      if a <> 0 then
	if (get0 st child.seqid).[child.start+a] = '\000'
	then Some [child]
	else None
      else
	match explicit.v with
	| I x -> Some (Hashtbl.find_all x.children '\000')
	| L _ -> None

(* --------------------------------
   creation of new nodes and leaves
   -------------------------------- *)

    let add_leaf st node seqid start final_ref index =
      match node.v with
      | I x ->
	  let child = {seqid=seqid; start=start; final=final_ref; parent=node; v=(L index)} in
	  Hashtbl.add x.children (get0 st seqid).[start] child
      | L _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")

(* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
    let insert_node st (explicit,implicit,child) =
      let a = Subseq.length implicit in
      if a = 0
      then explicit
      else
	match explicit.v with
	| I x ->
	    let c_child_old = (get0 st child.seqid).[child.start] in
	    let c_child_new = (get0 st child.seqid).[child.start+a] in
	    let h' = Hashtbl.create (Hashtbl.length x.children) in Hashtbl.add h' c_child_new child;
	    let n' = {
	      seqid = child.seqid;
	      start = child.start;
	      final = ref (child.start+a-1);
	      parent = explicit;
	      v = I
		{ children = h';
		  link = st.root;
		  backlinks = LSet.empty ();
		  ext = ext0 child;
		  locals = (match child.v with L _ -> Ext.singleton child.seqid | _ -> Ext.empty);
		  maximal = false
		};
	    } in
	    child.start <- child.start+a;
	    child.parent <- n';
	    Hashtbl.replace x.children c_child_old n';
	    n'
	| L _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")

(* add some strid in the extent of all ancestor of a node (except the root) *)
    let rec add_strid st new_maximal strid node =
      if node != st.root then
	match node.v with
	| I x ->
	    if not (Ext.mem strid x.ext) then begin
	      x.ext <- Ext.add strid x.ext;
	      add_strid st new_maximal strid node.parent end
	    else
	      LSet.remove node new_maximal
	| _ -> assert false
      else LSet.remove node new_maximal

(* add a suffix link from [pred_opt] (if defined) to [explicit] *)
    let add_link new_maximal pred_opt explicit =
      match pred_opt with
      | Some n ->
	  ( match n.v, explicit.v with
	  | I x0, I x ->
	      x0.link <- explicit;
	      x.backlinks <- LSet.add n x.backlinks;
	      LSet.remove explicit new_maximal
	  | _ -> assert false)
      | None -> new_maximal

(* ------------ 
   suffix links
   ------------ *)

(* extends suffix_link for implicit nodes *)
    let get_link st = function  (* TODO *)
      | (explicit,implicit,_) when Subseq.is_empty implicit ->
	  let explicit' = match explicit.v with I x -> x.link | _ -> assert false in  (*suffix_link root explicit*)
	  (explicit', Subseq.empty, explicit')
      | (explicit,implicit,_) ->
	  if explicit == st.root
	  then
	    let implicit' = Subseq.sub implicit 1 (Subseq.length implicit - 1) in
	    canonical (st.root, implicit', get_child (st.root,implicit'))
	  else
	    let explicit' = match explicit.v with I x -> x.link | _ -> assert false in  (*suffix_link root explicit*)
	    canonical (explicit', implicit, get_child (explicit',implicit))

(* --------------------------------------------------------------
   GST update for the new character c at position i in sequence k
   -------------------------------------------------------------- *)

(* state for 'update' *)
    type res = {
	terminal : int ref;
	mutable startj : int;
	mutable startnode : node * Subseq.t * node;
	mutable new_maximal : node LSet.t;
      }

    let rec update st (strid,i) res pred_opt =
      (* c = seqar.(strid).[i] *)
      match has_child st res.startnode (strid,i) with
      | Some extended_startnode -> (* startnode can be extended by [c] *)
	  let explicit, implicit, _ = res.startnode in
	  assert (pred_opt = None or Subseq.is_empty implicit);
          (* if a link has been followed after node creation, then we are on an explicit node *)
	  res.new_maximal <- add_link res.new_maximal pred_opt explicit;
	  res.startnode <- canonical extended_startnode
      | None -> (* startnode cannot be extended by [c] ... *)
	  let n' = insert_node st res.startnode in (* ... so we insert a new node ... *)
	  if (get0 st strid).[res.startj] <> '\000' then begin
	    add_leaf st n' strid i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
	    res.new_maximal <- LSet.add n' res.new_maximal;
	    res.new_maximal <- add_strid st res.new_maximal strid n';  (* updating the extent of ancestor nodes *)
	    res.new_maximal <- add_link res.new_maximal pred_opt n';  (* ... a suffix link from the last created node (if defined) ... *)
	  end;
	  res.startj <- res.startj + 1; (* prepare for the next suffix *)
	  if not (is_root st res.startnode)
	  then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
	    res.startnode <- get_link st res.startnode; (* ... follow the suffix link to find the next suffix ... *)
	    update st (strid,i) res (Some n') end  (* ... and loop on [update] *)

(* -------------------------------
   implementing the signature T
   ------------------------------- *)

    let create () =
      let rec root = {
	seqid = -1;
	start = 0;
	final = ref (-1);
	parent = root;
	v=I
	  { children = Hashtbl.create 2;
	    link = root;
	    backlinks = LSet.empty ();
	    ext = Ext.empty;
	    locals = Ext.empty;
	    maximal = false
	  }} in
      { cpt = 0;
	ht = Hashtbl.create 100;
	root = root
      }

    let size st = st.cpt

    let is_root st node = node == st.root

    let root st = st.root

    let is_leaf st node =
      match node.v with
      | L _ -> true
      | _ -> false

    let label st node =
      if node == st.root
      then ""
      else String.sub (get0 st node.seqid) node.start (!(node.final) - node.start + (match node.v with I _ -> 1 | L _ -> 0))

    let ext st node =
      match node.v with
      | I x -> LSet.of_list (Ext.elements x.ext)
      | L _ -> LSet.singleton node.seqid

    let is_maximal st node =
      match node.v with
      | I x -> x.maximal
      | L pos -> pos=0

    let children st node =
      match node.v with
      | I x ->
	  Hashtbl.fold (fun c n l -> LSet.add n l) x.children (LSet.empty ())
      | L _ -> []

    let parent st node =
      if is_root st node
      then None
      else Some node.parent

    let succ st node =
      if is_root st node
      then None
      else 
	match node.v with
	| I x -> Some x.link
	| L _ -> None

    let preds st node =
      match node.v with
      | I x -> x.backlinks
      | L _ -> LSet.empty ()

    let suffix st node =
      match node.v with
      | I _ -> raise Not_found
      | L i -> (node.seqid, i)


(* general fold *)
    let rec fold_tree : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's =
      fun gst f h s init ->
	fold_node gst f h s init (root gst)
    and fold_node gst f h s h_node node =
      s
	(List.map
	   (fun child -> fold_node gst f h s (h h_node child) child)
	   (List.filter (f h_node) (children gst node)))
	h_node
	node
	
(* synthesized attributes only *)
    let fold_s_node gst s node = fold_node gst (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) () node
    let fold_s_tree gst s = fold_s_node gst s (root gst)
	
(* filtering and synthesizing, no inheritance *)
    let fold_fs_node gst f s node = fold_node gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) () node
    let fold_fs_tree gst f s = fold_fs_node gst f s (root gst)

    let rec path st node =
      match parent st node with
      | None -> ""
	   | Some parent -> path st parent ^ label st node

    let path_restrictions st node =
      let lp = match parent st node with None -> LSet.empty () | Some p -> LSet.singleton p in (* the prefix restriction, if it exists *)
      let ls = match succ st node with None -> LSet.empty () | Some s -> LSet.singleton s in (* the suffix restriction, if it exists *)
      LSet.union lp ls

    let path_extensions st node =
      let lr = List.filter (fun n -> not (is_leaf st n)) (children st node) in (* right extensions *)
      let lf = preds st node in (* left extensions *)
      LSet.union lr lf

    let rec max_restrictions st node =
      max_restrictions_aux st (LSet.empty ()) (path_restrictions st node)
    and max_restrictions_aux st acc = function
      | [] -> acc
      | n::ns ->
	  if is_maximal st n
	  then max_restrictions_aux st (LSet.add n acc) ns
	  else max_restrictions_aux st acc (path_restrictions st n @ ns)

    let rec max_extensions st node_opt =
      let ns =
	match node_opt with
	| None -> [st.root]
	| Some n -> path_extensions st n in
      max_extensions_aux st (LSet.empty ()) ns
    and max_extensions_aux st acc = function
      | [] -> acc
      | n::ns ->
	  if is_maximal st n
	  then max_extensions_aux st (LSet.add n acc) ns
	  else max_extensions_aux st acc (path_extensions st n @ ns)

    let string_restrictions st strid =
      fold_fs_tree st  (* looking for the nodes having strid as a local *)
	(fun n -> Ext.mem strid (ext0 n))
	(fun l n ->
	  List.fold_left
	    LSet.union
	    (if is_maximal st n && Ext.mem strid (locals0 n) then LSet.singleton n else LSet.empty ())
	    l)

    let string_extensions st node_opt =
      Ext.elements (locals0 (match node_opt with None -> st.root | Some n -> n))



    let rec find_factor st str =
      let (explicit, (s,i,len), child) = find_factor_aux st st.root (str,0,String.length str) in
      (explicit, String.sub s i len, child)
    and find_factor_aux st node implicit =
      let w = Subseq.length implicit in
      if w = 0
      then (node,implicit,node)
      else
	let child = get_child (node,implicit) in
	let l = !(child.final) - child.start + 1 in
	let a = ref 1 in
	while !a < l & !a < w & eq_char (get0 st child.seqid).[child.start + !a] (Subseq.get implicit !a) do
	  incr a
	done; (* [!a] is the first mismatch position, or the length of [child] label *)
	if ! a = l
	then find_factor_aux st child (Subseq.sub implicit !a (w - !a))
	else 
	  if !a = w
	  then (node,implicit,child)
	  else raise Not_found
(*      
      if !a < w then
	if !a < l
	then raise Not_found
	else find_factor_aux st child (Subseq.sub implicit !a (w - !a))
      else (node,implicit,child) 
*)


    let suffixes st (_,_,child) =
      fold_s_node st
	(fun l n -> if l=[] then [suffix st n] else List.concat l)
	child

    let strings st (_,_,child) = ext st child
(*
      fold_s_node st
	(fun l n -> if l=[] then LSet.singleton (fst (suffix st n)) else LSet.union_r l)
	child
*)

    let get st strid =
      let str0 = get0 st strid in
      String.sub str0 0 (String.length str0 - 1)

    let find st str =
      let factor = find_factor st str in
      match has_end st factor with
      | Some leafs ->
	  Common.mapfind
	    (fun leaf ->
	      let (strid,pos) = suffix st leaf in
	      if pos = 0 then Some strid else None) (* there should be only one *)
	    leafs
      | None -> raise Not_found

    let find_node st str =
      let (explicit,implicit,_) = find_factor st str in
      if implicit = ""
      then explicit
      else raise Not_found

(* add a string and returns its strid. If the string already exists, the GST is not modified,
   and the existing id is returned *)
    let add st str =
      try
	find st str
      with Not_found ->
	let strid = st.cpt <- st.cpt+1; st.cpt in
	let str0 = str ^ String.make 1 '\000' in (* add a terminal symbol *)
	Hashtbl.add st.ht strid str0;
	( match st.root.v with I x -> x.ext <- Ext.add strid x.ext | _ -> assert false); (* update the extent of the root *)
	let res = {terminal=ref (-1); startj=0; startnode=(st.root,Subseq.empty,st.root); new_maximal=LSet.singleton st.root} in (* initialize for [update] ... *)
	for pos = 0 to String.length str0 - 1 do (* for every position [i] in the sequence ... *)
	  incr res.terminal; (* increment the leaves final position ... *)
	  update st (strid,pos) res None (* call [update] for updating the suffix tree with the character at position [pos] *)
	done;
	List.iter
	  (fun n -> 
	    match n.v with
	    | I x ->
		x.locals <- Ext.add strid x.locals;
		x.maximal <- true
	    | _ -> assert false)
	  res.new_maximal; (* recording new maximal nodes *)
	strid

    let rec remove st strid =
      ignore (remove_node true st strid st.root);
      Hashtbl.remove st.ht strid
    and remove_node on_root st strid node =
      match node.v with
      | L _ ->
	  if node.seqid = strid
	  then None
	  else Some (node, Ext.empty)
      | I x ->
	  if not (Ext.mem strid x.ext)
	  then Some (node, Ext.empty)
	  else
	    let new_children =
	      Hashtbl.fold
		(fun c n res -> match remove_node false st strid n with None -> res | Some (n',moved_locals) -> (c,n',moved_locals)::res)
		x.children [] in
	    match new_children with
	    | [] ->
		if on_root
		then begin
		  Hashtbl.clear x.children;
		  x.ext <- Ext.remove strid x.ext;
		  x.locals <- Ext.remove strid x.locals;
		  x.maximal <- false;
		  Some (node, Ext.empty) end
		else begin
		  remove_link st node;
		  None end
	    | [(c,child,_)] when not on_root ->
		child.start <- child.start - (!(node.final) - node.start + 1);
		child.parent <- node.parent;
		remove_link st node;
		Some (child, x.locals)
	    | (_,n0,_)::_ as l ->
		Hashtbl.clear x.children;
		List.iter
		  (fun (c,n,moved_locals) ->
		    Hashtbl.add x.children c n;
		    n.parent <- node;
		    x.locals <- Ext.union x.locals moved_locals
		  ) l;
		x.ext <- Ext.remove strid x.ext;
		x.locals <- Ext.remove strid x.locals;
		let card = Ext.cardinal x.ext in
		x.maximal <- x.maximal && (remove_maxcard l x.backlinks < card);
		if node.seqid = strid
		then begin (* strid string does not exists any more *)
		  let a = !(node.final) - node.start + 1 in
		  node.seqid <- n0.seqid;
		  node.start <- n0.start - a;
		  node.final <- ref (n0.start - 1) end;
		Some (node, Ext.empty)
    and remove_link st node =
      match succ st node with
      | None -> ()
      | Some n ->
	  match n.v with
	  | I x -> x.backlinks <- LSet.remove node x.backlinks
	  | _ -> assert false
    and remove_maxcard children backlinks =
      let maxcard_children =
	List.fold_left (fun res (_,n,_) -> max res (Ext.cardinal (ext0 n))) 0 children in
      let maxcard_backlinks =
	List.fold_left (fun res n -> max res (Ext.cardinal (ext0 n))) 0 backlinks in
      max maxcard_children maxcard_backlinks


    let fold f st init =
      Hashtbl.fold f st.ht init


(* readable version of a GST *)

    type tree = Node of string * bool * int list * tree list | Leaf of string * (strid * int)
	
    let tree st =
      fold_s_tree st
	(fun l n ->
	  if n == st.root
	  then Node ("", is_maximal st n, ext st n, l)
	  else
	    let w = label st n in
	    if l=[]
	    then Leaf (w, suffix st n)
	    else Node (w, is_maximal st n, ext st n, l))

  end