Source

ocaml-lib / suffix_tree.ml

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
(**
   Suffix trees.

   Suffix trees with incremental addition and removal of strings
   plus incremental maintenance of maximal factors.


   Author: S-Aébastien Ferré <ferre@irisa.fr>.-b

   License: LGPL
*)

(* for test *)
(*
#load "cis.cmo";;
#load "lSet.cmo";;
*)

(* copied from module Common *)

let rec fold_while : ('a -> 'a option) -> 'a -> 'a =
  fun f e ->
    match f e with
    | None -> e
    | Some e' -> fold_while f e'

let rec mapfind : ('a -> 'b option) -> 'a list -> 'b =
  fun f -> function
  | [] -> raise Not_found
  | x::l -> match f x with
      | None -> mapfind f l
      | Some y -> y

(* end of copy *)

module type PARAM =
  sig
    val get_visible : string -> int * int
	(** [get_visible s] returns the sizes of the prefix and suffix of [s]
	   that can be removed from [s] without damage to its meaning. *)
  end

module type T =
  sig
    type strid = int
	  (** Type of string ids. Functions using such ids are unspecified if the id is not valid. *)
    type t
	  (** Type of suffix trees. This is not a pure functional data-structure. *)

(** {1 Suffix trees as string sets. } *)

    val create : unit -> t
	(** [create ()] returns a fresh and empty suffix tree. *)
    val size : t -> int
	(** [size st] returns the number of strings registered in the suffix tree [st]. *)
    val add : t -> string -> strid
	(** [add st s] adds the string [s], and all its suffixes in the suffix tree [st], unless [s] has already been added.
	   It also returns the string id as an handle on this string. *)
    val remove : t -> strid -> unit
	(** [remove st id] removes the string identified by [id], and all its suffixes, from the suffix tree [st]. *)
    val get : t -> strid -> string
	(** [get st id] returns the string associated to [id]. *)
    val find : t -> string -> strid
	(** [find st s] returns the id associated to the string [s], if the strings exists in the suffix tree [st].
	   Otherwise raise Not_found. *)
    val fold : (strid -> string -> 'a -> 'a) -> t -> 'a -> 'a
	(** [fold f st e] is a classic folding on all strings in the suffix tree [st]. *)

(** {1 Low-level interface on suffix trees. } *)

    type node
	  (** Type of the nodes of suffix trees.
	     Nodes are either leaves or internal nodes. *)

    val root : t -> node
	(** [root st] returns the root node of the suffix tree [st]. *)
    val is_leaf : t -> node -> bool
	(** [is_leaf st n] returns whether the node [n] is a leaf. *)
    val label : t -> node -> string
	(** [label st n] returns the string labelling the node [n]. *)
    val length : t -> node -> int
	(** [length st n] returns the length of the string labelling the node [n]. *)
    val path : t -> node -> string
	(** [path st n] returns the full path from the root to the node [n]. *)
    val height : t -> node -> int
	(** [height st n] returns the height of node [n], i.e. the length of the path from root to [n]. *)
    val ext : t -> node -> strid LSet.t
	(** [ext st n] returns an ordered list of string ids that match the path of the node [n]. *)
    val children : t -> node -> node LSet.t
	(** [children st n] returns the list of children nodes of [n]. *)
    val parent : t -> node -> node option
	(** [parent st n] returns the parent node of [n], unless [n] is the root node. *)
    val succ : t -> node -> node option
	(** [succ st n] returns the successor node through the suffix link of [n], unless there is no suffix link. *)
    val preds : t -> node -> node LSet.t
	(** [preds st n] returns the list of all nodes having [n] as successor node. *)
    val suffix : t -> node -> strid * int
	(** [suffix st n] returns the suffix represented by the leaf node [n] as a couple [(string id, position in the string)].
	   Raise Not_found if [n] is not a leaf. *)
    val find_node : t -> string -> node
	(** [find_node st s] returns the node whose path is equal to the string [s], if it exists.
	   Raise Not_found otherwise. *)
    val fold_tree : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's
	(** [fold_tree st filter herit synth h0] returns the result of an attribute evaluation on the suffix tree [st].
	   - [filter] is used to filter which children of a node should be explored given the heritance value of the parent node,
	   - [herit] defines the heritance value of a node, given the heritance value of its parent,
	   - [synth] defines the synthesized value of a node given its heritance value, and the list of synthesized values of its filtered children,
	   - [h0] is the heritance value given to the root.
	 *)

(** {1 Exploring the suffix tree through the substring relation. } *)
		
    val path_restrictions : t -> node -> node list
	(** [path_restrictions st n] returns the list of nodes whose path is a direct restriction of the path of [n]. *)
    val path_extensions : t -> node -> node list
	(** [path_extensions st n] returns the list of nodes whose path is a direct extension of the path of [n]. *)
    val is_maximal : t -> node -> bool
	(** [is_maximal st n] returns whether a node is maximal.
	   A node is maximal is each of its extensions has a strictly smaller extent, or the node represents a full string. *)
    val set_visible : t -> node -> int * int -> unit
	(** [set_visible st node (left_pos, right_pos)] sets which part of a node path should be visible when maximal. *)
    val max_restrictions : t -> node -> node list
	(** [max_restrictions st n] returns the list of maximal nodes whose path is a restriction of the path of [n]. *)
    val max_extensions : t -> node option -> node list * strid list
	(** [max_extensions st n_opt] returns the list of maximal nodes and leaves whose path is an extension of the path of [n], when given.
	   If a start node is not given, then the maximal nodes with shortest path are returned. *)
(*
    val string_extensions : t -> node option -> strid list
	(** [string_extensions st n_opt] completes the result of [max_extensions st n_opt] with full strings through their ids. *)
*)
    val string_restrictions : t -> strid -> node list
	(** [string_restrictions st strid] returns the list of maximal nodes having [strid] as a string extension. *)

(** {1 Searching in a suffix tree} *)

    type factor = node * string * node
	  (** [(parent,s,child)] locates a factor string on the edge from node [parent] to node [child], where [s] is a prefix of the label of [child].
	     If [s] is the empty string, then [parent] and [child] are a same node.
	     The path of a factor is the concatenation of [path st parent] and [s]. *)

    val find_factor : t -> string -> factor
	(** [find_factor st s] returns the factor locating [s] in the suffix tree [st].
	   This means the path of the result factor is equal to [s].
	   Raise [Not_found] if the string [s] does not appear in any string of [st]. *)
    val suffixes : t -> factor -> (strid * int) list
	(** [suffixes st f] returns the list of all suffixes [(strid,pos)] that have the path of [f] as a prefix: this path occurs in string [strid] at position [pos]. *)
    val strings : t -> factor -> strid LSet.t
	(** [strings st f] returns the ids of all string containing the path of [f]. *)

(** {1 Simpler representation of a suffix tree (for debugging purpose at top-level)} *)

    type tree = Node of string * int * int list * tree list | Leaf of string * (strid * int)

    val tree : t -> tree

  end


(* --------------------------------------------------------------------------------
   Operations on substrings of sequences
   -------------------------------------------------------------------------------- *)

    module Subseq =
      struct
	type t = string * int * int  (* (seq, pos, len) *)
	  
	let empty = ("",0,0)  (* non-significant subseq *)
    
	let is_empty (s,pos,len) = len = 0

	let get (s,pos,len) i = s.[pos+i]

	let length (s,pos,len) = len

	let sub (s,pos,len) pos' len' = (s,pos+pos',len')

	let extend (s,pos,len) = (s,pos,len+1)
      end


module Ext =
  struct
    type t = int * Cis.t

    let cardinal (k,_) = k

    let empty = (0,Cis.empty)

    let mem i (k,cis) = Cis.mem i cis

    let singleton i = (1,Cis.singleton i)

    let add i (k,cis as ext) =
      if Cis.mem i cis
      then ext
      else (k+1,Cis.add i cis)

    let remove i (k,cis as ext) =
      if Cis.mem i cis
      then (k-1,Cis.remove i cis)
      else ext

    let union (_,cis1) (_,cis2) =
      let cis = Cis.union cis1 cis2 in
      (Cis.cardinal cis, cis)

    let diff (_,cis1) (_,cis2) =
      let cis = Cis.diff cis1 cis2 in
      (Cis.cardinal cis, cis)

    let elements (_,cis) = Cis.elements cis
  end


module Make (Param : PARAM) : T =
  struct
    type strid = int

(* type of nodes in suffix trees *)
    type node = {
	mutable seqid : strid; (* sequence index in which the positions start and final are defined *)
	mutable start : int;     (* start and final position of the word labelling the node *)
	mutable final : int ref;
	mutable parent : node; (* prefix link, the root for the root itself *)
	v : node_value
      }
    and node_value =
      | I of node_internal (* for non-leaves, internal nodes *)
      | L of int (* for leaves: position of recognized suffix *)
    and node_internal = {
	children : (char,node) Hashtbl.t;
	mutable link : node; (* suffix link *)
	mutable backlinks : node LSet.t;
	mutable ext : Ext.t; (* set of strids under this node *)
	mutable locals : Ext.t; (* subset of ext, strids only in leaves of the node (only on maximal nodes) *)
	mutable maximal_right : bool; (* whether this node is maximal on its right given its ext (a concept intent) *)
	mutable maximal : int; (* by which strid (>0) this node became a maximal node (a concept intent) *)
	mutable visible : int * int; (* which part of this node is visible: left and right offset. *)
      }

    type factor = node * string * node

(* type of suffix trees *)
    type t = {
	mutable cpt : strid;
	ht : (strid,string) Hashtbl.t;
	mutable root : node;
      }

    let get0 st strid =
      try Hashtbl.find st.ht strid
      with Not_found -> failwith ("Invalid string id: " ^ string_of_int strid)

    let ext0 node =
      match node.v with
      | I x -> x.ext
      | L _ -> Ext.singleton node.seqid

    let maximal0 node =
      match node.v with
      | I x -> x.maximal
      | L _ -> assert false

    let locals0 node =
      match node.v with
      | I x -> x.locals
      | L _ -> Ext.empty

    let size st = st.cpt

    let is_root st node = node == st.root

    let root st = st.root

    let is_leaf st node =
      match node.v with
      | L _ -> true
      | _ -> false

    let length st node =
      !(node.final) - node.start + (match node.v with I _ -> 1 | L _ -> 0)

    let label st node =
      if node == st.root
      then ""
      else String.sub (get0 st node.seqid) node.start (length st node)

    let ext st node =
      match node.v with
      | I x -> LSet.of_list (Ext.elements x.ext)
      | L _ -> LSet.singleton node.seqid

    let card st node =
      match node.v with
      | I x -> Ext.cardinal x.ext
      | L _ -> 1

    let is_maximal st node =
      match node.v with
      | I x -> x.maximal > 0
      | L pos -> pos=0

    let children st node =
      match node.v with
      | I x ->
	  Hashtbl.fold (fun c n l -> LSet.add n l) x.children (LSet.empty ())
      | L _ -> []

    let parent st node =
      if is_root st node
      then None
      else Some node.parent

    let succ st node =
      if is_root st node
      then None
      else 
	match node.v with
	| I x -> Some x.link
	| L _ -> None

    let preds st node =
      match node.v with
      | I x -> x.backlinks
      | L _ -> LSet.empty ()

    let suffix st node =
      match node.v with
      | I _ -> raise Not_found
      | L i -> (node.seqid, i)

    let rec path st node =
      match parent st node with
      | None -> ""
      | Some parent -> path st parent ^ label st node

    let rec height st node =
      match parent st node with
      | None -> 0
      | Some parent -> height st parent + length st node

(*
    let set_maximal st node b =
      match node.v with
      | I x -> x.maximal <- b
      | L _ -> ()

    let set_maximal_right st node b =
      match node.v with
      | I x -> x.maximal_right <- b
      | L _ -> ()
*)

    let set_visible st node lr =
      match node.v with
      | I x -> x.visible <- lr
      | L _ -> ()

    let get_visible st node =
      match node.v with
      | I x -> x.visible
      | _ -> raise (Invalid_argument "Suffix_tree.get_visible: applied to a leaf")

    let rec reduce st node (sl, sr) =
      match node.v with
      | I x ->
	  let nl = reduce_left st sl node in
	  let nr, residue = reduce_right st sr nl in
	  nr, residue
      | _ -> raise (Invalid_argument "Suffix_tree.reduce_to_visible: applied to a leaf")
    and reduce_left st sl node =
      if sl <= 0
      then node
      else match node.v with I x -> assert (height st node = (height st x.link) + 1); reduce_left st (sl - 1) x.link | _ -> assert false
    and reduce_right st sr node =
      if node == st.root
      then node, sr
      else begin
	let node_len = !(node.final) - node.start + 1 in
	if sr < node_len
	then node, sr
	else
	  reduce_right st (sr - node_len) node.parent end


(* -------------------------------------------------------------------------------
   Operations on implicit nodes (explicit, implicit, child : node * subseq * node)
   the snd node [child] is significant only when [implicit] is not the empty string,
   and is the child that recognizes [implicit] starting from [explicit]. [implicit] is
   defined by a sequence, a start and a length.
   ------------------------------------------------------------------------------- *)

    let eq_char c1 c2 =
      c1<>'\000' & c1=c2  (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)

(* returns the child node that recognizes [implicit] from the node [explicit] *)
    let get_child (explicit,implicit) =
      if Subseq.is_empty implicit
      then explicit
      else
	let c = Subseq.get implicit 0 in
	if c = '\000'
	then raise Not_found
	else
	  match explicit.v with
	  | I x -> Hashtbl.find x.children c
	  | L _ -> raise Not_found

(* ensures that implicit does not span over another node below [explicit] *)
    let rec canonical (explicit,implicit,child) =
      if Subseq.is_empty implicit
      then (explicit,implicit,child)
      else
	let l = !(child.final) - child.start + 1 in
	let a = Subseq.length implicit in
	if a < l
	then (explicit,implicit,child)
	else
	  let implicit' = Subseq.sub implicit l (a-l) in
	  canonical (child, implicit', get_child (child,implicit'))

(* test whether an implicit node is the root node *)
    let is_root st (explicit,implicit,_) =
      explicit == st.root & Subseq.is_empty implicit

(* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
   and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
    let has_child st (explicit,implicit,child) (k,i) =
      let a = Subseq.length implicit in
      if a <> 0 then
	if eq_char (get0 st child.seqid).[child.start+a] (get0 st k).[i]
	then Some (explicit, Subseq.extend implicit, child)
	else None
      else
	try
	  let implicit' = (get0 st k,i,1) in
	  Some (explicit, implicit', get_child (explicit,implicit'))
	with Not_found -> None

(* test whether the extension of an implicit node by '\000' exists in the GST,
   and if so, returns the corresponding leaves, otherwise returns None. *)
    let has_end st (explicit,implicit,child) =
      let a = String.length implicit in
      if a <> 0 then
	if (get0 st child.seqid).[child.start+a] = '\000'
	then Some [child]
	else None
      else
	match explicit.v with
	| I x -> Some (Hashtbl.find_all x.children '\000')
	| L _ -> None

(* --------------------------------
   creation of new nodes and leaves
   -------------------------------- *)

    let add_leaf st node seqid start final_ref index =
      match node.v with
      | I x ->
	  let child = {seqid=seqid; start=start; final=final_ref; parent=node; v=(L index)} in
	  Hashtbl.add x.children (get0 st seqid).[start] child
      | L _ -> raise (Invalid_argument "Suffix_tree.add_leaf: 2nd argument must not be a leaf")

(* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
    let insert_node st strid (explicit,implicit,child) =
	match explicit.v with
	| I x ->
	    let a = Subseq.length implicit in
	    if a = 0
	    then begin
	      if x.maximal = 0 then begin (* otherwise explicit has been made maximal by another strid *)
		x.maximal <- strid;
		x.maximal_right <- true;
		x.locals <- Ext.add strid x.locals end;
	      explicit end
	    else begin
	      let c_child_old = (get0 st child.seqid).[child.start] in
	      let c_child_new = (get0 st child.seqid).[child.start+a] in
	      let h' = Hashtbl.create (Hashtbl.length x.children) in Hashtbl.add h' c_child_new child;
	      let n' = {
		seqid = child.seqid;
		start = child.start;
		final = ref (child.start+a-1);
		parent = explicit;
		v = I
		  { children = h';
		    link = st.root;
		    backlinks = LSet.empty ();
		    ext = ext0 child;
		    locals = Ext.singleton strid;
		    maximal = strid;
		    maximal_right = true;
		    visible = (0,0);
		  };
	      } in
	      child.start <- child.start+a;
	      child.parent <- n';
	      Hashtbl.replace x.children c_child_old n';
	      set_visible st n' (Param.get_visible (path st n'));
	      n' end
	| L _ -> raise (Invalid_argument "Suffix_tree.insert_node: first part of 2nd argument must not be a leaf")

(* add some strid in the extent of all ancestor of a node (except the root) *)
    let rec add_strid st strid node =
      match node.v with
      | I x ->
	  if node != st.root && not (Ext.mem strid x.ext) then begin
	    x.ext <- Ext.add strid x.ext;
	    add_strid st strid node.parent end
	  else
	    if x.maximal = strid then begin (* this node has been made maximal when adding strid *)
	      x.maximal <- 0;
	      x.maximal_right <- false;
	      x.locals <- Ext.remove strid x.locals end
(*	      LSet.remove node new_maximal *)
      | _ -> assert false

(* add a suffix link from [pred_opt] (if defined) to [explicit] *)
    let add_link strid pred_opt explicit =
      match pred_opt with
      | Some n ->
	  ( match n.v, explicit.v with
	  | I x0, I x ->
	      x0.link <- explicit;
	      x.backlinks <- LSet.add n x.backlinks;
	      if x.maximal = strid then begin
		x.maximal <- 0; (* maximal_right is left unchanged *)
		x.locals <- Ext.remove strid x.locals end
(*	      LSet.remove explicit new_maximal *)
	  | _ -> assert false)
      | None -> ()

(* ------------ 
   suffix links
   ------------ *)

(* extends suffix_link for implicit nodes *)
    let get_link st = function  (* TODO *)
      | (explicit,implicit,_) when Subseq.is_empty implicit ->
	  let explicit' = match explicit.v with I x -> x.link | _ -> assert false in  (*suffix_link root explicit*)
	  (explicit', Subseq.empty, explicit')
      | (explicit,implicit,_) ->
	  if explicit == st.root
	  then
	    let implicit' = Subseq.sub implicit 1 (Subseq.length implicit - 1) in
	    canonical (st.root, implicit', get_child (st.root,implicit'))
	  else
	    let explicit' = match explicit.v with I x -> x.link | _ -> assert false in  (*suffix_link root explicit*)
	    canonical (explicit', implicit, get_child (explicit',implicit))

(* --------------------------------------------------------------
   GST update for the new character c at position i in sequence k
   -------------------------------------------------------------- *)

(* state for 'update' *)
    type res = {
	terminal : int ref;
	mutable startj : int;
	mutable startnode : node * Subseq.t * node;
(*	mutable new_maximal : node LSet.t; *)
      }

    let rec update st (strid,i) res pred_opt =
      (* c = seqar.(strid).[i] *)
      match has_child st res.startnode (strid,i) with
      | Some extended_startnode -> (* startnode can be extended by [c] *)
	  let explicit, implicit, _ = res.startnode in
	  assert (pred_opt = None or Subseq.is_empty implicit);
          (* if a link has been followed after node creation, then we are on an explicit node *)
	  add_link strid pred_opt explicit;
	  res.startnode <- canonical extended_startnode
      | None -> (* startnode cannot be extended by [c] ... *)
	  let n' = insert_node st strid res.startnode in (* ... so we insert a new node ... *)
	  if (get0 st strid).[res.startj] <> '\000' then begin
	    add_leaf st n' strid i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
	    add_strid st strid n';  (* updating the extent of ancestor nodes *)
	    add_link strid pred_opt n';  (* ... a suffix link from the last created node (if defined) ... *)
	  end;
	  res.startj <- res.startj + 1; (* prepare for the next suffix *)
	  if not (is_root st res.startnode)
	  then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
	    res.startnode <- get_link st res.startnode; (* ... follow the suffix link to find the next suffix ... *)
	    update st (strid,i) res (Some n') end  (* ... and loop on [update] *)

(* -------------------------------
   implementing the signature T
   ------------------------------- *)

    let create () =
      let rec root = {
	seqid = -1;
	start = 0;
	final = ref (-1);
	parent = root;
	v=I
	  { children = Hashtbl.create 2;
	    link = root;
	    backlinks = LSet.empty ();
	    ext = Ext.empty;
	    locals = Ext.empty;
	    maximal = 0;
	    maximal_right = false;
	    visible = (0,0);
	  }} in
      let st =
	{ cpt = 0;
	  ht = Hashtbl.create 100;
	  root = root
	} in
      set_visible st root (0,0);
      st


(* general fold *)
    let rec fold_tree : t -> ('h -> node -> bool) -> ('h -> node -> 'h) -> ('s list -> 'h -> node -> 's) -> 'h -> 's =
      fun gst f h s init ->
	fold_node gst f h s init (root gst)
    and fold_node gst f h s h_node node =
      s
	(List.map
	   (fun child -> fold_node gst f h s (h h_node child) child)
	   (List.filter (f h_node) (children gst node)))
	h_node
	node
	
(* synthesized attributes only *)
    let fold_s_node gst s node = fold_node gst (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) () node
    let fold_s_tree gst s = fold_s_node gst s (root gst)
	
(* filtering and synthesizing, no inheritance *)
    let fold_fs_node gst f s node = fold_node gst (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) () node
    let fold_fs_tree gst f s = fold_fs_node gst f s (root gst)

    let path_restrictions st node =
      let lp = match parent st node with None -> LSet.empty () | Some p -> LSet.singleton p in (* the prefix restriction, if it exists *)
      let ls = match succ st node with None -> LSet.empty () | Some s -> LSet.singleton s in (* the suffix restriction, if it exists *)
      LSet.union lp ls

    let path_extensions st node =
      let lr = List.filter (fun n -> not (is_leaf st n)) (children st node) in (* right extensions *)
      let lf = preds st node in (* left extensions *)
      LSet.union lr lf

    let rec max_restrictions st node =
      let res1 = max_restrictions_aux st (LSet.empty ()) (path_restrictions st node) in
      let _, res2 =
	fold_while
	  (fun (res1, res2) ->
	    match res1 with
	    | [] -> None
	    | (_,n)::hns -> Some (LSet.diff hns (max_restrictions_aux st (LSet.empty ()) (path_restrictions st n)), n::res2))
	  (res1, []) in
      res2
    and max_restrictions_aux st acc = function
      | [] -> acc
      | n::ns ->
	  if is_maximal st n
	  then max_restrictions_aux st (LSet.add (height st n, n) acc) ns
	  else max_restrictions_aux st acc (path_restrictions st n @ ns)

    module SetShiftNode = Set.Make (struct type t = (* int * int * *) node let compare = Pervasives.compare end)

    let rec max_extensions st node_opt =
      let on_start, start_is_root, start =
	match node_opt with
	| None -> false, true, st.root
	| Some n -> true, n == st.root, n in
      let incrs0 = max_extensions_right st start_is_root on_start (get_visible st start) start SetShiftNode.empty (* LSet.empty () *) in
      let incrs1 =
	SetShiftNode.fold
	  (fun ((* sl, sr, *) n) incrs1 ->
	    max_extensions_remove_right st false true (* sl, sr *) n incrs1)
	  incrs0 incrs0 in
      let incrs = SetShiftNode.fold (fun ((* _, _, *) n) res -> n::res) incrs1 [] in
      let ext = ext0 start in
      let locals =
	Ext.elements (List.fold_left (fun res n -> Ext.diff res (ext0 n)) ext incrs) in
      incrs, locals
    and max_extensions_right st start_is_root on_start (shift_left, shift_right) node acc = (* node is right extension of start (possibly start itself (on_start = true)) *)
      match node.v with
      | I x ->
	  let sl, sr = x.visible in
	  if not on_start && x.maximal > 0 && ((* sl <= shift_left && sr <= shift_right && *) sl + sr < shift_left + shift_right) then
	    SetShiftNode.add ((* shift_left, shift_right, *) node) acc (* LSet.add node acc *)
	  else
	    let acc1 =
	      if not start_is_root && x.maximal_right
	      then max_extensions_left st on_start (shift_left, shift_right) node acc
	      else acc in
	    Hashtbl.fold
	      (fun _ n acc -> max_extensions_right st start_is_root false (shift_left, shift_right + !(n.final) - n.start + 1) n acc)
	      x.children
	      acc1
      | _ -> acc
    and max_extensions_left st on_start (shift_left, shift_right) node acc = (* node is right-maximal right extension of start (possibly start itself) *)
      match node.v with
      | I x ->
	  let sl, sr = x.visible in
	  if not on_start && x.maximal > 0 && ((*sl <= shift_left && sr <= shift_right && *) sl + sr < shift_left + shift_right) then
	    SetShiftNode.add ((* shift_left, shift_right, *) node) acc (* LSet.add node acc *)
	  else
	    List.fold_left
	      (fun acc n -> max_extensions_left st false (shift_left + 1, shift_right) n acc)
	      acc
	      x.backlinks
      | _ -> assert false
    and max_extensions_remove_right st start_is_root on_start (*shift_left, shift_right*) node acc = (* node is right extension of start (possibly start itself (on_start = true)) *)
      match node.v with
      | I x ->
	  let acc0 =
	    if not on_start && x.maximal > 0
	    then SetShiftNode.remove ((* shift_left, shift_right, *) node) acc
	    else acc in
	  let acc1 =
	    if not start_is_root && x.maximal_right
	    then max_extensions_remove_left st on_start (*shift_left, shift_right*) node acc0
	    else acc0 in
	  Hashtbl.fold
	    (fun _ n acc -> max_extensions_remove_right st start_is_root false (*shift_left, shift_right + !(n.final) - n.start + 1*) n acc)
	    x.children
	    acc1
      | _ -> acc
    and max_extensions_remove_left st on_start (*shift_left, shift_right*) node acc = (* node is right-maximal right extension of start (possibly start itself) *)
      match node.v with
      | I x ->
	  let acc0 =
	    if not on_start && x.maximal > 0
	    then SetShiftNode.remove ((* shift_left, shift_right, *) node) acc
	    else acc in
	  List.fold_left
	    (fun acc n -> max_extensions_remove_left st false (*shift_left + 1, shift_right*) n acc)
	    acc0
	    x.backlinks
      | _ -> assert false


    let string_restrictions st strid =
      fold_fs_tree st  (* looking for the nodes having strid as a local *)
	(fun n -> Ext.mem strid (ext0 n))
	(fun l n ->
	  List.fold_left
	    LSet.union
	    (if is_maximal st n && Ext.mem strid (locals0 n) then LSet.singleton n else LSet.empty ())
	    l)

(*
    let string_extensions st node_opt =
      Ext.elements (locals0 (match node_opt with None -> st.root | Some n -> n))
*)


    let rec find_factor st str =
      let (explicit, (s,i,len), child) = find_factor_aux st st.root (str,0,String.length str) in
      (explicit, String.sub s i len, child)
    and find_factor_aux st node implicit =
      let w = Subseq.length implicit in
      if w = 0
      then (node,implicit,node)
      else
	let child = get_child (node,implicit) in
	let l = !(child.final) - child.start + 1 in
	let a = ref 1 in
	while !a < l & !a < w & eq_char (get0 st child.seqid).[child.start + !a] (Subseq.get implicit !a) do
	  incr a
	done; (* [!a] is the first mismatch position, or the length of [child] label *)
	if ! a = l
	then find_factor_aux st child (Subseq.sub implicit !a (w - !a))
	else 
	  if !a = w
	  then (node,implicit,child)
	  else raise Not_found
(*      
      if !a < w then
	if !a < l
	then raise Not_found
	else find_factor_aux st child (Subseq.sub implicit !a (w - !a))
      else (node,implicit,child) 
*)


    let suffixes st (_,_,child) =
      fold_s_node st
	(fun l n -> if l=[] then [suffix st n] else List.concat l)
	child

    let strings st (_,_,child) = ext st child
(*
      fold_s_node st
	(fun l n -> if l=[] then LSet.singleton (fst (suffix st n)) else LSet.union_r l)
	child
*)

    let get st strid =
      let str0 = get0 st strid in
      String.sub str0 0 (String.length str0 - 1)

    let find st str =
      let factor = find_factor st str in
      match has_end st factor with
      | Some leafs ->
	  mapfind
	    (fun leaf ->
	      let (strid,pos) = suffix st leaf in
	      if pos = 0 then Some strid else None) (* there should be only one *)
	    leafs
      | None -> raise Not_found

    let find_node st str =
      let (explicit,implicit,_) = find_factor st str in
      if implicit = ""
      then explicit
      else raise Not_found

(* add a string and returns its strid. If the string already exists, the GST is not modified,
   and the existing id is returned *)
    let add st str =
      try
	find st str
      with Not_found ->
	let strid = st.cpt <- st.cpt+1; st.cpt in
	let str0 = str ^ String.make 1 '\000' in (* add a terminal symbol *)
	Hashtbl.add st.ht strid str0;
	( match st.root.v with
	| I x ->
	    x.ext <- Ext.add strid x.ext;
	    x.maximal <- strid;
	    x.maximal_right <- true;
	    x.locals <- Ext.add strid x.locals
	| _ -> assert false ); (* update the extent of the root *)
	let res = {terminal=ref (-1); startj=0; startnode=(st.root,Subseq.empty,st.root); (* new_maximal=LSet.singleton st.root *)} in (* initialize for [update] ... *)
	for pos = 0 to String.length str0 - 1 do (* for every position [i] in the sequence ... *)
	  incr res.terminal; (* increment the leaves final position ... *)
	  update st (strid,pos) res None (* call [update] for updating the suffix tree with the character at position [pos] *)
	done;
(*
	List.iter
	  (fun n -> 
	    match n.v with
	    | I x ->
		x.locals <- Ext.add strid x.locals;
		x.maximal <- true
	    | _ -> assert false)
	  res.new_maximal; (* recording new maximal nodes *)
*)
	strid

    let rec remove st strid =
      ignore (remove_node true st strid st.root);
      Hashtbl.remove st.ht strid
    and remove_node on_root st strid node =
      match node.v with
      | L _ ->
	  if node.seqid = strid
	  then None
	  else Some (node, Ext.empty)
      | I x ->
	  if not (Ext.mem strid x.ext)
	  then Some (node, Ext.empty)
	  else
	    let new_children =
	      Hashtbl.fold
		(fun c n res -> match remove_node false st strid n with None -> res | Some (n',moved_locals) -> (c,n',moved_locals)::res)
		x.children [] in
	    match new_children with
	    | [] ->
		if on_root
		then begin
		  Hashtbl.clear x.children;
		  x.ext <- Ext.remove strid x.ext;
		  x.locals <- Ext.remove strid x.locals;
		  x.maximal <- 0;
		  x.maximal_right <- false; (* ? *)
		  Some (node, Ext.empty) end
		else begin
		  remove_link st node;
		  None end
	    | [(c,child,_)] when not on_root ->
		child.start <- child.start - (!(node.final) - node.start + 1);
		child.parent <- node.parent;
		remove_link st node;
		Some (child, x.locals)
	    | (_,n0,_)::_ as l ->
		Hashtbl.clear x.children;
		List.iter
		  (fun (c,n,moved_locals) ->
		    Hashtbl.add x.children c n;
		    n.parent <- node;
		    x.locals <- Ext.union x.locals moved_locals
		  ) l;
		x.ext <- Ext.remove strid x.ext;
		x.locals <- Ext.remove strid x.locals;
		let card = Ext.cardinal x.ext in
		let maxcard_children, maxcard_backlinks = remove_maxcard l x.backlinks in
		if max maxcard_children maxcard_backlinks = card then x.maximal <- 0;
		if maxcard_children = card then x.maximal_right <- false;
		if node.seqid = strid
		then begin (* strid string does not exists any more *)
		  let a = !(node.final) - node.start + 1 in
		  node.seqid <- n0.seqid;
		  node.start <- n0.start - a;
		  node.final <- ref (n0.start - 1) end;
		Some (node, Ext.empty)
    and remove_link st node =
      match succ st node with
      | None -> ()
      | Some n ->
	  match n.v with
	  | I x -> x.backlinks <- LSet.remove node x.backlinks
	  | _ -> assert false
    and remove_maxcard children backlinks =
      let maxcard_children =
	List.fold_left (fun res (_,n,_) -> max res (Ext.cardinal (ext0 n))) 0 children in
      let maxcard_backlinks =
	List.fold_left (fun res n -> max res (Ext.cardinal (ext0 n))) 0 backlinks in
      maxcard_children, maxcard_backlinks


    let fold f st init =
      Hashtbl.fold f st.ht init


(* readable version of a GST *)

    type tree = Node of string * int * int list * tree list | Leaf of string * (strid * int)
	
    let tree st =
      fold_s_tree st
	(fun l n ->
	  if n == st.root
	  then Node ("", maximal0 n, ext st n, l)
	  else
	    let w = label st n in
	    if l=[]
	    then Leaf (w, suffix st n)
	    else Node (w, maximal0 n, ext st n, l))

  end

(* for test *)
(*
module M = Make (struct let get_visible _ = (0,0) end)
open M

let st = create ();;
let _ =
  ignore (add st "formal concept analysis");
  ignore (add st "logical concept analysis");
  ignore (add st "conceptual graphs");
  tree st;;
*)