Source

ocaml-lib / stringset.ml

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
(**
   Suffix trees with incremental addition and removal of strings.
   plus incremental maintenance of maximal factors.
*)

(* for test *)
(*
#load "unix.cma";;
#load "str.cma";;
#load "nums.cma";;
#load "dbm.cma";;
#load "common.cmo";;
#load "cis.cmo";;
#load "lSet.cmo";;
#load "bintree.cmo";;
#load "intset.cmo";;
#load "index.cmo";;
#load "persindex.cmo";;
#load "genid.cmo";;
*)

open Persindex

type strid = int (* string id *)
module StrSet = Intset.Cis (* sets of string ids *)

type nodeid = int (* node id *)
module NodeSet = Intset.LSet (* sets of node ids *)

type factor = nodeid * string * nodeid
      (** [(parent,s,child)] locates a factor string on the edge from node [parent] to node [child], where [s] is a prefix of the label of [child].
	 If [s] is the empty string, then [parent] and [child] are a same node.
	 The path of a factor is the concatenation of [path st parent] and [s]. *)

type tree = Node of string * int * int list * tree list | Leaf of string * (strid * int)

(* Operations on substrings of sequences *)
module Subseq =
  struct
    type t = string * int * int  (* (seq, pos, len) *)
	  
    let empty = ("",0,0)  (* non-significant subseq *)
	
    let is_empty (s,pos,len) = len = 0
	
    let get (s,pos,len) i = s.[pos+i]
	
    let length (s,pos,len) = len
	
    let sub (s,pos,len) pos' len' = (s,pos+pos',len')
	
    let extend (s,pos,len) = (s,pos,len+1)
  end

(* Operations on nodes *)
module Node =
  struct
    type internal = {
	children : (char,nodeid) Hashtbl.t;
	mutable link : nodeid; (* suffix link *)
	mutable backlinks : NodeSet.t;
	mutable ext : StrSet.t; (* set of strids under this node *)
	mutable locals : StrSet.t; (* subset of ext, strids only in leaves of the node (only on maximal nodes) *)
	mutable maximal_right : bool; (* whether this node is maximal on its right given its ext (a concept intent) *)
	mutable maximal : strid; (* by which strid (>0) this node became a maximal node (a concept intent) *)
	mutable visible : int * int; (* which part of this node is visible: left and right offset. *)
      }
      
    type specific  =
      | I of internal (* for non-leaves, internal nodes *)
      | L of int (* for leaves: position of recognized suffix *)

(** Type of the nodes of suffix trees.
   Nodes are either leaves or internal nodes. *)
    type node = {
	mutable seqid : strid; (* sequence index in which the positions start and final are defined *)
	mutable start : int;     (* start and final position of the word labelling the node *)
	mutable final : int ref;
	mutable parent : nodeid; (* prefix link, the root for the root itself *)
	v : specific
      }

    let ext node =
      match node.v with
      | I x -> x.ext
      | L _ -> StrSet.singleton node.seqid
	
    let maximal node =
      match node.v with
      | I x -> x.maximal
      | L _ -> assert false

    let locals node =
      match node.v with
      | I x -> x.locals
      | L _ -> StrSet.empty
	    
    let length node =
      !(node.final) - node.start + (match node.v with I _ -> 1 | L _ -> 0)

    let label str node =
      String.sub (str # get node.seqid) node.start (length node)

    let get_visible node =
      match node.v with
      | I x -> x.visible
      | _ -> raise (Invalid_argument "Stringset.get_visible: applied to a leaf")

  end

open Node

(* state for 'update' *)
type res = {
    terminal : int ref;
    mutable startj : int;
    mutable startnode : (nodeid * node) * Subseq.t * (nodeid * node);
  }

let eq_char c1 c2 =
  c1<>'\000' & c1=c2  (* ensures that 2 terminal symbols '\000' are pairwise different (for GST only, not necessary for ST) *)


(** Class of suffix trees. This is not a purely functional data-structure. *)

class st ?(get_visible : string -> int * int = fun _ -> 0, 0) (name : string) (db : database) =
  object (self)
    val strgen = new Genid.genid (name ^ ".strgen") db
    val stridx : (strid,string) index = new varray_vector_opt ~vals:Serialize.string 13 chunk (fun _ -> raise Not_found) db
    val nodegen = new Genid.genid (name ^ ".nodegen") db
    val nodeidx : (nodeid,node) index = new varray_vector_opt ~vals:Serialize.marshal 13 chunk (fun _ -> raise Not_found) db
    val root : (unit,nodeid) index = new var ~vals:Serialize.int (-1) (fun () -> raise Not_found) db

    initializer
      stridx # locate (name ^ "_strid") name "str";
      nodeidx # locate (name ^ "_nodeid") name "node";
      root # locate name name "root";
      try
	ignore (root # get ())
      with Not_found -> (* creation of the suffix tree, and not re-opening from db *)
	let root_id = nodegen # alloc in
	let root_node = {
	  seqid = -1;
	  start = 0;
	  final = ref (-1);
	  parent = root_id;
	  v=I
	    { children = Hashtbl.create 2;
	      link = root_id;
	      backlinks = NodeSet.empty;
	      ext = StrSet.empty;
	      locals = StrSet.empty;
	      maximal = 0;
	      maximal_right = false;
	      visible = (0,0);
	    }} in
	root # set () root_id;
	nodeidx # set root_id root_node

    method name = name

    method sync =
      strgen # sync;
      stridx # sync;
      nodegen # sync;
      nodeidx # sync;
      root # sync

    method unload (p : int) =
      stridx # unload p;
      nodeidx # unload p


(** {1 Suffix trees as string sets. } *)

(* returns the child node that recognizes [implicit] from the node [explicit] *)
    method private get_child (n,node) implicit =
      if Subseq.is_empty implicit
      then (n,node)
      else
	let c = Subseq.get implicit 0 in
	if c = '\000'
	then raise Not_found
	else
	  match node.v with
	  | I x -> let ch = Hashtbl.find x.children c in (ch, nodeidx # get ch)
	  | L _ -> raise Not_found

    method add (str : string) : strid =
	(** [st # add s] adds the string [s], and all its suffixes in the suffix tree [st], unless [s] has already been added.
	   It also returns the string id as an handle on this string. *)
    (* ensures that implicit does not span over another node below [explicit] *)
      let rec canonical ((e,explicit),implicit,(c,child) as res) =
	if Subseq.is_empty implicit
	then res
	else
	  let l = !(child.final) - child.start + 1 in
	  let a = Subseq.length implicit in
	  if a < l
	  then res
	  else
	    let implicit' = Subseq.sub implicit l (a-l) in
	    canonical ((c, child), implicit', self # get_child (c,child) implicit')
      in
    (* test whether an implicit node is the root node *)
      let is_root ((_,explicit), implicit, _) =
	explicit.seqid = (-1) && Subseq.is_empty implicit
      in
    (* test whether the extension of an implicit node by [seqar.(k).[i]] is still recognized in the GST,
       and if yes, returns the implicit node extended by 1 position, otherwise returns [None]. *)
      let has_child ((e,explicit),implicit,(c,child)) (k,i) =
	let a = Subseq.length implicit in
	if a <> 0 then
	  if eq_char (stridx # get child.seqid).[child.start+a] (stridx # get k).[i]
	  then Some ((e,explicit), Subseq.extend implicit, (c,child))
	  else None
	else
	  try
	    let implicit' = (stridx # get k,i,1) in
	    Some ((e,explicit), implicit', self # get_child (e,explicit) implicit')
	  with Not_found -> None
      in
    (* creation of new nodes and leaves *)
      let add_leaf (n,node) seqid start final_ref index =
	match node.v with
	| I x ->
	    let c = nodegen # alloc in
	    let child = {seqid=seqid; start=start; final=final_ref; parent=n; v=(L index)} in
	    nodeidx # set c child;
	    Hashtbl.add x.children (stridx # get seqid).[start] c;
	    nodeidx # set n node (* for update *)
	| L _ -> raise (Invalid_argument "Stringset.add_leaf: 2nd argument must not be a leaf")
      in      
    (* make explicit an implicit node by inserting a new node between [explicit] and [child] *)
      let insert_node strid ((e,explicit),implicit,(c,child)) =
	match explicit.v with
	| I x ->
	    let a = Subseq.length implicit in
	    if a = 0
	    then begin
	      if x.maximal = 0 then begin (* otherwise explicit has been made maximal by another strid *)
		x.maximal <- strid;
		x.maximal_right <- true;
		x.locals <- StrSet.add strid x.locals end;
	      nodeidx # set e explicit;
	      e, explicit end
	    else begin
	      let c_child_old = (stridx # get child.seqid).[child.start] in
	      let c_child_new = (stridx # get child.seqid).[child.start+a] in
	      let h' = Hashtbl.create (Hashtbl.length x.children) in Hashtbl.add h' c_child_new c;
	      let n' = nodegen # alloc in
	      let node' = {
		seqid = child.seqid;
		start = child.start;
		final = ref (child.start+a-1);
		parent = e;
		v = I
		  { children = h';
		    link = root # get ();
		    backlinks = NodeSet.empty;
		    ext = Node.ext child;
		    locals = StrSet.singleton strid;
		    maximal = strid;
		    maximal_right = true;
		    visible = (0,0);
		  };
	      } in
	      nodeidx # set n' node';
	      child.start <- child.start+a;
	      child.parent <- n';
	      nodeidx # set c child;
	      Hashtbl.replace x.children c_child_old n';
	      nodeidx # set e explicit;
	      self # set_visible n' (get_visible (self # path n'));
	      nodeidx # set n' node';
	      n', node' end
	| L _ -> raise (Invalid_argument "Stringset.insert_node: first part of 2nd argument must not be a leaf")
      in
    (* add some strid in the extent of all ancestor of a node (except the root) *)
      let rec add_strid strid (n,node) =
	match node.v with
	| I x ->
	    if node.seqid <> (-1) && not (StrSet.mem strid x.ext) then begin
	      x.ext <- StrSet.add strid x.ext;
	      nodeidx # set n node;
	      add_strid strid (node.parent, nodeidx # get node.parent) end
	    else
	      if x.maximal = strid then begin (* this node has been made maximal when adding strid *)
		x.maximal <- 0;
		x.maximal_right <- false;
		x.locals <- StrSet.remove strid x.locals;
		nodeidx # set n node end
	| _ -> assert false
      in
    (* add a suffix link from [pred_opt] (if defined) to [explicit] *)
      let add_link strid pred_opt (e,explicit) =
	match pred_opt with
	| Some (n,node) ->
	    ( match node.v, explicit.v with
	    | I x0, I x ->
		x0.link <- e;
		x.backlinks <- NodeSet.add n x.backlinks;
		if x.maximal = strid then begin
		  x.maximal <- 0; (* maximal_right is left unchanged *)
		  x.locals <- StrSet.remove strid x.locals end;
		nodeidx # set n node;
		nodeidx # set e explicit
	    | _ -> assert false)
	| None -> ()
      in
    (* extends suffix_link for implicit nodes *)
      let get_link ((e,explicit),implicit,_) =
	if Subseq.is_empty implicit
	then
	  let e_explicit' = match explicit.v with I x -> x.link, nodeidx # get x.link | _ -> assert false in  (*suffix_link root explicit*)
	  (e_explicit', Subseq.empty, e_explicit')
	else
	  if explicit.seqid = (-1)
	  then
	    let implicit' = Subseq.sub implicit 1 (Subseq.length implicit - 1) in
	    let root_id = root # get () in
	    let root_node = nodeidx # get root_id in
	    canonical ((root_id, root_node), implicit', self # get_child (root_id,root_node) implicit')
	  else
	    let e', explicit' = match explicit.v with I x -> x.link, nodeidx # get x.link | _ -> assert false in  (*suffix_link root explicit*)
	    canonical ((e',explicit'), implicit, self # get_child (e',explicit') implicit)
      in
    (* GST update for the new character c at position i in sequence k *)
      let rec update (strid,i) res pred_opt =
	(* c = (stridx # get strid).[i] *)
	match has_child res.startnode (strid,i) with
	| Some extended_startnode -> (* startnode can be extended by [c] *)
	    let e_explicit, implicit, _ = res.startnode in
	    assert (pred_opt = None or Subseq.is_empty implicit);
            (* if a link has been followed after node creation, then we are on an explicit node *)
	    add_link strid pred_opt e_explicit;
	    res.startnode <- canonical extended_startnode
	| None -> (* startnode cannot be extended by [c] ... *)
	    let n', node' as n_node' = insert_node strid res.startnode in (* ... so we insert a new node ... *)
	    if (stridx # get strid).[res.startj] <> '\000' then begin
	      add_leaf n_node' strid i res.terminal res.startj;  (* ... and a new leaf for the suffix at position [res.startj] *)
	      add_strid strid n_node';  (* updating the extent of ancestor nodes *)
	      add_link strid pred_opt n_node';  (* ... a suffix link from the last created node (if defined) ... *)
	    end;
	    res.startj <- res.startj + 1; (* prepare for the next suffix *)
	    if not (is_root res.startnode)
	    then begin (* while [res.startnode] is not the root, and cannot be extended by [c] ... *)
	      res.startnode <- get_link res.startnode; (* ... follow the suffix link to find the next suffix ... *)
	      update (strid,i) res (Some n_node') end (* ... and loop on [update] *)
      in
    (* add a string and returns its strid. If the string already exists, the GST is not modified,
       and the existing id is returned *)
      try
	self # find str
      with Not_found ->
	let strid = strgen # alloc in
	let str0 = str ^ String.make 1 '\000' in (* add a terminal symbol *)
	stridx # set strid str0;
	let root_id = root # get () in
	let root_node = nodeidx # get root_id in
	( match root_node.v with
	| I x ->
	    x.ext <- StrSet.add strid x.ext;
	    x.maximal <- strid;
	    x.maximal_right <- true;
	    x.locals <- StrSet.add strid x.locals;
	    nodeidx # set root_id root_node
	| _ -> assert false ); (* update the extent of the root *)
	let res = {terminal=ref (-1); startj=0; startnode=((root_id,root_node), Subseq.empty, (root_id,root_node))} in (* initialize for [update] ... *)
	for pos = 0 to String.length str0 - 1 do (* for every position [i] in the sequence ... *)
	  incr res.terminal; (* increment the leaves final position ... *)
	  update (strid,pos) res None (* call [update] for updating the suffix tree with the character at position [pos] *)
	done;
	strid

    method remove (strid : strid) : unit =
	(** [st # remove id] removes the string identified by [id], and all its suffixes, from the suffix tree [st]. *)
      let remove_maxcard children backlinks =
	let maxcard_children =
	  List.fold_left (fun res (_,n,node,_) -> max res (StrSet.cardinal (Node.ext node))) 0 children in
	let maxcard_backlinks =
	  NodeSet.fold (fun res n -> max res (StrSet.cardinal (self # ext n))) 0 backlinks in
	maxcard_children, maxcard_backlinks
      in
      let remove_link (n,node) =
	if node.seqid = (-1)
	then ()
	else
	  match node.v with
	  | L _ -> ()
	  | I x ->
	      let succ = nodeidx # get x.link in
	      match succ.v with
	      | I x -> x.backlinks <- NodeSet.remove n x.backlinks
	      | _ -> assert false
      in
      let rec remove_node on_root strid n0 =
	let node = nodeidx # get n0 in
	match node.v with
	| L _ ->
	    if node.seqid = strid
	    then None
	    else Some (n0, node, StrSet.empty)
	| I x ->
	    if not (StrSet.mem strid x.ext)
	    then Some (n0, node, StrSet.empty)
	    else
	      let new_children =
		Hashtbl.fold
		(fun c n res -> match remove_node false strid n with None -> res | Some (n',node',moved_locals) -> (c,n',node',moved_locals)::res)
		  x.children [] in
	      match new_children with
	      | [] ->
		  if on_root
		  then begin
		    Hashtbl.clear x.children;
		    x.ext <- StrSet.remove strid x.ext;
		    x.locals <- StrSet.remove strid x.locals;
		    x.maximal <- 0;
		    x.maximal_right <- false; (* ? *)
		    nodeidx # set n0 node;
		    Some (n0, node, StrSet.empty) end
		  else begin
		    remove_link (n0,node);
		    None end
	      | [(c,ch,child,_)] when not on_root ->
		  child.start <- child.start - (!(node.final) - node.start + 1);
		  child.parent <- node.parent;
		  nodeidx # set ch child;
		  remove_link (n0,node);
		  Some (ch, child, x.locals)
	      | (_,n1,node1,_)::_ as l ->
		  Hashtbl.clear x.children;
		  List.iter
		    (fun (c,n',node',moved_locals) ->
		      Hashtbl.add x.children c n';
		      node'.parent <- n0;
		      nodeidx # set n' node';
		      x.locals <- StrSet.union x.locals moved_locals
		    ) l;
		  x.ext <- StrSet.remove strid x.ext;
		  x.locals <- StrSet.remove strid x.locals;
		  let card = StrSet.cardinal x.ext in
		  let maxcard_children, maxcard_backlinks = remove_maxcard l x.backlinks in
		  if max maxcard_children maxcard_backlinks = card then x.maximal <- 0;
		  if maxcard_children = card then x.maximal_right <- false;
		  if node.seqid = strid
		  then begin (* strid string does not exists any more *)
		    let a = !(node.final) - node.start + 1 in
		    node.seqid <- node1.seqid;
		    node.start <- node1.start - a;
		    node.final <- ref (node1.start - 1) end;
		  nodeidx # set n0 node;
		  Some (n0, node, StrSet.empty)
      in
      ignore (remove_node true strid (root # get ()));
      stridx # reset strid

    method get (strid : strid) : string =
	(** [st # get id] returns the string associated to [id]. *)
      let str0 = stridx # get strid in
      String.sub str0 0 (String.length str0 - 1)

    method find (str : string) : strid =
	(** [st # find s] returns the id associated to the string [s], if the strings exists in the suffix tree [st].
	   Otherwise raise Not_found. *)
    (* test whether the extension of an implicit node by '\000' exists in the GST,
       and if so, returns the corresponding leaves, otherwise returns None. *)
      let has_end (e,implicit,c) =
	let a = String.length implicit in
	if a <> 0 then
	  let child = nodeidx # get c in
	  if (stridx # get child.seqid).[child.start+a] = '\000'
	  then Some [c]
	  else None
	else
	  let explicit = nodeidx # get e in
	  match explicit.v with
	  | I x -> Some (Hashtbl.find_all x.children '\000')
	  | L _ -> None
      in
      let factor = self # find_factor str in
      match has_end factor with
      | Some leafs ->
	  Common.mapfind
	    (fun leaf ->
	      let (strid,pos) = self # suffix leaf in
	      if pos = 0 then Some strid else None) (* there should be only one *)
	    leafs
      | None -> raise Not_found

(** {1 Low-level interface on suffix trees. } *)

    method root : nodeid =
         (** [st # root] returns the root node of the suffix tree [st]. *)
      root # get ()

    method is_leaf (n : nodeid) : bool =
	(** [st # is_leaf n] returns whether the node [n] is a leaf. *)
      let node = nodeidx # get n in
      match node.v with
      | L _ -> true
      | _ -> false

    method label (n : nodeid) : string =
	(** [st # label n] returns the string labelling the node [n]. *)
      if n = root # get ()
      then ""
      else
	let node = nodeidx # get n in
	Node.label stridx node

    method length (n : nodeid) : int =
	(** [st # length n] returns the length of the string labelling the node [n]. *)
      let node = nodeidx # get n in
      Node.length node

    method path (n : nodeid) : string =
	(** [st # path n] returns the full path from the root to the node [n]. *)
      let rec aux n =
	if n = root # get ()
	then ""
	else
	  let node = nodeidx # get n in
	  aux node.parent ^ Node.label stridx node in
      aux n

    method height (n : nodeid) : int =
	(** [st # height n] returns the height of node [n], i.e. the length of the path from root to [n]. *)
      let rec aux n =
	if n = root # get ()
	then 0
	else
	  let node = nodeidx # get n in
	  aux node.parent + Node.length node in
      aux n

    method ext (n : nodeid) : StrSet.t =
	(** [st # ext n] returns a set of string ids that match the path of the node [n]. *)
      let node = nodeidx # get n in
      Node.ext node

    method children (n : nodeid) : NodeSet.t =
	(** [st # children n] returns the set of children nodes of [n]. *)
      let node = nodeidx # get n in
      match node.v with
      | I x -> Hashtbl.fold (fun c -> NodeSet.add) x.children NodeSet.empty
      | L _ -> NodeSet.empty

    method parent (n : nodeid) : nodeid option =
	(** [st # parent n] returns the parent node of [n], unless [n] is the root node. *)
      if n = root # get ()
      then None
      else Some (nodeidx # get n).parent

    method succ (n : nodeid) : nodeid option =
	(** [st # succ n] returns the successor node through the suffix link of [n], unless there is no suffix link. *)
      if n = root # get ()
      then None
      else 
	let node = nodeidx # get n in
	match node.v with
	| I x -> Some x.link
	| L _ -> None

    method preds (n : nodeid) : NodeSet.t =
	(** [st # preds n] returns the list of all nodes having [n] as successor node. *)
      let node = nodeidx # get n in
      match node.v with
      | I x -> x.backlinks
      | L _ -> NodeSet.empty

    method suffix (n : nodeid) : strid * int =
	(** [st # suffix n] returns the suffix represented by the leaf node [n] as a couple [(string id, position in the string)].
	   Raise Not_found if [n] is not a leaf. *)
      let node = nodeidx # get n in
      match node.v with
      | I _ -> raise Not_found
      | L i -> (node.seqid, i)

    method find_node (str : string) : nodeid =
	(** [st # find_node s] returns the node whose path is equal to the string [s], if it exists.
	   Raise Not_found otherwise. *)
      let (explicit,implicit,_) = self # find_factor str in
      if implicit = ""
      then explicit
      else raise Not_found

    method fold : 'h 's . ?start:nodeid -> ('h -> nodeid -> bool) -> ('h -> nodeid -> 'h) -> ('s list -> 'h -> nodeid -> 's) -> 'h -> 's =
	(** [st # fold start filter herit synth h0] returns the result of an attribute evaluation on the suffix tree [st].
	   - [start] is the starting node for the evaluation, only the subtree is explored,
	   - [filter] is used to filter which children of a node should be explored given the heritance value of the parent node,
	   - [herit] defines the heritance value of a node, given the heritance value of its parent,
	   - [synth] defines the synthesized value of a node given its heritance value, and the list of synthesized values of its filtered children,
	   - [h0] is the heritance value given to the root.
	 *)
      fun ?(start = root # get ()) f h s init ->
	let rec aux f h s h_node node =
	  s
	    (List.map
	       (fun child -> aux f h s (h h_node child) child)
	       (List.filter (f h_node) (NodeSet.elements (self # children node))))
	    h_node
	    node in
	aux f h s init start

  (* synthesized attributes only *)
    method fold_s : 's . ?start:nodeid -> ('s list -> nodeid -> 's) -> 's =
      fun ?start s ->
	self # fold ?start (fun _ _ -> true) (fun _ _ -> ()) (fun l _ n -> s l n) ()
	
  (* filtering and synthesizing, no inheritance *)
    method fold_fs : 's . ?start:nodeid -> (nodeid -> bool) -> ('s list -> nodeid -> 's) -> 's =
      fun ?start f s ->
	self # fold ?start (fun _ n -> f n) (fun _ _ -> ()) (fun l _ n -> s l n) ()


(** {1 Exploring the suffix tree through the substring relation. } *)
		
    method path_restrictions (x : nodeid) : nodeid list =
	(** [st # path_restrictions n] returns the list of nodes whose path is a direct restriction of the path of [n]. *)
      let lp = match self # parent x with None -> NodeSet.empty | Some p -> NodeSet.singleton p in (* the prefix restriction, if it exists *)
      let ls = match self # succ x with None -> NodeSet.empty | Some s -> NodeSet.singleton s in (* the suffix restriction, if it exists *)
      NodeSet.elements (NodeSet.union lp ls)

    method path_extensions (x : nodeid) : nodeid list =
	(** [st # path_extensions n] returns the list of nodes whose path is a direct extension of the path of [n]. *)
      let lr = NodeSet.filter (fun n -> not (self # is_leaf n)) (self # children x) in (* right extensions *)
      let lf = self # preds x in (* left extensions *)
      NodeSet.elements (NodeSet.union lr lf)

    method is_maximal (n : nodeid) : bool =
	(** [st # is_maximal n] returns whether a node is maximal.
	   A node is maximal is each of its extensions has a strictly smaller extent, or the node represents a full string. *)
      let node = nodeidx # get n in
      match node.v with
      | I x -> x.maximal > 0
      | L pos -> pos=0

    method set_visible (n : nodeid) (lr : int * int) : unit =
	(** [st # set_visible n (left_pos, right_pos)] sets which part of a node path should be visible when maximal. *)
      let node = nodeidx # get n in
      match node.v with
      | I x -> x.visible <- lr
      | L _ -> ()

    method max_restrictions (x : nodeid) : nodeid list =
	(** [st # max_restrictions n] returns the list of maximal nodes whose path is a restriction of the path of [n]. *)
      let rec aux acc = function
	| [] -> acc
	| n::ns ->
	    if self # is_maximal n
	    then aux (LSet.add (self # height n, n) acc) ns
	    else aux acc (self # path_restrictions n @ ns)
      in
      let res1 = aux (LSet.empty ()) (self # path_restrictions x) in
      let _, res2 =
	Common.fold_while
	  (fun (res1, res2) ->
	    match res1 with
	    | [] -> None
	    | (_,n)::hns -> Some (LSet.diff hns (aux (LSet.empty ()) (self # path_restrictions n)), n::res2))
	  (res1, []) in
      res2

    method max_extensions (n0_opt : nodeid option) : nodeid list * strid list =
	(** [st # max_extensions n_opt] returns the list of maximal nodes and leaves whose path is an extension of the path of [n], when given.
	   If a start node is not given, then the maximal nodes with shortest path are returned. *)
      let rec max_extensions_remove_left on_start n0 acc =
	(* node is right-maximal right extension of start (possibly start itself) *)
	let node = nodeidx # get n0 in
	match node.v with
	| I x ->
	    let acc0 =
	      if not on_start && x.maximal > 0
	      then NodeSet.remove n0 acc
	      else acc in
	    NodeSet.fold
	      (fun acc n -> max_extensions_remove_left false n acc)
	      acc0
	      x.backlinks
	| _ -> assert false
      in
      let rec max_extensions_remove_right start_is_root on_start n0 acc =
	(* node is right extension of start (possibly start itself (on_start = true)) *)
	let node = nodeidx # get n0 in
	match node.v with
	| I x ->
	    let acc0 =
	      if not on_start && x.maximal > 0
	      then NodeSet.remove n0 acc
	      else acc in
	    let acc1 =
	      if not start_is_root && x.maximal_right
	      then max_extensions_remove_left on_start n0 acc0
	      else acc0 in
	    Hashtbl.fold
	      (fun _ n acc -> max_extensions_remove_right start_is_root false n acc)
	      x.children
	      acc1
	| _ -> acc
      in
      let rec max_extensions_left on_start (shift_left, shift_right) n0 acc =
	(* node is right-maximal right extension of start (possibly start itself) *)
	let node = nodeidx # get n0 in
	match node.v with
	| I x ->
	    let sl, sr = x.visible in
	    if not on_start && x.maximal > 0 && ((*sl <= shift_left && sr <= shift_right && *) sl + sr < shift_left + shift_right) then
	      NodeSet.add n0 acc
	    else
	      NodeSet.fold
		(fun acc n -> max_extensions_left false (shift_left + 1, shift_right) n acc)
		acc
		x.backlinks
	| _ -> assert false
      in
      let rec max_extensions_right start_is_root on_start (shift_left, shift_right) n0 acc =
	(* node is right extension of start (possibly start itself (on_start = true)) *)
	let node = nodeidx # get n0 in
	match node.v with
	| I x ->
	    let sl, sr = x.visible in
	    if not on_start && x.maximal > 0 && ((* sl <= shift_left && sr <= shift_right && *) sl + sr < shift_left + shift_right) then
	      NodeSet.add n0 acc
	    else
	      let acc1 =
		if not start_is_root && x.maximal_right
		then max_extensions_left on_start (shift_left, shift_right) n0 acc
		else acc in
	      Hashtbl.fold
		(fun _ n acc ->
		  let node' = nodeidx # get n in
		  max_extensions_right start_is_root false (shift_left, shift_right + !(node'.final) - node'.start + 1) n acc)
		x.children
		acc1
	| _ -> acc
      in
      let on_start, start_is_root, start =
	match n0_opt with
	| None -> false, true, root # get ()
	| Some n -> true, n = root # get (), n in
      let start_node = nodeidx # get start in
      let incrs0 = max_extensions_right start_is_root on_start (Node.get_visible start_node) start NodeSet.empty in
      let incrs1 =
	NodeSet.fold
	  (fun incrs1 n ->
	    max_extensions_remove_right false true n incrs1)
	  incrs0 incrs0 in
      let incrs = NodeSet.elements incrs1 in
      let ext = Node.ext start_node in
      let locals =
	StrSet.elements (List.fold_left (fun res n -> StrSet.diff res (self # ext n)) ext incrs) in
      incrs, locals

    method string_restrictions (strid : strid) : nodeid list =
	(** [st # string_restrictions strid] returns the list of maximal nodes having [strid] as a string extension. *)
      NodeSet.elements
	(self # fold_fs  (* looking for the nodes having strid as a local *)
	   (fun n -> StrSet.mem strid (self # ext n))
	   (fun l n ->
	     List.fold_left
	       NodeSet.union
	       (if self # is_maximal n && StrSet.mem strid (Node.locals (nodeidx # get n)) then NodeSet.singleton n else NodeSet.empty)
	       l))


(** {1 Searching in a suffix tree} *)

    method find_factor (str : string) : factor =
	(** [st # find_factor s] returns the factor locating [s] in the suffix tree [st].
	   This means the path of the result factor is equal to [s].
	   Raise [Not_found] if the string [s] does not appear in any string of [st]. *)
      let rec find_factor_aux (n,node) implicit =
	let w = Subseq.length implicit in
	if w = 0
	then (n,implicit,n)
	else
	  let c, child = self # get_child (n,node) implicit in
	  let l = !(child.final) - child.start + 1 in
	  let a = ref 1 in
	  while !a < l & !a < w & eq_char (stridx # get child.seqid).[child.start + !a] (Subseq.get implicit !a) do
	    incr a
	  done; (* [!a] is the first mismatch position, or the length of [child] label *)
	  if !a = l
	  then find_factor_aux (c,child) (Subseq.sub implicit !a (w - !a))
	  else 
	    if !a = w
	    then (n,implicit,c)
	    else raise Not_found
      in
      let root_id = root # get () in
      let root_node = nodeidx # get root_id in
      let (explicit, (s,i,len), child) = find_factor_aux (root_id, root_node) (str,0,String.length str) in
      (explicit, String.sub s i len, child)

    method suffixes (_, _, child : factor) : (strid * int) list =
	(** [st # suffixes f] returns the list of all suffixes [(strid,pos)] that have the path of [f] as a prefix: this path occurs in string [strid] at position [pos]. *)
      self # fold_s
	~start:child
	(fun l n -> if l=[] then [self # suffix n] else List.concat l)

    method strings (_, _, child : factor) : StrSet.t =
	(** [st # strings f] returns the set of the ids of all strings containing the path of [f]. *)
      self # ext child
(*
      fold_s
        ~start:child
	(fun l n -> if l=[] then StrSet.singleton (fst (self # suffix n)) else StrSet.union_r l)
*)


(** {1 Simpler representation of a suffix tree (for debugging purpose at top-level)} *)

    method tree : tree =
      (* readable version of a GST *)
      let maximal n =
	let node = nodeidx # get n in
	Node.maximal node
      in
      let root_id = root # get () in
      self # fold_s
	(fun l n ->
	  if n = root_id
	  then
	    Node ("", maximal n, StrSet.elements (self # ext n), l)
	  else
	    let w = self # label n in
	    if l=[]
	    then Leaf (w, self # suffix n)
	    else Node (w, maximal n, StrSet.elements (self # ext n), l))

  end

(* for test *)
(*
let st = new st ~get_visible:(fun _ -> (0,0)) "root" (new database);;
let _ =
  ignore (st # add "formal concept analysis");
  ignore (st # add "logical concept analysis");
  ignore (st # add "conceptual graphs");;
*)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.