Commits

camlspotter committed 9c198be

String.Levenshtein.distance is rewritten using an imperative algorithsm.

Comments (0)

Files changed (6)

-2.4.0 (not yet)
+2.4.1
+---------------
+
+- String.Levenshtein.distance is rewritten using an imperative algorithsm.
+
+2.4.0
 ---------------
 
 - Added Option.iter
 OASISFormat: 0.2
 Name:        spotlib
-Version:     2.4.0
+Version:     2.4.1
 Synopsis:    Useful functions for OCaml programming used by @camlspotter
 Authors:     Jun FURUSE
 License:     LGPL-2.0 with OCaml linking exception
 name="spotlib"
-version="2.4.0"
+version="2.4.1"
 description="Spotter's garbages"
 requires="unix,str,pa_ounit"
 archive(byte)="spotlib.cmo"
 open Base
-open String
+
+open String (* BEWARE! compare is now for string *)
 
 let index_opt s c = try Some (index s c) with Not_found -> None
 
 
 module Set = Xset.Make(struct type t = string let compare (x:string) y = compare x y end)
 
-module Pervasives = struct
-  let chop_eols = chop_eols
-end
-
 let split_at len str = String.sub str 0 len, String.sub str len (String.length str - len)
 
 TEST "Xstring.split_at" = 
     
 let foldi_left f acc s = scani_left f acc s
 
+let to_array s = Array.init (length s) & fun i -> unsafe_get s i
+
+let to_code_array s = Array.init (length s) & fun i -> Char.code & unsafe_get s i
+
 module Levenshtein = struct
-  let (@) = String.unsafe_get
+  (* http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance *)
 
-  (* It is not tail recursive, but so far I am happy *)    
-  let dist_non_tco s1 s2 =
-    let lev lev_fix (i, j) = match i, j with
-      | -1, d | d, -1 -> max d 0
-      | _ ->
-          min (lev_fix (i-1, j) + 1)
-          & min (lev_fix (i, j-1) + 1)
-                (lev_fix (i-1, j-1) + if s1@i = s2@j then 0 else 1)
+  (* Minimum of three integers *)
+  let minimum (x:int) y z =
+    let m' (a:int) b = if a < b then a else b in
+    m' (m' x y) z
+  
+  (* Matrix initialization. 
+  
+     n x m array:
+  
+      ------- m -------
+     | 0123456789.....(m-1)
+     n 1000000000.....0
+     | 2000000000.....0
+     | ....
+     | (n-1)..........0
+  *)
+  let init_matrix n m =
+    let init_col = Array.init m in
+    Array.init n & function
+      | 0 -> init_col (function j -> j)
+      | i -> init_col (function 0 -> i | _ -> 0)
+  
+  (* Computes the Levenshtein distance between two unistring. *)
+  let distance_arrays x y =
+    let get = Array.unsafe_get in
+    match Array.length x, Array.length y with
+      | 0, n -> n
+      | m, 0 -> m
+      | m, n ->
+         let matrix = init_matrix (m + 1) (n + 1) in
+         for i = 1 to m do
+           let s = get matrix i and t = get matrix (i - 1) in
+           for j = 1 to n do
+             let cost = abs (Pervasives.compare (get x (i - 1)) (get y (j - 1))) in
+             Array.unsafe_set s j (minimum (get t j + 1) (get s (j - 1) + 1) (get t (j - 1) + cost))
+           done
+         done;
+         get (get matrix m) n
+  
+  let distance x y = distance_arrays (to_code_array x) (to_code_array y)
+end
+
+module L_slow = struct
+  (* Minimum of three integers. This function is deliberately
+   * not polymorphic because (1) we only need to compare integers 
+   * and (2) the OCaml compilers do not perform type specialization 
+   * for user-defined functions. *)
+  let minimum (x:int) y z =
+    let m' (a:int) b = if a < b then a else b in
+      m' (m' x y) z
+   
+  (* Matrix initialization. *)
+  let init_matrix n m =
+    let init_col = Array.init m in
+    Array.init n (function
+      | 0 -> init_col (function j -> j)
+      | i -> init_col (function 0 -> i | _ -> 0)
+    )
+   
+  (* Computes the Levenshtein distance between two unistring.
+   * If you want to run it faster, add the -unsafe option when
+   * compiling or use Array.unsafe_* functions (but be carefull 
+   * with these well-named unsafe features). *)
+  let distance_utf8 x y =
+    match Array.length x, Array.length y with
+      | 0, n -> n
+      | m, 0 -> m
+      | m, n ->
+         let matrix = init_matrix (m + 1) (n + 1) in
+           for i = 1 to m do
+             let s = matrix.(i) and t = matrix.(i - 1) in
+               for j = 1 to n do
+                 let cost = abs (Pervasives.compare x.(i - 1) y.(j - 1)) in
+                   s.(j) <- minimum (t.(j) + 1) (s.(j - 1) + 1) (t.(j - 1) + cost)
+               done
+           done;
+           matrix.(m).(n)
+   
+  (* This function takes two strings, convert them to unistring (int array)
+   * and then call distance_utf8, so we can compare utf8 strings. Please
+   * note that you need Glib (see LablGTK). *)
+  let distance x y =
+    distance_utf8 (to_code_array x) (to_code_array y)
+
+  TEST_UNIT "leven3" = 
+    let test n s1 s2 =
+      let d = distance s1 s2 in
+      if d <> n then Exn.failwithf "distance %S %S = %d <> %d!" s1 s2 d n
     in
-    memoize_rec lev (String.length s1 - 1, String.length s2 - 1)
+    test 3 "xaaax" "xx"
 end
 
+let random len = 
+  let s = create len in
+  for i = 0 to len - 1 do
+    unsafe_set s i (Char.chr (Random.int 256))
+  done;
+  s
+
+let random_hum len =
+  let s = create len in
+  let range = Char.code '~' - Char.code ' ' + 1 in
+  let shift = Char.code ' ' in
+  for i = 0 to len - 1 do
+    unsafe_set s i (Char.chr (Random.int range + shift))
+  done;
+  s
+
+TEST_UNIT "Xstring.Levenstein" =
+  for _i = 0 to 10000 do
+    let l1 = Random.int 10 in
+    let l2 = Random.int 10 in
+    let s1 = random_hum l1 in
+    let s2 = random_hum l2 in
+    let d1 = Levenshtein.distance s1 s2 in
+    let d2 = L_slow.distance s1 s2 in
+    if d1 <> d2 then begin
+      Format.eprintf "%d %d\n%S\n%S@." d1 d2 s1 s2;
+      Exn.failwithf "%d %d\n%S\n%S" d1 d2 s1 s2
+    end
+  done      
+
 let sub' s pos len =
   let orig_len = length s in
   let len = max (min (pos + len) orig_len - pos) 0 in
     | c when c = from -> unsafe_set s' p to_
     | _ -> ()) s';
   s'
+
+module Pervasives = struct
+  let chop_eols = chop_eols
+end
   -> 'a -> string -> 'a
 
 module Levenshtein : sig
-  val dist_non_tco : string -> string -> int
-  (** Levenshtein edit distance. 
-     * Memoized.
-     * Non tail recursive. *)
+  val distance : string -> string -> int
+  (** http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance *)
 end
 
 module Pervasives : sig
 (** [replace_chars c1 c2 s] returns a copy of [s] with replacing
     all the char occurrences of [c1] by [c2]. *)
 
-    
+val to_array : string -> char array
+
+val to_code_array : string -> int array
+
+val random : int (** length *) -> string
+
+val random_hum : int (** length *) -> string
+(** human readable *)
+
 (* setup.ml generated for the first time by OASIS v0.3.0 *)
 
 (* OASIS_START *)
-(* DO NOT EDIT (digest: ab1582f164ce5b57a913ebb4cad13af3) *)
+(* DO NOT EDIT (digest: d37cc029efe4b34662cae9fe752787c5) *)
 (*
    Regenerated by OASIS v0.4.1
    Visit http://oasis.forge.ocamlcore.org for more information and
           alpha_features = [];
           beta_features = [];
           name = "spotlib";
-          version = "2.4.0";
+          version = "2.4.1";
           license =
             OASISLicense.DEP5License
               (OASISLicense.DEP5Unit
        };
      oasis_fn = Some "_oasis";
      oasis_version = "0.4.1";
-     oasis_digest = Some "0\t\225\140x\237\250\255\006\004\212r\132\243+\192";
+     oasis_digest =
+       Some "\207\026\213\182\006\164B\138z\165\190\220\218\186\188N";
      oasis_exec = None;
      oasis_setup_args = [];
      setup_update = false
 
 let setup () = BaseSetup.setup setup_t;;
 
-# 6487 "setup.ml"
+# 6488 "setup.ml"
 (* OASIS_STOP *)
 let () = setup ();;