Commits

camlspotter committed d22ae24

Added Levenshtein for Levenshtein distance for arrays

  • Participants
  • Parent commits 9c198be
  • Branches 2.4.0

Comments (0)

Files changed (6)

 2.4.1
 ---------------
 
-- String.Levenshtein.distance is rewritten using an imperative algorithsm.
+- Added Levenshtein for Levenshtein distance for arrays
+- String.Levenshtein.distance is rewritten using an imperative algorithm.
 
 2.4.0
 ---------------

File lib/OMakefile

    xlist
    xarray
    xhashtbl
+   levenshtein
    xstring
    xlazy
    xformat

File lib/levenshtein.ml

+(**
+
+   Levenshtein distance algorithm for general array.
+   
+   Author: jun.furuse@gmail.com
+   License: public domain
+
+*)
+
+(** Minimum of three integers *)
+let min3 (x:int) y z =
+  let m' (a:int) b = if a < b then a else b in
+  m' (m' x y) z
+
+(* Matrix initialization. 
+
+    ------- 2 ----------
+   | 0123456789...    m
+   | 1
+   1 2          0
+   | .              
+   | n
+*)
+let init_matrix n m =
+  let init_col = Array.init (m+1) in
+  Array.init (n+1) (function
+    | 0 -> init_col (function j -> j)
+    | i -> init_col (function 0 -> i | _ -> 0))
+
+module type S = sig
+  type t
+  val distance : ?upper_bound: int -> t -> t -> int
+  (** Calculate Levenshtein distance of 2 t's *)
+end
+
+module Make(A : sig 
+  type t
+  type elem
+  val compare : elem -> elem -> int
+  val get : t -> int -> elem
+  val size : t -> int
+end) = struct
+
+  type t = A.t
+
+  (* It is simply slow but nearest to the math *)
+  let slow_but_simple xs ys =
+    let rec d i j =
+      match i, j with
+      | 0, _ -> j
+      | _, 0 -> i
+      | _ -> 
+          let i' = i - 1 in
+          let j' = j - 1 in
+          min3 
+            (d i' j + 1)
+            (d i j' + 1)
+            (d i' j' + abs (A.compare (A.get xs i') (A.get ys j')))
+    in
+    d (A.size xs) (A.size ys)
+
+  (* slow_but_simple + memoization *)      
+  let memoized xs ys =
+    let cache = Array.init (A.size xs+1) (fun _ -> Array.create (A.size ys+1) (-1)) in
+    let rec d i j =
+      match i, j with
+      | 0, _ -> j
+      | _, 0 -> i
+      | _ -> 
+          let cache_i = Array.unsafe_get cache i in
+          match Array.unsafe_get cache_i j with
+          | -1 ->
+              let res = 
+                let i' = i - 1 in
+                let j' = j - 1 in
+                min3 
+                  (d i' j + 1)
+                  (d i j' + 1)
+                  (d i' j' + abs (A.compare (A.get xs i') (A.get ys j')))
+              in
+              Array.unsafe_set cache_i j res;
+              res
+          | res -> res
+    in
+    d (A.size xs) (A.size ys)
+
+  (* slow_but_simple + memoization + upperbound 
+
+     There is a property: d(i-1)(j-1) <= d(i)(j)
+     so if d(i-1)(j-1) >= upper_bound then we can immediately say
+     d(i)(j) >= upper_bound, and skip the calculation of d(i-1)(j) and d(i)(j-1)
+  *)
+  let distance ?(upper_bound=max_int) xs ys =
+    let size_xs = A.size xs 
+    and size_ys = A.size ys in
+    (* cache: d i j is stored at cache.(i-1).(j-1) *)
+    let cache = Array.init size_xs (fun _ -> Array.create size_ys (-1)) in
+    let rec d i j =
+      match i, j with
+      | 0, _ -> j
+      | _, 0 -> i
+      | _ -> 
+          let i' = i - 1 in
+          let cache_i = Array.unsafe_get cache i' in
+          let j' = j - 1 in
+          match Array.unsafe_get cache_i j' with
+          | -1 ->
+              let res = 
+                let upleft = d i' j' in
+                if upleft >= upper_bound then upper_bound
+                else 
+                  let cost = abs (A.compare (A.get xs i') (A.get ys j')) in
+                  let upleft' = upleft + cost in
+                  if upleft' >= upper_bound then upper_bound
+                  else
+                    (* This is not tail recursive *)
+                    min3 (d i' j + 1)
+                         (d i j' + 1)
+                         upleft'
+              in
+              Array.unsafe_set cache_i j' res;
+              res
+          | res -> res
+    in
+    min (d size_xs size_ys) upper_bound
+
+  (** http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance *)
+  let wikibook xs ys =
+    let get = Array.unsafe_get in
+    match A.size xs, A.size ys with
+    | 0, n -> n
+    | m, 0 -> m
+    | m, n ->
+        let matrix = init_matrix m n in
+        (* d(0)(j) = j
+           d(i)(0) = i
+           d(i)(j) = min of d(i-1)(j  ) + 1
+                            d(i)  (j-1) + 1
+                            d(i-1)(j-1) + if x.[i-1] = y.[j-1] then 0 else 1
+        *)
+        for i = 1 to m do
+          let s = get matrix i and t = get matrix (i - 1) in
+          for j = 1 to n do
+            let cost = abs (A.compare (A.get xs (i - 1)) (A.get ys (j - 1))) in
+            Array.unsafe_set s j (min3 (get t j + 1) (get s (j - 1) + 1) (get t (j - 1) + cost))
+          done
+        done;
+        get (get matrix m) n
+end
+
+module String = struct
+
+  include Make(struct
+    type t = string
+    type elem = char
+    let compare (c1 : char) c2 = compare c1 c2
+    let get = String.unsafe_get
+    let size = String.length
+  end)
+
+  TEST "Levenshtein.String slow_but_simple" =
+    slow_but_simple "xx" "xaaax" = 3
+
+  TEST "Levenshtein.String.distance" =
+    distance "xx" "xaaax" = 3
+
+  let random_char = 
+    let offset = Char.code 'A' in
+    let length = Char.code 'Z' - offset + 1 in
+    fun () -> Char.chr (Random.int length + offset)
+
+  let random len =
+    let s = String.create len in
+    for i = 0 to len - 1 do
+      String.unsafe_set s i (random_char ())
+    done;
+    s
+
+  let test ?(upper_bound=max_int) loop len dist dist' =
+    for i = 0 to loop do
+      if i mod (loop / 10) = 0 then Printf.eprintf "%d\n%!" i;
+      let l1 = Random.int len in
+      let l2 = Random.int len in
+      let s1 = random l1 in
+      let s2 = random l2 in
+      let d = dist s1 s2 in
+      let d' = dist' s1 s2 in
+      if d < upper_bound && d' < upper_bound && d <> d' then begin
+        Printf.eprintf "%s %s : %d %d\n" s1 s2 d d';
+        assert false
+      end
+    done
+
+  TEST_UNIT "wikibook correctness" =  
+    test 1000 10 slow_but_simple wikibook
+
+  TEST_UNIT "memoized and wikibook" =  
+    test 10000 20 memoized wikibook
+
+  TEST_UNIT "distance and wikibook" =  
+    test ~upper_bound:20 100000 30 (distance ~upper_bound:20) wikibook
+
+  TEST_UNIT "distance and wikibook performance check (it takes long time)" =  
+    let sample_size = 100000 in
+    for _i = 1 to 10 do
+      let samples = Array.init sample_size (fun _ -> random 30, random 30) in
+      let time name f v =
+        let () = Gc.full_major () in
+        let start = Unix.gettimeofday () in
+        f v;
+        let end_ = Unix.gettimeofday () in
+        Printf.eprintf "%s : %f\n%!" name (end_ -. start)
+      in
+      let bench d =
+        Array.iter (fun (s1,s2) -> ignore (d s1 s2)) samples
+      in
+      time "wikibook" bench wikibook;
+      time "distance ~upper_bound:20" bench (distance ~upper_bound:20)
+    done
+        
+end

File lib/levenshtein.mli

+(**
+
+   Levenshtein distance algorithm for general array.
+
+   Author: jun.furuse@gmail.com
+   License: public domain
+
+*)
+
+module type S = sig
+  type t
+  val distance : ?upper_bound: int -> t -> t -> int
+  (** Calculate Levenshtein distance of 2 t's.
+      
+      If we are only interested in the distance if it is smaller than 
+      a threshold, specifying [upper_bound] greatly improves the performance
+      of [distance]. In that case, the distances over [upper_bound] is 
+      culled to [upper_bound].
+  *)
+end
+
+module Make(A : sig
+  type t
+  (** Type of arrays *)
+
+  type elem 
+  (** Type of the elements of arrays *)
+
+  val compare : elem -> elem -> int
+  val get : t -> int -> elem
+  val size : t -> int
+
+end) : S with type t = A.t
+
+module String : S with type t = string
+

File lib/xstring.ml

 
 let to_code_array s = Array.init (length s) & fun i -> Char.code & unsafe_get s i
 
-module Levenshtein = struct
-  (* http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance *)
-
-  (* Minimum of three integers *)
-  let minimum (x:int) y z =
-    let m' (a:int) b = if a < b then a else b in
-    m' (m' x y) z
-  
-  (* Matrix initialization. 
-  
-     n x m array:
-  
-      ------- m -------
-     | 0123456789.....(m-1)
-     n 1000000000.....0
-     | 2000000000.....0
-     | ....
-     | (n-1)..........0
-  *)
-  let init_matrix n m =
-    let init_col = Array.init m in
-    Array.init n & function
-      | 0 -> init_col (function j -> j)
-      | i -> init_col (function 0 -> i | _ -> 0)
-  
-  (* Computes the Levenshtein distance between two unistring. *)
-  let distance_arrays x y =
-    let get = Array.unsafe_get in
-    match Array.length x, Array.length y with
-      | 0, n -> n
-      | m, 0 -> m
-      | m, n ->
-         let matrix = init_matrix (m + 1) (n + 1) in
-         for i = 1 to m do
-           let s = get matrix i and t = get matrix (i - 1) in
-           for j = 1 to n do
-             let cost = abs (Pervasives.compare (get x (i - 1)) (get y (j - 1))) in
-             Array.unsafe_set s j (minimum (get t j + 1) (get s (j - 1) + 1) (get t (j - 1) + cost))
-           done
-         done;
-         get (get matrix m) n
-  
-  let distance x y = distance_arrays (to_code_array x) (to_code_array y)
-end
-
-module L_slow = struct
-  (* Minimum of three integers. This function is deliberately
-   * not polymorphic because (1) we only need to compare integers 
-   * and (2) the OCaml compilers do not perform type specialization 
-   * for user-defined functions. *)
-  let minimum (x:int) y z =
-    let m' (a:int) b = if a < b then a else b in
-      m' (m' x y) z
-   
-  (* Matrix initialization. *)
-  let init_matrix n m =
-    let init_col = Array.init m in
-    Array.init n (function
-      | 0 -> init_col (function j -> j)
-      | i -> init_col (function 0 -> i | _ -> 0)
-    )
-   
-  (* Computes the Levenshtein distance between two unistring.
-   * If you want to run it faster, add the -unsafe option when
-   * compiling or use Array.unsafe_* functions (but be carefull 
-   * with these well-named unsafe features). *)
-  let distance_utf8 x y =
-    match Array.length x, Array.length y with
-      | 0, n -> n
-      | m, 0 -> m
-      | m, n ->
-         let matrix = init_matrix (m + 1) (n + 1) in
-           for i = 1 to m do
-             let s = matrix.(i) and t = matrix.(i - 1) in
-               for j = 1 to n do
-                 let cost = abs (Pervasives.compare x.(i - 1) y.(j - 1)) in
-                   s.(j) <- minimum (t.(j) + 1) (s.(j - 1) + 1) (t.(j - 1) + cost)
-               done
-           done;
-           matrix.(m).(n)
-   
-  (* This function takes two strings, convert them to unistring (int array)
-   * and then call distance_utf8, so we can compare utf8 strings. Please
-   * note that you need Glib (see LablGTK). *)
-  let distance x y =
-    distance_utf8 (to_code_array x) (to_code_array y)
-
-  TEST_UNIT "leven3" = 
-    let test n s1 s2 =
-      let d = distance s1 s2 in
-      if d <> n then Exn.failwithf "distance %S %S = %d <> %d!" s1 s2 d n
-    in
-    test 3 "xaaax" "xx"
-end
+module Levenshtein = Levenshtein.String
 
 let random len = 
   let s = create len in
   done;
   s
 
-TEST_UNIT "Xstring.Levenstein" =
-  for _i = 0 to 10000 do
-    let l1 = Random.int 10 in
-    let l2 = Random.int 10 in
-    let s1 = random_hum l1 in
-    let s2 = random_hum l2 in
-    let d1 = Levenshtein.distance s1 s2 in
-    let d2 = L_slow.distance s1 s2 in
-    if d1 <> d2 then begin
-      Format.eprintf "%d %d\n%S\n%S@." d1 d2 s1 s2;
-      Exn.failwithf "%d %d\n%S\n%S" d1 d2 s1 s2
-    end
-  done      
-
 let sub' s pos len =
   let orig_len = length s in
   let len = max (min (pos + len) orig_len - pos) 0 in

File lib/xstring.mli

   -> 'a -> string -> 'a
 
 module Levenshtein : sig
-  val distance : string -> string -> int
-  (** http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance *)
+  val distance : ?upper_bound: int -> string -> string -> int
+  (** See Levenshtein.S *)
 end
 
 module Pervasives : sig