Source

spotlib / lib / xstring.ml

Full commit
open Base

open String (* BEWARE! compare is now for string *)

let index_opt s c = try Some (index s c) with Not_found -> None

let rec index_rec s lim i c =
  if i >= lim then None else
    if String.unsafe_get s i = c then Some i else index_rec s lim (i +
  1) c
;;
  
let index_from_to s from to_ c =
  let l = String.length s in
  if from < 0 || from > to_ || to_ >= l then 
    invalid_arg "Xstring.index_from_to" 
  else
    index_rec s (to_+1) from c
;;

let chop_eols s =
  let len = length s in
  if len > 1 then
    match s.[len-1] with
    | '\n' -> 
        if len > 2 && s.[len-2] = '\r' then sub s 0 (len-2)
        else sub s 0 (len-1)
    | '\r' -> sub s 0 (len-1)
    | _ -> s
  else s

TEST "chop_eols" =
      chop_eols "a" = "a"
      && chop_eols "a\n" = "a"
      && chop_eols "a\r\n" = "a"
      && chop_eols "a\r" = "a"

let sub_from_to s from to_ = 
  if from > to_ then invalid_arg "sub_from_to";
  String.sub s from (to_ - from + 1)

let is_space_or_tab = function ' ' | '\t' -> true | _ -> false
let is_newline_or_return = function '\n' | '\r' -> true | _ -> false

let get_opt s pos = try Some (get s pos) with _ -> None

let lines s =
  let rec aux st start_pos pos = 
    match get_opt s pos with
    | None (* eos *) ->
        List.rev (
          if start_pos = pos then st
          else (sub s start_pos (pos - start_pos), "") :: st
        )
    | Some '\n' ->
        aux ((sub s start_pos (pos - start_pos), "\n") :: st) (pos+1) (pos+1)
    | Some '\r' ->
        begin match get_opt s (pos+1) with
        | Some '\n' ->
            aux ((sub s start_pos (pos - start_pos), "\r\n") :: st) (pos+2) (pos+2)
        | _ ->
            aux ((sub s start_pos (pos - start_pos), "\r") :: st) (pos+1) (pos+1)
        end
    | _ -> aux st start_pos (pos+1)
  in
  aux [] 0 0

TEST "lines" = 
      let ss = lines "hello\nworld\r\ngood\rday" in
      let res = ss = ["hello", "\n"; "world", "\r\n"; "good", "\r"; "day", ""] in
      if not res then List.iter (fun (x,y) -> Printf.eprintf "%S,%S\n" x y) ss;
      res

TEST "lines" = 
      lines "\na\nb\rc\r\nd\n\re\n\nf\ng" = [ ""  , "\n";
                                              "a" , "\n";
                                              "b" , "\r";
                                              "c" , "\r\n";
                                              "d" , "\n";
                                              ""  , "\r";
                                              "e" , "\n";
                                              ""  , "\n";
                                              "f" , "\n";
                                              "g" , "" ] 


(* split a string according to char_sep predicate *)
let split char_sep str =
  let len = String.length str in
  if len = 0 then [] else
    let rec skip_sep cur =
      if cur >= len then cur
      else if char_sep str.[cur] then skip_sep (succ cur)
      else cur  in
    let rec split beg cur =
      if cur >= len then 
	if beg = cur then []
	else [String.sub str beg (len - beg)]
      else if char_sep str.[cur] 
	   then 
	     let nextw = skip_sep cur in
	      (String.sub str beg (cur - beg))
		::(split nextw nextw)
	   else split beg (succ cur) in
    let wstart = skip_sep 0 in
    split wstart wstart

TEST "split" = 
        split (function ' ' -> true | _ -> false) " hello  world " = ["hello"; "world"]

let make1 = String.make 1

module Set = Xset.Make(struct type t = string let compare (x:string) y = compare x y end)

let split_at len str = String.sub str 0 len, String.sub str len (String.length str - len)

TEST "Xstring.split_at" = 
    split_at 3 "hello world" = ("hel", "lo world")
;;

let take len str = String.sub str 0 len
let prefix = take
let drop len str = String.sub str len (String.length str - len)
let drop_postfix len str = String.sub str 0 (String.length str - len)

TEST "Xstring.drop_postfix" = 
    drop_postfix 6 "hello world" = "hello"
;;

let postfix len str = 
  let l = String.length str in
  String.sub str (l-len) len

TEST "Xstring.drop_postfix" = 
    postfix 5 "hello world" = "world"
;;

let is_prefix' ?(from=0) sub str =
  let sublen = String.length sub in
  try 
    if String.sub str from sublen = sub then Some (drop (from + sublen) str)
    else None
  with _ -> None

TEST "Xstring.is_prefix'" = 
    is_prefix' "hello" "hello world" = Some " world"
;;

let is_prefix ?(from=0) sub str =
  let sublen = String.length sub in
  try 
    String.sub str from sublen = sub
  with _ -> false

TEST "Xstring.is_prefix" = 
    is_prefix "hello" "hello world"
;;

let is_substring ?from:(pos=0) ~needle:sub str =
  let str_len = String.length str in
  let sub_len = String.length sub in
  if pos + sub_len > str_len then false
  else 
    let rec iter pos = 
      if pos + sub_len > str_len then false
      else if is_prefix ~from:pos sub str then true
      else iter (pos+1)
    in
    iter pos

TEST_UNIT "Xstring.is_substring" = 
    assert (is_substring ~needle:"hello" "hello world")
    assert (is_substring ~needle:"hello" "bye world" = false)
    assert (is_substring ~needle:"shindanmaker.com" "http://shindanmaker.com/341161")
;;    

let is_postfix sub str =
  let sublen = String.length sub in
  try postfix sublen str = sub with _ -> false
  
TEST "Xstring.is_postfix" = 
    is_postfix "world" "hello world"
;;

let is_postfix' sub str =
  let sublen = String.length sub in
  try
    if postfix sublen str = sub then Some (drop_postfix sublen str)
    else None
  with _ -> None

TEST "Xstring.is_postfix" = 
    is_postfix' "world" "hello world" = Some "hello "
;;

let index_string_from str pos sub =
  let sub_len = String.length sub in
  if sub_len = 0 then pos 
  else 
    let limit = String.length str - sub_len in
    let rec iter i = 
      if i > limit then raise Not_found
      else if is_substring str ~from:i ~needle:sub then i
      else iter (i+1)
    in
    iter pos

let scani_left f acc ?from ?to_ s = 
  let from = Option.default from (fun () -> 0) in
  let to_ = Option.default to_ (fun () -> String.length s - 1) in
  let rec fold acc pos = 
    if pos >= to_ then acc
    else 
      match f pos acc & String.unsafe_get s pos with
      | `Continue acc -> fold acc & pos + 1
      | `Stop acc -> acc
  in
  fold acc from
    
let foldi_left f acc s = scani_left f acc s

let to_array s = Array.init (length s) & fun i -> unsafe_get s i

let to_code_array s = Array.init (length s) & fun i -> Char.code & unsafe_get s i

module Levenshtein = struct
  (* http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance *)

  (* Minimum of three integers *)
  let minimum (x:int) y z =
    let m' (a:int) b = if a < b then a else b in
    m' (m' x y) z
  
  (* Matrix initialization. 
  
     n x m array:
  
      ------- m -------
     | 0123456789.....(m-1)
     n 1000000000.....0
     | 2000000000.....0
     | ....
     | (n-1)..........0
  *)
  let init_matrix n m =
    let init_col = Array.init m in
    Array.init n & function
      | 0 -> init_col (function j -> j)
      | i -> init_col (function 0 -> i | _ -> 0)
  
  (* Computes the Levenshtein distance between two unistring. *)
  let distance_arrays x y =
    let get = Array.unsafe_get in
    match Array.length x, Array.length y with
      | 0, n -> n
      | m, 0 -> m
      | m, n ->
         let matrix = init_matrix (m + 1) (n + 1) in
         for i = 1 to m do
           let s = get matrix i and t = get matrix (i - 1) in
           for j = 1 to n do
             let cost = abs (Pervasives.compare (get x (i - 1)) (get y (j - 1))) in
             Array.unsafe_set s j (minimum (get t j + 1) (get s (j - 1) + 1) (get t (j - 1) + cost))
           done
         done;
         get (get matrix m) n
  
  let distance x y = distance_arrays (to_code_array x) (to_code_array y)
end

module L_slow = struct
  (* Minimum of three integers. This function is deliberately
   * not polymorphic because (1) we only need to compare integers 
   * and (2) the OCaml compilers do not perform type specialization 
   * for user-defined functions. *)
  let minimum (x:int) y z =
    let m' (a:int) b = if a < b then a else b in
      m' (m' x y) z
   
  (* Matrix initialization. *)
  let init_matrix n m =
    let init_col = Array.init m in
    Array.init n (function
      | 0 -> init_col (function j -> j)
      | i -> init_col (function 0 -> i | _ -> 0)
    )
   
  (* Computes the Levenshtein distance between two unistring.
   * If you want to run it faster, add the -unsafe option when
   * compiling or use Array.unsafe_* functions (but be carefull 
   * with these well-named unsafe features). *)
  let distance_utf8 x y =
    match Array.length x, Array.length y with
      | 0, n -> n
      | m, 0 -> m
      | m, n ->
         let matrix = init_matrix (m + 1) (n + 1) in
           for i = 1 to m do
             let s = matrix.(i) and t = matrix.(i - 1) in
               for j = 1 to n do
                 let cost = abs (Pervasives.compare x.(i - 1) y.(j - 1)) in
                   s.(j) <- minimum (t.(j) + 1) (s.(j - 1) + 1) (t.(j - 1) + cost)
               done
           done;
           matrix.(m).(n)
   
  (* This function takes two strings, convert them to unistring (int array)
   * and then call distance_utf8, so we can compare utf8 strings. Please
   * note that you need Glib (see LablGTK). *)
  let distance x y =
    distance_utf8 (to_code_array x) (to_code_array y)

  TEST_UNIT "leven3" = 
    let test n s1 s2 =
      let d = distance s1 s2 in
      if d <> n then Exn.failwithf "distance %S %S = %d <> %d!" s1 s2 d n
    in
    test 3 "xaaax" "xx"
end

let random len = 
  let s = create len in
  for i = 0 to len - 1 do
    unsafe_set s i (Char.chr (Random.int 256))
  done;
  s

let random_hum len =
  let s = create len in
  let range = Char.code '~' - Char.code ' ' + 1 in
  let shift = Char.code ' ' in
  for i = 0 to len - 1 do
    unsafe_set s i (Char.chr (Random.int range + shift))
  done;
  s

TEST_UNIT "Xstring.Levenstein" =
  for _i = 0 to 10000 do
    let l1 = Random.int 10 in
    let l2 = Random.int 10 in
    let s1 = random_hum l1 in
    let s2 = random_hum l2 in
    let d1 = Levenshtein.distance s1 s2 in
    let d2 = L_slow.distance s1 s2 in
    if d1 <> d2 then begin
      Format.eprintf "%d %d\n%S\n%S@." d1 d2 s1 s2;
      Exn.failwithf "%d %d\n%S\n%S" d1 d2 s1 s2
    end
  done      

let sub' s pos len =
  let orig_len = length s in
  let len = max (min (pos + len) orig_len - pos) 0 in
  sub s pos len

TEST_UNIT "Xstring.sub'" = 
  assert (sub' "hello" 0 4 = "hell");
  assert (sub' "hello" 0 5 = "hello");
  assert (sub' "hello" 0 6 = "hello");
  assert (sub' "hello" 0 7 = "hello");
  assert (sub' "hello" 3 2 = "lo");
  assert (sub' "hello" 3 3 = "lo");
  assert (sub' "hello" 3 4 = "lo");
  assert (sub' "hello" 5 5 = "")

let find s pos f =
  let len = length s in
  let rec scan pos =
    if pos >= len then None
    else if f (unsafe_get s pos) then Some pos else scan (pos + 1)
  in
  scan pos

let replace_chars from to_ s =
  let s' = copy s in
  iteri (fun p -> function
    | c when c = from -> unsafe_set s' p to_
    | _ -> ()) s';
  s'

module Pervasives = struct
  let chop_eols = chop_eols
end