Commits

Anonymous committed 0a8985f

Strings.Utf8: + length, + sub

Comments (0)

Files changed (2)

src/cd_Strings.ml

                but doesn't escape unicode characters. *)
             value escaped : t -> t;
 
+            value length : t -> int;
+            value sub : t -> int -> int -> t;
+
           end
          =
           struct
             ;
 
 
+            value length s =
+              let len_bytes = String.length s in
+              loop 0 0
+              where rec loop len_chars i =
+                if i = len_bytes
+                then len_chars
+                else
+                  let i = i + U.char_length s i in
+                  if i > len_bytes
+                  then raise (U.Bad_utf8 "String.length: incomplete character")
+                  else loop (len_chars + 1) i
+            ;
+
+
+            (* advance [n] characters in string [s] from offset [i]
+               and return offset of [n]'s char.
+               Expected [i] and [n] to be in ranges.
+             *)
+            value advance ~place s i n =
+              if n < 0 || i < 0
+              then assert False
+              else
+              let len_bytes = String.length s in
+              if i >= len_bytes && n > 0
+              then invalid_arg (place ^ ": string end")
+              else
+              loop i n
+              where rec loop i n =
+                if n = 0
+                then i
+                else
+                  if i = len_bytes
+                  then failwith "%s: string is too short" place
+                  else
+                    let i = i + U.char_length s i in
+                    if i > len_bytes
+                    then raise (U.Bad_utf8 (place ^ ": incomplete character"))
+                    else loop i (n - 1)
+            ;
+
+
+            value sub s ofs len =
+              if ofs < 0 || len < 0
+              then invalid_arg "String.sub"
+              else
+              let begin_ofs = advance ~place:"String.sub" s 0 ofs in
+              let end_ofs = advance ~place:"String.sub" s begin_ofs len in
+              if begin_ofs = 0 && end_ofs = length_bytes s
+              then s
+              else String.sub s begin_ofs (end_ofs - begin_ofs)
+            ;
+
+
           end  (* Utf8.String *)
         ;
 
 ;
 
 
+value utf8sublen () =
+  let open Strings.Utf8 in
+  List.iter
+    (fun (s, exp_len, ofs, len, exp_sub) ->
+       let got_len = String.length s in
+       let got_sub = String.sub s ofs len in
+       ( assert_equal exp_len got_len
+       ; assert_equal exp_sub got_sub
+       )
+    )
+    [ ("мама мыла пилораму", 18, 3, 9, "а мыла пи")
+    ]
+;
+
+
 value utf8 =
   [ "broken_utf8" >:: utf8fixing
+  ; "utf8_sub_len" >:: utf8sublen
   ]
 ;