ocaml-lib / unicode.ml

type codepoint = int

let codepoint_of_string s = int_of_string ("0x" ^ s)

type utf8 = string

let regexp_utf8_char = "\\([\x00-\x7F]\\|[\xC0-\xDF][\x80-\xBF]\\|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]\\|[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]\\)"


let utf8_of_codepoint (code : codepoint) : utf8 =
  let buf = Buffer.create 4 in
  let write byte = Buffer.add_char buf (Char.chr byte) in
  if code < 0x0080 then
    write code
  else if code < 0x0800 then begin
    write (0xC0 lor (code lsr 6));
    write (0x80 lor (code land 0x3F)) end
  else if code < 0x010000 then begin
    write (0xE0 lor (code lsr 12));
    write (0x80 lor ((code lsr 6) land 0x3F));
    write (0x80 lor (code land 0x3F)) end
  else if code < 0x110000 then begin
    write (0xF0 lor (code lsr 18));
    write (0x80 lor ((code lsr 12) land 0x3F));
    write (0x80 lor ((code lsr 6) land 0x3F));
    write (0x80 lor (code land 0x3F)) end
  else failwith "Unicode.utf8_of_codepoint: undefined for codepoints above U+10FFFF";
  Buffer.contents buf

let codepoint_of_utf8_char (s : utf8) : codepoint =
  (* assume s is a valid representation of a UTF8 char *)
  let n = String.length s in
  let res = ref 0 in
  let add word = res := !res lor word in
  if n = 1 then
    add (Char.code s.[0])
  else if n = 2 then begin
    add ((Char.code s.[0] land 0x1F) lsl 6);
    add (Char.code s.[1] land 0x3F) end
  else if n = 3 then begin
    add ((Char.code s.[0] land 0x0F) lsl 12);
    add ((Char.code s.[1] land 0x3F) lsl 6);
    add (Char.code s.[2] land 0x3F) end
  else if n = 4 then begin
    add ((Char.code s.[0] land 0x07) lsl 18);
    add ((Char.code s.[1] land 0x3F) lsl 12);
    add ((Char.code s.[2] land 0x3F) lsl 6);
    add (Char.code s.[3] land 0x3F) end
  else failwith "Unicode.codepoint_of_utf8_char: invalid UTF8 code";
  !res
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.