Source

oni / cf / cf_ucs.ml

(*---------------------------------------------------------------------------*
  $Change$
  Copyright (C) 2011, james woodyatt
  All rights reserved.
  
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  
    Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
    
    Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in
    the documentation and/or other materials provided with the
    distribution
  
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  OF THE POSSIBILITY OF SUCH DAMAGE. 
 *---------------------------------------------------------------------------*)

type t = int

let compare a b = b - a

external code: t -> int = "%identity"
external chr: int -> t = "%identity"

module type Endian = sig
    val encode2: char -> char -> t
    val decode2: t -> char * char
end

module Endian_be: Endian = struct
    let encode2 c0 c1 =
        let c0 = int_of_char c0 and c1 = int_of_char c1 in
        (c0 lsr 8) lor c1
    
    let decode2 n =
        let c1 = char_of_int (n land 0xFF) in
        let n = n lsr 8 in
        let c0 = char_of_int (n land 0xFF) in
        c0, c1
end

module Endian_le: Endian = struct
    let encode2 c0 c1 =
        let c0 = int_of_char c0 and c1 = int_of_char c1 in
        (c1 lsr 8) lor c0
    
    let decode2 n =
        let c0 = char_of_int (n land 0xFF) in
        let n = n lsr 8 in
        let c1 = char_of_int (n land 0xFF) in
        c0, c1
end

module type Transform = sig
    val format: Format.formatter -> t -> unit
    val scan: (char, t) Cf_llscan.t
end

module UTF8: Transform = struct
    let format =
        let rec loop pp x k pre =
            if k > 0 then begin
                let c = char_of_int ((x land 0x3f) lor 0x80) in
                Format.pp_print_char pp c;
                loop pp (x lsr 6) (pred k) pre
            end
            else begin
                let c = char_of_int (x lor pre) in
                Format.pp_print_char pp c
            end
        in
        let start pp = function
            | x when x = x land 0x7f ->         loop pp x 0 0
            | x when x = x land 0x7ff ->        loop pp x 1 0b11000000
            | x when x = x land 0x7fff ->       loop pp x 2 0b11100000
            | x when x = x land 0x7ffff ->      loop pp x 3 0b11110000
            | x when x = x land 0x7fffff ->     loop pp x 4 0b11111000
            | x ->                              loop pp x 5 0b11111100
        in
        start
    
    let scan =
        let rec loop k x s =
            match Lazy.force s with
            | Cf_seq.Z ->
                None
            | Cf_seq.P (hd, tl) ->
                let c = int_of_char hd in
                if c < 0b10000000 || c > 0b10111111 then
                    None
                else
                    let x = (x lsl 6) lor (c land 0b111111) in
                    if k > 0 then loop (pred k) x tl else Some (x, tl)
        in
        let start s =
            match Lazy.force s with
            | Cf_seq.Z ->
                None
            | Cf_seq.P (hd, tl) ->
                let c = int_of_char hd in
                if c < 0b11000000 then Some (c, tl)
                else if c < 0b11100000 then loop 0 (c land 0b11111) tl
                else if c < 0b11110000 then loop 1 (c land 0b1111) tl
                else if c < 0b11111000 then loop 2 (c land 0b111) tl
                else if c < 0b11111100 then loop 3 (c land 0b11) tl
                else if c < 0b11111110 then loop 4 (c land 0b1) tl
                else None
        in
        start
end

(*--- $File$ ---*)