Source

oni / ucs / ucs_code.ml

Full commit
(*---------------------------------------------------------------------------*
  $Change$
  Copyright (C) 2011, james woodyatt
  All rights reserved.
  
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  
    Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.
    
    Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in
    the documentation and/or other materials provided with the
    distribution
  
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  OF THE POSSIBILITY OF SUCH DAMAGE. 
 *---------------------------------------------------------------------------*)

let ( >>= ) = Cf_llscan.bind

type t = int

let compare a b = b - a

external code: t -> int = "%identity"
external chr: int -> t = "%identity"

module type Transform = sig
    val emit: Format.formatter -> t -> unit
    val scan: (char, t) Cf_llscan.t
end

module UTF_8: Transform = struct
    let emit =
        let rec loop pp x k pre =
            if k > 0 then begin
                let c = char_of_int ((x land 0x3f) lor 0x80) in
                Format.pp_print_char pp c;
                loop pp (x lsr 6) (pred k) pre
            end
            else begin
                let c = char_of_int (x lor pre) in
                Format.pp_print_char pp c
            end
        in
        let start pp = function
            | x when x = x land 0x7f ->         loop pp x 0 0
            | x when x = x land 0x7ff ->        loop pp x 1 0b11000000
            | x when x = x land 0x7fff ->       loop pp x 2 0b11100000
            | x when x = x land 0x7ffff ->      loop pp x 3 0b11110000
            | x when x = x land 0x7fffff ->     loop pp x 4 0b11111000
            | x ->                              loop pp x 5 0b11111100
        in
        start
    
    let scan =
        let rec loop k x s =
            match Lazy.force s with
            | Cf_seq.Z ->
                None
            | Cf_seq.P (hd, tl) ->
                let c = int_of_char hd in
                if c < 0b10000000 || c > 0b10111111 then
                    None
                else
                    let x = (x lsl 6) lor (c land 0b111111) in
                    if k > 0 then loop (pred k) x tl else Some (x, tl)
        in
        let start s =
            match Lazy.force s with
            | Cf_seq.Z ->
                None
            | Cf_seq.P (hd, tl) ->
                let c = int_of_char hd in
                if c < 0b11000000 then Some (c, tl)
                else if c < 0b11100000 then loop 0 (c land 0b11111) tl
                else if c < 0b11110000 then loop 1 (c land 0b1111) tl
                else if c < 0b11111000 then loop 2 (c land 0b111) tl
                else if c < 0b11111100 then loop 3 (c land 0b11) tl
                else if c < 0b11111110 then loop 4 (c land 0b1) tl
                else None
        in
        start
end

module type UCS_2_endian = sig
    val emit2: Format.formatter -> int -> unit
    val scan2: (char, int) Cf_llscan.t
end

module UCS_2_big_endian: UCS_2_endian = struct
    let emit2 pp n =
        let c1 = char_of_int (n land 0xFF) in
        let n = n lsr 8 in
        let c0 = char_of_int (n land 0xFF) in
        Format.pp_print_char pp c0;
        Format.pp_print_char pp c1
    
    let scan2 =
        let lift c = Some (int_of_char c) in
        Cf_llscan.tok lift >>= fun c0 ->
        Cf_llscan.tok lift >>= fun c1 ->
        Cf_llscan.ret ((c0 lsr 8) lor c1)
end

module UCS_2_little_endian: UCS_2_endian = struct
    let emit2 pp n =
        let c1 = char_of_int (n land 0xFF) in
        let n = n lsr 8 in
        let c0 = char_of_int (n land 0xFF) in
        Format.pp_print_char pp c1;
        Format.pp_print_char pp c0
    
    let scan2 =
        let lift c = Some (int_of_char c) in
        Cf_llscan.tok lift >>= fun c0 ->
        Cf_llscan.tok lift >>= fun c1 ->
        Cf_llscan.ret ((c1 lsr 8) lor c0)
end

module UTF_16_functor(UCS2: UCS_2_endian) = struct
    let emit pp n =
        if n = n land 0xffff then
            UCS2.emit2 pp n
        else if n > 0 && n < 0x110000 then begin
            let n = n - 0x10000 in
            UCS2.emit2 pp (0xd800 lor ((n lsr 10) land 0x3ff));
            UCS2.emit2 pp (0xdc00 lor (n land 0x3ff))
        end
        else
            UCS2.emit2 pp 0xfffd
    
    let scan =
        UCS2.scan2 >>= fun c ->
        assert (c >= 0 && c < 0xffff);
        if c < 0xd800 || c >= 0xe000 then
            Cf_llscan.ret c
        else begin
            if c lsr 10 <> 0xd8 then
                Cf_llscan.nil
            else begin
                let c0 = c land 0x3ff in
                UCS2.scan2 >>= fun c ->
                if c lsr 10 <> 0xdc then
                    Cf_llscan.nil
                else begin
                    let c1 = c land 0x3ff in
                    Cf_llscan.ret ((c0 lsl 10) lor c1)
                end
            end
        end
end

module UTF_16BE = UTF_16_functor(UCS_2_big_endian)
module UTF_16LE = UTF_16_functor(UCS_2_little_endian)

type endian = [ `BE | `LE ]

let emit_bom pp = function
    | `BE ->
        Format.pp_print_char pp '\xff';
        Format.pp_print_char pp '\xfe'
    | `LE ->
        Format.pp_print_char pp '\xfe';
        Format.pp_print_char pp '\xff'

let scan_bom =
    Cf_llscan.any >>= fun c0 ->
    Cf_llscan.any >>= fun c1 ->
    match c0, c1 with
    | '\xff', '\xfe' -> Cf_llscan.ret `BE
    | '\xfe', '\xff' -> Cf_llscan.ret `LE
    | _, _ -> Cf_llscan.nil

(*--- $File$ ---*)