Markus Mottl avatar Markus Mottl committed 3c1558a

Added syntax for nested block comments, fixed several minor bugs, and cleaned up code

Comments (0)

Files changed (4)

base/sexplib/lib/lexer.mll

     | 't' -> '\t'
     | 'b' -> '\b'
     | 'r' -> '\r'
-    | c   -> c
+    | c -> c
 
-  let double_nl = "\013\010"
+  let dos_newline = "\013\010"
 
   let dec_code c1 c2 c3 =
     100 * (Char.code c1 - 48) + 10 * (Char.code c2 - 48) + (Char.code c3 - 48)
       else d2 - 48 in
     val1 * 16 + val2
 
-  let found_newline lexbuf diff =
-    let curr_p = lexbuf.lex_curr_p in
+  let found_newline ({ lex_curr_p; _ } as lexbuf) diff =
     lexbuf.lex_curr_p <-
       {
-        curr_p with
-        pos_lnum = curr_p.pos_lnum + 1;
-        pos_bol = max 1 (curr_p.pos_cnum - diff);
+        lex_curr_p with
+        pos_lnum = lex_curr_p.pos_lnum + 1;
+        pos_bol = lex_curr_p.pos_cnum - diff;
       }
 
-  let get_lexeme_len lexbuf = lexbuf.lex_curr_pos - lexbuf.lex_start_pos
+  let lexeme_len lexbuf = lexeme_end lexbuf - lexeme_start lexbuf
+
+  let main_failure lexbuf msg =
+    let { pos_lnum; pos_bol; pos_cnum; _ } = lexeme_start_p lexbuf in
+    let msg =
+      sprintf
+        "Sexplib.Lexer.main: %s at line %d char %d"
+        msg pos_lnum (pos_cnum - pos_bol)
+    in
+    failwith msg
 }
 
-let newline = ('\010' | '\013' | "\013\010")
-let space = [' ' '\009' '\012']
-let whitespace = [' ' '\010' '\013' '\009' '\012']
-let backslash_escapes = ['\\' '\'' '"' 'n' 't' 'b' 'r' ' ']
+let lf_cr = ['\010' '\013']
+let dos_newline = "\013\010"
+let newline = lf_cr | dos_newline
+let blank = [' ' '\009' '\012']
+let unquoted = [^ ';' '(' ')' '"'] # blank # lf_cr
+let digit = ['0'-'9']
+let hexdigit = digit | ['a'-'f' 'A'-'F']
+
+let unquoted_start =
+  unquoted # ['#' '|'] | '#' unquoted # ['|'] | '|' unquoted # ['#']
 
 rule main buf = parse
-  | newline { found_newline lexbuf 1; main buf lexbuf }
-  | space+ | ';' [^ '\n' '\r']* { main buf lexbuf }
+  | newline { found_newline lexbuf 0; main buf lexbuf }
+  | blank+ | ';' (_ # lf_cr)* { main buf lexbuf }
   | '(' { LPAREN }
   | ')' { RPAREN }
-  | "#;" { SEXP_COMMENT }
   | '"'
       {
-        scan_string buf lexbuf;
+        scan_string buf (lexeme_start_p lexbuf) lexbuf;
         let str = Buffer.contents buf in
         Buffer.clear buf;
         STRING str
       }
-  | ([^ ';' '(' ')' '"'] # whitespace)+ as str { STRING str }
+  | "#;" { SEXP_COMMENT }
+  | "#|"
+      {
+        scan_block_comment buf [lexeme_start_p lexbuf] lexbuf;
+        main buf lexbuf
+      }
+  | "|#" { main_failure lexbuf "illegal end of comment" }
+  | unquoted_start unquoted* ("#|" | "|#") unquoted*
+      { main_failure lexbuf "comment tokens in unquoted atom" }
+  | "#" | unquoted_start unquoted* as str { STRING str }
   | eof { EOF }
 
-and scan_string buf = parse
+and scan_string buf start = parse
   | '"' { () }
-  | '\\' ['\010' '\013'] [' ' '\009']*
+  | '\\' lf_cr [' ' '\t']*
       {
-        let len = get_lexeme_len lexbuf in
-        found_newline lexbuf (len - 2);
-        scan_string buf lexbuf
+        found_newline lexbuf (lexeme_len lexbuf - 2);
+        scan_string buf start lexbuf
       }
-  | '\\' "\013\010" [' ' '\009']*
+  | '\\' dos_newline [' ' '\t']*
       {
-        let len = get_lexeme_len lexbuf in
-        found_newline lexbuf (len - 3);
-        scan_string buf lexbuf
+        found_newline lexbuf (lexeme_len lexbuf - 3);
+        scan_string buf start lexbuf
       }
-  | '\\' (backslash_escapes as c)
+  | '\\' (['\\' '\'' '"' 'n' 't' 'b' 'r' ' '] as c)
       {
         Buffer.add_char buf (char_for_backslash c);
-        scan_string buf lexbuf
+        scan_string buf start lexbuf
       }
-  | '\\' (['0'-'9'] as c1) (['0'-'9'] as c2) (['0'-'9']  as c3)
+  | '\\' (digit as c1) (digit as c2) (digit as c3)
       {
         let v = dec_code c1 c2 c3 in
         if v > 255 then (
-          let pos = lexbuf.lex_curr_p in
+          let { pos_lnum; pos_bol; pos_cnum; _ } = lexeme_end_p lexbuf in
           let msg =
             sprintf
               "Sexplib.Lexer.scan_string: \
                illegal escape at line %d char %d: `\\%c%c%c'"
-              pos.pos_lnum (pos.pos_cnum - pos.pos_bol - 3)
+              pos_lnum (pos_cnum - pos_bol - 3)
               c1 c2 c3 in
           failwith msg);
         Buffer.add_char buf (Char.chr v);
-        scan_string buf lexbuf
+        scan_string buf start lexbuf
       }
-  | '\\' 'x' (['0'-'9' 'a'-'f' 'A'-'F'] as c1) (['0'-'9' 'a'-'f' 'A'-'F'] as c2)
+  | '\\' 'x' (hexdigit as c1) (hexdigit as c2)
       {
         let v = hex_code c1 c2 in
-        if v > 255 then (
-          let pos = lexbuf.lex_curr_p in
-          let msg =
-            sprintf
-              "Sexplib.Lexer.scan_string: \
-               illegal escape at line %d char %d: `\\x%c%c'"
-              pos.pos_lnum (pos.pos_cnum - pos.pos_bol - 3)
-              c1 c2 in
-          failwith msg);
         Buffer.add_char buf (Char.chr v);
-        scan_string buf lexbuf
+        scan_string buf start lexbuf
       }
   | '\\' (_ as c)
       {
         Buffer.add_char buf '\\';
         Buffer.add_char buf c;
-        scan_string buf lexbuf
+        scan_string buf start lexbuf
       }
-  | ['\010' '\013'] as c
+  | lf_cr as c
       {
-        found_newline lexbuf 1;
+        found_newline lexbuf 0;
         Buffer.add_char buf c;
-        scan_string buf lexbuf
+        scan_string buf start lexbuf
       }
-  | "\013\010"
+  | dos_newline
       {
-        found_newline lexbuf 2;
-        Buffer.add_string buf double_nl;
-        scan_string buf lexbuf
+        found_newline lexbuf 0;
+        Buffer.add_string buf dos_newline;
+        scan_string buf start lexbuf
       }
-  | [^ '\\' '"']+
+  | ([^ '\\' '"'] # lf_cr)+
       {
-        let ofs = lexbuf.lex_start_pos in
-        let len = lexbuf.lex_curr_pos - ofs in
+        let ofs = lexeme_start lexbuf in
+        let len = lexeme_end lexbuf - ofs in
         Buffer.add_substring buf lexbuf.lex_buffer ofs len;
-        scan_string buf lexbuf
+        scan_string buf start lexbuf
       }
-  | eof { failwith "Sexplib.Lexer.scan_string: unterminated string" }
+  | eof
+      {
+        let msg =
+          sprintf
+            "Sexplib.Lexer.scan_string: unterminated string at line %d char %d"
+            start.pos_lnum (start.pos_cnum - start.pos_bol)
+        in
+        failwith msg
+      }
+
+and scan_block_comment buf locs = parse
+  | ('#'* | '|'*) newline
+      {
+        found_newline lexbuf 0;
+        scan_block_comment buf locs lexbuf;
+      }
+  | (('#'* | '|'*) [^ '"' '#' '|'] # lf_cr)+
+      { scan_block_comment buf locs lexbuf }
+  | ('#'* | '|'*) '"'
+      {
+        let cur = lexeme_end_p lexbuf in
+        let start = { cur with pos_cnum = cur.pos_cnum - 1 } in
+        scan_string buf start lexbuf;
+        Buffer.clear buf;
+        scan_block_comment buf locs lexbuf
+      }
+  | '#'+ '|'
+    {
+      let cur = lexeme_end_p lexbuf in
+      let start = { cur with pos_cnum = cur.pos_cnum - 2 } in
+      scan_block_comment buf (start :: locs) lexbuf
+    }
+  | '|'+ '#'
+      {
+        match locs with
+        | [_] -> ()
+        | _ :: t -> scan_block_comment buf t lexbuf
+        | [] -> assert false  (* impossible *)
+      }
+  | eof
+      {
+        match locs with
+        | [] -> assert false
+        | { pos_lnum; pos_bol; pos_cnum; _ } :: _ ->
+            let msg =
+              sprintf "Sexplib.Lexer.scan_block_comment: \
+                unterminated block comment at line %d char %d"
+                pos_lnum (pos_cnum - pos_bol)
+            in
+            failwith msg
+      }
 
 {
   let main ?buf =

base/sexplib/lib/pre_sexp.ml

 
 (* Escaping of strings used as atoms in S-expressions *)
 
-let is_special_char c =
-  c <= ' ' || c = '"' || c = '(' || c = ')' || c = ';' || c = '\\'
-
 let must_escape str =
   let len = String.length str in
   len = 0 ||
-    let rec loop ix = is_special_char str.[ix] || ix > 0 && loop (ix - 1) in
+    let rec loop ix =
+      match str.[ix] with
+      | '"' | '(' | ')' | ';' | '\\' -> true
+      | '|' -> ix > 0 && let next = ix - 1 in str.[next] = '#' || loop next
+      | '#' -> ix > 0 && let next = ix - 1 in str.[next] = '|' || loop next
+      | c -> c <= ' ' || ix > 0 && loop (ix - 1)
+    in
     loop (len - 1)
 
 let maybe_esc_str str =
   let with_buf_pos t buf_pos = { t with buf_pos }
 end
 
+module Cont_state = struct
+  type t =
+    | Parsing_whitespace
+    | Parsing_atom
+    | Parsing_list
+    | Parsing_sexp_comment
+    | Parsing_block_comment
+
+  let to_string = function
+    | Parsing_whitespace -> "Parsing_whitespace"
+    | Parsing_atom -> "Parsing_atom"
+    | Parsing_list -> "Parsing_list"
+    | Parsing_sexp_comment -> "Parsing_sexp_comment"
+    | Parsing_block_comment -> "Parsing_block_comment"
+end
+
 type ('a, 't) parse_result =
   | Done of 't * Parse_pos.t
-  | Cont of bool * ('a, 't) parse_fun
+  | Cont of Cont_state.t * ('a, 't) parse_fun
 
 and ('a, 't) parse_fun = pos : int -> len : int -> 'a -> ('a, 't) parse_result
 
   parse_pos
 
 let raise_parse_error parse_state location buf_pos err_msg =
-  begin
-    match parse_state with
-    | `Sexp { parse_pos; _ } | `Annot { parse_pos; _ } ->
-        set_parse_pos parse_pos buf_pos;
-        parse_pos.Parse_pos.text_char <- parse_pos.Parse_pos.text_char + 1;
-  end;
-  let parse_error = { location; err_msg; parse_state } in
-  raise (Parse_error parse_error)
+  match parse_state with
+  | `Sexp { parse_pos; _ } | `Annot { parse_pos; _ } ->
+      set_parse_pos parse_pos buf_pos;
+      let parse_error = { location; err_msg; parse_state } in
+      raise (Parse_error parse_error)
 
 let raise_unexpected_char parse_state location buf_pos c =
   let err_msg = sprintf "unexpected character: '%c'" c in
   raise_parse_error parse_state location buf_pos err_msg
 
-(* The code below is derived from lexer.mll in the OCaml distribution,
-   which also uses ASCII codes instead of escape sequences to denote
-   special characters. *)
+let mk_cont_parser cont_parse = (); fun _state str ~max_pos ~pos ->
+  let len = max_pos - pos + 1 in
+  cont_parse ~pos ~len str
 
 (* Macro for generating parsers *)
 #define MK_PARSER( \
     if pos_len > str_len then invalid_arg (loc ^ ": pos + len > str_len"); \
     pos_len - 1 \
   \
-  let mk_cont_ws name cont state ~ws_only = \
+  let mk_cont_state name cont state ~cont_state = \
     let parse_fun = \
       let used_ref = ref false in \
       fun ~pos ~len str -> \
           cont state str ~max_pos ~pos \
         end \
     in \
-    Cont (ws_only, parse_fun) \
+    Cont (cont_state, parse_fun) \
+  \
   let mk_cont name cont state = \
-    mk_cont_ws name cont state \
-      ~ws_only:(GET_PSTACK = [] && Buffer.length state.pbuf = 0) \
+    let cont_state = \
+      match GET_PSTACK = [], Buffer.length state.pbuf = 0 with \
+      | true, true -> Cont_state.Parsing_whitespace \
+      | false, true -> Cont_state.Parsing_list \
+      | _, false -> Cont_state.Parsing_atom \
+    in \
+    mk_cont_state name cont state ~cont_state \
   \
   let rec PARSE state str ~max_pos ~pos = \
     if pos > max_pos then mk_cont "parse" PARSE state \
       | c -> \
           REGISTER_POS \
           let parse = \
-            if c = '#' then maybe_parse_sexp_comment else parse_atom \
+            match c with \
+            | '#' -> maybe_parse_comment \
+            | '|' -> maybe_parse_close_comment \
+            | _ -> parse_atom \
           in \
           add_bump_pos state str ~max_pos ~pos c parse \
   \
       | '\013' -> bump_line_cont state str ~max_pos ~pos parse_nl \
       | _ -> bump_pos_cont state str ~max_pos ~pos parse_comment \
   \
-  and maybe_parse_sexp_comment state str ~max_pos ~pos = \
+  and maybe_parse_comment state str ~max_pos ~pos = \
     if pos > max_pos then \
-      mk_cont "maybe_parse_sexp_comment" maybe_parse_sexp_comment state \
+      mk_cont "maybe_parse_comment" maybe_parse_comment state \
     else \
       match GET_CHAR with \
       | ';' -> bump_pos_cont state str ~max_pos ~pos parse_sexp_comment \
+      | '|' -> bump_pos_cont state str ~max_pos ~pos parse_block_comment \
       | _ -> parse_atom state str ~max_pos ~pos \
   \
+  and maybe_parse_close_comment state str ~max_pos ~pos = \
+    if pos > max_pos then \
+      mk_cont "maybe_parse_close_comment" maybe_parse_close_comment state \
+    else \
+      if GET_CHAR <> '#' then parse_atom state str ~max_pos ~pos \
+      else \
+        let err_msg = "end of block comment without start" in \
+        raise_parse_error (MK_PARSE_STATE state) \
+          "maybe_parse_close_comment" pos err_msg \
+  \
   and parse_sexp_comment state str ~max_pos ~pos = \
     let pbuf_str = "" in \
     ignore (MK_ATOM); \
     let rec loop parse state str ~max_pos ~pos = \
       Buffer.clear state.pbuf; \
       match parse state str ~max_pos ~pos with \
-      | Done (_sexp, parse_pos) -> \
+      | Done (_sexp, { Parse_pos.buf_pos = pos; _ }) -> \
           Buffer.clear state.pbuf; \
           let pstack = old_pstack in \
           SET_PSTACK; \
-          PARSE state str ~max_pos ~pos:parse_pos.Parse_pos.buf_pos \
+          PARSE state str ~max_pos ~pos \
       | Cont (_, cont_parse) -> \
-          let parse _state str ~max_pos ~pos = \
-            let len = max_pos - pos + 1 in \
-            cont_parse ~pos ~len str \
-          in \
-          mk_cont_ws "parse_sexp_comment" (loop parse) state ~ws_only:false \
+          let parse = mk_cont_parser cont_parse in \
+          mk_cont_state "parse_sexp_comment" (loop parse) state \
+            ~cont_state:Cont_state.Parsing_sexp_comment \
     in \
     loop PARSE state str ~max_pos ~pos \
+  \
+  and parse_block_comment ({ pbuf; _ } as state) str ~max_pos ~pos = \
+    let pbuf_str = "" in \
+    ignore (MK_ATOM); \
+    Buffer.clear pbuf; \
+    let rec loop depth state str ~max_pos ~pos = \
+      let rec parse_block_depth state str ~max_pos ~pos = \
+        if pos > max_pos then \
+          mk_cont "parse_block_depth" parse_block_depth state \
+        else \
+          match GET_CHAR with \
+          | '\010' -> bump_line_cont state str ~max_pos ~pos parse_block_depth \
+          | '\013' -> \
+              bump_line_cont state str ~max_pos ~pos parse_block_depth_nl \
+          | '"' -> \
+              let rec parse_block_quote parse state str ~max_pos ~pos = \
+                match parse state str ~max_pos ~pos with \
+                | Done (_sexp, { Parse_pos.buf_pos = pos; _ }) -> \
+                    Buffer.clear pbuf; \
+                    parse_block_depth state str ~max_pos ~pos \
+                | Cont (_, cont_parse) -> \
+                    let parse = mk_cont_parser cont_parse in \
+                    mk_cont_state "parse_block_quote" \
+                      (parse_block_quote parse) state \
+                      ~cont_state:Cont_state.Parsing_block_comment \
+              in \
+              bump_pos_cont state str ~max_pos ~pos \
+                (parse_block_quote parse_quoted) \
+          | '#' -> bump_pos_cont state str ~max_pos ~pos parse_open_block \
+          | '|' -> bump_pos_cont state str ~max_pos ~pos parse_close_block \
+          | _ -> bump_pos_cont state str ~max_pos ~pos parse_block_depth \
+      and parse_block_depth_nl state str ~max_pos ~pos = \
+        if pos > max_pos then \
+          mk_cont "parse_block_depth_nl" parse_block_depth_nl state \
+        else \
+          let pos = if GET_CHAR = '\010' then pos + 1 else pos in \
+          parse_block_depth state str ~max_pos ~pos \
+      and parse_open_block state str ~max_pos ~pos = \
+        if pos > max_pos then \
+          mk_cont "parse_open_block" parse_open_block state \
+        else \
+          if GET_CHAR = '|' then \
+            bump_pos_cont state str ~max_pos ~pos (loop (depth + 1)) \
+          else parse_block_depth state str ~max_pos ~pos \
+      and parse_close_block state str ~max_pos ~pos = \
+        if pos > max_pos then \
+          mk_cont "parse_close_block" parse_close_block state \
+        else \
+          if GET_CHAR = '#' then \
+            let parse = if depth = 1 then PARSE else loop (depth - 1) in \
+            bump_pos_cont state str ~max_pos ~pos parse \
+          else parse_block_depth state str ~max_pos ~pos \
+      in \
+      parse_block_depth state str ~max_pos ~pos \
+    in \
+    loop 1 state str ~max_pos ~pos \
+  \
   and parse_atom state str ~max_pos ~pos = \
     if pos > max_pos then mk_cont "parse_atom" parse_atom state \
     else \
       match GET_CHAR with \
       | ' ' | '\009' | '\012' -> \
           bump_found_atom bump_text_pos state str ~max_pos ~pos PARSE \
+      | '#' as c -> \
+          add_bump_pos state str ~max_pos ~pos c maybe_parse_bad_atom_hash \
+      | '|' as c -> \
+          add_bump_pos state str ~max_pos ~pos c maybe_parse_bad_atom_pipe \
       | '(' -> \
           let pbuf = state.pbuf in \
           let pbuf_str = Buffer.contents pbuf in \
             bump_text_pos state str ~max_pos ~pos reg_parse_quoted \
       | c -> add_bump_pos state str ~max_pos ~pos c parse_atom \
   \
+  and maybe_parse_bad_atom_pipe state str ~max_pos ~pos = \
+    if pos > max_pos then \
+      mk_cont "maybe_parse_bad_atom_pipe" maybe_parse_bad_atom_pipe state \
+    else \
+      match GET_CHAR with \
+      | '#' -> \
+          let err_msg = "illegal end of block comment in unquoted atom" in \
+          raise_parse_error (MK_PARSE_STATE state) "maybe_parse_bad_atom_pipe" \
+            pos err_msg \
+      | _ -> parse_atom state str ~max_pos ~pos \
+  \
+  and maybe_parse_bad_atom_hash state str ~max_pos ~pos = \
+    if pos > max_pos then \
+      mk_cont "maybe_parse_bad_atom_hash" maybe_parse_bad_atom_hash state \
+    else \
+      match GET_CHAR with \
+      | '|' -> \
+          let err_msg = "illegal start of block comment in unquoted atom" in \
+          raise_parse_error (MK_PARSE_STATE state) "maybe_parse_bad_atom_hash" \
+            pos err_msg \
+      | _ -> parse_atom state str ~max_pos ~pos \
+  \
   and reg_parse_quoted state str ~max_pos ~pos = \
     REGISTER_POS \
     parse_quoted state str ~max_pos ~pos \
+  \
   and parse_quoted state str ~max_pos ~pos = \
     if pos > max_pos then mk_cont "parse_quoted" parse_quoted state \
     else \
 let gen_input_rev_sexps my_parse ?parse_pos ?(buf = String.create 8192) ic =
   let rev_sexps_ref = ref [] in
   let buf_len = String.length buf in
-  let rec loop this_parse ~pos ~len ~is_incomplete =
+  let rec loop this_parse ~pos ~len ~cont_state =
     if len > 0 then
       match this_parse ~pos ~len buf with
       | Done (sexp, ({ Parse_pos.buf_pos; _ } as parse_pos)) ->
           rev_sexps_ref := sexp :: !rev_sexps_ref;
           let n_parsed = buf_pos - pos in
           let this_parse = mk_this_parse ~parse_pos my_parse in
+          let cont_state = Cont_state.Parsing_whitespace in
           if n_parsed = len then
             let new_len = input ic buf 0 buf_len in
-            loop this_parse ~pos:0 ~len:new_len ~is_incomplete:false
-          else
-            loop this_parse
-              ~pos:buf_pos ~len:(len - n_parsed) ~is_incomplete:false
-      | Cont (ws_only, this_parse) ->
-          loop this_parse
-            ~pos:0 ~len:(input ic buf 0 buf_len) ~is_incomplete:(not ws_only)
-    else if is_incomplete then
-      failwith
-        "Sexplib.Sexp.input_rev_sexps: reached EOF with incomplete S-expression"
-    else !rev_sexps_ref
+            loop this_parse ~pos:0 ~len:new_len ~cont_state
+          else loop this_parse ~pos:buf_pos ~len:(len - n_parsed) ~cont_state
+      | Cont (cont_state, this_parse) ->
+          loop this_parse ~pos:0 ~len:(input ic buf 0 buf_len) ~cont_state
+    else
+      if cont_state = Cont_state.Parsing_whitespace then !rev_sexps_ref
+      else
+        failwith (
+          "Sexplib.Sexp.input_rev_sexps: reached EOF while in state "
+          ^ Cont_state.to_string cont_state)
   in
   let len = input ic buf 0 buf_len in
   let this_parse = mk_this_parse ?parse_pos my_parse in
-  loop this_parse ~pos:0 ~len ~is_incomplete:false
+  loop this_parse ~pos:0 ~len ~cont_state:Cont_state.Parsing_whitespace
 
 let input_rev_sexps ?parse_pos ?buf ic =
   gen_input_rev_sexps parse ?parse_pos ?buf ic
       in
       failwith msg
   | Done (sexp, _) -> sexp
-  | Cont (ws_only, this_parse) ->
-      if ws_only then failwith (sprintf "Sexplib.Sexp.%s: whitespace only" loc);
+  | Cont (_, this_parse) ->
       (* When parsing atoms, the incremental parser cannot tell whether
-         it is at the end until it hits whitespace.  We therefore feed
-         it one space to determine whether it is finished. *)
+         it is at the end until it hits whitespace.  We therefore feed it
+         one space to determine whether it is finished. *)
       match this_parse ~pos:0 ~len:1 ws_buf with
       | Done (sexp, _) -> sexp
-      | Cont _ ->
+      | Cont (cont_state, _) ->
+          let cont_state_str = Cont_state.to_string cont_state in
           failwith (
-            sprintf "Sexplib.Sexp.%s: got incomplete S-expression: %s"
-              loc (get_sub str 0 (get_len str)))
+            sprintf
+              "Sexplib.Sexp.%s: incomplete S-expression while in state %s: %s"
+              loc cont_state_str (get_sub str 0 (get_len str)))
 
 let of_string str =
   of_string_bigstring "of_string" parse " " String.length String.sub str
 let gen_load_sexp my_parse ?(strict = true) ?(buf = String.create 8192) file =
   let buf_len = String.length buf in
   let ic = open_in file in
-  let rec loop this_parse =
+  let rec loop this_parse ~cont_state =
     let len = input ic buf 0 buf_len in
     if len = 0 then
-      failwith (sprintf "Sexplib.Sexp.gen_load_sexp: end of file: %s" file)
+      failwith (
+        sprintf "Sexplib.Sexp.gen_load_sexp: EOF in %s while in state %s"
+          file (Cont_state.to_string cont_state))
     else
       match this_parse ~pos:0 ~len buf with
-      | Done (sexp, ({ Parse_pos.buf_pos; _ } as parse_pos))
-        when strict ->
+      | Done (sexp, ({ Parse_pos.buf_pos; _ } as parse_pos)) when strict ->
           let rec strict_loop this_parse ~pos ~len =
             match this_parse ~pos ~len buf with
-            | Done _ | Cont (false, _) ->
+            | Done _ ->
                 failwith (
                   sprintf
-                    "Sexplib.Sexp.gen_load_sexp: more than one S-expression: %s"
-                      file)
-            | Cont (true, this_parse) ->
+                    "Sexplib.Sexp.gen_load_sexp: \
+                     more than one S-expression in file %s"
+                    file)
+            | Cont (cont_state, this_parse) ->
                 let len = input ic buf 0 buf_len in
-                if len = 0 then sexp
-                else strict_loop this_parse ~pos:0 ~len
+                if len > 0 then strict_loop this_parse ~pos:0 ~len
+                else if cont_state = Cont_state.Parsing_whitespace then sexp
+                else
+                  failwith (
+                    sprintf
+                      "Sexplib.Sexp.gen_load_sexp: \
+                      additional incomplete data in state %s loading file %s"
+                      (Cont_state.to_string cont_state) file)
           in
           let this_parse = mk_this_parse ~parse_pos my_parse in
           strict_loop this_parse ~pos:buf_pos ~len:(len - buf_pos)
       | Done (sexp, _) -> sexp
-      | Cont (_, this_parse) -> loop this_parse
+      | Cont (cont_state, this_parse) -> loop this_parse ~cont_state
   in
   try
-    let sexp = loop (mk_this_parse my_parse) in
+    let sexp =
+      loop (mk_this_parse my_parse) ~cont_state:Cont_state.Parsing_whitespace
+    in
     close_in ic;
     sexp
   with exc -> close_in_noerr ic; raise exc

base/sexplib/lib/sexp_intf.ml

         [buf_pos] is set to [pos]. *)
   end
 
+  module Cont_state : sig
+    (** State of parser continuations *)
+    type t = Pre_sexp.Cont_state.t =
+      | Parsing_whitespace
+      | Parsing_atom
+      | Parsing_list
+      | Parsing_sexp_comment
+      | Parsing_block_comment
+
+    val to_string : t -> string
+    (** [to_string cont_state] converts state of parser continuation
+        [cont_state] to a string. *)
+  end
+
   (** Type of result from calling {!Sexp.parse}. *)
   type ('a, 't) parse_result = ('a, 't) Pre_sexp.parse_result =
     | Done of 't * Parse_pos.t  (** [Done (t, parse_pos)] finished parsing
                                     an S-expression.  Current parse position
                                     is [parse_pos]. *)
-    | Cont of bool * ('a, 't) parse_fun
-      (** [Cont (ws_only, parse_fun)] met the end of input before completely
+    | Cont of Cont_state.t * ('a, 't) parse_fun
+      (** [Cont (cont_state, parse_fun)] met the end of input before completely
           parsing an S-expression.  The user has to call [parse_fun] to
-          continue parsing the S-expression in another buffer.  If [ws_only]
-          is true, only whitespace has been parsed so far (or comments!).
+          continue parsing the S-expression in another buffer.  [cont_state]
+          is the current parsing state of the continuation.
           NOTE: the continuation may only be called once and will raise
           [Failure] otherwise! *)
 

base/sexplib/lib_test/test.sexp

 
   "This string contains decimal \255, hex \xff codes, \
    and other \\ \n escapes."
+
+  A# # ## #x|
 )
+
+; Line comment
+
+#; (
+  S-expression comment
+)
+
+#| #| Nested |# block comment "|#" |#
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.