Commits

Florian Zumbiehl  committed c2ad9e8

Fix parsing of UTF-16 surrogate pairs, implement strict UTF-8
validation, make string unparser escape control characters.

  • Participants
  • Parent commits 7f99ad3

Comments (0)

Files changed (2)

   (bind (as-string
          (preceded-by (is #\u)
                       (repeated (in char-set:hex-digit) 4)))
-        handle-unicode))
+        (lambda (lead-hex)
+          (let ((lead (string->number lead-hex 16)))
+            (if (<= #xD800 lead #xDBFF)
+              (bind (as-string
+                      (preceded-by (char-seq "\\u")
+                                   (repeated (in char-set:hex-digit) 4)))
+                    (lambda (trail-hex)
+                      (let ((trail (string->number trail-hex 16)))
+                        (if (<= #xDC00 trail #xDFFF)
+                          (handle-unicode
+                            (+ #x010000
+                               (bitwise-ior
+                                 (arithmetic-shift (- lead #xD800) 10)
+                                 (- trail #xDC00))))
+                          fail))))
+              (if (<= #xDC00 lead #xDFFF)
+                fail
+                (handle-unicode lead)))))))
+
+(define (ucs-range->char-set/inclusive lower upper)
+  (ucs-range->char-set lower (add1 upper)))
+
+(define utf8-tail
+  (in (ucs-range->char-set/inclusive #x80 #xBF)))
+
+(define utf8-1
+  (in (ucs-range->char-set/inclusive #x00 #x7F)))
+
+(define utf8-2
+  (sequence
+    (in (ucs-range->char-set/inclusive #xC2 #xDF))
+    utf8-tail))
+
+(define utf8-3
+  (any-of
+    (sequence
+      (is #\xE0)
+      (in (ucs-range->char-set/inclusive #xA0 #xBF))
+      utf8-tail)
+    (sequence
+      (in (ucs-range->char-set/inclusive #xE1 #xEC))
+      (repeated utf8-tail 2))
+    (sequence
+      (is #\xED)
+      (in (ucs-range->char-set/inclusive #x80 #x9F))
+      utf8-tail)
+    (sequence
+      (in (ucs-range->char-set/inclusive  #xEE #xEF))
+      (repeated utf8-tail 2))))
+
+(define utf8-4
+  (any-of
+    (sequence
+      (is #\xF0)
+      (in (ucs-range->char-set/inclusive #x90 #xBF))
+      (repeated utf8-tail 2))
+    (sequence
+      (in (ucs-range->char-set/inclusive #xF1 #xF3))
+      (repeated utf8-tail 3))
+    (sequence
+      (is #\xF4)
+      (in (ucs-range->char-set/inclusive #x80 #x8F))
+      (repeated utf8-tail 2))))
+
+(define utf8-char
+  (any-of
+    utf8-1
+    utf8-2
+    utf8-3
+    utf8-4))
 
 ;; char    <- #\\ ( escape / unicode ) / (![[:cntrl:]"\\] .)
 
 (define char
   (any-of (preceded-by (is #\\) (any-of escape unicode fail))
-          (none-of* (in char-set:json-char) item)))
+          (none-of* (in char-set:json-char) utf8-char)))
 
 ;; raw-string <- #\" char* #\"
 ;;            -> handle-string/raw
      (except comparse)
      (only srfi-1 cons* find remove)
      (only data-structures compose constantly identity alist-ref conc string-translate*)
-     (only srfi-13 string-pad)
+     (only srfi-13 string-pad string-index substring/shared string-concatenate-reverse/shared)
      (only ports with-output-to-port with-output-to-string))
 
 (define (vector-for-each proc vec)
   (result ((parser-ref 'string) string)))
 
 (define (handle-unicode u)
-  (result (##sys#char->utf8-string (integer->char (string->number u 16)))))
+  (result (##sys#char->utf8-string (integer->char u))))
 
 (define (handle-escape e)
   (result
                  (else input))
            memoize: memoize?)))
 
+(define (escape-control-chars s)
+  (let loop ((i 0)
+             (r '()))
+    (let ((j (string-index s (ucs-range->char-set #x0 #x20) i)))
+      (if j
+        (loop (add1 j)
+              (cons* (string-pad (number->string (char->integer (string-ref s j)) 16) 4 #\0)
+                     "\\u"
+                     (substring/shared s i j)
+                     r))
+        (string-concatenate-reverse/shared r (substring/shared s i))))))
+
 (define (unparse-string s)
   (display #\")
-  (display (string-translate* s json-escapes))
+  (display (escape-control-chars (string-translate* s json-escapes)))
   (display #\"))
 
 (define (for-each/delimiter proc list delimiter)