Commits

Yaroslav Stavnichiy committed 4c3e564

Added unicode sequence processing.

Comments (0)

Files changed (19)

 
 ##Features
 
-- Parse JSON from null-terminated string
+- Parses JSON from null-terminated string
 - Easy to use tree traversal API
-- Unescape string values (except Unicode)
-- Comments // line and /\* block \*/ skipped
+- Allows // line and /\* block \*/ comments (except before colon :)
+- Operates on single-byte or multi-byte characters (like UTF-8), not wide characters
+- Unescapes string values (including Unicode codepoints & surrogates)
+- Can use custom Unicode encoder, UTF-8 encoder built in
+- Can use custom memory allocator
+- Can use custom macro to print errors
 - Test suite included
 
-##Limitations
+## Limitations
 
-- No Unicode support (\uXXXX escape sequences remain untouched)
-- Not validating parser; might accept invalid JSON (eg., extra or missing commas, comments, octal or hex numeric values, etc.)
+- Non-validating parser; might accept invalid JSON (eg., extra or missing commas, comments, octal or hex numeric values, etc.)
 
-##API
+## API
 
 Parsed JSON tree consists of nodes. Each node has type:
 
     } nx_json;
 
 
-Parse function:
+#### Parsing
 
-      const nx_json* nx_json_parse(char* text);
+    const nx_json* nx_json_parse(char* text, nx_json_unicode_encoder encoder);
 
-Parse null-terminated string `text` into `nx_json` tree structure. The string is **modified in place**.
+Parses null-terminated string `text` into `nx_json` tree structure. The string is **modified in place**.
 
 Parsing ends right after retrieving first valid JSON value. Remainder of the text is not analysed.
 
-Returns `NULL` on syntax error. Error details are printed out using re-definable macro `NX_JSON_REPORT_ERROR(msg,ptr)`.
+Returns `NULL` on syntax error. Error details are printed out using user-redefinable macro `NX_JSON_REPORT_ERROR(msg, ptr)`.
 
-Inside parse function `nx_json` nodes get allocated using re-definable macro `NX_JSON_CALLOC()` and freed by `NX_JSON_FREE(json)`.
+Inside parse function `nx_json` nodes get allocated using user-redefinable macro `NX_JSON_CALLOC()` and freed by `NX_JSON_FREE(json)`.
 
 All `text_value` pointers refer to the content of original `text` string, which is modified in place to unescape and null-terminate JSON string literals.
 
+`encoder` is a function defined as follows:
 
-      void nx_json_free(const nx_json* js);
+    int unicode_to_my_encoding(unsigned int codepoint, char* p, char** endp) { ... }
+
+Encoder takes Unicode codepoint and writes corresponding encoded value into buffer pointed by `p`. It should store pointer to the end of encoded value into `*endp`. The function should return 1 on success and 0 on error. Number of bytes written must not exceed 6.
+
+NXJSON includes sample encoder `nx_json_unicode_to_utf8`, which converts all `\uXXXX` escapes into UTF-8 sequences.
+
+In case `encoder` parameter is `NULL` all unicode escape sequences (`\uXXXX`) are ignored (remain untouched).
+
+
+    const nx_json* nx_json_parse_utf8(char* text);
+
+This is shortcut for `nx_json_parse(text, nx_json_unicode_to_utf8)` where `nx_json_unicode_to_utf8` is unicode to UTF-8 encoder provided by NXJSON.
+
+
+    void nx_json_free(const nx_json* js);
 
 Free resources (`nx_json` nodes) allocated by `nx_json_parse()`.
 
 
-      const nx_json* nx_json_get(const nx_json* json, const char* key);
+#### Traversal
+
+    const nx_json* nx_json_get(const nx_json* json, const char* key);
 
 Get object's property by key.
 
 If there is no such property returns *dummy* node of type `NX_JSON_NULL`. Never returns literal `NULL`.
 
 
-      const nx_json* nx_json_item(const nx_json* json, int idx);
+    const nx_json* nx_json_item(const nx_json* json, int idx);
 
 Get array's item by its index.
 
 If there is no such item/property returns *dummy* node of type `NX_JSON_NULL`. Never returns literal `NULL`.
 
 
-##Usage Example
+## Usage Example
 
 JSON code:
 
 
 C API:
 
-    const nx_json* json=nx_json_parse(code);
+    const nx_json* json=nx_json_parse(code, 0);
     if (json) {
       printf("some-int=%ld\n", nx_json_get(json, "some-int")->int_value);
       printf("some-dbl=%lf\n", nx_json_get(json, "some-dbl")->dbl_value);
-      printf("some-bool=%ld\n", nx_json_get(json, "some-bool")->int_value);
+      printf("some-bool=%s\n", nx_json_get(json, "some-bool")->int_value? "true":"false");
       printf("some-null=%s\n", nx_json_get(json, "some-null")->text_value);
       printf("hello=%s\n", nx_json_get(json, "hello")->text_value);
       printf("other=%s\n", nx_json_get(json, "other")->text_value);
   NX_JSON_FREE(js);
 }
 
-static char* unescape_string(char* s, char** end) {
+static int unicode_to_utf8(unsigned int codepoint, char* p, char** endp) {
+  // code from http://stackoverflow.com/a/4609989/697313
+  if (codepoint<0x80) *p++=codepoint;
+  else if (codepoint<0x800) *p++=192+codepoint/64, *p++=128+codepoint%64;
+  else if (codepoint-0xd800u<0x800) return 0; // surrogate must have been treated earlier
+  else if (codepoint<0x10000) *p++=224+codepoint/4096, *p++=128+codepoint/64%64, *p++=128+codepoint%64;
+  else if (codepoint<0x110000) *p++=240+codepoint/262144, *p++=128+codepoint/4096%64, *p++=128+codepoint/64%64, *p++=128+codepoint%64;
+  else return 0; // error
+  *endp=p;
+  return 1;
+}
+
+nx_json_unicode_encoder nx_json_unicode_to_utf8=unicode_to_utf8;
+
+static inline int hex_val(char c) {
+  if (c>='0' && c<='9') return c-'0';
+  if (c>='a' && c<='f') return c-'a'+10;
+  if (c>='A' && c<='F') return c-'A'+10;
+  return -1;
+}
+
+static char* unescape_string(char* s, char** end, nx_json_unicode_encoder encoder) {
   char* p=s;
   char* d=s;
   char c;
         case '\\':
         case '/':
         case '"':
-          c=*p++;
+          *d++=*p++;
           break;
         case 'b':
-          c='\b'; p++;
+          *d++='\b'; p++;
           break;
         case 'f':
-          c='\f'; p++;
+          *d++='\f'; p++;
           break;
         case 'n':
-          c='\n'; p++;
+          *d++='\n'; p++;
           break;
         case 'r':
-          c='\r'; p++;
+          *d++='\r'; p++;
           break;
         case 't':
-          c='\t'; p++;
+          *d++='\t'; p++;
           break;
-        case 'u': // unicode unescape not implemented
+        case 'u': // unicode
+          if (!encoder) {
+            // leave untouched
+            *d++=c;
+            break;
+          }
+          char* ps=p-1;
+          int h1, h2, h3, h4;
+          if ((h1=hex_val(p[1]))<0 || (h2=hex_val(p[2]))<0 || (h3=hex_val(p[3]))<0 || (h4=hex_val(p[4]))<0) {
+            NX_JSON_REPORT_ERROR("invalid unicode escape", p-1);
+            return 0;
+          }
+          unsigned int codepoint=h1<<12|h2<<8|h3<<4|h4;
+          if ((codepoint & 0xfc00)==0xd800) { // high surrogate; need one more unicode to succeed
+            p+=6;
+            if (p[-1]!='\\' || *p!='u' || (h1=hex_val(p[1]))<0 || (h2=hex_val(p[2]))<0 || (h3=hex_val(p[3]))<0 || (h4=hex_val(p[4]))<0) {
+              NX_JSON_REPORT_ERROR("invalid unicode surrogate", ps);
+              return 0;
+            }
+            unsigned int codepoint2=h1<<12|h2<<8|h3<<4|h4;
+            if ((codepoint2 & 0xfc00)!=0xdc00) {
+              NX_JSON_REPORT_ERROR("invalid unicode surrogate", ps);
+              return 0;
+            }
+            codepoint=0x10000+((codepoint-0xd800)<<10)+(codepoint2-0xdc00);
+          }
+          if (!encoder(codepoint, d, &d)) {
+            NX_JSON_REPORT_ERROR("invalid codepoint", ps);
+            return 0;
+          }
+          p+=5;
+          break;
+        default:
+          // leave untouched
+          *d++=c;
           break;
       }
-      *d++=c;
     }
     else {
       *d++=c;
   return p+1;
 }
 
-static char* parse_key(const char** key, char* p) {
+static char* parse_key(const char** key, char* p, nx_json_unicode_encoder encoder) {
   // on '}' return with *p=='}'
   char c;
   while ((c=*p++)) {
     if (c=='"') {
-      *key=unescape_string(p, &p);
+      *key=unescape_string(p, &p, encoder);
       if (!*key) return 0; // propagate error
       while (*p && IS_WHITESPACE(*p)) p++;
       if (*p==':') return p+1;
   return 0; // error
 }
 
-static char* parse_value(nx_json* parent, const char* key, char* p) {
+static char* parse_value(nx_json* parent, const char* key, char* p, nx_json_unicode_encoder encoder) {
   nx_json* js;
   while (1) {
     switch (*p) {
         p++;
         while (1) {
           const char* new_key;
-          p=parse_key(&new_key, p);
+          p=parse_key(&new_key, p, encoder);
           if (!p) return 0; // error
           if (*p=='}') return p+1; // end of object
-          p=parse_value(js, new_key, p);
+          p=parse_value(js, new_key, p, encoder);
           if (!p) return 0; // error
         }
       case '[':
         js=create_json(NX_JSON_ARRAY, key, parent);
         p++;
         while (1) {
-          p=parse_value(js, 0, p);
+          p=parse_value(js, 0, p, encoder);
           if (!p) return 0; // error
           if (*p==']') return p+1; // end of array
         }
       case '"':
         p++;
         js=create_json(NX_JSON_STRING, key, parent);
-        js->text_value=unescape_string(p, &p);
+        js->text_value=unescape_string(p, &p, encoder);
         if (!js->text_value) return 0; // propagate error
         return p;
       case '-': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
   }
 }
 
-const nx_json* nx_json_parse(char* text) {
+const nx_json* nx_json_parse_utf8(char* text) {
+  return nx_json_parse(text, unicode_to_utf8);
+}
+
+const nx_json* nx_json_parse(char* text, nx_json_unicode_encoder encoder) {
   nx_json js={0};
-  if (!parse_value(&js, 0, text)) {
+  if (!parse_value(&js, 0, text, encoder)) {
     if (js.child) nx_json_free(js.child);
     return 0;
   }
   return &dummy; // never return null
 }
 
-#ifdef NXJSON_DEMO
-
-int main() {
-  char* code=strdup(" {\"some-int\":195, \"array\" :[ 0, 5.1, -7, \"\\\\\" ,, /*11*/ , \"last\\nitem\"],"
-    "\"some-bool\":true, \"some-dbl\":-1e-4, \"some-null\": null, \"hello\" : \"world\\\"\\!\", /*\"other\" : \"/OTHER/\"*/,\n"
-    "\"obj\":{\"KEY\":\"VAL\"}\n"
-    "}");
-  const nx_json* json=nx_json_parse(code);
-  if (json) {
-    printf("some-int=%ld\n", nx_json_get(json, "some-int")->int_value);
-    printf("some-dbl=%lf\n", nx_json_get(json, "some-dbl")->dbl_value);
-    printf("some-bool=%ld\n", nx_json_get(json, "some-bool")->int_value);
-    printf("some-null=%s\n", nx_json_get(json, "some-null")->text_value);
-    printf("hello=%s\n", nx_json_get(json, "hello")->text_value);
-    printf("KEY=%s\n", nx_json_get(nx_json_get(json, "obj"), "KEY")->text_value);
-    printf("other=%s\n", nx_json_get(json, "other")->text_value);
-    const nx_json* arr=nx_json_get(json, "array");
-    int i;
-    for (i=0; i<arr->length; i++) {
-      const nx_json* item=nx_json_item(arr, i);
-      printf("arr[%d]=(%d) %ld %lf %s\n", i, (int)item->type, item->int_value, item->dbl_value, item->text_value);
-    }
-    nx_json_free(json);
-  }
-  free(code);
-  return 0;
-}
-
-#endif
-
 
 #ifdef  __cplusplus
 }
   struct nx_json* last_child;
 } nx_json;
 
-const nx_json* nx_json_parse(char* text);
+typedef int (*nx_json_unicode_encoder)(unsigned int codepoint, char* p, char** endp);
+
+extern nx_json_unicode_encoder nx_json_unicode_to_utf8;
+
+const nx_json* nx_json_parse(char* text, nx_json_unicode_encoder encoder);
+const nx_json* nx_json_parse_utf8(char* text);
 void nx_json_free(const nx_json* js);
 const nx_json* nx_json_get(const nx_json* json, const char* key); // get object's property by key
 const nx_json* nx_json_item(const nx_json* json, int idx); // get array element by index
 
 static int run_test(int test_number, char* input, const char* expected_output) {
   int input_length=strlen(input);
-  const nx_json* json=nx_json_parse(input);
+  const nx_json* json=nx_json_parse_utf8(input);
   if (!json) {
     if (!expected_output) {
       printf("[%03d] PASSED\n", test_number);

tests/001.expected

   str5:"\?text\?"
   str	6\:"text
 text	text"
-  str7:"text\u1234text\u5678"
+  str7:"textሴtext噸"
   obj:{
     KEY:"VAL"
     obj:{

tests/023.expected

-"\u004d\u0430\u4e8c\ud800\udf02"
+"𐌂Mа二𐌂"
-"\u004d\u0430\u4e8c\ud800\udf02"
+/* This string contains Unicode surrogate 𐌂 both in UTF-8 and escaped */ "𐌂\u004d\u0430\u4e8c\ud800\udf02"

tests/040.expected

 [
   "
- foo / bar 
+ foo / bar 
 ]

tests/041.expected

 [
   "
- foo / bar 
+ foo / bar 
   ""and this string has an escape at the beginning"
   "and this string has no escapes"
 ]

tests/047.expected

+[
+  "Да"
+  "Му"
+  "Еба"
+  "Майката"
+]
+["\u0414\u0430",
+ "\u041c\u0443",
+ "\u0415\u0431\u0430",
+ "\u041c\u0430\u0439\u043a\u0430\u0442\u0430"]

tests/048.expected

+"foobar"
+"\u0066\u006f\u006f\u0062\u0061\u0072"
+/* invalid unicode surrogate */ "\ud800"

tests/050.expected

+"Проверка"
+"\u041F\u0440\u043E\u0432\u0435\u0440\u043a\u0430"
+"\u04FG"
+"\u0
+"\u04"