Commits

Anonymous committed c3ad883

Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same
strings as keys.

Comments (0)

Files changed (5)

Lib/json/decoder.py

 
 
 def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
-        _w=WHITESPACE.match, _ws=WHITESPACE_STR):
+               memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
     s, end = s_and_end
     pairs = []
     pairs_append = pairs.append
+    # Backwards compatibility
+    if memo is None:
+        memo = {}
+    memo_get = memo.setdefault
     # Use a slice to prevent IndexError from being raised, the following
     # check will raise a more specific ValueError if the string is empty
     nextchar = s[end:end + 1]
     end += 1
     while True:
         key, end = scanstring(s, end, strict)
+        key = memo_get(key, key)
         # To skip some function call overhead we optimize the fast paths where
         # the JSON key separator is ": " or just ":".
         if s[end:end + 1] != ':':
         pairs = object_hook(pairs)
     return pairs, end
 
-def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
+def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
     s, end = s_and_end
     values = []
     nextchar = s[end:end + 1]
         self.parse_object = JSONObject
         self.parse_array = JSONArray
         self.parse_string = scanstring
+        self.memo = {}
         self.scan_once = make_scanner(self)
 
 

Lib/json/scanner.py

     parse_int = context.parse_int
     parse_constant = context.parse_constant
     object_hook = context.object_hook
+    object_pairs_hook = context.object_pairs_hook
+    memo = context.memo
 
     def _scan_once(string, idx):
         try:
             return parse_string(string, idx + 1, strict)
         elif nextchar == '{':
             return parse_object((string, idx + 1), strict,
-                _scan_once, object_hook, object_pairs_hook)
+                _scan_once, object_hook, object_pairs_hook, memo)
         elif nextchar == '[':
             return parse_array((string, idx + 1), _scan_once)
         elif nextchar == 'n' and string[idx:idx + 4] == 'null':
         else:
             raise StopIteration
 
+    def scan_once(string, idx):
+        try:
+            return _scan_once(string, idx)
+        finally:
+            memo.clear()
+
     return _scan_once
 
 make_scanner = c_make_scanner or py_make_scanner

Lib/json/tests/test_decode.py

 import decimal
 from unittest import TestCase
 from io import StringIO
+from contextlib import contextmanager
 
 import json
+import json.decoder
+import json.scanner
 from collections import OrderedDict
 
+
+@contextmanager
+def use_python_scanner():
+    py_scanner = json.scanner.py_make_scanner
+    old_scanner = json.decoder.make_scanner
+    json.decoder.make_scanner = py_scanner
+    try:
+        yield
+    finally:
+        json.decoder.make_scanner = old_scanner
+
+
 class TestDecode(TestCase):
     def test_decimal(self):
         rval = json.loads('1.1', parse_float=decimal.Decimal)
         # exercise the uncommon cases. The array cases are already covered.
         rval = json.loads('{   "key"    :    "value"    ,  "k":"v"    }')
         self.assertEquals(rval, {"key":"value", "k":"v"})
+
+    def check_keys_reuse(self, source, loads):
+        rval = loads(source)
+        (a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
+        self.assertIs(a, c)
+        self.assertIs(b, d)
+
+    def test_keys_reuse(self):
+        s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
+        self.check_keys_reuse(s, json.loads)
+        # Disabled: the pure Python version of json simply doesn't work
+        with use_python_scanner():
+            self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
 Library
 -------
 
+- Issue #7451: Improve decoding performance of JSON objects, and reduce
+  the memory consumption of said decoded objects when they use the same
+  strings as keys.
+
 - Issue #1100562: Fix deep-copying of objects derived from the list and
   dict types.  Patch by Michele Orrù and Björn Lindqvist.
 
     PyObject *parse_float;
     PyObject *parse_int;
     PyObject *parse_constant;
+    PyObject *memo;
 } PyScannerObject;
 
 static PyMemberDef scanner_members[] = {
     return tpl;
 }
 
+#define APPEND_OLD_CHUNK \
+    if (chunk != NULL) { \
+        if (chunks == NULL) { \
+            chunks = PyList_New(0); \
+            if (chunks == NULL) { \
+                goto bail; \
+            } \
+        } \
+        if (PyList_Append(chunks, chunk)) { \
+            Py_DECREF(chunk); \
+            goto bail; \
+        } \
+        Py_CLEAR(chunk); \
+    }
+
 static PyObject *
 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
 {
 
     Return value is a new PyUnicode
     */
-    PyObject *rval;
+    PyObject *rval = NULL;
     Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
     Py_ssize_t begin = end - 1;
     Py_ssize_t next = begin;
     const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
-    PyObject *chunks = PyList_New(0);
-    if (chunks == NULL) {
-        goto bail;
-    }
+    PyObject *chunks = NULL;
+    PyObject *chunk = NULL;
+
     if (end < 0 || len <= end) {
         PyErr_SetString(PyExc_ValueError, "end is out of bounds");
         goto bail;
     while (1) {
         /* Find the end of the string or the next escape */
         Py_UNICODE c = 0;
-        PyObject *chunk = NULL;
         for (next = end; next < len; next++) {
             c = buf[next];
             if (c == '"' || c == '\\') {
         }
         /* Pick up this chunk if it's not zero length */
         if (next != end) {
+            APPEND_OLD_CHUNK
             chunk = PyUnicode_FromUnicode(&buf[end], next - end);
             if (chunk == NULL) {
                 goto bail;
             }
-            if (PyList_Append(chunks, chunk)) {
-                Py_DECREF(chunk);
-                goto bail;
-            }
-            Py_DECREF(chunk);
         }
         next++;
         if (c == '"') {
             }
 #endif
         }
+        APPEND_OLD_CHUNK
         chunk = PyUnicode_FromUnicode(&c, 1);
         if (chunk == NULL) {
             goto bail;
         }
-        if (PyList_Append(chunks, chunk)) {
-            Py_DECREF(chunk);
+    }
+
+    if (chunks == NULL) {
+        if (chunk != NULL)
+            rval = chunk;
+        else
+            rval = PyUnicode_FromStringAndSize("", 0);
+    }
+    else {
+        APPEND_OLD_CHUNK
+        rval = join_list_unicode(chunks);
+        if (rval == NULL) {
             goto bail;
         }
-        Py_DECREF(chunk);
+        Py_CLEAR(chunks);
     }
 
-    rval = join_list_unicode(chunks);
-    if (rval == NULL) {
-        goto bail;
-    }
-    Py_DECREF(chunks);
     *next_end_ptr = end;
     return rval;
 bail:
     *next_end_ptr = -1;
     Py_XDECREF(chunks);
+    Py_XDECREF(chunk);
     return NULL;
 }
 
     Py_CLEAR(s->parse_float);
     Py_CLEAR(s->parse_int);
     Py_CLEAR(s->parse_constant);
+    Py_CLEAR(s->memo);
     return 0;
 }
 
     Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
     Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
     PyObject *val = NULL;
-    PyObject *rval = PyList_New(0);
+    PyObject *rval = NULL;
     PyObject *key = NULL;
     int strict = PyObject_IsTrue(s->strict);
+    int has_pairs_hook = (s->object_pairs_hook != Py_None);
     Py_ssize_t next_idx;
+
+    if (has_pairs_hook)
+        rval = PyList_New(0);
+    else
+        rval = PyDict_New();
     if (rval == NULL)
         return NULL;
 
     /* only loop if the object is non-empty */
     if (idx <= end_idx && str[idx] != '}') {
         while (idx <= end_idx) {
+            PyObject *memokey;
+
             /* read key */
             if (str[idx] != '"') {
                 raise_errmsg("Expecting property name", pystr, idx);
             key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
             if (key == NULL)
                 goto bail;
+            memokey = PyDict_GetItem(s->memo, key);
+            if (memokey != NULL) {
+                Py_INCREF(memokey);
+                Py_DECREF(key);
+                key = memokey;
+            }
+            else {
+                if (PyDict_SetItem(s->memo, key, key) < 0)
+                    goto bail;
+            }
             idx = next_idx;
 
             /* skip whitespace between key and : delimiter, read :, skip whitespace */
             if (val == NULL)
                 goto bail;
 
-            {
-                PyObject *tuple = PyTuple_Pack(2, key, val);
-                if (tuple == NULL)
+            if (has_pairs_hook) {
+                PyObject *item = PyTuple_Pack(2, key, val);
+                if (item == NULL)
                     goto bail;
-                if (PyList_Append(rval, tuple) == -1) {
-                    Py_DECREF(tuple);
+                Py_CLEAR(key);
+                Py_CLEAR(val);
+                if (PyList_Append(rval, item) == -1) {
+                    Py_DECREF(item);
                     goto bail;
                 }
-                Py_DECREF(tuple);
+                Py_DECREF(item);
             }
-
-            Py_CLEAR(key);
-            Py_CLEAR(val);
+            else {
+                if (PyDict_SetItem(rval, key, val) < 0)
+                    goto bail;
+                Py_CLEAR(key);
+                Py_CLEAR(val);
+            }
             idx = next_idx;
 
             /* skip whitespace before } or , */
 
     *next_idx_ptr = idx + 1;
 
-    if (s->object_pairs_hook != Py_None) {
+    if (has_pairs_hook) {
         val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
-        if (val == NULL)
-            goto bail;
         Py_DECREF(rval);
         return val;
     }
 
-    val = PyDict_New();
-    if (val == NULL)
-        goto bail;
-    if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
-        goto bail;
-    Py_DECREF(rval);
-    rval = val;
-
     /* if object_hook is not None: rval = object_hook(rval) */
     if (s->object_hook != Py_None) {
         val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
-        if (val == NULL)
-            goto bail;
         Py_DECREF(rval);
-        rval = val;
-        val = NULL;
+        return val;
     }
     return rval;
 bail:
     Py_XDECREF(key);
     Py_XDECREF(val);
-    Py_DECREF(rval);
+    Py_XDECREF(rval);
     return NULL;
 }
 
                  Py_TYPE(pystr)->tp_name);
         return NULL;
     }
+    PyDict_Clear(s->memo);
+    if (rval == NULL)
+        return NULL;
     return _build_rval_index_tuple(rval, next_idx);
 }
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
         return -1;
 
+    if (s->memo == NULL) {
+        s->memo = PyDict_New();
+        if (s->memo == NULL)
+            goto bail;
+    }
+
     /* All of these will fail "gracefully" so we don't need to verify them */
     s->strict = PyObject_GetAttrString(ctx, "strict");
     if (s->strict == NULL)