Commits

Antonio Cuni  committed eaf022d

rename _fastjson to _pypyjson

  • Participants
  • Parent commits c64aefd
  • Branches fastjson

Comments (0)

Files changed (8)

File lib-python/2.7/json/__init__.py

 
 try:
     # PyPy speedup, the interface is different than CPython's _json
-    import _fastjson
+    import _pypyjson
 except ImportError:
-    _fastjson = None
+    _pypyjson = None
 
 from .decoder import JSONDecoder
 from .encoder import JSONEncoder
     if (cls is None and encoding is None and object_hook is None and
             parse_int is None and parse_float is None and
             parse_constant is None and object_pairs_hook is None and not kw):
-        if _fastjson and not isinstance(s, unicode):
-            return _fastjson.loads(s)
+        if _pypyjson and not isinstance(s, unicode):
+            return _pypyjson.loads(s)
         else:
             return _default_decoder.decode(s)
     if cls is None:

File pypy/config/pypyoption.py

      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "binascii", "_multiprocessing", '_warnings',
      "_collections", "_multibytecodec", "micronumpy", "_ffi",
-     "_continuation", "_cffi_backend", "_csv", "cppyy", "_fastjson"]
+     "_continuation", "_cffi_backend", "_csv", "cppyy", "_pypyjson"]
 ))
 
 translation_modules = default_modules.copy()

File pypy/module/_fastjson/__init__.py

-from pypy.interpreter.mixedmodule import MixedModule
-
-class Module(MixedModule):
-    """fast json implementation"""
-
-    appleveldefs = {}
-
-    interpleveldefs = {
-        'loads' : 'interp_decoder.loads',
-        }

File pypy/module/_fastjson/interp_decoder.py

-import sys
-import math
-from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.objectmodel import specialize
-from rpython.rlib import rfloat
-from rpython.rtyper.lltypesystem import lltype, rffi
-from pypy.interpreter.error import OperationError, operationerrfmt
-from pypy.interpreter.gateway import unwrap_spec
-from pypy.interpreter import unicodehelper
-from rpython.rtyper.annlowlevel import llstr, hlunicode
-
-OVF_DIGITS = len(str(sys.maxint))
-
-def is_whitespace(ch):
-    return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n'
-
-# precomputing negative powers of 10 is MUCH faster than using e.g. math.pow
-# at runtime
-NEG_POW_10 = [10.0**-i for i in range(16)]
-def neg_pow_10(x, exp):
-    if exp >= len(NEG_POW_10):
-        return 0.0
-    return x * NEG_POW_10[exp]
-
-def strslice2unicode_latin1(s, start, end):
-    """
-    Convert s[start:end] to unicode. s is supposed to be an RPython string
-    encoded in latin-1, which means that the numeric value of each char is the
-    same as the corresponding unicode code point.
-
-    Internally it's implemented at the level of low-level helpers, to avoid
-    the extra copy we would need if we take the actual slice first.
-    
-    No bound checking is done, use carefully.
-    """
-    from rpython.rtyper.annlowlevel import llstr, hlunicode
-    from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
-    from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
-    length = end-start
-    ll_s = llstr(s)
-    ll_res = malloc(UNICODE, length)
-    ll_res.hash = 0
-    for i in range(length):
-        ch = ll_s.chars[start+i]
-        ll_res.chars[i] = cast_primitive(UniChar, ch)
-    return hlunicode(ll_res)
-
-TYPE_UNKNOWN = 0
-TYPE_STRING = 1
-class JSONDecoder(object):
-    def __init__(self, space, s):
-        self.space = space
-        self.s = s
-        # we put our string in a raw buffer so:
-        # 1) we automatically get the '\0' sentinel at the end of the string,
-        #    which means that we never have to check for the "end of string"
-        # 2) we can pass the buffer directly to strtod
-        self.ll_chars = rffi.str2charp(s)
-        self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw')
-        self.pos = 0
-        self.last_type = TYPE_UNKNOWN
-
-    def close(self):
-        rffi.free_charp(self.ll_chars)
-        lltype.free(self.end_ptr, flavor='raw')
-
-    def getslice(self, start, end):
-        assert start >= 0
-        assert end >= 0
-        return self.s[start:end]
-
-    def skip_whitespace(self, i):
-        while True:
-            ch = self.ll_chars[i]
-            if is_whitespace(ch):
-                i+=1
-            else:
-                break
-        return i
-
-    @specialize.arg(1)
-    def _raise(self, msg, *args):
-        raise operationerrfmt(self.space.w_ValueError, msg, *args)
-
-    def decode_any(self, i):
-        i = self.skip_whitespace(i)
-        ch = self.ll_chars[i]
-        if ch == '"':
-            return self.decode_string(i+1)
-        elif ch == '[':
-            return self.decode_array(i+1)
-        elif ch == '{':
-            return self.decode_object(i+1)
-        elif ch == 'n':
-            return self.decode_null(i+1)
-        elif ch == 't':
-            return self.decode_true(i+1)
-        elif ch == 'f':
-            return self.decode_false(i+1)
-        elif ch == 'I':
-            return self.decode_infinity(i+1)
-        elif ch == 'N':
-            return self.decode_nan(i+1)
-        elif ch == '-':
-            if self.ll_chars[i+1] == 'I':
-                return self.decode_infinity(i+2, sign=-1)
-            return self.decode_numeric(i)
-        elif ch.isdigit():
-            return self.decode_numeric(i)
-        else:
-            self._raise("No JSON object could be decoded: unexpected '%s' at char %d",
-                        ch, self.pos)
-
-    def decode_null(self, i):
-        if (self.ll_chars[i]   == 'u' and
-            self.ll_chars[i+1] == 'l' and
-            self.ll_chars[i+2] == 'l'):
-            self.pos = i+3
-            return self.space.w_None
-        self._raise("Error when decoding null at char %d", i)
-
-    def decode_true(self, i):
-        if (self.ll_chars[i]   == 'r' and
-            self.ll_chars[i+1] == 'u' and
-            self.ll_chars[i+2] == 'e'):
-            self.pos = i+3
-            return self.space.w_True
-        self._raise("Error when decoding true at char %d", i)
-
-    def decode_false(self, i):
-        if (self.ll_chars[i]   == 'a' and
-            self.ll_chars[i+1] == 'l' and
-            self.ll_chars[i+2] == 's' and
-            self.ll_chars[i+3] == 'e'):
-            self.pos = i+4
-            return self.space.w_False
-        self._raise("Error when decoding false at char %d", i)
-
-    def decode_infinity(self, i, sign=1):
-        if (self.ll_chars[i]   == 'n' and
-            self.ll_chars[i+1] == 'f' and
-            self.ll_chars[i+2] == 'i' and
-            self.ll_chars[i+3] == 'n' and
-            self.ll_chars[i+4] == 'i' and
-            self.ll_chars[i+5] == 't' and
-            self.ll_chars[i+6] == 'y'):
-            self.pos = i+7
-            return self.space.wrap(rfloat.INFINITY * sign)
-        self._raise("Error when decoding Infinity at char %d", i)
-
-    def decode_nan(self, i):
-        if (self.ll_chars[i]   == 'a' and
-            self.ll_chars[i+1] == 'N'):
-            self.pos = i+2
-            return self.space.wrap(rfloat.NAN)
-        self._raise("Error when decoding NaN at char %d", i)
-
-    def decode_numeric(self, i):
-        start = i
-        i, ovf_maybe, intval = self.parse_integer(i)
-        #
-        # check for the optional fractional part
-        ch = self.ll_chars[i]
-        if ch == '.':
-            if not self.ll_chars[i+1].isdigit():
-                self._raise("Expected digit at char %d", i+1)
-            return self.decode_float(start)
-        elif ch == 'e' or ch == 'E':
-            return self.decode_float(start)
-        elif ovf_maybe:
-            return self.decode_int_slow(start)
-
-        self.pos = i
-        return self.space.wrap(intval)
-
-    def decode_float(self, i):
-        from rpython.rlib import rdtoa
-        start = rffi.ptradd(self.ll_chars, i)
-        floatval = rdtoa.dg_strtod(start, self.end_ptr)
-        diff = rffi.cast(rffi.LONG, self.end_ptr[0]) - rffi.cast(rffi.LONG, start)
-        self.pos = i + diff
-        return self.space.wrap(floatval)
-
-    def decode_int_slow(self, i):
-        start = i
-        if self.ll_chars[i] == '-':
-            i += 1
-        while self.ll_chars[i].isdigit():
-            i += 1
-        s = self.getslice(start, i)
-        self.pos = i
-        return self.space.call_function(self.space.w_int, self.space.wrap(s))
-
-    def parse_integer(self, i):
-        "Parse a decimal number with an optional minus sign"
-        sign = 1
-        # parse the sign
-        if self.ll_chars[i] == '-':
-            sign = -1
-            i += 1
-        elif self.ll_chars[i] == '+':
-            i += 1
-        #
-        if self.ll_chars[i] == '0':
-            i += 1
-            return i, False, 0
-
-        intval = 0
-        start = i
-        while True:
-            ch = self.ll_chars[i]
-            if ch.isdigit():
-                intval = intval*10 + ord(ch)-ord('0')
-                i += 1
-            else:
-                break
-        count = i - start
-        if count == 0:
-            self._raise("Expected digit at char %d", i)
-        # if the number has more digits than OVF_DIGITS, it might have
-        # overflowed
-        ovf_maybe = (count >= OVF_DIGITS)
-        return i, ovf_maybe, sign * intval
-    parse_integer._always_inline_ = True
-
-    def decode_array(self, i):
-        w_list = self.space.newlist([])
-        start = i
-        count = 0
-        i = self.skip_whitespace(start)
-        if self.ll_chars[i] == ']':
-            self.pos = i+1
-            return w_list
-        #
-        while True:
-            w_item = self.decode_any(i)
-            i = self.pos
-            self.space.call_method(w_list, 'append', w_item)
-            i = self.skip_whitespace(i)
-            ch = self.ll_chars[i]
-            i += 1
-            if ch == ']':
-                self.pos = i
-                return w_list
-            elif ch == ',':
-                pass
-            elif ch == '\0':
-                self._raise("Unterminated array starting at char %d", start)
-            else:
-                self._raise("Unexpected '%s' when decoding array (char %d)",
-                            ch, self.pos)
-
-    def decode_object(self, i):
-        start = i
-        w_dict = self.space.newdict()
-        #
-        i = self.skip_whitespace(i)
-        if self.ll_chars[i] == '}':
-            self.pos = i+1
-            return w_dict
-        #
-        while True:
-            # parse a key: value
-            self.last_type = TYPE_UNKNOWN
-            w_name = self.decode_any(i)
-            if self.last_type != TYPE_STRING:
-                self._raise("Key name must be string for object starting at char %d", start)
-            i = self.skip_whitespace(self.pos)
-            ch = self.ll_chars[i]
-            if ch != ':':
-                self._raise("No ':' found at char %d", i)
-            i += 1
-            i = self.skip_whitespace(i)
-            #
-            w_value = self.decode_any(i)
-            self.space.setitem(w_dict, w_name, w_value)
-            i = self.skip_whitespace(self.pos)
-            ch = self.ll_chars[i]
-            i += 1
-            if ch == '}':
-                self.pos = i
-                return w_dict
-            elif ch == ',':
-                pass
-            elif ch == '\0':
-                self._raise("Unterminated object starting at char %d", start)
-            else:
-                self._raise("Unexpected '%s' when decoding object (char %d)",
-                            ch, self.pos)
-
-
-    def decode_string(self, i):
-        start = i
-        bits = 0
-        while True:
-            # this loop is a fast path for strings which do not contain escape
-            # characters
-            ch = self.ll_chars[i]
-            i += 1
-            bits |= ord(ch)
-            if ch == '"':
-                if bits & 0x80:
-                    # the 8th bit is set, it's an utf8 strnig
-                    content_utf8 = self.getslice(start, i-1)
-                    content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
-                else:
-                    # ascii only, fast path (ascii is a strict subset of
-                    # latin1, and we already checked that all the chars are <
-                    # 128)
-                    content_unicode = strslice2unicode_latin1(self.s, start, i-1)
-                self.last_type = TYPE_STRING
-                self.pos = i
-                return self.space.wrap(content_unicode)
-            elif ch == '\\':
-                content_so_far = self.getslice(start, i-1)
-                self.pos = i-1
-                return self.decode_string_escaped(start, content_so_far)
-            elif ch == '\0':
-                self._raise("Unterminated string starting at char %d", start)
-
-
-    def decode_string_escaped(self, start, content_so_far):
-        builder = StringBuilder(len(content_so_far)*2) # just an estimate
-        builder.append(content_so_far)
-        i = self.pos
-        while True:
-            ch = self.ll_chars[i]
-            i += 1
-            if ch == '"':
-                content_utf8 = builder.build()
-                content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
-                self.last_type = TYPE_STRING
-                self.pos = i
-                return self.space.wrap(content_unicode)
-            elif ch == '\\':
-                i = self.decode_escape_sequence(i, builder)
-            elif ch == '\0':
-                self._raise("Unterminated string starting at char %d", start)
-            else:
-                builder.append_multiple_char(ch, 1) # we should implement append_char
-
-    def decode_escape_sequence(self, i, builder):
-        ch = self.ll_chars[i]
-        i += 1
-        put = builder.append_multiple_char
-        if ch == '\\':  put('\\', 1)
-        elif ch == '"': put('"' , 1)
-        elif ch == '/': put('/' , 1)
-        elif ch == 'b': put('\b', 1)
-        elif ch == 'f': put('\f', 1)
-        elif ch == 'n': put('\n', 1)
-        elif ch == 'r': put('\r', 1)
-        elif ch == 't': put('\t', 1)
-        elif ch == 'u':
-            return self.decode_escape_sequence_unicode(i, builder)
-        else:
-            self._raise("Invalid \\escape: %s (char %d)", ch, self.pos-1)
-        return i
-
-    def decode_escape_sequence_unicode(self, i, builder):
-        # at this point we are just after the 'u' of the \u1234 sequence.
-        start = i
-        i += 4
-        hexdigits = self.getslice(start, i)
-        try:
-            val = int(hexdigits, 16)
-            if val & 0xfc00 == 0xd800:
-                # surrogate pair
-                val = self.decode_surrogate_pair(i, val)
-                i += 6
-        except ValueError:
-            self._raise("Invalid \uXXXX escape (char %d)", i-1)
-            return # help the annotator to know that we'll never go beyond
-                   # this point
-        #
-        uchr = unichr(val)
-        utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
-        builder.append(utf8_ch)
-        return i
-
-    def decode_surrogate_pair(self, i, highsurr):
-        if self.ll_chars[i] != '\\' or self.ll_chars[i+1] != 'u':
-            self._raise("Unpaired high surrogate at char %d", i)
-        i += 2
-        hexdigits = self.getslice(i, i+4)
-        lowsurr = int(hexdigits, 16) # the possible ValueError is caugth by the caller
-        return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
-
-def loads(space, w_s):
-    if space.isinstance_w(w_s, space.w_unicode):
-        raise OperationError(space.w_TypeError,
-                             space.wrap("Expected utf8-encoded str, got unicode"))
-    s = space.str_w(w_s)
-    decoder = JSONDecoder(space, s)
-    try:
-        w_res = decoder.decode_any(0)
-        i = decoder.skip_whitespace(decoder.pos)
-        if i < len(s):
-            start = i
-            end = len(s) - 1
-            raise operationerrfmt(space.w_ValueError, "Extra data: char %d - %d", start, end)
-        return w_res
-    finally:
-        decoder.close()

File pypy/module/_fastjson/test/test__fastjson.py

-# -*- encoding: utf-8 -*-
-import py
-from pypy.module._fastjson.interp_decoder import JSONDecoder
-
-def test_skip_whitespace():
-    s = '   hello   '
-    dec = JSONDecoder('fake space', s)
-    assert dec.pos == 0
-    assert dec.skip_whitespace(0) == 3
-    assert dec.skip_whitespace(3) == 3
-    assert dec.skip_whitespace(8) == len(s)
-    dec.close()
-
-    
-
-class AppTest(object):
-    spaceconfig = {"objspace.usemodules._fastjson": True}
-
-    def test_raise_on_unicode(self):
-        import _fastjson
-        raises(TypeError, _fastjson.loads, u"42")
-
-
-    def test_decode_constants(self):
-        import _fastjson
-        assert _fastjson.loads('null') is None
-        raises(ValueError, _fastjson.loads, 'nul')
-        raises(ValueError, _fastjson.loads, 'nu')
-        raises(ValueError, _fastjson.loads, 'n')
-        raises(ValueError, _fastjson.loads, 'nuXX')
-        #
-        assert _fastjson.loads('true') is True
-        raises(ValueError, _fastjson.loads, 'tru')
-        raises(ValueError, _fastjson.loads, 'tr')
-        raises(ValueError, _fastjson.loads, 't')
-        raises(ValueError, _fastjson.loads, 'trXX')
-        #
-        assert _fastjson.loads('false') is False
-        raises(ValueError, _fastjson.loads, 'fals')
-        raises(ValueError, _fastjson.loads, 'fal')
-        raises(ValueError, _fastjson.loads, 'fa')
-        raises(ValueError, _fastjson.loads, 'f')
-        raises(ValueError, _fastjson.loads, 'falXX')
-        
-
-    def test_decode_string(self):
-        import _fastjson
-        res = _fastjson.loads('"hello"')
-        assert res == u'hello'
-        assert type(res) is unicode
-
-    def test_decode_string_utf8(self):
-        import _fastjson
-        s = u'àèìòù'
-        res = _fastjson.loads('"%s"' % s.encode('utf-8'))
-        assert res == s
-
-    def test_skip_whitespace(self):
-        import _fastjson
-        s = '   "hello"   '
-        assert _fastjson.loads(s) == u'hello'
-        s = '   "hello"   extra'
-        raises(ValueError, "_fastjson.loads(s)")
-
-    def test_unterminated_string(self):
-        import _fastjson
-        s = '"hello' # missing the trailing "
-        raises(ValueError, "_fastjson.loads(s)")
-
-    def test_escape_sequence(self):
-        import _fastjson
-        assert _fastjson.loads(r'"\\"') == u'\\'
-        assert _fastjson.loads(r'"\""') == u'"'
-        assert _fastjson.loads(r'"\/"') == u'/'       
-        assert _fastjson.loads(r'"\b"') == u'\b'
-        assert _fastjson.loads(r'"\f"') == u'\f'
-        assert _fastjson.loads(r'"\n"') == u'\n'
-        assert _fastjson.loads(r'"\r"') == u'\r'
-        assert _fastjson.loads(r'"\t"') == u'\t'
-
-    def test_escape_sequence_in_the_middle(self):
-        import _fastjson
-        s = r'"hello\nworld"'
-        assert _fastjson.loads(s) == "hello\nworld"
-
-    def test_unterminated_string_after_escape_sequence(self):
-        import _fastjson
-        s = r'"hello\nworld' # missing the trailing "
-        raises(ValueError, "_fastjson.loads(s)")
-        
-    def test_escape_sequence_unicode(self):
-        import _fastjson
-        s = r'"\u1234"'
-        assert _fastjson.loads(s) == u'\u1234'
-
-    def test_invalid_utf_8(self):
-        import _fastjson
-        s = '"\xe0"' # this is an invalid UTF8 sequence inside a string
-        raises(UnicodeDecodeError, "_fastjson.loads(s)")
-
-    def test_decode_numeric(self):
-        import sys
-        import _fastjson
-        def check(s, val):
-            res = _fastjson.loads(s)
-            assert type(res) is type(val)
-            assert res == val
-        #
-        check('42', 42)
-        check('-42', -42)
-        check('42.123', 42.123)
-        check('42E0', 42.0)
-        check('42E3', 42000.0)
-        check('42E-1', 4.2)
-        check('42E+1', 420.0)
-        check('42.123E3', 42123.0)
-        check('0', 0)
-        check('-0', 0)
-        check('0.123', 0.123)
-        check('0E3', 0.0)
-        check('5E0001', 50.0)
-        check(str(1 << 32), 1 << 32)
-        check(str(1 << 64), 1 << 64)
-        #
-        x = str(sys.maxint+1) + '.123'
-        check(x, float(x))
-        x = str(sys.maxint+1) + 'E1'
-        check(x, float(x))
-        x = str(sys.maxint+1) + 'E-1'
-        check(x, float(x))
-        #
-        check('1E400', float('inf'))
-        ## # these are non-standard but supported by CPython json
-        check('Infinity', float('inf'))
-        check('-Infinity', float('-inf'))
-
-    def test_nan(self):
-        import math
-        import _fastjson
-        res = _fastjson.loads('NaN')
-        assert math.isnan(res)
-
-    def test_decode_numeric_invalid(self):
-        import _fastjson
-        def error(s):
-            raises(ValueError, _fastjson.loads, s)
-        #
-        error('  42   abc')
-        error('.123')
-        error('+123')
-        error('12.')
-        error('12.-3')
-        error('12E')
-        error('12E-')
-        error('0123') # numbers can't start with 0
-
-    def test_decode_object(self):
-        import _fastjson
-        assert _fastjson.loads('{}') == {}
-        assert _fastjson.loads('{  }') == {}
-        #
-        s = '{"hello": "world", "aaa": "bbb"}'
-        assert _fastjson.loads(s) == {'hello': 'world',
-                                      'aaa': 'bbb'}
-        raises(ValueError, _fastjson.loads, '{"key"')
-        raises(ValueError, _fastjson.loads, '{"key": 42')
-
-    def test_decode_object_nonstring_key(self):
-        import _fastjson
-        raises(ValueError, "_fastjson.loads('{42: 43}')")
-        
-    def test_decode_array(self):
-        import _fastjson
-        assert _fastjson.loads('[]') == []
-        assert _fastjson.loads('[  ]') == []
-        assert _fastjson.loads('[1]') == [1]
-        assert _fastjson.loads('[1, 2]') == [1, 2]
-        raises(ValueError, "_fastjson.loads('[1: 2]')")
-        raises(ValueError, "_fastjson.loads('[1, 2')")
-        raises(ValueError, """_fastjson.loads('["extra comma",]')""")
-        
-    def test_unicode_surrogate_pair(self):
-        import _fastjson
-        expected = u'z\U0001d120x'
-        res = _fastjson.loads('"z\\ud834\\udd20x"')
-        assert res == expected
-
-

File pypy/module/_pypyjson/__init__.py

+from pypy.interpreter.mixedmodule import MixedModule
+
+class Module(MixedModule):
+    """fast json implementation"""
+
+    appleveldefs = {}
+
+    interpleveldefs = {
+        'loads' : 'interp_decoder.loads',
+        }

File pypy/module/_pypyjson/interp_decoder.py

+import sys
+import math
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.objectmodel import specialize
+from rpython.rlib import rfloat
+from rpython.rtyper.lltypesystem import lltype, rffi
+from pypy.interpreter.error import OperationError, operationerrfmt
+from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter import unicodehelper
+from rpython.rtyper.annlowlevel import llstr, hlunicode
+
+OVF_DIGITS = len(str(sys.maxint))
+
+def is_whitespace(ch):
+    return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n'
+
+# precomputing negative powers of 10 is MUCH faster than using e.g. math.pow
+# at runtime
+NEG_POW_10 = [10.0**-i for i in range(16)]
+def neg_pow_10(x, exp):
+    if exp >= len(NEG_POW_10):
+        return 0.0
+    return x * NEG_POW_10[exp]
+
+def strslice2unicode_latin1(s, start, end):
+    """
+    Convert s[start:end] to unicode. s is supposed to be an RPython string
+    encoded in latin-1, which means that the numeric value of each char is the
+    same as the corresponding unicode code point.
+
+    Internally it's implemented at the level of low-level helpers, to avoid
+    the extra copy we would need if we take the actual slice first.
+    
+    No bound checking is done, use carefully.
+    """
+    from rpython.rtyper.annlowlevel import llstr, hlunicode
+    from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
+    from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
+    length = end-start
+    ll_s = llstr(s)
+    ll_res = malloc(UNICODE, length)
+    ll_res.hash = 0
+    for i in range(length):
+        ch = ll_s.chars[start+i]
+        ll_res.chars[i] = cast_primitive(UniChar, ch)
+    return hlunicode(ll_res)
+
+TYPE_UNKNOWN = 0
+TYPE_STRING = 1
+class JSONDecoder(object):
+    def __init__(self, space, s):
+        self.space = space
+        self.s = s
+        # we put our string in a raw buffer so:
+        # 1) we automatically get the '\0' sentinel at the end of the string,
+        #    which means that we never have to check for the "end of string"
+        # 2) we can pass the buffer directly to strtod
+        self.ll_chars = rffi.str2charp(s)
+        self.end_ptr = lltype.malloc(rffi.CCHARPP.TO, 1, flavor='raw')
+        self.pos = 0
+        self.last_type = TYPE_UNKNOWN
+
+    def close(self):
+        rffi.free_charp(self.ll_chars)
+        lltype.free(self.end_ptr, flavor='raw')
+
+    def getslice(self, start, end):
+        assert start >= 0
+        assert end >= 0
+        return self.s[start:end]
+
+    def skip_whitespace(self, i):
+        while True:
+            ch = self.ll_chars[i]
+            if is_whitespace(ch):
+                i+=1
+            else:
+                break
+        return i
+
+    @specialize.arg(1)
+    def _raise(self, msg, *args):
+        raise operationerrfmt(self.space.w_ValueError, msg, *args)
+
+    def decode_any(self, i):
+        i = self.skip_whitespace(i)
+        ch = self.ll_chars[i]
+        if ch == '"':
+            return self.decode_string(i+1)
+        elif ch == '[':
+            return self.decode_array(i+1)
+        elif ch == '{':
+            return self.decode_object(i+1)
+        elif ch == 'n':
+            return self.decode_null(i+1)
+        elif ch == 't':
+            return self.decode_true(i+1)
+        elif ch == 'f':
+            return self.decode_false(i+1)
+        elif ch == 'I':
+            return self.decode_infinity(i+1)
+        elif ch == 'N':
+            return self.decode_nan(i+1)
+        elif ch == '-':
+            if self.ll_chars[i+1] == 'I':
+                return self.decode_infinity(i+2, sign=-1)
+            return self.decode_numeric(i)
+        elif ch.isdigit():
+            return self.decode_numeric(i)
+        else:
+            self._raise("No JSON object could be decoded: unexpected '%s' at char %d",
+                        ch, self.pos)
+
+    def decode_null(self, i):
+        if (self.ll_chars[i]   == 'u' and
+            self.ll_chars[i+1] == 'l' and
+            self.ll_chars[i+2] == 'l'):
+            self.pos = i+3
+            return self.space.w_None
+        self._raise("Error when decoding null at char %d", i)
+
+    def decode_true(self, i):
+        if (self.ll_chars[i]   == 'r' and
+            self.ll_chars[i+1] == 'u' and
+            self.ll_chars[i+2] == 'e'):
+            self.pos = i+3
+            return self.space.w_True
+        self._raise("Error when decoding true at char %d", i)
+
+    def decode_false(self, i):
+        if (self.ll_chars[i]   == 'a' and
+            self.ll_chars[i+1] == 'l' and
+            self.ll_chars[i+2] == 's' and
+            self.ll_chars[i+3] == 'e'):
+            self.pos = i+4
+            return self.space.w_False
+        self._raise("Error when decoding false at char %d", i)
+
+    def decode_infinity(self, i, sign=1):
+        if (self.ll_chars[i]   == 'n' and
+            self.ll_chars[i+1] == 'f' and
+            self.ll_chars[i+2] == 'i' and
+            self.ll_chars[i+3] == 'n' and
+            self.ll_chars[i+4] == 'i' and
+            self.ll_chars[i+5] == 't' and
+            self.ll_chars[i+6] == 'y'):
+            self.pos = i+7
+            return self.space.wrap(rfloat.INFINITY * sign)
+        self._raise("Error when decoding Infinity at char %d", i)
+
+    def decode_nan(self, i):
+        if (self.ll_chars[i]   == 'a' and
+            self.ll_chars[i+1] == 'N'):
+            self.pos = i+2
+            return self.space.wrap(rfloat.NAN)
+        self._raise("Error when decoding NaN at char %d", i)
+
+    def decode_numeric(self, i):
+        start = i
+        i, ovf_maybe, intval = self.parse_integer(i)
+        #
+        # check for the optional fractional part
+        ch = self.ll_chars[i]
+        if ch == '.':
+            if not self.ll_chars[i+1].isdigit():
+                self._raise("Expected digit at char %d", i+1)
+            return self.decode_float(start)
+        elif ch == 'e' or ch == 'E':
+            return self.decode_float(start)
+        elif ovf_maybe:
+            return self.decode_int_slow(start)
+
+        self.pos = i
+        return self.space.wrap(intval)
+
+    def decode_float(self, i):
+        from rpython.rlib import rdtoa
+        start = rffi.ptradd(self.ll_chars, i)
+        floatval = rdtoa.dg_strtod(start, self.end_ptr)
+        diff = rffi.cast(rffi.LONG, self.end_ptr[0]) - rffi.cast(rffi.LONG, start)
+        self.pos = i + diff
+        return self.space.wrap(floatval)
+
+    def decode_int_slow(self, i):
+        start = i
+        if self.ll_chars[i] == '-':
+            i += 1
+        while self.ll_chars[i].isdigit():
+            i += 1
+        s = self.getslice(start, i)
+        self.pos = i
+        return self.space.call_function(self.space.w_int, self.space.wrap(s))
+
+    def parse_integer(self, i):
+        "Parse a decimal number with an optional minus sign"
+        sign = 1
+        # parse the sign
+        if self.ll_chars[i] == '-':
+            sign = -1
+            i += 1
+        elif self.ll_chars[i] == '+':
+            i += 1
+        #
+        if self.ll_chars[i] == '0':
+            i += 1
+            return i, False, 0
+
+        intval = 0
+        start = i
+        while True:
+            ch = self.ll_chars[i]
+            if ch.isdigit():
+                intval = intval*10 + ord(ch)-ord('0')
+                i += 1
+            else:
+                break
+        count = i - start
+        if count == 0:
+            self._raise("Expected digit at char %d", i)
+        # if the number has more digits than OVF_DIGITS, it might have
+        # overflowed
+        ovf_maybe = (count >= OVF_DIGITS)
+        return i, ovf_maybe, sign * intval
+    parse_integer._always_inline_ = True
+
+    def decode_array(self, i):
+        w_list = self.space.newlist([])
+        start = i
+        count = 0
+        i = self.skip_whitespace(start)
+        if self.ll_chars[i] == ']':
+            self.pos = i+1
+            return w_list
+        #
+        while True:
+            w_item = self.decode_any(i)
+            i = self.pos
+            self.space.call_method(w_list, 'append', w_item)
+            i = self.skip_whitespace(i)
+            ch = self.ll_chars[i]
+            i += 1
+            if ch == ']':
+                self.pos = i
+                return w_list
+            elif ch == ',':
+                pass
+            elif ch == '\0':
+                self._raise("Unterminated array starting at char %d", start)
+            else:
+                self._raise("Unexpected '%s' when decoding array (char %d)",
+                            ch, self.pos)
+
+    def decode_object(self, i):
+        start = i
+        w_dict = self.space.newdict()
+        #
+        i = self.skip_whitespace(i)
+        if self.ll_chars[i] == '}':
+            self.pos = i+1
+            return w_dict
+        #
+        while True:
+            # parse a key: value
+            self.last_type = TYPE_UNKNOWN
+            w_name = self.decode_any(i)
+            if self.last_type != TYPE_STRING:
+                self._raise("Key name must be string for object starting at char %d", start)
+            i = self.skip_whitespace(self.pos)
+            ch = self.ll_chars[i]
+            if ch != ':':
+                self._raise("No ':' found at char %d", i)
+            i += 1
+            i = self.skip_whitespace(i)
+            #
+            w_value = self.decode_any(i)
+            self.space.setitem(w_dict, w_name, w_value)
+            i = self.skip_whitespace(self.pos)
+            ch = self.ll_chars[i]
+            i += 1
+            if ch == '}':
+                self.pos = i
+                return w_dict
+            elif ch == ',':
+                pass
+            elif ch == '\0':
+                self._raise("Unterminated object starting at char %d", start)
+            else:
+                self._raise("Unexpected '%s' when decoding object (char %d)",
+                            ch, self.pos)
+
+
+    def decode_string(self, i):
+        start = i
+        bits = 0
+        while True:
+            # this loop is a fast path for strings which do not contain escape
+            # characters
+            ch = self.ll_chars[i]
+            i += 1
+            bits |= ord(ch)
+            if ch == '"':
+                if bits & 0x80:
+                    # the 8th bit is set, it's an utf8 strnig
+                    content_utf8 = self.getslice(start, i-1)
+                    content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
+                else:
+                    # ascii only, fast path (ascii is a strict subset of
+                    # latin1, and we already checked that all the chars are <
+                    # 128)
+                    content_unicode = strslice2unicode_latin1(self.s, start, i-1)
+                self.last_type = TYPE_STRING
+                self.pos = i
+                return self.space.wrap(content_unicode)
+            elif ch == '\\':
+                content_so_far = self.getslice(start, i-1)
+                self.pos = i-1
+                return self.decode_string_escaped(start, content_so_far)
+            elif ch == '\0':
+                self._raise("Unterminated string starting at char %d", start)
+
+
+    def decode_string_escaped(self, start, content_so_far):
+        builder = StringBuilder(len(content_so_far)*2) # just an estimate
+        builder.append(content_so_far)
+        i = self.pos
+        while True:
+            ch = self.ll_chars[i]
+            i += 1
+            if ch == '"':
+                content_utf8 = builder.build()
+                content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
+                self.last_type = TYPE_STRING
+                self.pos = i
+                return self.space.wrap(content_unicode)
+            elif ch == '\\':
+                i = self.decode_escape_sequence(i, builder)
+            elif ch == '\0':
+                self._raise("Unterminated string starting at char %d", start)
+            else:
+                builder.append_multiple_char(ch, 1) # we should implement append_char
+
+    def decode_escape_sequence(self, i, builder):
+        ch = self.ll_chars[i]
+        i += 1
+        put = builder.append_multiple_char
+        if ch == '\\':  put('\\', 1)
+        elif ch == '"': put('"' , 1)
+        elif ch == '/': put('/' , 1)
+        elif ch == 'b': put('\b', 1)
+        elif ch == 'f': put('\f', 1)
+        elif ch == 'n': put('\n', 1)
+        elif ch == 'r': put('\r', 1)
+        elif ch == 't': put('\t', 1)
+        elif ch == 'u':
+            return self.decode_escape_sequence_unicode(i, builder)
+        else:
+            self._raise("Invalid \\escape: %s (char %d)", ch, self.pos-1)
+        return i
+
+    def decode_escape_sequence_unicode(self, i, builder):
+        # at this point we are just after the 'u' of the \u1234 sequence.
+        start = i
+        i += 4
+        hexdigits = self.getslice(start, i)
+        try:
+            val = int(hexdigits, 16)
+            if val & 0xfc00 == 0xd800:
+                # surrogate pair
+                val = self.decode_surrogate_pair(i, val)
+                i += 6
+        except ValueError:
+            self._raise("Invalid \uXXXX escape (char %d)", i-1)
+            return # help the annotator to know that we'll never go beyond
+                   # this point
+        #
+        uchr = unichr(val)
+        utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
+        builder.append(utf8_ch)
+        return i
+
+    def decode_surrogate_pair(self, i, highsurr):
+        if self.ll_chars[i] != '\\' or self.ll_chars[i+1] != 'u':
+            self._raise("Unpaired high surrogate at char %d", i)
+        i += 2
+        hexdigits = self.getslice(i, i+4)
+        lowsurr = int(hexdigits, 16) # the possible ValueError is caugth by the caller
+        return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
+
+def loads(space, w_s):
+    if space.isinstance_w(w_s, space.w_unicode):
+        raise OperationError(space.w_TypeError,
+                             space.wrap("Expected utf8-encoded str, got unicode"))
+    s = space.str_w(w_s)
+    decoder = JSONDecoder(space, s)
+    try:
+        w_res = decoder.decode_any(0)
+        i = decoder.skip_whitespace(decoder.pos)
+        if i < len(s):
+            start = i
+            end = len(s) - 1
+            raise operationerrfmt(space.w_ValueError, "Extra data: char %d - %d", start, end)
+        return w_res
+    finally:
+        decoder.close()

File pypy/module/_pypyjson/test/test__pypyjson.py

+# -*- encoding: utf-8 -*-
+import py
+from pypy.module._pypyjson.interp_decoder import JSONDecoder
+
+def test_skip_whitespace():
+    s = '   hello   '
+    dec = JSONDecoder('fake space', s)
+    assert dec.pos == 0
+    assert dec.skip_whitespace(0) == 3
+    assert dec.skip_whitespace(3) == 3
+    assert dec.skip_whitespace(8) == len(s)
+    dec.close()
+
+    
+
+class AppTest(object):
+    spaceconfig = {"objspace.usemodules._pypyjson": True}
+
+    def test_raise_on_unicode(self):
+        import _pypyjson
+        raises(TypeError, _pypyjson.loads, u"42")
+
+
+    def test_decode_constants(self):
+        import _pypyjson
+        assert _pypyjson.loads('null') is None
+        raises(ValueError, _pypyjson.loads, 'nul')
+        raises(ValueError, _pypyjson.loads, 'nu')
+        raises(ValueError, _pypyjson.loads, 'n')
+        raises(ValueError, _pypyjson.loads, 'nuXX')
+        #
+        assert _pypyjson.loads('true') is True
+        raises(ValueError, _pypyjson.loads, 'tru')
+        raises(ValueError, _pypyjson.loads, 'tr')
+        raises(ValueError, _pypyjson.loads, 't')
+        raises(ValueError, _pypyjson.loads, 'trXX')
+        #
+        assert _pypyjson.loads('false') is False
+        raises(ValueError, _pypyjson.loads, 'fals')
+        raises(ValueError, _pypyjson.loads, 'fal')
+        raises(ValueError, _pypyjson.loads, 'fa')
+        raises(ValueError, _pypyjson.loads, 'f')
+        raises(ValueError, _pypyjson.loads, 'falXX')
+        
+
+    def test_decode_string(self):
+        import _pypyjson
+        res = _pypyjson.loads('"hello"')
+        assert res == u'hello'
+        assert type(res) is unicode
+
+    def test_decode_string_utf8(self):
+        import _pypyjson
+        s = u'àèìòù'
+        res = _pypyjson.loads('"%s"' % s.encode('utf-8'))
+        assert res == s
+
+    def test_skip_whitespace(self):
+        import _pypyjson
+        s = '   "hello"   '
+        assert _pypyjson.loads(s) == u'hello'
+        s = '   "hello"   extra'
+        raises(ValueError, "_pypyjson.loads(s)")
+
+    def test_unterminated_string(self):
+        import _pypyjson
+        s = '"hello' # missing the trailing "
+        raises(ValueError, "_pypyjson.loads(s)")
+
+    def test_escape_sequence(self):
+        import _pypyjson
+        assert _pypyjson.loads(r'"\\"') == u'\\'
+        assert _pypyjson.loads(r'"\""') == u'"'
+        assert _pypyjson.loads(r'"\/"') == u'/'       
+        assert _pypyjson.loads(r'"\b"') == u'\b'
+        assert _pypyjson.loads(r'"\f"') == u'\f'
+        assert _pypyjson.loads(r'"\n"') == u'\n'
+        assert _pypyjson.loads(r'"\r"') == u'\r'
+        assert _pypyjson.loads(r'"\t"') == u'\t'
+
+    def test_escape_sequence_in_the_middle(self):
+        import _pypyjson
+        s = r'"hello\nworld"'
+        assert _pypyjson.loads(s) == "hello\nworld"
+
+    def test_unterminated_string_after_escape_sequence(self):
+        import _pypyjson
+        s = r'"hello\nworld' # missing the trailing "
+        raises(ValueError, "_pypyjson.loads(s)")
+        
+    def test_escape_sequence_unicode(self):
+        import _pypyjson
+        s = r'"\u1234"'
+        assert _pypyjson.loads(s) == u'\u1234'
+
+    def test_invalid_utf_8(self):
+        import _pypyjson
+        s = '"\xe0"' # this is an invalid UTF8 sequence inside a string
+        raises(UnicodeDecodeError, "_pypyjson.loads(s)")
+
+    def test_decode_numeric(self):
+        import sys
+        import _pypyjson
+        def check(s, val):
+            res = _pypyjson.loads(s)
+            assert type(res) is type(val)
+            assert res == val
+        #
+        check('42', 42)
+        check('-42', -42)
+        check('42.123', 42.123)
+        check('42E0', 42.0)
+        check('42E3', 42000.0)
+        check('42E-1', 4.2)
+        check('42E+1', 420.0)
+        check('42.123E3', 42123.0)
+        check('0', 0)
+        check('-0', 0)
+        check('0.123', 0.123)
+        check('0E3', 0.0)
+        check('5E0001', 50.0)
+        check(str(1 << 32), 1 << 32)
+        check(str(1 << 64), 1 << 64)
+        #
+        x = str(sys.maxint+1) + '.123'
+        check(x, float(x))
+        x = str(sys.maxint+1) + 'E1'
+        check(x, float(x))
+        x = str(sys.maxint+1) + 'E-1'
+        check(x, float(x))
+        #
+        check('1E400', float('inf'))
+        ## # these are non-standard but supported by CPython json
+        check('Infinity', float('inf'))
+        check('-Infinity', float('-inf'))
+
+    def test_nan(self):
+        import math
+        import _pypyjson
+        res = _pypyjson.loads('NaN')
+        assert math.isnan(res)
+
+    def test_decode_numeric_invalid(self):
+        import _pypyjson
+        def error(s):
+            raises(ValueError, _pypyjson.loads, s)
+        #
+        error('  42   abc')
+        error('.123')
+        error('+123')
+        error('12.')
+        error('12.-3')
+        error('12E')
+        error('12E-')
+        error('0123') # numbers can't start with 0
+
+    def test_decode_object(self):
+        import _pypyjson
+        assert _pypyjson.loads('{}') == {}
+        assert _pypyjson.loads('{  }') == {}
+        #
+        s = '{"hello": "world", "aaa": "bbb"}'
+        assert _pypyjson.loads(s) == {'hello': 'world',
+                                      'aaa': 'bbb'}
+        raises(ValueError, _pypyjson.loads, '{"key"')
+        raises(ValueError, _pypyjson.loads, '{"key": 42')
+
+    def test_decode_object_nonstring_key(self):
+        import _pypyjson
+        raises(ValueError, "_pypyjson.loads('{42: 43}')")
+        
+    def test_decode_array(self):
+        import _pypyjson
+        assert _pypyjson.loads('[]') == []
+        assert _pypyjson.loads('[  ]') == []
+        assert _pypyjson.loads('[1]') == [1]
+        assert _pypyjson.loads('[1, 2]') == [1, 2]
+        raises(ValueError, "_pypyjson.loads('[1: 2]')")
+        raises(ValueError, "_pypyjson.loads('[1, 2')")
+        raises(ValueError, """_pypyjson.loads('["extra comma",]')""")
+        
+    def test_unicode_surrogate_pair(self):
+        import _pypyjson
+        expected = u'z\U0001d120x'
+        res = _pypyjson.loads('"z\\ud834\\udd20x"')
+        assert res == expected
+
+