Commits

Tyler Wade  committed aeafcd4

Fix cpyext, but better this time

  • Participants
  • Parent commits cc1160f
  • Branches utf8-unicode2

Comments (0)

Files changed (11)

File pypy/interpreter/test/test_utf8.py

         assert s == u
 
     rffi.free_wcharp(wcharp)
+
+def test_from_wcharpsize():
+    u = u'A\u010F\0\u20AC\U0001F63D'
+    wcharp = rffi.unicode2wcharp(u)
+    s = Utf8Str.from_wcharpsize(wcharp, 4)
+    assert s == u[:4]
+
+    rffi.free_wcharp(wcharp)

File pypy/interpreter/test/test_utf8_codecs.py

 
     def test_encode_decimal(self):
         encoder = self.getencoder('decimal')
-        assert encoder(u' 12, 34 ', 8, None) == ' 12, 34 '
-        py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
-        assert encoder(u'u\u1234', 2, 'replace') == 'u?'
+        assert encoder(Utf8Str(' 12, 34 '), 8, None) == ' 12, 34 '
+        py.test.raises(UnicodeEncodeError, encoder,
+                       Utf8Str.from_unicode(u' 12, \u1234 '), 7, None)
+        assert encoder(Utf8Str.from_unicode(u'u\u1234'), 2, 'replace') == 'u?'
 
 
 class TestTranslation(object):

File pypy/interpreter/utf8.py

 
         return builder.build()
 
+    @staticmethod
+    def from_wcharpsize(wcharp, size):
+        array = rffi.cast(WCHAR_INTP, wcharp)
+        builder = Utf8Builder()
+        i = 0;
+        while i < size:
+            c = int(array[i])
+
+            if rffi.sizeof(rffi.WCHAR_T) == 2:
+                if i != size - 1 and 0xD800 <= c <= 0xDBFF:
+                    i += 1
+                    c2 = int(array[i])
+                    if not (0xDC00 <= c2 <= 0xDFFF):
+                        builder.append(c)
+                        c = c2
+                    else:
+                        c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
+
+            builder.append(c)
+            i += 1
+
+        return builder.build()
+
 class Utf8Builder(object):
     @specialize.argtype(1)
     def __init__(self, init_size=None):

File pypy/interpreter/utf8_codecs.py

             errorhandler('strict', 'mbcs', msg, s, 0, 0)
 
         if size == 0:
-            return u"", 0
+            return Utf8Str(""), 0
 
         if force_ignore or errors == 'ignore':
             flags = 0
     result = StringBuilder(size)
     pos = 0
     while pos < size:
-        ch = ord(s[pos])
+        ch = utf8ord(s, pos)
         if unicodedb.isspace(ch):
             result.append(' ')
             pos += 1
         collstart = pos
         collend = collstart + 1
         while collend < size:
-            ch = ord(s[collend])
+            ch = utf8ord(s, collend)
             try:
                 if (0 < ch < 256 or
                     unicodedb.isspace(ch) or

File pypy/module/_codecs/interp_codecs.py

         start = space.int_w(space.getattr(w_exc, space.wrap('start')))
         w_end = space.getattr(w_exc, space.wrap('end'))
         end = space.int_w(w_end)
-        builder = UnicodeBuilder()
+        builder = Utf8Builder()
         pos = start
         while pos < end:
             code = utf8ord(obj, pos)
-            if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and
-                       pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF):
-                code = (code & 0x03FF) << 10
-                code |= ord(obj[pos+1]) & 0x03FF
-                code += 0x10000
-                pos += 1
-            builder.append(u"&#")
-            builder.append(unicode(str(code)))
-            builder.append(u";")
+            builder.append("&#")
+            builder.append(str(code))
+            builder.append(";")
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
     else:

File pypy/module/cpyext/object.py

 from pypy.module.cpyext.pyerrors import PyErr_NoMemory, PyErr_BadInternalCall
 from pypy.objspace.std.typeobject import W_TypeObject
 from pypy.interpreter.error import OperationError
+from pypy.interpreter.utf8 import Utf8Str
 import pypy.module.__builtin__.operation as operation
 
 
     the Python expression unicode(o).  Called by the unicode() built-in
     function."""
     if w_obj is None:
-        return space.wrap(u"<NULL>")
+        return space.wrap(Utf8Str("<NULL>"))
     return space.call_function(space.w_unicode, w_obj)
 
 @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1)

File pypy/module/cpyext/test/test_codecs.py

 # encoding: iso-8859-15
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module.cpyext.test.test_api import BaseApiTest
 from rpython.rtyper.lltypesystem import rffi, lltype
 
     def test_incremental(self, space, api):
         utf8 = rffi.str2charp('utf-8')
         w_encoder = api.PyCodec_IncrementalEncoder(utf8, None)
-        w_encoded = space.call_method(w_encoder, 'encode', space.wrap(u'späm'))
+        w_encoded = space.call_method(w_encoder, 'encode',
+                                     space.wrap(Utf8Str.from_unicode(u'späm')))
         w_decoder = api.PyCodec_IncrementalDecoder(utf8, None)
         w_decoded = space.call_method(w_decoder, 'decode', w_encoded)
         assert space.unwrap(w_decoded) == u'späm'

File pypy/module/cpyext/test/test_object.py

 import py
 
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module.cpyext.test.test_api import BaseApiTest
 from pypy.module.cpyext.test.test_cpyext import AppTestCpythonExtensionBase
 from rpython.rtyper.lltypesystem import rffi, lltype
             assert ptr[0] == -1
             assert api.PyObject_Cmp(w("a"), w("a"), ptr) == 0
             assert ptr[0] == 0
-            assert api.PyObject_Cmp(w(u"\xe9"), w("\xe9"), ptr) < 0
+            assert api.PyObject_Cmp(w(Utf8Str.from_unicode(u"\xe9")), w("\xe9"), ptr) < 0
             assert api.PyErr_Occurred()
             api.PyErr_Clear()
 

File pypy/module/cpyext/test/test_sequence.py

 from rpython.rtyper.lltypesystem import rffi, lltype
 from pypy.interpreter.error import OperationError
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module.cpyext.test.test_api import BaseApiTest
 from pypy.module.cpyext import sequence
 import py.test
 
     def test_contains(self, space, api):
         w_t = space.wrap((1, 'ha'))
-        assert api.PySequence_Contains(w_t, space.wrap(u'ha'))
+        assert api.PySequence_Contains(w_t, space.wrap(Utf8Str.from_unicode('ha')))
         assert not api.PySequence_Contains(w_t, space.wrap(2))
         assert api.PySequence_Contains(space.w_None, space.wrap(2)) == -1
         assert api.PyErr_Occurred()

File pypy/module/cpyext/test/test_unicodeobject.py

 # encoding: iso-8859-15
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module.cpyext.test.test_api import BaseApiTest
 from pypy.module.cpyext.test.test_cpyext import AppTestCpythonExtensionBase
 from pypy.module.cpyext.unicodeobject import (
 
 class TestUnicode(BaseApiTest):
     def test_unicodeobject(self, space, api):
-        assert api.PyUnicode_GET_SIZE(space.wrap(u'sp�m')) == 4
-        assert api.PyUnicode_GetSize(space.wrap(u'sp�m')) == 4
+        wrap_u = lambda x: space.wrap(Utf8Str.from_unicode(x))
+        assert api.PyUnicode_GET_SIZE(wrap_u(u'sp�m')) == 4
+        assert api.PyUnicode_GetSize(wrap_u(u'sp�m')) == 4
         unichar = rffi.sizeof(Py_UNICODE)
-        assert api.PyUnicode_GET_DATA_SIZE(space.wrap(u'sp�m')) == 4 * unichar
+        assert api.PyUnicode_GET_DATA_SIZE(wrap_u(u'sp�m')) == 4 * unichar
 
         encoding = rffi.charp2str(api.PyUnicode_GetDefaultEncoding())
         w_default_encoding = space.call_function(
         rffi.free_charp(prev_encoding)
 
     def test_AS(self, space, api):
-        word = space.wrap(u'spam')
+        wrap_u = lambda x: space.wrap(Utf8Str.from_unicode(x))
+        word = space.wrap(Utf8Str('spam'))
         array = rffi.cast(rffi.CWCHARP, api.PyUnicode_AS_DATA(word))
         array2 = api.PyUnicode_AS_UNICODE(word)
         array3 = api.PyUnicode_AsUnicode(word)
                     space.wrap('spam'))
 
         utf_8 = rffi.str2charp('utf-8')
-        encoded = api.PyUnicode_AsEncodedString(space.wrap(u'sp�m'),
+        encoded = api.PyUnicode_AsEncodedString(wrap_u(u'sp�m'),
                                                 utf_8, None)
         assert space.unwrap(encoded) == 'sp\xc3\xa4m'
-        encoded_obj = api.PyUnicode_AsEncodedObject(space.wrap(u'sp�m'),
+        encoded_obj = api.PyUnicode_AsEncodedObject(wrap_u(u'sp�m'),
                                                 utf_8, None)
         assert space.eq_w(encoded, encoded_obj)
         self.raises(space, api, TypeError, api.PyUnicode_AsEncodedString,
                space.wrap(''), None, None)
         ascii = rffi.str2charp('ascii')
         replace = rffi.str2charp('replace')
-        encoded = api.PyUnicode_AsEncodedString(space.wrap(u'sp�m'),
+        encoded = api.PyUnicode_AsEncodedString(wrap_u(u'sp�m'),
                                                 ascii, replace)
         assert space.unwrap(encoded) == 'sp?m'
         rffi.free_charp(utf_8)
         rffi.free_charp(ascii)
 
         buf = rffi.unicode2wcharp(u"12345")
-        api.PyUnicode_AsWideChar(space.wrap(u'longword'), buf, 5)
+        api.PyUnicode_AsWideChar(wrap_u(u'longword'), buf, 5)
         assert rffi.wcharp2unicode(buf) == 'longw'
-        api.PyUnicode_AsWideChar(space.wrap(u'a'), buf, 5)
+        api.PyUnicode_AsWideChar(wrap_u(u'a'), buf, 5)
         assert rffi.wcharp2unicode(buf) == 'a'
         rffi.free_wcharp(buf)
 
         lltype.free(ar, flavor='raw')
 
     def test_AsUTF8String(self, space, api):
-        w_u = space.wrap(u'sp�m')
+        w_u = space.wrap(Utf8Str.from_unicode(u'sp�m'))
         w_res = api.PyUnicode_AsUTF8String(w_u)
         assert space.type(w_res) is space.w_str
         assert space.unwrap(w_res) == 'sp\xc3\xa4m'
         assert api.Py_UNICODE_TONUMERIC(u'\N{VULGAR FRACTION ONE HALF}') == .5
 
     def test_fromobject(self, space, api):
-        w_u = space.wrap(u'a')
+        w_u = space.wrap(Utf8Str('a'))
         assert api.PyUnicode_FromObject(w_u) is w_u
         assert space.unwrap(
             api.PyUnicode_FromObject(space.wrap('test'))) == u'test'
         assert space.isinstance_w(w_text, space.w_unicode)
         assert space.unwrap(w_text) == u"test"
 
-        assert api.PyUnicode_FromEncodedObject(space.wrap(u"test"), b_encoding, None) is None
+        assert api.PyUnicode_FromEncodedObject(space.wrap(Utf8Str("test")),
+                                               b_encoding, None) is None
         assert api.PyErr_Occurred() is space.w_TypeError
         assert api.PyUnicode_FromEncodedObject(space.wrap(1), b_encoding, None) is None
         assert api.PyErr_Occurred() is space.w_TypeError
 
     def test_decode_null_encoding(self, space, api):
         null_charp = lltype.nullptr(rffi.CCHARP.TO)
-        u_text = u'abcdefg'
+        u_text = Utf8Str('abcdefg')
         s_text = space.str_w(api.PyUnicode_AsEncodedString(space.wrap(u_text), null_charp, null_charp))
         b_text = rffi.str2charp(s_text)
         assert space.unwrap(api.PyUnicode_Decode(b_text, len(s_text), null_charp, null_charp)) == u_text
 
     def test_escape(self, space, api):
         def test(ustr):
-            w_ustr = space.wrap(ustr.decode('Unicode-Escape'))
+            w_ustr = space.wrap(Utf8Str.from_unicode(ustr.decode('Unicode-Escape')))
             result = api.PyUnicode_AsUnicodeEscapeString(w_ustr)
             assert space.eq_w(space.wrap(ustr), result)
 
 
     def test_ascii(self, space, api):
         ustr = "abcdef"
-        w_ustr = space.wrap(ustr.decode("ascii"))
+        w_ustr = space.wrap(Utf8Str.from_unicode(ustr.decode("ascii")))
         result = api.PyUnicode_AsASCIIString(w_ustr)
         
         assert space.eq_w(space.wrap(ustr), result)
 
-        w_ustr = space.wrap(u"abcd\xe9f")
+        w_ustr = space.wrap(Utf8Str.from_unicode(u"abcd\xe9f"))
         self.raises(space, api, UnicodeEncodeError, api.PyUnicode_AsASCIIString, w_ustr)
 
     def test_decode_utf16(self, space, api):
         assert api.PyUnicode_Compare(space.wrap('a'), space.wrap('b')) == -1
 
     def test_copy(self, space, api):
-        w_x = space.wrap(u"abcd\u0660")
+        w_x = space.wrap(Utf8Str.from_unicode(u"abcd\u0660"))
         target_chunk, _ = rffi.alloc_unicodebuffer(space.int_w(space.len(w_x)))
         #lltype.malloc(Py_UNICODE, space.int_w(space.len(w_x)), flavor='raw')
 
         x_chunk = api.PyUnicode_AS_UNICODE(w_x)
         api.Py_UNICODE_COPY(target_chunk, x_chunk, 4)
-        w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, 4))
+        w_y = space.wrap(Utf8Str.from_wcharpsize(target_chunk, 4))
 
-        assert space.eq_w(w_y, space.wrap(u"abcd"))
+        assert space.eq_w(w_y, space.wrap(Utf8Str("abcd")))
 
         size = api.PyUnicode_GET_SIZE(w_x)
         api.Py_UNICODE_COPY(target_chunk, x_chunk, size)
-        w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, size))
+        w_y = space.wrap(Utf8Str.from_wcharpsize(target_chunk, size))
 
         assert space.eq_w(w_y, w_x)
 
         s = 'abcdefg'
         data = rffi.str2charp(s)
         w_u = api.PyUnicode_DecodeASCII(data, len(s), lltype.nullptr(rffi.CCHARP.TO))
-        assert space.eq_w(w_u, space.wrap(u"abcdefg"))
+        assert space.eq_w(w_u, space.wrap(Utf8Str("abcdefg")))
         rffi.free_charp(data)
 
         s = 'abcd\xFF'
         s = 'abcdefg'
         data = rffi.str2charp(s)
         w_u = api.PyUnicode_DecodeLatin1(data, len(s), lltype.nullptr(rffi.CCHARP.TO))
-        assert space.eq_w(w_u, space.wrap(u"abcdefg"))
+        assert space.eq_w(w_u, space.wrap(Utf8Str("abcdefg")))
         rffi.free_charp(data)
 
         uni = u'abcdefg'
         rffi.free_wcharp(data)
 
         ustr = "abcdef"
-        w_ustr = space.wrap(ustr.decode("ascii"))
+        w_ustr = space.wrap(Utf8Str.from_unicode(ustr.decode("ascii")))
         result = api.PyUnicode_AsLatin1String(w_ustr)
         assert space.eq_w(space.wrap(ustr), result)
 
     def test_format(self, space, api):
-        w_format = space.wrap(u'hi %s')
-        w_args = space.wrap((u'test',))
+        w_format = space.wrap(Utf8Str('hi %s'))
+        w_args = space.wrap((Utf8Str('test'),))
         w_formated = api.PyUnicode_Format(w_format, w_args)
         assert space.unwrap(w_formated) == space.unwrap(space.mod(w_format, w_args))
 
     def test_join(self, space, api):
-        w_sep = space.wrap(u'<sep>')
-        w_seq = space.wrap([u'a', u'b'])
+        w_sep = space.wrap(Utf8Str('<sep>'))
+        w_seq = space.wrap([Utf8Str('a'), Utf8Str('b')])
         w_joined = api.PyUnicode_Join(w_sep, w_seq)
         assert space.unwrap(w_joined) == u'a<sep>b'
 
         assert space.unwrap(w_char) == u'\uFFFF'
 
     def test_replace(self, space, api):
-        w_str = space.wrap(u"abababab")
-        w_substr = space.wrap(u"a")
-        w_replstr = space.wrap(u"z")
+        w_str = space.wrap(Utf8Str("abababab"))
+        w_substr = space.wrap(Utf8Str("a"))
+        w_replstr = space.wrap(Utf8Str("z"))
         assert u"zbzbabab" == space.unwrap(
             api.PyUnicode_Replace(w_str, w_substr, w_replstr, 2))
         assert u"zbzbzbzb" == space.unwrap(
             api.PyUnicode_Replace(w_str, w_substr, w_replstr, -1))
 
     def test_tailmatch(self, space, api):
-        w_str = space.wrap(u"abcdef")
+        w_str = space.wrap(Utf8Str("abcdef"))
         # prefix match
         assert api.PyUnicode_Tailmatch(w_str, space.wrap("cde"), 2, 9, -1) == 1
         assert api.PyUnicode_Tailmatch(w_str, space.wrap("cde"), 2, 4, -1) == 0 # ends at 'd'
                     2, 10, 1)
 
     def test_count(self, space, api):
-        w_str = space.wrap(u"abcabdab")
-        assert api.PyUnicode_Count(w_str, space.wrap(u"ab"), 0, -1) == 2
-        assert api.PyUnicode_Count(w_str, space.wrap(u"ab"), 0, 2) == 1
-        assert api.PyUnicode_Count(w_str, space.wrap(u"ab"), -5, 30) == 2
+        wrap_u = lambda x: space.wrap(Utf8Str.from_unicode(x))
+        w_str = wrap_u(u"abcabdab")
+        assert api.PyUnicode_Count(w_str, wrap_u(u"ab"), 0, -1) == 2
+        assert api.PyUnicode_Count(w_str, wrap_u(u"ab"), 0, 2) == 1
+        assert api.PyUnicode_Count(w_str, wrap_u(u"ab"), -5, 30) == 2
 
     def test_find(self, space, api):
-        w_str = space.wrap(u"abcabcd")
-        assert api.PyUnicode_Find(w_str, space.wrap(u"c"), 0, 7, 1) == 2
-        assert api.PyUnicode_Find(w_str, space.wrap(u"c"), 3, 7, 1) == 5
-        assert api.PyUnicode_Find(w_str, space.wrap(u"c"), 0, 7, -1) == 5
-        assert api.PyUnicode_Find(w_str, space.wrap(u"c"), 3, 7, -1) == 5
-        assert api.PyUnicode_Find(w_str, space.wrap(u"c"), 0, 4, -1) == 2
-        assert api.PyUnicode_Find(w_str, space.wrap(u"z"), 0, 4, -1) == -1
+        wrap_u = lambda x: space.wrap(Utf8Str.from_unicode(x))
+        w_str = wrap_u("abcabcd")
+        assert api.PyUnicode_Find(w_str, wrap_u(u"c"), 0, 7, 1) == 2
+        assert api.PyUnicode_Find(w_str, wrap_u(u"c"), 3, 7, 1) == 5
+        assert api.PyUnicode_Find(w_str, wrap_u(u"c"), 0, 7, -1) == 5
+        assert api.PyUnicode_Find(w_str, wrap_u(u"c"), 3, 7, -1) == 5
+        assert api.PyUnicode_Find(w_str, wrap_u(u"c"), 0, 4, -1) == 2
+        assert api.PyUnicode_Find(w_str, wrap_u(u"z"), 0, 4, -1) == -1
 
     def test_split(self, space, api):
-        w_str = space.wrap(u"a\nb\nc\nd")
+        w_str = space.wrap(Utf8Str("a\nb\nc\nd"))
         assert "[u'a', u'b', u'c', u'd']" == space.unwrap(space.repr(
                 api.PyUnicode_Split(w_str, space.wrap('\n'), -1)))
         assert r"[u'a', u'b', u'c\nd']" == space.unwrap(space.repr(
                 api.PyUnicode_Split(w_str, space.wrap('\n'), 2)))
         assert r"[u'a', u'b', u'c d']" == space.unwrap(space.repr(
-                api.PyUnicode_Split(space.wrap(u'a\nb  c d'), None, 2)))
+                api.PyUnicode_Split(space.wrap(Utf8Str('a\nb  c d')), None, 2)))
         assert "[u'a', u'b', u'c', u'd']" == space.unwrap(space.repr(
                 api.PyUnicode_Splitlines(w_str, 0)))
         assert r"[u'a\n', u'b\n', u'c\n', u'd']" == space.unwrap(space.repr(

File pypy/module/cpyext/unicodeobject.py

 from pypy.interpreter.error import OperationError
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.interpreter import utf8_codecs
 from rpython.rtyper.lltypesystem import rffi, lltype
 from pypy.module.unicodedata import unicodedb
     be modified after this call.
     """
     py_uni = rffi.cast(PyUnicodeObject, py_obj)
-    s = rffi.wcharpsize2unicode(py_uni.c_buffer, py_uni.c_size)
+    s = Utf8Str.from_wcharpsize(py_uni.c_buffer, py_uni.c_size)
     w_obj = space.wrap(s)
     track_reference(space, py_obj, w_obj)
     return w_obj
     Therefore, modification of the resulting Unicode object is only allowed when u
     is NULL."""
     if wchar_p:
-        s = rffi.wcharpsize2unicode(wchar_p, length)
+        s = rffi.Utf8Str.from_wcharpsize(wchar_p, length)
         return make_ref(space, space.wrap(s))
     else:
         return rffi.cast(PyObject, new_empty_unicode(space, length))
         """Encode the Py_UNICODE buffer of the given size and return a
         Python string object.  Return NULL if an exception was raised
         by the codec."""
-        w_u = space.wrap(rffi.wcharpsize2unicode(s, size))
+        w_u = space.wrap(Utf8Str.from_wcharpsize(s, size))
         if errors:
             w_errors = space.wrap(rffi.charp2str(errors))
         else:
 
     Returns 0 on success, -1 on failure.
     """
-    u = rffi.wcharpsize2unicode(s, length)
+    u = Utf8Str.from_wcharpsize(s, length)
     if llerrors:
         errors = rffi.charp2str(llerrors)
     else: