Commits

Tyler Wade  committed cc1160f

Fix _rawffi module

  • Participants
  • Parent commits f19e309
  • Branches utf8-unicode2

Comments (0)

Files changed (8)

File pypy/interpreter/test/test_utf8.py

     if sys.maxunicode < 65536:
         assert l[:3] == [u'A', u'\u010F', u'\u20AC']
     else:
-        assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D']
+        assert l == [u'A', u'\u010F', u'\u20AC', u'\U0001F63D']
 
 def test_reverse_iterator():
     s = build_utf8str()
 
 def test_copy_to_wcharp():
     s = build_utf8str()
-    if sys.maxunicode < 0x10000:
+    if sys.maxunicode < 0x10000 and rffi.sizeof(rffi.WCHAR_T) == 4:
         # The last character requires a surrogate pair on narrow builds and
         # so won't be converted correctly by rffi.wcharp2unicode
         s = s[:-1]
     u = rffi.wcharp2unicode(wcharp)
     rffi.free_wcharp(wcharp)
     assert s == u
+
+def test_from_wcharp():
+    def check(u):
+        wcharp = rffi.unicode2wcharp(u)
+        s = Utf8Str.from_wcharp(wcharp)
+        rffi.free_wcharp(wcharp)
+        assert s == u
+    check(u'A\u010F\u20AC\U0001F63D')
+    check(u'0xDCC0 ')
+    check(u'0xDCC0')
+
+def test_from_wcharpn():
+    u = u'A\u010F\u20AC\U0001F63D'
+    wcharp = rffi.unicode2wcharp(u)
+    s = Utf8Str.from_wcharpn(wcharp, 3)
+    assert s == u[:3]
+
+    s = Utf8Str.from_wcharpn(wcharp, 4)
+    if sys.maxunicode == 0xFFFF:
+        assert s == u[:4]
+    else:
+        assert s == u
+
+    rffi.free_wcharp(wcharp)

File pypy/interpreter/utf8.py

 from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
 from rpython.rlib.rarithmetic import r_uint
 from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.lltypesystem import lltype
+
+wchar_rint = rffi.r_uint
+WCHAR_INTP = rffi.UINTP
+if rffi.sizeof(rffi.WCHAR_T) == 2:
+    wchar_rint = rffi.r_ushort
+    WCHAR_INTP = rffi.USHORTP
+
 
 def utf8chr(value):
     # Like unichr, but returns a Utf8Str object
             byte_pos -= 1
         return byte_pos
 
-    def copy_to_wcharp(self):
-        # XXX Temporary solution. This won't work on correctly on systems
-        #     where sizeof(wchar_t) == 2. Also, it copies twice.
-        from pypy.interpreter.utf8_codecs import unicode_encode_unicode_internal
-        from rpython.rlib.runicode import MAXUNICODE
-        bytes = unicode_encode_unicode_internal(self, len(self), 'strict')
-        return rffi.cast(rffi.CWCHARP, rffi.str2charp(bytes))
+    def copy_to_wcharp(self, track_allocation=True):
+        length = len(self) + 1
+        if rffi.sizeof(rffi.WCHAR_T) == 2:
+            for c in self.codepoint_iter():
+                if c > 0xFFFF:
+                    length += 1
 
+        array = lltype.malloc(WCHAR_INTP.TO, length, flavor='raw',
+                              track_allocation=track_allocation)
+        from pypy.interpreter.utf8_codecs import create_surrogate_pair
 
+        i = 0;
+        for c in self.codepoint_iter():
+            if rffi.sizeof(rffi.WCHAR_T) == 2:
+                c1, c2 = create_surrogate_pair(c)
+                array[i] = wchar_rint(c1)
+                if c2:
+                    i += 1
+                    array[i] = wchar_rint(c2)
+            else:
+                array[i] = wchar_rint(c)
+
+            i += 1
+
+        array[i] = wchar_rint(0)
+        array = rffi.cast(rffi.CWCHARP, array)
+        return array
+
+    @staticmethod
+    def from_wcharp(wcharp):
+        array = rffi.cast(WCHAR_INTP, wcharp)
+        builder = Utf8Builder()
+        i = 0;
+        while True:
+            c = int(array[i])
+            if c == 0:
+                break
+
+            if rffi.sizeof(rffi.WCHAR_T) == 2:
+                if 0xD800 <= c <= 0xDBFF:
+                    i += 1
+                    c2 = int(array[i])
+                    if c2 == 0:
+                        builder.append(c)
+                        break
+                    elif not (0xDC00 <= c2 <= 0xDFFF):
+                        builder.append(c)
+                        c = c2
+                    else:
+                        c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
+
+            builder.append(c)
+            i += 1
+
+        return builder.build()
+
+    @staticmethod
+    def from_wcharpn(wcharp, size):
+        array = rffi.cast(WCHAR_INTP, wcharp)
+        builder = Utf8Builder()
+        i = 0;
+        while i < size:
+            c = int(array[i])
+            if c == 0:
+                break
+
+            if rffi.sizeof(rffi.WCHAR_T) == 2:
+                if i != size - 1 and 0xD800 <= c <= 0xDBFF:
+                    i += 1
+                    c2 = int(array[i])
+                    if c2 == 0:
+                        builder.append(c)
+                        break
+                    elif not (0xDC00 <= c2 <= 0xDFFF):
+                        builder.append(c)
+                        c = c2
+                    else:
+                        c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
+
+            builder.append(c)
+            i += 1
+
+        return builder.build()
 
 class Utf8Builder(object):
     @specialize.argtype(1)

File pypy/interpreter/utf8_codecs.py

             result.append(r)
     return result.build(), pos, bo
 
+def create_surrogate_pair(val):
+    if val >= 0x10000:
+        return (0xD800 | ((val-0x10000) >> 10),
+                0xDC00 | ((val-0x10000) & 0x3FF))
+    else:
+        return val, 0
+
 def unicode_encode_utf_16_helper(s, size, errors,
                                  errorhandler=None,
                                  byteorder='little'):
     while i < size:
         ch = utf8ord(s, i)
         i += 1
-        ch2 = 0
-        if ch >= 0x10000:
-            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
-            ch  = 0xD800 | ((ch-0x10000) >> 10)
+        ch, ch2 = create_surrogate_pair(ch)
 
         _STORECHAR(result, ch, byteorder)
         if ch2:

File pypy/module/_rawffi/alt/interp_funcptr.py

         self.argchain.arg(addr)
 
     def handle_unichar_p(self, w_ffitype, w_obj, unicodeval):
-        buf = rffi.unicode2wcharp(unicodeval)
+        buf = unicodeval.copy_to_wcharp()
         self.w_func.to_free.append(rffi.cast(rffi.VOIDP, buf))
         addr = rffi.cast(rffi.ULONG, buf)
         self.argchain.arg(addr)

File pypy/module/_rawffi/alt/test/test_type_converter.py

 import sys
 from rpython.rlib.rarithmetic import r_uint, r_singlefloat, r_longlong, r_ulonglong
 from rpython.rlib.libffi import IS_32_BIT
+from pypy.interpreter.utf8 import Utf8Str
 from pypy.module._rawffi.alt.interp_ffitype import app_types, descr_new_pointer
 from pypy.module._rawffi.alt.type_converter import FromAppLevelConverter, ToAppLevelConverter
 
     def test_char(self):
         space = self.space
         self.check(app_types.char, space.wrap('a'), ord('a'))
-        self.check(app_types.unichar, space.wrap(u'\u1234'), 0x1234)
+        self.check(app_types.unichar,
+                   space.wrap(Utf8Str.from_unicode(u'\u1234')), 0x1234)
 
     def test_signed_longlong(self):
         space = self.space
     def test_strings(self):
         # first, try automatic conversion from applevel
         self.check(app_types.char_p, self.space.wrap('foo'), 'foo')
-        self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234')    
-        self.check(app_types.unichar_p, self.space.wrap('foo'), u'foo')    
+        self.check(app_types.unichar_p,
+                   self.space.wrap(Utf8Str.from_unicode(u'foo\u1234')),
+                   Utf8Str.from_unicode(u'foo\u1234'))
+        self.check(app_types.unichar_p, self.space.wrap('foo'),
+                   Utf8Str.from_unicode(u'foo'))
         # then, try to pass explicit pointers
         self.check(app_types.char_p, self.space.wrap(42), 42)
         self.check(app_types.unichar_p, self.space.wrap(42), 42)        

File pypy/module/_rawffi/alt/type_converter.py

 from rpython.rlib import jit
 from rpython.rlib.rarithmetic import r_uint
 from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import utf8chr
 from pypy.module._rawffi.structure import W_StructureInstance, W_Structure
 from pypy.module._rawffi.alt.interp_ffitype import app_types
 
             return space.wrap(chr(ucharval))
         elif w_ffitype.is_unichar():
             wcharval = self.get_unichar(w_ffitype)
-            return space.wrap(unichr(wcharval))
+            return space.wrap(utf8chr(int(wcharval)))
         elif w_ffitype.is_double():
             return self._float(w_ffitype)
         elif w_ffitype.is_singlefloat():

File pypy/module/_rawffi/array.py

         if not space.is_none(w_items):
             items_w = space.unpackiterable(w_items)
             iterlength = len(items_w)
-            if iterlength > length:
+
+            double_length_items = 0
+            if rffi.sizeof(rffi.WCHAR_T) == 2:
+                # On systems where sizeof(wchar_t) = 2, the resulting array
+                # needs to be encoded in utf-16. As a result, codepoints larger
+                # than 0xFFFF will occupy two array values
+                for w_i in items_w:
+                    if space.isinstance_w(w_i, space.w_unicode):
+                        u = space.unicode_w(w_i)
+                        if len(u) == 0 and utf8ord(u) > 0xFFFF:
+                            double_length_items += 1
+
+            if iterlength + double_length_items > length:
                 raise OperationError(space.w_ValueError,
                                      space.wrap("too many items for specified"
                                                 " array length"))
-            for num in range(iterlength):
-                w_item = items_w[num]
-                unwrap_value(space, write_ptr, result.ll_buffer, num,
-                             self.itemcode, w_item)
+            i = 0
+            for w_item in items_w:
+                i += unwrap_value(space, write_ptr, result.ll_buffer, i,
+                                  self.itemcode, w_item)
+
         return space.wrap(result)
 
     def descr_repr(self, space):

File pypy/module/_rawffi/interp_rawffi.py

 from pypy.interpreter.error import OperationError, oefmt, wrap_oserror
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.interpreter.utf8 import (
+    Utf8Str, utf8ord, utf8chr, WCHAR_INTP, wchar_rint)
+from pypy.interpreter.utf8_codecs import create_surrogate_pair
 
 from rpython.rlib.clibffi import *
 from rpython.rtyper.lltypesystem import lltype, rffi
     LL_TYPEMAP['X'] = rffi.CCHARP
     LL_TYPEMAP['v'] = rffi.SHORT
 
+
 def letter2tp(space, key):
     from pypy.module._rawffi.array import PRIMITIVE_ARRAY_TYPES
     try:
                 ptr_val = t_array[0]
                 return ptr_val
     else:
+        if T is rffi.CWCHARP:
+            return utf8chr(int(rffi.cast(WCHAR_INTP, ptr)[ofs]))
         return rffi.cast(T, ptr)[ofs]
 read_ptr._annspecialcase_ = 'specialize:arg(2)'
 
         else:
             ptr = unwrap_truncate_int(rffi.VOIDP, space, w_arg)
         push_func(add_arg, argdesc, ptr)
+        return 1
     elif letter == "d":
         push_func(add_arg, argdesc, space.float_w(w_arg))
+        return 1
     elif letter == "f":
         push_func(add_arg, argdesc, rffi.cast(rffi.FLOAT,
                                               space.float_w(w_arg)))
+        return 1
     elif letter == "g":
         push_func(add_arg, argdesc, rffi.cast(rffi.LONGDOUBLE,
                                               space.float_w(w_arg)))
+        return 1
     elif letter == "c":
         s = space.str_w(w_arg)
         if len(s) != 1:
                 "Expected string of length one as character"))
         val = s[0]
         push_func(add_arg, argdesc, val)
+        return 1
     elif letter == 'u':
         s = space.unicode_w(w_arg)
         if len(s) != 1:
             raise OperationError(space.w_TypeError, w(
                 "Expected unicode string of length one as wide character"))
-        val = s[0]
-        push_func(add_arg, argdesc, val)
+
+        val = utf8ord(s)
+        if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF:
+            # Utf-16 must be used on systems with a 2 byte wchar_t to
+            # encode codepoints > 0xFFFF
+            c1, c2 = create_surrogate_pair(val)
+            push_func(add_arg, argdesc, wchar_rint(c1))
+            push_func(add_arg, argdesc+1, wchar_rint(c2))
+            return 2
+        else:
+            push_func(add_arg, argdesc, wchar_rint(val))
+            return 1
     else:
         for c in unroll_letters_for_numbers:
             if letter == c:
                 TP = LL_TYPEMAP[c]
                 val = unwrap_truncate_int(TP, space, w_arg)
                 push_func(add_arg, argdesc, val)
-                return
+                return 1
         else:
             raise OperationError(space.w_TypeError,
                                  space.wrap("cannot directly write value"))
         return space.w_None
     wcharp_addr = rffi.cast(rffi.CWCHARP, address)
     if maxlength == -1:
-        s = rffi.wcharp2unicode(wcharp_addr)
+        s = Utf8Str.from_wcharp(wcharp_addr)
     else:
-        s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
+        s = Utf8Str.from_wcharpn(wcharp_addr, maxlength)
     return space.wrap(s)
 
 @unwrap_spec(address=r_uint, maxlength=int)