Commits

Amaury Forgeot d'Arc committed 7fc6072

Consider utf16 surrogates when encoding to raw_unicode_escape,
like the unicode_escape, but in both case this must be done
only in narrow unicode build!

Comments (0)

Files changed (2)

pypy/rlib/runicode.py

             pos += 1
             continue
 
-        if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+        if MAXUNICODE < 65536 and 0xD800 <= oc < 0xDC00 and pos + 1 < size:
             # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
             pos += 1
             oc2 = ord(s[pos])
     pos = 0
     while pos < size:
         oc = ord(s[pos])
+
+        if MAXUNICODE < 65536 and 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+            # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+            pos += 1
+            oc2 = ord(s[pos])
+
+            if 0xDC00 <= oc2 <= 0xDFFF:
+                ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+                raw_unicode_escape_helper(result, ucs)
+                pos += 1
+                continue
+            # Fall through: isolated surrogates are copied as-is
+            pos -= 1
+
         if oc < 0x100:
             result.append(chr(oc))
         else:

pypy/rlib/test/test_runicode.py

 
         res = interpret(f, [0x10140])
         assert res == 0x10140
+
+    def test_encode_surrogate_pair(self):
+        u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00)
+        if runicode.MAXUNICODE < 65536:
+            # Narrow unicode build, consider utf16 surrogate pairs
+            assert runicode.unicode_encode_unicode_escape(
+                u, len(u), True) == r'\U00010000'
+            assert runicode.unicode_encode_raw_unicode_escape(
+                u, len(u), True) == r'\U00010000'
+        else:
+            # Wide unicode build, don't merge utf16 surrogate pairs
+            assert runicode.unicode_encode_unicode_escape(
+                u, len(u), True) == r'\ud800\udc00'
+            assert runicode.unicode_encode_raw_unicode_escape(
+                u, len(u), True) == r'\ud800\udc00'