Commits

Philip Jenvey committed 1b48d48

cpython issue3297: fix parsing of surrogates w/ wide builds

  • Participants
  • Parent commits 0dacceb
  • Branches py3k

Comments (0)

Files changed (2)

File pypy/interpreter/pyparser/parsestring.py

+# coding: utf-8
 from pypy.interpreter.error import OperationError
 from pypy.interpreter import unicodehelper
 from rpython.rlib.rstring import StringBuilder
             # latin-1; So multibyte sequences must be escaped.
             lis = [] # using a list to assemble the value
             end = q
-            # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
+            # Worst case:
+            # "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+            # "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes),
+            # or ~1:6
             while ps < end:
                 if s[ps] == '\\':
                     lis.append(s[ps])
                         # instead.
                         lis.append("u005c")
                 if ord(s[ps]) & 0x80: # XXX inefficient
-                    w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
+                    w, ps = decode_utf8(space, s, ps, end, "utf-32-be")
                     rn = len(w)
-                    assert rn % 2 == 0
-                    for i in range(0, rn, 2):
-                        lis.append('\\u')
+                    assert rn % 4 == 0
+                    for i in range(0, rn, 4):
+                        lis.append('\\U')
                         lis.append(hexbyte(ord(w[i])))
                         lis.append(hexbyte(ord(w[i+1])))
+                        lis.append(hexbyte(ord(w[i+2])))
+                        lis.append(hexbyte(ord(w[i+3])))
                 else:
                     lis.append(s[ps])
                     ps += 1

File pypy/interpreter/test/test_exec.py

         x = ns['x']
         assert len(x) == 6
         assert ord(x[0]) == 0x0439
+
+    def test_issue3297(self):
+        c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
+        d = {}
+        exec(c, d)
+        assert d['a'] == d['b']
+        assert len(d['a']) == len(d['b'])
+        assert ascii(d['a']) == ascii(d['b'])