Commits

Anonymous committed b49adef

Fixed bug in UTF-7 decoder (surrogate pairs)
Also added a test to test_codecs.py to exercise decoding pairs.

  • Participants
  • Parent commits c0072e7

Comments (0)

Files changed (2)

Lib/test/test_codecs.py

             ]
         )
 
+    # Jython extra (test supplementary characters)
+    @unittest.skipIf(not test_support.is_jython, "Jython supports surrogate pairs")
+    def test_partial_supp(self):
+        # Check the encoding is what we think it is
+        ustr = u"x\U00023456.\u0177\U00023456\u017az"
+        bstr = b'x+2E3cVg.+AXfYTdxWAXo-z'
+        self.assertEqual(ustr.encode(self.encoding), bstr)
+
+        self.check_partial(
+            ustr,
+            [
+                u"x",
+                u"x",   # '+' added: begins Base64
+                u"x",
+                u"x",
+                u"x",
+                u"x",
+                u"x",
+                u"x",
+                u"x\U00023456.",    # '.' added: ends Base64
+                u"x\U00023456.",    # '+' added: begins Base64
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.",
+                u"x\U00023456.\u0177\U00023456\u017a",  # '-' added: ends Base64
+                u"x\U00023456.\u0177\U00023456\u017az",
+            ]
+        )
+
 class UTF16ExTest(unittest.TestCase):
 
     def test_errors(self):

src/org/python/core/codecs.java

      * Decode (perhaps partially) a sequence of bytes representing the UTF-7 encoded form of a
      * Unicode string and return the (Jython internal representation of) the unicode object, and
      * amount of input consumed. The only state we preserve is our read position, i.e. how many
-     * characters we have consumed. So if the input ends part way through a Base64 sequence the data
-     * reported as consumed is only that up to and not including the Base64 start marker ('+').
+     * bytes we have consumed. So if the input ends part way through a Base64 sequence the data
+     * reported as consumed is just that up to and not including the Base64 start marker ('+').
      * Performance will be poor (quadratic cost) on runs of Base64 data long enough to exceed the
-     * input quantum in incremental decoding. The retruned Java String is a UTF-16 representation of
+     * input quantum in incremental decoding. The returned Java String is a UTF-16 representation of
      * the Unicode result, in line with Java conventions. Unicode characters above the BMP are
      * represented as surrogate pairs.
      *
                 if ((unit & 0x0400) == 0) {
                     // This is a lead surrogate as expected ... get the trail surrogate.
                     int unit2 = (int)(buffer >>> (n - 32));
-                    if ((unit2 & 0xFC00) == 0xD800) {
+                    if ((unit2 & 0xFC00) == 0xDC00) {
                         // And this is the trail surrogate we expected
                         v.appendCodePoint(0x10000 + ((unit & 0x3ff) << 10) + (unit2 & 0x3ff));
                         n -= 32;
                 if ((unit & 0x0400) == 0) {
                     // This is a lead surrogate, which is valid: check the next 16 bits.
                     int unit2 = ((int)(buffer >>> (n - 32))) & 0xffff;
-                    if ((unit2 & 0xFC00) == 0xD800) {
+                    if ((unit2 & 0xFC00) == 0xDC00) {
+                        // Hmm ... why was I called?
+                        return UTF7Error.NONE;
+                    } else {
                         // Not trail surrogate: that's the problem
                         return UTF7Error.MISSING;
-                    } else {
-                        // Hmm ... why was I called?
-                        return UTF7Error.NONE;
                     }
 
                 } else {
      * PyString.)
      *
      * This method differs from the CPython equivalent (in <code>Object/unicodeobject.c</code>)
-     * which works with an array of point codes that are, in a wide build, Unicode code points.
+     * which works with an array of code points that are, in a wide build, Unicode code points.
      *
      * @param unicode
      * @param base64SetO
                  * representation.
                  */
                 // XXX see issue #2002: we should only count surrogate pairs as one character
-                // if ((ch & 0xFC00)==0xC800) { count++; }
+                // if ((ch & 0xFC00)==0xD800) { count++; }
 
                 if (base64bits > 48) {
                     // No room for the next 16 bits: emit all we have
     }
 
     /**
-     * Handler errors encountered during decoding, adjusting the output buffer contents and
-     * returning the correct position to resume decoding (if the handler does not siomply raise an
+     * Handler for errors encountered during decoding, adjusting the output buffer contents and
+     * returning the correct position to resume decoding (if the handler does not simply raise an
      * exception).
      *
      * @param partialDecode output buffer of unicode (as UTF-16) that the codec is building
      * Invoke a user-defined error-handling mechanism, for errors encountered during decoding, as
      * registered through {@link #register_error(String, PyObject)}. The return value is the return
      * from the error handler indicating the replacement codec output and the the position at which
-     * to resume decoding. invokes the mechanism described in PEP-293.
+     * to resume decoding. Invokes the mechanism described in PEP-293.
      *
      * @param errors name of the error policy (or null meaning "strict")
      * @param encoding name of encoding that encountered the error