Commits

Victor Stinner committed 0e53821 Draft

Convert OS codecs to a C file
***
oscodecs

Comments (0)

Files changed (7)

 		Objects/unicodeformat.o \
 		Objects/unicodelegacy.o \
 		Objects/unicodeobject.o \
+		Objects/unicodeoscodecs.o \
 		Objects/unicodectype.o \
 		Objects/weakrefobject.o
 
 
 Objects/bytearrayobject.o: $(srcdir)/Objects/bytearrayobject.c $(BYTESTR_DEPS)
 
-Objects/unicodecodecs.o: $(srcdir)/Objects/unicodecodecs.c $(srcdir)/Objects/unicodeoscodecs.h $(UNICODE_DEPS)
+Objects/unicodecodecs.o: $(srcdir)/Objects/unicodecodecs.c $(UNICODE_DEPS)
+Objects/unicodeoscodecs.o: $(srcdir)/Objects/unicodeoscodecs.c $(UNICODE_DEPS)
 Objects/unicodeformat.o: $(srcdir)/Objects/unicodeformat.c $(UNICODE_DEPS)
 Objects/unicodelegacy.o: $(srcdir)/Objects/unicodelegacy.c $(UNICODE_DEPS)
 Objects/unicodeobject.o: $(srcdir)/Objects/unicodemethods.h $(srcdir)/Objects/unicodeobject.c $(UNICODE_DEPS)

Objects/stringlib/codecs.h

             Py_ssize_t newpos;
             Py_ssize_t repsize, k, startpos;
             startpos = i-1;
-            rep = unicode_encode_call_errorhandler(
+            rep = _PyUnicode_EncodeCallErrorHandler(
                   errors, &errorHandler, "utf-8", "surrogates not allowed",
                   unicode, &exc, startpos, startpos+1, &newpos);
             if (!rep)
                 for(k=0; k<repsize; k++) {
                     Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
                     if (0x80 <= c) {
-                        raise_encode_exception(&exc, "utf-8",
-                                               unicode,
-                                               i-1, i,
-                                               "surrogates not allowed");
+                        _PyUnicode_RaiseEncodeException(&exc,
+                            "utf-8",
+                            unicode,
+                            i-1, i,
+                            "surrogates not allowed");
                         goto error;
                     }
                     *p++ = (char)c;

Objects/unicodecodecs.c

 #include "ucnhash.h"
 #include "unicodeimpl.h"
 
-static PyObject *
-unicode_encode_call_errorhandler(const char *errors,
-       PyObject **errorHandler,const char *encoding, const char *reason,
-       PyObject *unicode, PyObject **exceptionObject,
-       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
-
-static void
-raise_encode_exception(PyObject **exceptionObject,
-                       const char *encoding,
-                       PyObject *unicode,
-                       Py_ssize_t startpos, Py_ssize_t endpos,
-                       const char *reason);
-
 static int
 unicode_widen(PyObject **p_unicode, Py_ssize_t length,
                  unsigned int maxchar)
    return 0 on success, -1 on error
 */
 
-static int
-unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
-                                 const char *encoding, const char *reason,
-                                 const char **input, const char **inend, Py_ssize_t *startinpos,
-                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
-                                 PyObject **output, Py_ssize_t *outpos)
+int
+_PyUnicode_DecodeCallErrorHandler(
+    const char *errors, PyObject **errorHandler,
+    const char *encoding, const char *reason,
+    const char **input, const char **inend, Py_ssize_t *startinpos,
+    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
+    PyObject **output, Py_ssize_t *outpos)
 {
     static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
 
         continue;
 utf7Error:
         endinpos = s-starts;
-        if (unicode_decode_call_errorhandler(
+        if (_PyUnicode_DecodeCallErrorHandler(
                 errors, &errorHandler,
                 "utf7", errmsg,
                 &starts, &e, &startinpos, &endinpos, &exc, &s,
                 (base64bits >= 6) ||
                 (base64bits > 0 && base64buffer != 0)) {
             endinpos = size;
-            if (unicode_decode_call_errorhandler(
+            if (_PyUnicode_DecodeCallErrorHandler(
                     errors, &errorHandler,
                     "utf7", "unterminated shift sequence",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
             continue;
         }
 
-        if (unicode_decode_call_errorhandler(
+        if (_PyUnicode_DecodeCallErrorHandler(
                 errors, &errorHandler,
                 "utf-8", errmsg,
                 &starts, &end, &startinpos, &endinpos, &exc, &s,
         q += 4;
         continue;
       utf32Error:
-        if (unicode_decode_call_errorhandler(
+        if (_PyUnicode_DecodeCallErrorHandler(
                 errors, &errorHandler,
                 "utf32", errmsg,
                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
             continue;
         }
 
-        if (unicode_decode_call_errorhandler(
+        if (_PyUnicode_DecodeCallErrorHandler(
                 errors,
                 &errorHandler,
                 "utf16", errmsg,
             chr = 0;
             if (s+digits>end) {
                 endinpos = size;
-                if (unicode_decode_call_errorhandler(
+                if (_PyUnicode_DecodeCallErrorHandler(
                         errors, &errorHandler,
                         "unicodeescape", "end of string in escape sequence",
                         &starts, &end, &startinpos, &endinpos, &exc, &s,
                 c = (unsigned char) s[j];
                 if (!Py_ISXDIGIT(c)) {
                     endinpos = (s+j+1)-starts;
-                    if (unicode_decode_call_errorhandler(
+                    if (_PyUnicode_DecodeCallErrorHandler(
                             errors, &errorHandler,
                             "unicodeescape", message,
                             &starts, &end, &startinpos, &endinpos, &exc, &s,
                 WRITECHAR(chr);
             } else {
                 endinpos = s-starts;
-                if (unicode_decode_call_errorhandler(
+                if (_PyUnicode_DecodeCallErrorHandler(
                         errors, &errorHandler,
                         "unicodeescape", "illegal Unicode character",
                         &starts, &end, &startinpos, &endinpos, &exc, &s,
                 }
             }
             endinpos = s-starts;
-            if (unicode_decode_call_errorhandler(
+            if (_PyUnicode_DecodeCallErrorHandler(
                     errors, &errorHandler,
                     "unicodeescape", message,
                     &starts, &end, &startinpos, &endinpos, &exc, &s,
                 message = "\\ at end of string";
                 s--;
                 endinpos = s-starts;
-                if (unicode_decode_call_errorhandler(
+                if (_PyUnicode_DecodeCallErrorHandler(
                         errors, &errorHandler,
                         "unicodeescape", message,
                         &starts, &end, &startinpos, &endinpos, &exc, &s,
             c = (unsigned char)*s;
             if (!Py_ISXDIGIT(c)) {
                 endinpos = s-starts;
-                if (unicode_decode_call_errorhandler(
+                if (_PyUnicode_DecodeCallErrorHandler(
                         errors, &errorHandler,
                         "rawunicodeescape", "truncated \\uXXXX",
                         &starts, &end, &startinpos, &endinpos, &exc, &s,
                 goto onError;
         } else {
             endinpos = s-starts;
-            if (unicode_decode_call_errorhandler(
+            if (_PyUnicode_DecodeCallErrorHandler(
                     errors, &errorHandler,
                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
                     &starts, &end, &startinpos, &endinpos, &exc, &s,
                 endinpos = s - starts + Py_UNICODE_SIZE;
                 reason = "illegal code point (> 0x10FFFF)";
             }
-            if (unicode_decode_call_errorhandler(
+            if (_PyUnicode_DecodeCallErrorHandler(
                     errors, &errorHandler,
                     "unicode_internal", reason,
                     &starts, &end, &startinpos, &endinpos, &exc, &s,
     }
 }
 
-/* raises a UnicodeEncodeError */
-static void
-raise_encode_exception(PyObject **exceptionObject,
+void
+_PyUnicode_RaiseEncodeException(PyObject **exceptionObject,
                        const char *encoding,
                        PyObject *unicode,
                        Py_ssize_t startpos, Py_ssize_t endpos,
    build arguments, call the callback and check the arguments,
    put the result into newpos and return the replacement string, which
    has to be freed by the caller */
-static PyObject *
-unicode_encode_call_errorhandler(const char *errors,
-                                 PyObject **errorHandler,
-                                 const char *encoding, const char *reason,
-                                 PyObject *unicode, PyObject **exceptionObject,
-                                 Py_ssize_t startpos, Py_ssize_t endpos,
-                                 Py_ssize_t *newpos)
+PyObject *
+_PyUnicode_EncodeCallErrorHandler(
+    const char *errors,
+    PyObject **errorHandler,
+    const char *encoding, const char *reason,
+    PyObject *unicode, PyObject **exceptionObject,
+    Py_ssize_t startpos, Py_ssize_t endpos,
+    Py_ssize_t *newpos)
 {
     static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
     Py_ssize_t len;
             }
             switch (known_errorHandler) {
             case 1: /* strict */
-                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
+                _PyUnicode_RaiseEncodeException(&exc, encoding, unicode, collstart, collend, reason);
                 goto onError;
             case 2: /* replace */
                 while (collstart++<collend)
                 pos = collend;
                 break;
             default:
-                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
-                                                              encoding, reason, unicode, &exc,
-                                                              collstart, collend, &newpos);
+                repunicode = _PyUnicode_EncodeCallErrorHandler(errors, &errorHandler,
+                                                               encoding, reason, unicode, &exc,
+                                                               collstart, collend, &newpos);
                 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
                                            PyUnicode_READY(repunicode) == -1))
                     goto onError;
                 for (i = 0; repsize-->0; ++i, ++str) {
                     c = PyUnicode_READ_CHAR(repunicode, i);
                     if (c >= limit) {
-                        raise_encode_exception(&exc, encoding, unicode,
+                        _PyUnicode_RaiseEncodeException(&exc, encoding, unicode,
                                                pos, pos+1, reason);
                         Py_DECREF(repunicode);
                         goto onError;
         else {
             startinpos = s-starts;
             endinpos = startinpos + 1;
-            if (unicode_decode_call_errorhandler(
+            if (_PyUnicode_DecodeCallErrorHandler(
                     errors, &errorHandler,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                 /* undefined mapping */
                 startinpos = s-starts;
                 endinpos = startinpos+1;
-                if (unicode_decode_call_errorhandler(
+                if (_PyUnicode_DecodeCallErrorHandler(
                         errors, &errorHandler,
                         "charmap", "character maps to <undefined>",
                         &starts, &e, &startinpos, &endinpos, &exc, &s,
                 /* undefined mapping */
                 startinpos = s-starts;
                 endinpos = startinpos+1;
-                if (unicode_decode_call_errorhandler(
+                if (_PyUnicode_DecodeCallErrorHandler(
                         errors, &errorHandler,
                         "charmap", "character maps to <undefined>",
                         &starts, &e, &startinpos, &endinpos, &exc, &s,
     }
     switch (*known_errorHandler) {
     case 1: /* strict */
-        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
+        _PyUnicode_RaiseEncodeException(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
         return -1;
     case 2: /* replace */
         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
                 return -1;
             }
             else if (x==enc_FAILED) {
-                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
+                _PyUnicode_RaiseEncodeException(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
                 return -1;
             }
         }
                 if (x==enc_EXCEPTION)
                     return -1;
                 else if (x==enc_FAILED) {
-                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
+                    _PyUnicode_RaiseEncodeException(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
                     return -1;
                 }
             }
         *inpos = collendpos;
         break;
     default:
-        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
-                                                      encoding, reason, unicode, exceptionObject,
-                                                      collstartpos, collendpos, &newpos);
+        repunicode = _PyUnicode_EncodeCallErrorHandler(errors, errorHandler,
+                                                       encoding, reason, unicode, exceptionObject,
+                                                       collstartpos, collendpos, &newpos);
         if (repunicode == NULL)
             return -1;
         if (PyBytes_Check(repunicode)) {
             }
             else if (x==enc_FAILED) {
                 Py_DECREF(repunicode);
-                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
+                _PyUnicode_RaiseEncodeException(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
                 return -1;
             }
         }
 
         startpos = i;
         exc = NULL;
-        raise_encode_exception(&exc, "decimal", unicode,
+        _PyUnicode_RaiseEncodeException(&exc, "decimal", unicode,
                                startpos, startpos+1,
                                "invalid decimal Unicode string");
         Py_XDECREF(exc);
     return 0;
 }
 
-/* Operating system codecs: PyUnicode_EncodeFSDefault(),
-   PyUnicode_DecodeFSDefault(), PyUnicode_DecodeMBCSStateful(),
-   PyUnicode_EncodeCodePage(), etc. */
-#include "unicodeoscodecs.h"
-
 int _PyUnicode_InitCodecs(void)
 {
     PyType_Ready(&EncodingMapType);

Objects/unicodeimpl.h

 
 PyAPI_FUNC(int) _PyUnicode_InitCodecs(void);
 
+PyAPI_FUNC(int)
+_PyUnicode_DecodeCallErrorHandler(
+    const char *errors, PyObject **errorHandler,
+    const char *encoding, const char *reason,
+    const char **input, const char **inend, Py_ssize_t *startinpos,
+    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
+    PyObject **output, Py_ssize_t *outpos);
+
+PyAPI_FUNC(PyObject *)
+_PyUnicode_EncodeCallErrorHandler(
+    const char *errors,
+    PyObject **errorHandler,const char *encoding, const char *reason,
+    PyObject *unicode, PyObject **exceptionObject,
+    Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
+
+/* raises a UnicodeEncodeError */
+PyAPI_FUNC(void)
+_PyUnicode_RaiseEncodeException(
+    PyObject **exceptionObject,
+    const char *encoding,
+    PyObject *unicode,
+    Py_ssize_t startpos, Py_ssize_t endpos,
+    const char *reason);
+
 #ifdef __cplusplus
 }
 #endif

Objects/unicodeoperators.h

 {
     int kind;
     void *data1, *data2;
-    Py_ssize_t len, i;
+    Py_ssize_t len;
     int cmp;
 
     /* a string is equal to itself */
             return NULL;
 
         if (op == Py_EQ || op == Py_NE) {
-            eq = unicode_compare_eq(left, right);
-            if (op Py_EQ)
+            int eq = unicode_compare_eq(left, right);
+            if (op == Py_EQ)
                 v = TEST_COND(eq);
             else
                 v = TEST_COND(!eq);

Objects/unicodeoscodecs.c

+/* Operating system codecs: PyUnicode_EncodeFSDefault(),
+   PyUnicode_DecodeFSDefault(), PyUnicode_DecodeMBCSStateful(),
+   PyUnicode_EncodeCodePage(), etc. */
+
+#include "Python.h"
+#include "unicodeimpl.h"
+
+#ifdef HAVE_MBCS
+static OSVERSIONINFOEX winver;
+#endif
+
+#ifdef HAVE_MBCS
+
+/* --- MBCS codecs for Windows -------------------------------------------- */
+
+#if SIZEOF_INT < SIZEOF_SIZE_T
+#define NEED_RETRY
+#endif
+
+#ifndef WC_ERR_INVALID_CHARS
+#  define WC_ERR_INVALID_CHARS 0x0080
+#endif
+
+static char*
+code_page_name(UINT code_page, PyObject **obj)
+{
+    *obj = NULL;
+    if (code_page == CP_ACP)
+        return "mbcs";
+    if (code_page == CP_UTF7)
+        return "CP_UTF7";
+    if (code_page == CP_UTF8)
+        return "CP_UTF8";
+
+    *obj = PyBytes_FromFormat("cp%u", code_page);
+    if (*obj == NULL)
+        return NULL;
+    return PyBytes_AS_STRING(*obj);
+}
+
+static int
+is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
+{
+    const char *curr = s + offset;
+    const char *prev;
+
+    if (!IsDBCSLeadByteEx(code_page, *curr))
+        return 0;
+
+    prev = CharPrevExA(code_page, s, curr, 0);
+    if (prev == curr)
+        return 1;
+    /* FIXME: This code is limited to "true" double-byte encodings,
+       as it assumes an incomplete character consists of a single
+       byte. */
+    if (curr - prev == 2)
+        return 1;
+    if (!IsDBCSLeadByteEx(code_page, *prev))
+        return 1;
+    return 0;
+}
+
+static DWORD
+decode_code_page_flags(UINT code_page)
+{
+    if (code_page == CP_UTF7) {
+        /* The CP_UTF7 decoder only supports flags=0 */
+        return 0;
+    }
+    else
+        return MB_ERR_INVALID_CHARS;
+}
+
+/*
+ * Decode a byte string from a Windows code page into unicode object in strict
+ * mode.
+ *
+ * Returns consumed size if succeed, returns -2 on decode error, or raise a
+ * WindowsError and returns -1 on other error.
+ */
+static int
+decode_code_page_strict(UINT code_page,
+                        PyObject **v,
+                        const char *in,
+                        int insize)
+{
+    const DWORD flags = decode_code_page_flags(code_page);
+    wchar_t *out;
+    DWORD outsize;
+
+    /* First get the size of the result */
+    assert(insize > 0);
+    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
+    if (outsize <= 0)
+        goto error;
+
+    if (*v == NULL) {
+        /* Create unicode object */
+        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
+        *v = (PyObject*)_PyUnicode_New(outsize);
+        if (*v == NULL)
+            return -1;
+        out = PyUnicode_AS_UNICODE(*v);
+    }
+    else {
+        /* Extend unicode object */
+        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+        if (PyUnicode_Resize(v, n + outsize) < 0)
+            return -1;
+        out = PyUnicode_AS_UNICODE(*v) + n;
+    }
+
+    /* Do the conversion */
+    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
+    if (outsize <= 0)
+        goto error;
+    return insize;
+
+error:
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+        return -2;
+    PyErr_SetFromWindowsErr(0);
+    return -1;
+}
+
+/*
+ * Decode a byte string from a code page into unicode object with an error
+ * handler.
+ *
+ * Returns consumed size if succeed, or raise a WindowsError or
+ * UnicodeDecodeError exception and returns -1 on error.
+ */
+static int
+decode_code_page_errors(UINT code_page,
+                        PyObject **v,
+                        const char *in, const int size,
+                        const char *errors)
+{
+    const char *startin = in;
+    const char *endin = in + size;
+    const DWORD flags = decode_code_page_flags(code_page);
+    /* Ideally, we should get reason from FormatMessage. This is the Windows
+       2000 English version of the message. */
+    const char *reason = "No mapping for the Unicode character exists "
+                         "in the target code page.";
+    /* each step cannot decode more than 1 character, but a character can be
+       represented as a surrogate pair */
+    wchar_t buffer[2], *startout, *out;
+    int insize, outsize;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *encoding_obj = NULL;
+    char *encoding;
+    DWORD err;
+    int ret = -1;
+
+    assert(size > 0);
+
+    encoding = code_page_name(code_page, &encoding_obj);
+    if (encoding == NULL)
+        return -1;
+
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
+           UnicodeDecodeError. */
+        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_CLEAR(exc);
+        }
+        goto error;
+    }
+
+    if (*v == NULL) {
+        /* Create unicode object */
+        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
+        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
+        if (*v == NULL)
+            goto error;
+        startout = PyUnicode_AS_UNICODE(*v);
+    }
+    else {
+        /* Extend unicode object */
+        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
+        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
+            goto error;
+        startout = PyUnicode_AS_UNICODE(*v) + n;
+    }
+
+    /* Decode the byte string character per character */
+    out = startout;
+    while (in < endin)
+    {
+        /* Decode a character */
+        insize = 1;
+        do
+        {
+            outsize = MultiByteToWideChar(code_page, flags,
+                                          in, insize,
+                                          buffer, Py_ARRAY_LENGTH(buffer));
+            if (outsize > 0)
+                break;
+            err = GetLastError();
+            if (err != ERROR_NO_UNICODE_TRANSLATION
+                && err != ERROR_INSUFFICIENT_BUFFER)
+            {
+                PyErr_SetFromWindowsErr(0);
+                goto error;
+            }
+            insize++;
+        }
+        /* 4=maximum length of a UTF-8 sequence */
+        while (insize <= 4 && (in + insize) <= endin);
+
+        if (outsize <= 0) {
+            Py_ssize_t startinpos, endinpos, outpos;
+
+            startinpos = in - startin;
+            endinpos = startinpos + 1;
+            outpos = out - PyUnicode_AS_UNICODE(*v);
+            if (_PyUnicode_DecodeCallErrorHandler(
+                    errors, &errorHandler,
+                    encoding, reason,
+                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
+                    v, &outpos))
+            {
+                goto error;
+            }
+            out = PyUnicode_AS_UNICODE(*v) + outpos;
+        }
+        else {
+            in += insize;
+            memcpy(out, buffer, outsize * sizeof(wchar_t));
+            out += outsize;
+        }
+    }
+
+    /* write a NUL character at the end */
+    *out = 0;
+
+    /* Extend unicode object */
+    outsize = out - startout;
+    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
+    if (PyUnicode_Resize(v, outsize) < 0)
+        goto error;
+    ret = size;
+
+error:
+    Py_XDECREF(encoding_obj);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return ret;
+}
+
+static PyObject *
+decode_code_page_stateful(int code_page,
+                          const char *s, Py_ssize_t size,
+                          const char *errors, Py_ssize_t *consumed)
+{
+    PyObject *v = NULL;
+    int chunk_size, final, converted, done;
+
+    if (code_page < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+        return NULL;
+    }
+
+    if (consumed)
+        *consumed = 0;
+
+    do
+    {
+#ifdef NEED_RETRY
+        if (size > INT_MAX) {
+            chunk_size = INT_MAX;
+            final = 0;
+            done = 0;
+        }
+        else
+#endif
+        {
+            chunk_size = (int)size;
+            final = (consumed == NULL);
+            done = 1;
+        }
+
+        /* Skip trailing lead-byte unless 'final' is set */
+        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
+            --chunk_size;
+
+        if (chunk_size == 0 && done) {
+            if (v != NULL)
+                break;
+            return PyUnicode_New(0, 0);
+        }
+
+
+        converted = decode_code_page_strict(code_page, &v,
+                                            s, chunk_size);
+        if (converted == -2)
+            converted = decode_code_page_errors(code_page, &v,
+                                                s, chunk_size,
+                                                errors);
+        assert(converted != 0);
+
+        if (converted < 0) {
+            Py_XDECREF(v);
+            return NULL;
+        }
+
+        if (consumed)
+            *consumed += converted;
+
+        s += converted;
+        size -= converted;
+    } while (!done);
+
+    return _PyUnicode_Result(v);
+}
+
+PyObject *
+PyUnicode_DecodeCodePageStateful(int code_page,
+                                 const char *s,
+                                 Py_ssize_t size,
+                                 const char *errors,
+                                 Py_ssize_t *consumed)
+{
+    return decode_code_page_stateful(code_page, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCSStateful(const char *s,
+                             Py_ssize_t size,
+                             const char *errors,
+                             Py_ssize_t *consumed)
+{
+    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
+}
+
+PyObject *
+PyUnicode_DecodeMBCS(const char *s,
+                     Py_ssize_t size,
+                     const char *errors)
+{
+    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
+
+static DWORD
+encode_code_page_flags(UINT code_page, const char *errors)
+{
+    if (code_page == CP_UTF8) {
+        if (winver.dwMajorVersion >= 6)
+            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
+               and later */
+            return WC_ERR_INVALID_CHARS;
+        else
+            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
+            return 0;
+    }
+    else if (code_page == CP_UTF7) {
+        /* CP_UTF7 only supports flags=0 */
+        return 0;
+    }
+    else {
+        if (errors != NULL && strcmp(errors, "replace") == 0)
+            return 0;
+        else
+            return WC_NO_BEST_FIT_CHARS;
+    }
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string in strict
+ * mode.
+ *
+ * Returns consumed characters if succeed, returns -2 on encode error, or raise
+ * a WindowsError and returns -1 on other error.
+ */
+static int
+encode_code_page_strict(UINT code_page, PyObject **outbytes,
+                        PyObject *unicode, Py_ssize_t offset, int len,
+                        const char* errors)
+{
+    BOOL usedDefaultChar = FALSE;
+    BOOL *pusedDefaultChar = &usedDefaultChar;
+    int outsize;
+    PyObject *exc = NULL;
+    wchar_t *p;
+    Py_ssize_t size;
+    const DWORD flags = encode_code_page_flags(code_page, NULL);
+    char *out;
+    /* Create a substring so that we can get the UTF-16 representation
+       of just the slice under consideration. */
+    PyObject *substring;
+
+    assert(len > 0);
+
+    if (code_page != CP_UTF8 && code_page != CP_UTF7)
+        pusedDefaultChar = &usedDefaultChar;
+    else
+        pusedDefaultChar = NULL;
+
+    substring = PyUnicode_Substring(unicode, offset, offset+len);
+    if (substring == NULL)
+        return -1;
+    p = PyUnicode_AsUnicodeAndSize(substring, &size);
+    if (p == NULL) {
+        Py_DECREF(substring);
+        return -1;
+    }
+
+    /* First get the size of the result */
+    outsize = WideCharToMultiByte(code_page, flags,
+                                  p, size,
+                                  NULL, 0,
+                                  NULL, pusedDefaultChar);
+    if (outsize <= 0)
+        goto error;
+    /* If we used a default char, then we failed! */
+    if (pusedDefaultChar && *pusedDefaultChar) {
+        Py_DECREF(substring);
+        return -2;
+    }
+
+    if (*outbytes == NULL) {
+        /* Create string object */
+        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+        if (*outbytes == NULL) {
+            Py_DECREF(substring);
+            return -1;
+        }
+        out = PyBytes_AS_STRING(*outbytes);
+    }
+    else {
+        /* Extend string object */
+        const Py_ssize_t n = PyBytes_Size(*outbytes);
+        if (outsize > PY_SSIZE_T_MAX - n) {
+            PyErr_NoMemory();
+            Py_DECREF(substring);
+            return -1;
+        }
+        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
+            Py_DECREF(substring);
+            return -1;
+        }
+        out = PyBytes_AS_STRING(*outbytes) + n;
+    }
+
+    /* Do the conversion */
+    outsize = WideCharToMultiByte(code_page, flags,
+                                  p, size,
+                                  out, outsize,
+                                  NULL, pusedDefaultChar);
+    Py_CLEAR(substring);
+    if (outsize <= 0)
+        goto error;
+    if (pusedDefaultChar && *pusedDefaultChar)
+        return -2;
+    return 0;
+
+error:
+    Py_XDECREF(substring);
+    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
+        return -2;
+    PyErr_SetFromWindowsErr(0);
+    return -1;
+}
+
+/*
+ * Encode a Unicode string to a Windows code page into a byte string using a
+ * error handler.
+ *
+ * Returns consumed characters if succeed, or raise a WindowsError and returns
+ * -1 on other error.
+ */
+static int
+encode_code_page_errors(UINT code_page, PyObject **outbytes,
+                        PyObject *unicode, Py_ssize_t unicode_offset,
+                        Py_ssize_t insize, const char* errors)
+{
+    const DWORD flags = encode_code_page_flags(code_page, errors);
+    Py_ssize_t pos = unicode_offset;
+    Py_ssize_t endin = unicode_offset + insize;
+    /* Ideally, we should get reason from FormatMessage. This is the Windows
+       2000 English version of the message. */
+    const char *reason = "invalid character";
+    /* 4=maximum length of a UTF-8 sequence */
+    char buffer[4];
+    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
+    Py_ssize_t outsize;
+    char *out;
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+    PyObject *encoding_obj = NULL;
+    char *encoding;
+    Py_ssize_t newpos, newoutsize;
+    PyObject *rep;
+    int ret = -1;
+
+    assert(insize > 0);
+
+    encoding = code_page_name(code_page, &encoding_obj);
+    if (encoding == NULL)
+        return -1;
+
+    if (errors == NULL || strcmp(errors, "strict") == 0) {
+        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
+           then we raise a UnicodeEncodeError. */
+        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
+        if (exc != NULL) {
+            PyCodec_StrictErrors(exc);
+            Py_DECREF(exc);
+        }
+        Py_XDECREF(encoding_obj);
+        return -1;
+    }
+
+    if (code_page != CP_UTF8 && code_page != CP_UTF7)
+        pusedDefaultChar = &usedDefaultChar;
+    else
+        pusedDefaultChar = NULL;
+
+    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
+        PyErr_NoMemory();
+        goto error;
+    }
+    outsize = insize * Py_ARRAY_LENGTH(buffer);
+
+    if (*outbytes == NULL) {
+        /* Create string object */
+        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
+        if (*outbytes == NULL)
+            goto error;
+        out = PyBytes_AS_STRING(*outbytes);
+    }
+    else {
+        /* Extend string object */
+        Py_ssize_t n = PyBytes_Size(*outbytes);
+        if (n > PY_SSIZE_T_MAX - outsize) {
+            PyErr_NoMemory();
+            goto error;
+        }
+        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
+            goto error;
+        out = PyBytes_AS_STRING(*outbytes) + n;
+    }
+
+    /* Encode the string character per character */
+    while (pos < endin)
+    {
+        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
+        wchar_t chars[2];
+        int charsize;
+        if (ch < 0x10000) {
+            chars[0] = (wchar_t)ch;
+            charsize = 1;
+        }
+        else {
+            ch -= 0x10000;
+            chars[0] = 0xd800 + (ch >> 10);
+            chars[1] = 0xdc00 + (ch & 0x3ff);
+            charsize = 2;
+        }
+
+        outsize = WideCharToMultiByte(code_page, flags,
+                                      chars, charsize,
+                                      buffer, Py_ARRAY_LENGTH(buffer),
+                                      NULL, pusedDefaultChar);
+        if (outsize > 0) {
+            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
+            {
+                pos++;
+                memcpy(out, buffer, outsize);
+                out += outsize;
+                continue;
+            }
+        }
+        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
+            PyErr_SetFromWindowsErr(0);
+            goto error;
+        }
+
+        rep = _PyUnicode_EncodeCallErrorHandler(
+                  errors, &errorHandler, encoding, reason,
+                  unicode, &exc,
+                  pos, pos + 1, &newpos);
+        if (rep == NULL)
+            goto error;
+        pos = newpos;
+
+        if (PyBytes_Check(rep)) {
+            outsize = PyBytes_GET_SIZE(rep);
+            if (outsize != 1) {
+                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                out = PyBytes_AS_STRING(*outbytes) + offset;
+            }
+            memcpy(out, PyBytes_AS_STRING(rep), outsize);
+            out += outsize;
+        }
+        else {
+            Py_ssize_t i;
+            enum PyUnicode_Kind kind;
+            void *data;
+
+            if (PyUnicode_READY(rep) == -1) {
+                Py_DECREF(rep);
+                goto error;
+            }
+
+            outsize = PyUnicode_GET_LENGTH(rep);
+            if (outsize != 1) {
+                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
+                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
+                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                out = PyBytes_AS_STRING(*outbytes) + offset;
+            }
+            kind = PyUnicode_KIND(rep);
+            data = PyUnicode_DATA(rep);
+            for (i=0; i < outsize; i++) {
+                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+                if (ch > 127) {
+                    _PyUnicode_RaiseEncodeException(&exc,
+                        encoding, unicode,
+                        pos, pos + 1,
+                        "unable to encode error handler result to ASCII");
+                    Py_DECREF(rep);
+                    goto error;
+                }
+                *out = (unsigned char)ch;
+                out++;
+            }
+        }
+        Py_DECREF(rep);
+    }
+    /* write a NUL byte */
+    *out = 0;
+    outsize = out - PyBytes_AS_STRING(*outbytes);
+    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
+    if (_PyBytes_Resize(outbytes, outsize) < 0)
+        goto error;
+    ret = 0;
+
+error:
+    Py_XDECREF(encoding_obj);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return ret;
+}
+
+static PyObject *
+encode_code_page(int code_page,
+                 PyObject *unicode,
+                 const char *errors)
+{
+    Py_ssize_t len;
+    PyObject *outbytes = NULL;
+    Py_ssize_t offset;
+    int chunk_len, ret, done;
+
+    if (PyUnicode_READY(unicode) == -1)
+        return NULL;
+    len = PyUnicode_GET_LENGTH(unicode);
+
+    if (code_page < 0) {
+        PyErr_SetString(PyExc_ValueError, "invalid code page number");
+        return NULL;
+    }
+
+    if (len == 0)
+        return PyBytes_FromStringAndSize(NULL, 0);
+
+    offset = 0;
+    do
+    {
+#ifdef NEED_RETRY
+        /* UTF-16 encoding may double the size, so use only INT_MAX/2
+           chunks. */
+        if (len > INT_MAX/2) {
+            chunk_len = INT_MAX/2;
+            done = 0;
+        }
+        else
+#endif
+        {
+            chunk_len = (int)len;
+            done = 1;
+        }
+
+        ret = encode_code_page_strict(code_page, &outbytes,
+                                      unicode, offset, chunk_len,
+                                      errors);
+        if (ret == -2)
+            ret = encode_code_page_errors(code_page, &outbytes,
+                                          unicode, offset,
+                                          chunk_len, errors);
+        if (ret < 0) {
+            Py_XDECREF(outbytes);
+            return NULL;
+        }
+
+        offset += chunk_len;
+        len -= chunk_len;
+    } while (!done);
+
+    return outbytes;
+}
+
+PyObject *
+PyUnicode_EncodeCodePage(int code_page,
+                         PyObject *unicode,
+                         const char *errors)
+{
+    return encode_code_page(code_page, unicode, errors);
+}
+
+PyObject *
+PyUnicode_AsMBCSString(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
+}
+
+#undef NEED_RETRY
+
+#endif /* HAVE_MBCS */
+
+static size_t
+wcstombs_errorpos(const wchar_t *wstr)
+{
+    size_t len;
+#if SIZEOF_WCHAR_T == 2
+    wchar_t buf[3];
+#else
+    wchar_t buf[2];
+#endif
+    char outbuf[MB_LEN_MAX];
+    const wchar_t *start, *previous;
+
+#if SIZEOF_WCHAR_T == 2
+    buf[2] = 0;
+#else
+    buf[1] = 0;
+#endif
+    start = wstr;
+    while (*wstr != L'\0')
+    {
+        previous = wstr;
+#if SIZEOF_WCHAR_T == 2
+        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
+            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
+        {
+            buf[0] = wstr[0];
+            buf[1] = wstr[1];
+            wstr += 2;
+        }
+        else {
+            buf[0] = *wstr;
+            buf[1] = 0;
+            wstr++;
+        }
+#else
+        buf[0] = *wstr;
+        wstr++;
+#endif
+        len = wcstombs(outbuf, buf, sizeof(outbuf));
+        if (len == (size_t)-1)
+            return previous - start;
+    }
+
+    /* failed to find the unencodable character */
+    return 0;
+}
+
+static int
+locale_error_handler(const char *errors, int *surrogateescape)
+{
+    if (errors == NULL) {
+        *surrogateescape = 0;
+        return 0;
+    }
+
+    if (strcmp(errors, "strict") == 0) {
+        *surrogateescape = 0;
+        return 0;
+    }
+    if (strcmp(errors, "surrogateescape") == 0) {
+        *surrogateescape = 1;
+        return 0;
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "only 'strict' and 'surrogateescape' error handlers "
+                 "are supported, not '%s'",
+                 errors);
+    return -1;
+}
+
+PyObject *
+PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
+{
+    Py_ssize_t wlen, wlen2;
+    wchar_t *wstr;
+    PyObject *bytes = NULL;
+    char *errmsg;
+    PyObject *reason;
+    PyObject *exc;
+    size_t error_pos;
+    int surrogateescape;
+
+    if (locale_error_handler(errors, &surrogateescape) < 0)
+        return NULL;
+
+    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+    if (wstr == NULL)
+        return NULL;
+
+    wlen2 = wcslen(wstr);
+    if (wlen2 != wlen) {
+        PyMem_Free(wstr);
+        PyErr_SetString(PyExc_TypeError, "embedded null character");
+        return NULL;
+    }
+
+    if (surrogateescape) {
+        /* locale encoding with surrogateescape */
+        char *str;
+
+        str = _Py_wchar2char(wstr, &error_pos);
+        if (str == NULL) {
+            if (error_pos == (size_t)-1) {
+                PyErr_NoMemory();
+                PyMem_Free(wstr);
+                return NULL;
+            }
+            else {
+                goto encode_error;
+            }
+        }
+        PyMem_Free(wstr);
+
+        bytes = PyBytes_FromString(str);
+        PyMem_Free(str);
+    }
+    else {
+        size_t len, len2;
+
+        len = wcstombs(NULL, wstr, 0);
+        if (len == (size_t)-1) {
+            error_pos = (size_t)-1;
+            goto encode_error;
+        }
+
+        bytes = PyBytes_FromStringAndSize(NULL, len);
+        if (bytes == NULL) {
+            PyMem_Free(wstr);
+            return NULL;
+        }
+
+        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
+        if (len2 == (size_t)-1 || len2 > len) {
+            error_pos = (size_t)-1;
+            goto encode_error;
+        }
+        PyMem_Free(wstr);
+    }
+    return bytes;
+
+encode_error:
+    errmsg = strerror(errno);
+    assert(errmsg != NULL);
+
+    if (error_pos == (size_t)-1)
+        error_pos = wcstombs_errorpos(wstr);
+
+    PyMem_Free(wstr);
+    Py_XDECREF(bytes);
+
+    if (errmsg != NULL) {
+        size_t errlen;
+        wstr = _Py_char2wchar(errmsg, &errlen);
+        if (wstr != NULL) {
+            reason = PyUnicode_FromWideChar(wstr, errlen);
+            PyMem_Free(wstr);
+        } else
+            errmsg = NULL;
+    }
+    if (errmsg == NULL)
+        reason = PyUnicode_FromString(
+            "wcstombs() encountered an unencodable "
+            "wide character");
+    if (reason == NULL)
+        return NULL;
+
+    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
+                                "locale", unicode,
+                                (Py_ssize_t)error_pos,
+                                (Py_ssize_t)(error_pos+1),
+                                reason);
+    Py_DECREF(reason);
+    if (exc != NULL) {
+        PyCodec_StrictErrors(exc);
+        Py_XDECREF(exc);
+    }
+    return NULL;
+}
+
+PyObject *
+PyUnicode_EncodeFSDefault(PyObject *unicode)
+{
+#ifdef HAVE_MBCS
+    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
+#elif defined(__APPLE__)
+    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
+#else
+    PyInterpreterState *interp = PyThreadState_GET()->interp;
+    /* Bootstrap check: if the filesystem codec is implemented in Python, we
+       cannot use it to encode and decode filenames before it is loaded. Load
+       the Python codec requires to encode at least its own filename. Use the C
+       version of the locale codec until the codec registry is initialized and
+       the Python codec is loaded.
+
+       Py_FileSystemDefaultEncoding is shared between all interpreters, we
+       cannot only rely on it: check also interp->fscodec_initialized for
+       subinterpreters. */
+    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
+        return PyUnicode_AsEncodedString(unicode,
+                                         Py_FileSystemDefaultEncoding,
+                                         "surrogateescape");
+    }
+    else {
+        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
+    }
+#endif
+}
+
+static size_t
+mbstowcs_errorpos(const char *str, size_t len)
+{
+#ifdef HAVE_MBRTOWC
+    const char *start = str;
+    mbstate_t mbs;
+    size_t converted;
+    wchar_t ch;
+
+    memset(&mbs, 0, sizeof mbs);
+    while (len)
+    {
+        converted = mbrtowc(&ch, (char*)str, len, &mbs);
+        if (converted == 0)
+            /* Reached end of string */
+            break;
+        if (converted == (size_t)-1 || converted == (size_t)-2) {
+            /* Conversion error or incomplete character */
+            return str - start;
+        }
+        else {
+            str += converted;
+            len -= converted;
+        }
+    }
+    /* failed to find the undecodable byte sequence */
+    return 0;
+#endif
+    return 0;
+}
+
+PyObject*
+PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
+                              const char *errors)
+{
+    wchar_t smallbuf[256];
+    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
+    wchar_t *wstr;
+    size_t wlen, wlen2;
+    PyObject *unicode;
+    int surrogateescape;
+    size_t error_pos;
+    char *errmsg;
+    PyObject *reason, *exc;
+
+    if (locale_error_handler(errors, &surrogateescape) < 0)
+        return NULL;
+
+    if (str[len] != '\0' || len != strlen(str)) {
+        PyErr_SetString(PyExc_TypeError, "embedded null character");
+        return NULL;
+    }
+
+    if (surrogateescape)
+    {
+        wstr = _Py_char2wchar(str, &wlen);
+        if (wstr == NULL) {
+            if (wlen == (size_t)-1)
+                PyErr_NoMemory();
+            else
+                PyErr_SetFromErrno(PyExc_OSError);
+            return NULL;
+        }
+
+        unicode = PyUnicode_FromWideChar(wstr, wlen);
+        PyMem_Free(wstr);
+    }
+    else {
+#ifndef HAVE_BROKEN_MBSTOWCS
+        wlen = mbstowcs(NULL, str, 0);
+#else
+        wlen = len;
+#endif
+        if (wlen == (size_t)-1)
+            goto decode_error;
+        if (wlen+1 <= smallbuf_len) {
+            wstr = smallbuf;
+        }
+        else {
+            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
+                return PyErr_NoMemory();
+
+            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
+            if (!wstr)
+                return PyErr_NoMemory();
+        }
+
+        /* This shouldn't fail now */
+        wlen2 = mbstowcs(wstr, str, wlen+1);
+        if (wlen2 == (size_t)-1) {
+            if (wstr != smallbuf)
+                PyMem_Free(wstr);
+            goto decode_error;
+        }
+#ifdef HAVE_BROKEN_MBSTOWCS
+        assert(wlen2 == wlen);
+#endif
+        unicode = PyUnicode_FromWideChar(wstr, wlen2);
+        if (wstr != smallbuf)
+            PyMem_Free(wstr);
+    }
+    return unicode;
+
+decode_error:
+    errmsg = strerror(errno);
+    assert(errmsg != NULL);
+
+    error_pos = mbstowcs_errorpos(str, len);
+    if (errmsg != NULL) {
+        size_t errlen;
+        wstr = _Py_char2wchar(errmsg, &errlen);
+        if (wstr != NULL) {
+            reason = PyUnicode_FromWideChar(wstr, errlen);
+            PyMem_Free(wstr);
+        } else
+            errmsg = NULL;
+    }
+    if (errmsg == NULL)
+        reason = PyUnicode_FromString(
+            "mbstowcs() encountered an invalid multibyte sequence");
+    if (reason == NULL)
+        return NULL;
+
+    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
+                                "locale", str, len,
+                                (Py_ssize_t)error_pos,
+                                (Py_ssize_t)(error_pos+1),
+                                reason);
+    Py_DECREF(reason);
+    if (exc != NULL) {
+        PyCodec_StrictErrors(exc);
+        Py_XDECREF(exc);
+    }
+    return NULL;
+}
+
+PyObject*
+PyUnicode_DecodeLocale(const char *str, const char *errors)
+{
+    Py_ssize_t size = (Py_ssize_t)strlen(str);
+    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
+}
+
+
+PyObject*
+PyUnicode_DecodeFSDefault(const char *s) {
+    Py_ssize_t size = (Py_ssize_t)strlen(s);
+    return PyUnicode_DecodeFSDefaultAndSize(s, size);
+}
+
+PyObject*
+PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
+{
+#ifdef HAVE_MBCS
+    return PyUnicode_DecodeMBCS(s, size, NULL);
+#elif defined(__APPLE__)
+    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
+#else
+    PyInterpreterState *interp = PyThreadState_GET()->interp;
+    /* Bootstrap check: if the filesystem codec is implemented in Python, we
+       cannot use it to encode and decode filenames before it is loaded. Load
+       the Python codec requires to encode at least its own filename. Use the C
+       version of the locale codec until the codec registry is initialized and
+       the Python codec is loaded.
+
+       Py_FileSystemDefaultEncoding is shared between all interpreters, we
+       cannot only rely on it: check also interp->fscodec_initialized for
+       subinterpreters. */
+    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
+        return PyUnicode_Decode(s, size,
+                                Py_FileSystemDefaultEncoding,
+                                "surrogateescape");
+    }
+    else {
+        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
+    }
+#endif
+}
+
+int
+PyUnicode_FSConverter(PyObject* arg, void* addr)
+{
+    PyObject *output = NULL;
+    Py_ssize_t size;
+    void *data;
+    if (arg == NULL) {
+        Py_DECREF(*(PyObject**)addr);
+        return 1;
+    }
+    if (PyBytes_Check(arg)) {
+        output = arg;
+        Py_INCREF(output);
+    }
+    else {
+        arg = PyUnicode_FromObject(arg);
+        if (!arg)
+            return 0;
+        output = PyUnicode_EncodeFSDefault(arg);
+        Py_DECREF(arg);
+        if (!output)
+            return 0;
+        if (!PyBytes_Check(output)) {
+            Py_DECREF(output);
+            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
+            return 0;
+        }
+    }
+    size = PyBytes_GET_SIZE(output);
+    data = PyBytes_AS_STRING(output);
+    if (size != strlen(data)) {
+        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+        Py_DECREF(output);
+        return 0;
+    }
+    *(PyObject**)addr = output;
+    return Py_CLEANUP_SUPPORTED;
+}
+
+
+int
+PyUnicode_FSDecoder(PyObject* arg, void* addr)
+{
+    PyObject *output = NULL;
+    Py_ssize_t pos;
+    if (arg == NULL) {
+        Py_DECREF(*(PyObject**)addr);
+        return 1;
+    }
+    if (PyUnicode_Check(arg)) {
+        if (PyUnicode_READY(arg) == -1)
+            return 0;
+        output = arg;
+        Py_INCREF(output);
+    }
+    else {
+        arg = PyBytes_FromObject(arg);
+        if (!arg)
+            return 0;
+        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
+                                                  PyBytes_GET_SIZE(arg));
+        Py_DECREF(arg);
+        if (!output)
+            return 0;
+        if (!PyUnicode_Check(output)) {
+            Py_DECREF(output);
+            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
+            return 0;
+        }
+    }
+    if (PyUnicode_READY(output) == -1) {
+        Py_DECREF(output);
+        return 0;
+    }
+    pos = PyUnicode_FindChar(output, '\0',
+                             0, PyUnicode_GET_LENGTH(output), 1);
+    if (pos == -2) {
+        Py_DECREF(output);
+        return 0;
+    }
+    if (pos >= 0) {
+        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+        Py_DECREF(output);
+        return 0;
+    }
+    *(PyObject**)addr = output;
+    return Py_CLEANUP_SUPPORTED;
+}
+

Objects/unicodeoscodecs.h

-/* Operating system codecs: PyUnicode_EncodeFSDefault(),
-   PyUnicode_DecodeFSDefault(), PyUnicode_DecodeMBCSStateful(),
-   PyUnicode_EncodeCodePage(), etc. */
-
-#ifdef HAVE_MBCS
-static OSVERSIONINFOEX winver;
-#endif
-
-#ifdef HAVE_MBCS
-
-/* --- MBCS codecs for Windows -------------------------------------------- */
-
-#if SIZEOF_INT < SIZEOF_SIZE_T
-#define NEED_RETRY
-#endif
-
-#ifndef WC_ERR_INVALID_CHARS
-#  define WC_ERR_INVALID_CHARS 0x0080
-#endif
-
-static char*
-code_page_name(UINT code_page, PyObject **obj)
-{
-    *obj = NULL;
-    if (code_page == CP_ACP)
-        return "mbcs";
-    if (code_page == CP_UTF7)
-        return "CP_UTF7";
-    if (code_page == CP_UTF8)
-        return "CP_UTF8";
-
-    *obj = PyBytes_FromFormat("cp%u", code_page);
-    if (*obj == NULL)
-        return NULL;
-    return PyBytes_AS_STRING(*obj);
-}
-
-static int
-is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
-{
-    const char *curr = s + offset;
-    const char *prev;
-
-    if (!IsDBCSLeadByteEx(code_page, *curr))
-        return 0;
-
-    prev = CharPrevExA(code_page, s, curr, 0);
-    if (prev == curr)
-        return 1;
-    /* FIXME: This code is limited to "true" double-byte encodings,
-       as it assumes an incomplete character consists of a single
-       byte. */
-    if (curr - prev == 2)
-        return 1;
-    if (!IsDBCSLeadByteEx(code_page, *prev))
-        return 1;
-    return 0;
-}
-
-static DWORD
-decode_code_page_flags(UINT code_page)
-{
-    if (code_page == CP_UTF7) {
-        /* The CP_UTF7 decoder only supports flags=0 */
-        return 0;
-    }
-    else
-        return MB_ERR_INVALID_CHARS;
-}
-
-/*
- * Decode a byte string from a Windows code page into unicode object in strict
- * mode.
- *
- * Returns consumed size if succeed, returns -2 on decode error, or raise a
- * WindowsError and returns -1 on other error.
- */
-static int
-decode_code_page_strict(UINT code_page,
-                        PyObject **v,
-                        const char *in,
-                        int insize)
-{
-    const DWORD flags = decode_code_page_flags(code_page);
-    wchar_t *out;
-    DWORD outsize;
-
-    /* First get the size of the result */
-    assert(insize > 0);
-    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
-    if (outsize <= 0)
-        goto error;
-
-    if (*v == NULL) {
-        /* Create unicode object */
-        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
-        *v = (PyObject*)_PyUnicode_New(outsize);
-        if (*v == NULL)
-            return -1;
-        out = PyUnicode_AS_UNICODE(*v);
-    }
-    else {
-        /* Extend unicode object */
-        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
-        if (PyUnicode_Resize(v, n + outsize) < 0)
-            return -1;
-        out = PyUnicode_AS_UNICODE(*v) + n;
-    }
-
-    /* Do the conversion */
-    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
-    if (outsize <= 0)
-        goto error;
-    return insize;
-
-error:
-    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
-        return -2;
-    PyErr_SetFromWindowsErr(0);
-    return -1;
-}
-
-/*
- * Decode a byte string from a code page into unicode object with an error
- * handler.
- *
- * Returns consumed size if succeed, or raise a WindowsError or
- * UnicodeDecodeError exception and returns -1 on error.
- */
-static int
-decode_code_page_errors(UINT code_page,
-                        PyObject **v,
-                        const char *in, const int size,
-                        const char *errors)
-{
-    const char *startin = in;
-    const char *endin = in + size;
-    const DWORD flags = decode_code_page_flags(code_page);
-    /* Ideally, we should get reason from FormatMessage. This is the Windows
-       2000 English version of the message. */
-    const char *reason = "No mapping for the Unicode character exists "
-                         "in the target code page.";
-    /* each step cannot decode more than 1 character, but a character can be
-       represented as a surrogate pair */
-    wchar_t buffer[2], *startout, *out;
-    int insize, outsize;
-    PyObject *errorHandler = NULL;
-    PyObject *exc = NULL;
-    PyObject *encoding_obj = NULL;
-    char *encoding;
-    DWORD err;
-    int ret = -1;
-
-    assert(size > 0);
-
-    encoding = code_page_name(code_page, &encoding_obj);
-    if (encoding == NULL)
-        return -1;
-
-    if (errors == NULL || strcmp(errors, "strict") == 0) {
-        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
-           UnicodeDecodeError. */
-        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
-        if (exc != NULL) {
-            PyCodec_StrictErrors(exc);
-            Py_CLEAR(exc);
-        }
-        goto error;
-    }
-
-    if (*v == NULL) {
-        /* Create unicode object */
-        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
-            PyErr_NoMemory();
-            goto error;
-        }
-        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
-        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
-        if (*v == NULL)
-            goto error;
-        startout = PyUnicode_AS_UNICODE(*v);
-    }
-    else {
-        /* Extend unicode object */
-        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
-        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
-            PyErr_NoMemory();
-            goto error;
-        }
-        if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
-            goto error;
-        startout = PyUnicode_AS_UNICODE(*v) + n;
-    }
-
-    /* Decode the byte string character per character */
-    out = startout;
-    while (in < endin)
-    {
-        /* Decode a character */
-        insize = 1;
-        do
-        {
-            outsize = MultiByteToWideChar(code_page, flags,
-                                          in, insize,
-                                          buffer, Py_ARRAY_LENGTH(buffer));
-            if (outsize > 0)
-                break;
-            err = GetLastError();
-            if (err != ERROR_NO_UNICODE_TRANSLATION
-                && err != ERROR_INSUFFICIENT_BUFFER)
-            {
-                PyErr_SetFromWindowsErr(0);
-                goto error;
-            }
-            insize++;
-        }
-        /* 4=maximum length of a UTF-8 sequence */
-        while (insize <= 4 && (in + insize) <= endin);
-
-        if (outsize <= 0) {
-            Py_ssize_t startinpos, endinpos, outpos;
-
-            startinpos = in - startin;
-            endinpos = startinpos + 1;
-            outpos = out - PyUnicode_AS_UNICODE(*v);
-            if (unicode_decode_call_errorhandler(
-                    errors, &errorHandler,
-                    encoding, reason,
-                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
-                    v, &outpos))
-            {
-                goto error;
-            }
-            out = PyUnicode_AS_UNICODE(*v) + outpos;
-        }
-        else {
-            in += insize;
-            memcpy(out, buffer, outsize * sizeof(wchar_t));
-            out += outsize;
-        }
-    }
-
-    /* write a NUL character at the end */
-    *out = 0;
-
-    /* Extend unicode object */
-    outsize = out - startout;
-    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
-    if (PyUnicode_Resize(v, outsize) < 0)
-        goto error;
-    ret = size;
-
-error:
-    Py_XDECREF(encoding_obj);
-    Py_XDECREF(errorHandler);
-    Py_XDECREF(exc);
-    return ret;
-}
-
-static PyObject *
-decode_code_page_stateful(int code_page,
-                          const char *s, Py_ssize_t size,
-                          const char *errors, Py_ssize_t *consumed)
-{
-    PyObject *v = NULL;
-    int chunk_size, final, converted, done;
-
-    if (code_page < 0) {
-        PyErr_SetString(PyExc_ValueError, "invalid code page number");
-        return NULL;
-    }
-
-    if (consumed)
-        *consumed = 0;
-
-    do
-    {
-#ifdef NEED_RETRY
-        if (size > INT_MAX) {
-            chunk_size = INT_MAX;
-            final = 0;
-            done = 0;
-        }
-        else
-#endif
-        {
-            chunk_size = (int)size;
-            final = (consumed == NULL);
-            done = 1;
-        }
-
-        /* Skip trailing lead-byte unless 'final' is set */
-        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
-            --chunk_size;
-
-        if (chunk_size == 0 && done) {
-            if (v != NULL)
-                break;
-            return PyUnicode_New(0, 0);
-        }
-
-
-        converted = decode_code_page_strict(code_page, &v,
-                                            s, chunk_size);
-        if (converted == -2)
-            converted = decode_code_page_errors(code_page, &v,
-                                                s, chunk_size,
-                                                errors);
-        assert(converted != 0);
-
-        if (converted < 0) {
-            Py_XDECREF(v);
-            return NULL;
-        }
-
-        if (consumed)
-            *consumed += converted;
-
-        s += converted;
-        size -= converted;
-    } while (!done);
-
-    return _PyUnicode_Result(v);
-}
-
-PyObject *
-PyUnicode_DecodeCodePageStateful(int code_page,
-                                 const char *s,
-                                 Py_ssize_t size,
-                                 const char *errors,
-                                 Py_ssize_t *consumed)
-{
-    return decode_code_page_stateful(code_page, s, size, errors, consumed);
-}
-
-PyObject *
-PyUnicode_DecodeMBCSStateful(const char *s,
-                             Py_ssize_t size,
-                             const char *errors,
-                             Py_ssize_t *consumed)
-{
-    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
-}
-
-PyObject *
-PyUnicode_DecodeMBCS(const char *s,
-                     Py_ssize_t size,
-                     const char *errors)
-{
-    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
-}
-
-static DWORD
-encode_code_page_flags(UINT code_page, const char *errors)
-{
-    if (code_page == CP_UTF8) {
-        if (winver.dwMajorVersion >= 6)
-            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
-               and later */
-            return WC_ERR_INVALID_CHARS;
-        else
-            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
-            return 0;
-    }
-    else if (code_page == CP_UTF7) {
-        /* CP_UTF7 only supports flags=0 */
-        return 0;
-    }
-    else {
-        if (errors != NULL && strcmp(errors, "replace") == 0)
-            return 0;
-        else
-            return WC_NO_BEST_FIT_CHARS;
-    }
-}
-
-/*
- * Encode a Unicode string to a Windows code page into a byte string in strict
- * mode.
- *
- * Returns consumed characters if succeed, returns -2 on encode error, or raise
- * a WindowsError and returns -1 on other error.
- */
-static int
-encode_code_page_strict(UINT code_page, PyObject **outbytes,
-                        PyObject *unicode, Py_ssize_t offset, int len,
-                        const char* errors)
-{
-    BOOL usedDefaultChar = FALSE;
-    BOOL *pusedDefaultChar = &usedDefaultChar;
-    int outsize;
-    PyObject *exc = NULL;
-    wchar_t *p;
-    Py_ssize_t size;
-    const DWORD flags = encode_code_page_flags(code_page, NULL);
-    char *out;
-    /* Create a substring so that we can get the UTF-16 representation
-       of just the slice under consideration. */
-    PyObject *substring;
-
-    assert(len > 0);
-
-    if (code_page != CP_UTF8 && code_page != CP_UTF7)
-        pusedDefaultChar = &usedDefaultChar;
-    else
-        pusedDefaultChar = NULL;
-
-    substring = PyUnicode_Substring(unicode, offset, offset+len);
-    if (substring == NULL)
-        return -1;
-    p = PyUnicode_AsUnicodeAndSize(substring, &size);
-    if (p == NULL) {
-        Py_DECREF(substring);
-        return -1;
-    }
-
-    /* First get the size of the result */
-    outsize = WideCharToMultiByte(code_page, flags,
-                                  p, size,
-                                  NULL, 0,
-                                  NULL, pusedDefaultChar);
-    if (outsize <= 0)
-        goto error;
-    /* If we used a default char, then we failed! */
-    if (pusedDefaultChar && *pusedDefaultChar) {
-        Py_DECREF(substring);
-        return -2;
-    }
-
-    if (*outbytes == NULL) {
-        /* Create string object */
-        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
-        if (*outbytes == NULL) {
-            Py_DECREF(substring);
-            return -1;
-        }
-        out = PyBytes_AS_STRING(*outbytes);
-    }
-    else {
-        /* Extend string object */
-        const Py_ssize_t n = PyBytes_Size(*outbytes);
-        if (outsize > PY_SSIZE_T_MAX - n) {
-            PyErr_NoMemory();
-            Py_DECREF(substring);
-            return -1;
-        }
-        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
-            Py_DECREF(substring);
-            return -1;
-        }
-        out = PyBytes_AS_STRING(*outbytes) + n;
-    }
-
-    /* Do the conversion */
-    outsize = WideCharToMultiByte(code_page, flags,
-                                  p, size,
-                                  out, outsize,
-                                  NULL, pusedDefaultChar);
-    Py_CLEAR(substring);
-    if (outsize <= 0)
-        goto error;
-    if (pusedDefaultChar && *pusedDefaultChar)
-        return -2;
-    return 0;
-
-error:
-    Py_XDECREF(substring);
-    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
-        return -2;
-    PyErr_SetFromWindowsErr(0);
-    return -1;
-}
-
-/*
- * Encode a Unicode string to a Windows code page into a byte string using a
- * error handler.
- *
- * Returns consumed characters if succeed, or raise a WindowsError and returns
- * -1 on other error.
- */
-static int
-encode_code_page_errors(UINT code_page, PyObject **outbytes,
-                        PyObject *unicode, Py_ssize_t unicode_offset,
-                        Py_ssize_t insize, const char* errors)
-{
-    const DWORD flags = encode_code_page_flags(code_page, errors);
-    Py_ssize_t pos = unicode_offset;
-    Py_ssize_t endin = unicode_offset + insize;
-    /* Ideally, we should get reason from FormatMessage. This is the Windows
-       2000 English version of the message. */
-    const char *reason = "invalid character";
-    /* 4=maximum length of a UTF-8 sequence */
-    char buffer[4];
-    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
-    Py_ssize_t outsize;
-    char *out;
-    PyObject *errorHandler = NULL;
-    PyObject *exc = NULL;