Commits

Victor Stinner committed edf029f

Close #17694: Add minimum length to _PyUnicodeWriter

* Add also min_char attribute to _PyUnicodeWriter structure (currently unused)
* _PyUnicodeWriter_Init() has no more argument (except the writer itself):
min_length and overallocate must be set explicitly
* In error handlers, only enable overallocation if the replacement string
is longer than 1 character
* CJK decoders don't use overallocation anymore
* Set min_length, instead of preallocating memory using
_PyUnicodeWriter_Prepare(), in many decoders
* _PyUnicode_DecodeUnicodeInternal() checks for integer overflow

Comments (0)

Files changed (7)

Include/unicodeobject.h

     Py_UCS4 maxchar;
     Py_ssize_t size;
     Py_ssize_t pos;
-    /* minimum length of the buffer when overallocation is enabled,
-       see _PyUnicodeWriter_Init() */
+
+    /* minimum number of allocated characters (default: 0) */
     Py_ssize_t min_length;
+
+    /* minimum character (default: 127, ASCII) */
+    Py_UCS4 min_char;
+
+    /* If non-zero, overallocate the buffer by 25% (default: 0). */
     unsigned char overallocate;
+
     /* If readonly is 1, buffer is a shared string (cannot be modified)
        and size is set to 0. */
     unsigned char readonly;
 } _PyUnicodeWriter ;
 
 /* Initialize a Unicode writer.
-
-   If min_length is greater than zero, _PyUnicodeWriter_Prepare()
-   overallocates the buffer and min_length is the minimum length in characters
-   of the buffer. */
+ *
+ * By default, the minimum buffer size is 0 character and overallocation is
+ * disabled. Set min_length, min_char and overallocate attributes to control
+ * the allocation of the buffer. */
 PyAPI_FUNC(void)
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length);
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
 
 /* Prepare the buffer to write 'length' characters
    with the specified maximum character.

Modules/cjkcodecs/multibytecodec.c

         return make_tuple(PyUnicode_New(0, 0), 0);
     }
 
-    _PyUnicodeWriter_Init(&buf.writer, datalen);
+    _PyUnicodeWriter_Init(&buf.writer);
+    buf.writer.min_length = datalen;
     buf.excobj = NULL;
     buf.inbuf = buf.inbuf_top = (unsigned char *)data;
     buf.inbuf_end = buf.inbuf_top + datalen;
 {
     buf->inbuf = buf->inbuf_top = (const unsigned char *)data;
     buf->inbuf_end = buf->inbuf_top + size;
-    _PyUnicodeWriter_Init(&buf->writer, size);
+    buf->writer.min_length += size;
     return 0;
 }
 
     data = pdata.buf;
     size = pdata.len;
 
-    _PyUnicodeWriter_Init(&buf.writer, 1);
+    _PyUnicodeWriter_Init(&buf.writer);
     buf.excobj = NULL;
     origpending = self->pendingsize;
 
     if (sizehint == 0)
         return PyUnicode_New(0, 0);
 
-    _PyUnicodeWriter_Init(&buf.writer, 1);
+    _PyUnicodeWriter_Init(&buf.writer);
     buf.excobj = NULL;
     cres = NULL;
 

Objects/complexobject.c

     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
         return NULL;
 
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     ret = _PyComplex_FormatAdvancedWriter(
         &writer,
         self,

Objects/floatobject.c

     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
         return NULL;
 
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     ret = _PyFloat_FormatAdvancedWriter(
         &writer,
         self,

Objects/longobject.c

     if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
         return NULL;
 
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     ret = _PyLong_FormatAdvancedWriter(
         &writer,
         self,

Objects/stringlib/unicode_format.h

              int recursion_depth, AutoNumber *auto_number)
 {
     _PyUnicodeWriter writer;
-    Py_ssize_t minlen;
 
     /* check the recursion level */
     if (recursion_depth <= 0) {
         return NULL;
     }
 
-    minlen = PyUnicode_GET_LENGTH(input->str) + 100;
-    _PyUnicodeWriter_Init(&writer, minlen);
+    _PyUnicodeWriter_Init(&writer);
+    writer.overallocate = 1;
+    writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
 
     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
                    auto_number)) {

Objects/unicodeobject.c

     const char *f;
     _PyUnicodeWriter writer;
 
-    _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = strlen(format) + 100;
+    writer.overallocate = 1;
 
     /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
        Copy it to be able to pass a reference to a subfunction. */
         goto onError;
     }
 
-    writer->overallocate = 1;
+    if (PyUnicode_READY(repunicode) < 0)
+        goto onError;
+    if (PyUnicode_GET_LENGTH(repunicode) > 1)
+        writer->overallocate = 1;
     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
         return
 
     }
 
     /* Start off assuming it's all ASCII. Widen later as necessary. */
-    _PyUnicodeWriter_Init(&writer, 0);
-    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-        goto onError;
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
 
     shiftOutStart = 0;
     e = s + size;
         return get_latin1_char((unsigned char)s[0]);
     }
 
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
         goto onError;
 
     le = bo <= 0;
 #endif
 
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
         goto onError;
 
 
     /* Note: size will always be longer than the resulting Unicode
        character count */
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
         goto onError;
 
        and we determined it's exact size (common case)
        or it contains \x, \u, ... escape sequences.  then we create a
        legacy wchar string and resize it at the end of this function. */
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     if (len > 0) {
-        if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
-            goto onError;
-        assert(writer.kind == PyUnicode_1BYTE_KIND);
+        writer.min_length = len;
     }
     else {
         /* Escaped strings will always be longer than the resulting
            length after conversion to the true value.
            (but if the error callback returns a long replacement string
            we'll have to allocate more space) */
-        if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-            goto onError;
+        writer.min_length = size;
     }
 
     if (size == 0)
         if (s > end)
             c = '\0'; /* Invalid after \ */
 
-        /* The only case in which i == ascii_length is a backslash
-           followed by a newline. */
-        assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
-
         switch (c) {
 
             /* \x escapes */
        Unicode string, so we start with size here and then reduce the
        length after conversion to the true value. (But decoding error
        handler might have to resize the string) */
-    _PyUnicodeWriter_Init(&writer, 1);
-    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-        goto onError;
+    _PyUnicodeWriter_Init(&writer);
+    writer.min_length = size;
 
     end = s + size;
     while (s < end) {
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
 
-    /* XXX overflow detection missing */
-    _PyUnicodeWriter_Init(&writer, 0);
-    if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
+    _PyUnicodeWriter_Init(&writer);
+    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
+        PyErr_NoMemory();
         goto onError;
+    }
+    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
+
     end = s + size;
-
     while (s < end) {
         Py_UNICODE uch;
         Py_UCS4 ch;
     if (size == 1 && (unsigned char)s[0] < 128)
         return get_latin1_char((unsigned char)s[0]);
 
-    _PyUnicodeWriter_Init(&writer, 0);
-    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
-        goto onError;
+    _PyUnicodeWriter_Init(&writer);
+    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0)
+        return NULL;
 
     e = s + size;
     data = writer.data;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
         goto onError;
 
                 ch = *s;
                 x = mapdata_ucs1[ch];
                 if (x > maxchar) {
-                    if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)
+                    if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1)
                         goto onError;
                     maxchar = writer.maxchar;
                     outdata = (Py_UCS1 *)writer.data;
 Py_LOCAL_INLINE(void)
 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
 {
-    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    if (!writer->readonly)
+        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    else {
+        /* Copy-on-write mode: set buffer size to 0 so
+         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
+         * next write. */
+        writer->size = 0;
+    }
     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
     writer->data = PyUnicode_DATA(writer->buffer);
     writer->kind = PyUnicode_KIND(writer->buffer);
 }
 
 void
-_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
 {
     memset(writer, 0, sizeof(*writer));
 #ifdef Py_DEBUG
     writer->kind = 5;    /* invalid kind */
 #endif
-    writer->min_length = Py_MAX(min_length, 100);
-    writer->overallocate = (min_length > 0);
+    writer->min_char = 127;
 }
 
 int
     }
     newlen = writer->pos + length;
 
+    maxchar = MAX_MAXCHAR(maxchar, writer->min_char);
+
     if (writer->buffer == NULL) {
-        if (writer->overallocate) {
+        assert(!writer->readonly);
+        if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
             /* overallocate 25% to limit the number of resize */
-            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
-                newlen += newlen / 4;
-            if (newlen < writer->min_length)
-                newlen = writer->min_length;
-        }
+            newlen += newlen / 4;
+        }
+        if (newlen < writer->min_length)
+            newlen = writer->min_length;
+
         writer->buffer = PyUnicode_New(newlen, maxchar);
         if (writer->buffer == NULL)
             return -1;
-        _PyUnicodeWriter_Update(writer);
-        return 0;
-    }
-
-    if (newlen > writer->size) {
-        if (writer->overallocate) {
+    }
+    else if (newlen > writer->size) {
+        if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
             /* overallocate 25% to limit the number of resize */
-            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
-                newlen += newlen / 4;
-            if (newlen < writer->min_length)
-                newlen = writer->min_length;
-        }
+            newlen += newlen / 4;
+        }
+        if (newlen < writer->min_length)
+            newlen = writer->min_length;
 
         if (maxchar > writer->maxchar || writer->readonly) {
             /* resize + widen */
                 return -1;
         }
         writer->buffer = newbuffer;
-        _PyUnicodeWriter_Update(writer);
     }
     else if (maxchar > writer->maxchar) {
         assert(!writer->readonly);
                                       writer->buffer, 0, writer->pos);
         Py_DECREF(writer->buffer);
         writer->buffer = newbuffer;
-        _PyUnicodeWriter_Update(writer);
-    }
+    }
+    _PyUnicodeWriter_Update(writer);
     return 0;
 }
 
     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
         if (writer->buffer == NULL && !writer->overallocate) {
+            writer->readonly = 1;
             Py_INCREF(str);
             writer->buffer = str;
             _PyUnicodeWriter_Update(writer);
-            writer->readonly = 1;
-            writer->size = 0;
             writer->pos += len;
             return 0;
         }
 
     if (PyUnicode_READY(self) == -1)
         return NULL;
-    _PyUnicodeWriter_Init(&writer, 0);
+    _PyUnicodeWriter_Init(&writer);
     ret = _PyUnicode_FormatAdvancedWriter(&writer,
                                           self, format_spec, 0,
                                           PyUnicode_GET_LENGTH(format_spec));
     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
     ctx.fmtpos = 0;
 
-    _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
+    _PyUnicodeWriter_Init(&ctx.writer);
+    ctx.writer.min_length = ctx.fmtcnt + 100;
+    ctx.writer.overallocate = 1;
 
     if (PyTuple_Check(args)) {
         ctx.arglen = PyTuple_Size(args);
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.