gh-139156: Use PyBytesWriter in UTF-32 encoder (#139157)

Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API.
2025-10-25 10:44:55 +00:00 · 2025-09-22 22:05:35 +02:00 · 2025-09-22 22:05:35 +02:00 · 92ba2c92c4
commit 92ba2c92c4
parent f0d8583303
1 changed files with 60 additions and 51 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6089,45 +6089,61 @@ _PyUnicode_EncodeUTF32(PyObject *str,
                       const char *errors,
                       int byteorder)
 {
-    int kind;
+    if (!PyUnicode_Check(str)) {
-    const void *data;
+        PyErr_BadArgument();
-    Py_ssize_t len;
+        return NULL;
-    PyObject *v;
+    }
-    uint32_t *out;
+    int kind = PyUnicode_KIND(str);
    const void *data = PyUnicode_DATA(str);
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
        return PyErr_NoMemory();
    Py_ssize_t nsize = len + (byteorder == 0);
 #if PY_LITTLE_ENDIAN
    int native_ordering = byteorder <= 0;
 #else
    int native_ordering = byteorder >= 0;
 #endif
    const char *encoding;
    Py_ssize_t nsize, pos;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *rep = NULL;
-    if (!PyUnicode_Check(str)) {
+    if (kind == PyUnicode_1BYTE_KIND) {
-        PyErr_BadArgument();
+        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
        // on short strings
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
        if (v == NULL) {
            return NULL;
        }
        /* output buffer is 4-bytes aligned */
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
        if (byteorder == 0) {
            *out++ = 0xFEFF;
        }
        if (len > 0) {
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
                                 &out, native_ordering);
        }
        return v;
    }
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
    if (writer == NULL) {
        return NULL;
    }
    kind = PyUnicode_KIND(str);
    data = PyUnicode_DATA(str);
    len = PyUnicode_GET_LENGTH(str);
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
        return PyErr_NoMemory();
    nsize = len + (byteorder == 0);
    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
    if (v == NULL)
        return NULL;
    /* output buffer is 4-bytes aligned */
-    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
+    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
-    out = (uint32_t *)PyBytes_AS_STRING(v);
+    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
-    if (byteorder == 0)
+    if (byteorder == 0) {
        *out++ = 0xFEFF;
-    if (len == 0)
+    }
-        goto done;
+    if (len == 0) {
        return PyBytesWriter_Finish(writer);
    }
    const char *encoding;
    if (byteorder == -1)
        encoding = "utf-32-le";
    else if (byteorder == 1)
@ -6135,15 +6151,11 @@ _PyUnicode_EncodeUTF32(PyObject *str,
    else
        encoding = "utf-32";
-    if (kind == PyUnicode_1BYTE_KIND) {
+    PyObject *errorHandler = NULL;
-        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
+    PyObject *exc = NULL;
-        goto done;
+    PyObject *rep = NULL;
    }
    pos = 0;
    while (pos < len) {
        Py_ssize_t newpos, repsize, moreunits;
    for (Py_ssize_t pos = 0; pos < len; ) {
        if (kind == PyUnicode_2BYTE_KIND) {
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
                                        &out, native_ordering);
@ -6156,6 +6168,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        if (pos == len)
            break;
        Py_ssize_t newpos;
        rep = unicode_encode_call_errorhandler(
                errors, &errorHandler,
                encoding, "surrogates not allowed",
@ -6163,6 +6176,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        if (!rep)
            goto error;
        Py_ssize_t repsize, moreunits;
        if (PyBytes_Check(rep)) {
            repsize = PyBytes_GET_SIZE(rep);
            if (repsize & 3) {
@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        /* four bytes are reserved for each surrogate */
        if (moreunits > 0) {
-            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
+            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
-            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
+            if (out == NULL) {
                /* integer overflow */
                PyErr_NoMemory();
                goto error;
            }
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
                goto error;
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
        }
        if (PyBytes_Check(rep)) {
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
            out += repsize / 4;
-        } else /* rep is unicode */ {
+        }
        else {
            /* rep is unicode */
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
                                 &out, native_ordering);
@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
        Py_CLEAR(rep);
    }
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    /* Cut back to size actually needed. This is necessary for, for example,
       encoding of a string containing isolated surrogates and the 'ignore'
       handler is used. */
-    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
+    return PyBytesWriter_FinishWithPointer(writer, out);
-    if (nsize != PyBytes_GET_SIZE(v))
+
      _PyBytes_Resize(&v, nsize);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
  done:
    return v;
  error:
    Py_XDECREF(rep);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
-    Py_XDECREF(v);
+    PyBytesWriter_Discard(writer);
    return NULL;
 }