gh-139156: Use PyBytesWriter in UTF-32 encoder (#139157)

Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the
PyBytesWriter API.
This commit is contained in:
Victor Stinner 2025-09-22 22:05:35 +02:00 committed by GitHub
parent f0d8583303
commit 92ba2c92c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -6089,45 +6089,61 @@ _PyUnicode_EncodeUTF32(PyObject *str,
const char *errors, const char *errors,
int byteorder) int byteorder)
{ {
int kind; if (!PyUnicode_Check(str)) {
const void *data; PyErr_BadArgument();
Py_ssize_t len; return NULL;
PyObject *v; }
uint32_t *out; int kind = PyUnicode_KIND(str);
const void *data = PyUnicode_DATA(str);
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
return PyErr_NoMemory();
Py_ssize_t nsize = len + (byteorder == 0);
#if PY_LITTLE_ENDIAN #if PY_LITTLE_ENDIAN
int native_ordering = byteorder <= 0; int native_ordering = byteorder <= 0;
#else #else
int native_ordering = byteorder >= 0; int native_ordering = byteorder >= 0;
#endif #endif
const char *encoding;
Py_ssize_t nsize, pos;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *rep = NULL;
if (!PyUnicode_Check(str)) { if (kind == PyUnicode_1BYTE_KIND) {
PyErr_BadArgument(); // gh-139156: Don't use PyBytesWriter API here since it has an overhead
// on short strings
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
if (v == NULL) {
return NULL;
}
/* output buffer is 4-bytes aligned */
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
if (byteorder == 0) {
*out++ = 0xFEFF;
}
if (len > 0) {
ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
&out, native_ordering);
}
return v;
}
PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
if (writer == NULL) {
return NULL; return NULL;
} }
kind = PyUnicode_KIND(str);
data = PyUnicode_DATA(str);
len = PyUnicode_GET_LENGTH(str);
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
return PyErr_NoMemory();
nsize = len + (byteorder == 0);
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
if (v == NULL)
return NULL;
/* output buffer is 4-bytes aligned */ /* output buffer is 4-bytes aligned */
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
out = (uint32_t *)PyBytes_AS_STRING(v); uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
if (byteorder == 0) if (byteorder == 0) {
*out++ = 0xFEFF; *out++ = 0xFEFF;
if (len == 0) }
goto done; if (len == 0) {
return PyBytesWriter_Finish(writer);
}
const char *encoding;
if (byteorder == -1) if (byteorder == -1)
encoding = "utf-32-le"; encoding = "utf-32-le";
else if (byteorder == 1) else if (byteorder == 1)
@ -6135,15 +6151,11 @@ _PyUnicode_EncodeUTF32(PyObject *str,
else else
encoding = "utf-32"; encoding = "utf-32";
if (kind == PyUnicode_1BYTE_KIND) { PyObject *errorHandler = NULL;
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); PyObject *exc = NULL;
goto done; PyObject *rep = NULL;
}
pos = 0;
while (pos < len) {
Py_ssize_t newpos, repsize, moreunits;
for (Py_ssize_t pos = 0; pos < len; ) {
if (kind == PyUnicode_2BYTE_KIND) { if (kind == PyUnicode_2BYTE_KIND) {
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
&out, native_ordering); &out, native_ordering);
@ -6156,6 +6168,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
if (pos == len) if (pos == len)
break; break;
Py_ssize_t newpos;
rep = unicode_encode_call_errorhandler( rep = unicode_encode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
encoding, "surrogates not allowed", encoding, "surrogates not allowed",
@ -6163,6 +6176,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
if (!rep) if (!rep)
goto error; goto error;
Py_ssize_t repsize, moreunits;
if (PyBytes_Check(rep)) { if (PyBytes_Check(rep)) {
repsize = PyBytes_GET_SIZE(rep); repsize = PyBytes_GET_SIZE(rep);
if (repsize & 3) { if (repsize & 3) {
@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
/* four bytes are reserved for each surrogate */ /* four bytes are reserved for each surrogate */
if (moreunits > 0) { if (moreunits > 0) {
Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { if (out == NULL) {
/* integer overflow */
PyErr_NoMemory();
goto error; goto error;
} }
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
goto error;
out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
} }
if (PyBytes_Check(rep)) { if (PyBytes_Check(rep)) {
memcpy(out, PyBytes_AS_STRING(rep), repsize); memcpy(out, PyBytes_AS_STRING(rep), repsize);
out += repsize / 4; out += repsize / 4;
} else /* rep is unicode */ { }
else {
/* rep is unicode */
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
&out, native_ordering); &out, native_ordering);
@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
Py_CLEAR(rep); Py_CLEAR(rep);
} }
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
/* Cut back to size actually needed. This is necessary for, for example, /* Cut back to size actually needed. This is necessary for, for example,
encoding of a string containing isolated surrogates and the 'ignore' encoding of a string containing isolated surrogates and the 'ignore'
handler is used. */ handler is used. */
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); return PyBytesWriter_FinishWithPointer(writer, out);
if (nsize != PyBytes_GET_SIZE(v))
_PyBytes_Resize(&v, nsize);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
done:
return v;
error: error:
Py_XDECREF(rep); Py_XDECREF(rep);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
Py_XDECREF(v); PyBytesWriter_Discard(writer);
return NULL; return NULL;
} }