mirror of
https://github.com/python/cpython.git
synced 2025-10-25 10:44:55 +00:00
gh-139156: Use PyBytesWriter in UTF-32 encoder (#139157)
Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API.
This commit is contained in:
parent
f0d8583303
commit
92ba2c92c4
1 changed files with 60 additions and 51 deletions
|
|
@ -6089,45 +6089,61 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
const char *errors,
|
const char *errors,
|
||||||
int byteorder)
|
int byteorder)
|
||||||
{
|
{
|
||||||
int kind;
|
if (!PyUnicode_Check(str)) {
|
||||||
const void *data;
|
PyErr_BadArgument();
|
||||||
Py_ssize_t len;
|
return NULL;
|
||||||
PyObject *v;
|
}
|
||||||
uint32_t *out;
|
int kind = PyUnicode_KIND(str);
|
||||||
|
const void *data = PyUnicode_DATA(str);
|
||||||
|
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
|
||||||
|
|
||||||
|
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
|
||||||
|
return PyErr_NoMemory();
|
||||||
|
Py_ssize_t nsize = len + (byteorder == 0);
|
||||||
|
|
||||||
#if PY_LITTLE_ENDIAN
|
#if PY_LITTLE_ENDIAN
|
||||||
int native_ordering = byteorder <= 0;
|
int native_ordering = byteorder <= 0;
|
||||||
#else
|
#else
|
||||||
int native_ordering = byteorder >= 0;
|
int native_ordering = byteorder >= 0;
|
||||||
#endif
|
#endif
|
||||||
const char *encoding;
|
|
||||||
Py_ssize_t nsize, pos;
|
|
||||||
PyObject *errorHandler = NULL;
|
|
||||||
PyObject *exc = NULL;
|
|
||||||
PyObject *rep = NULL;
|
|
||||||
|
|
||||||
if (!PyUnicode_Check(str)) {
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
PyErr_BadArgument();
|
// gh-139156: Don't use PyBytesWriter API here since it has an overhead
|
||||||
|
// on short strings
|
||||||
|
PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
|
||||||
|
if (v == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* output buffer is 4-bytes aligned */
|
||||||
|
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
|
||||||
|
uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
|
||||||
|
if (byteorder == 0) {
|
||||||
|
*out++ = 0xFEFF;
|
||||||
|
}
|
||||||
|
if (len > 0) {
|
||||||
|
ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
|
||||||
|
&out, native_ordering);
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
|
||||||
|
if (writer == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
kind = PyUnicode_KIND(str);
|
|
||||||
data = PyUnicode_DATA(str);
|
|
||||||
len = PyUnicode_GET_LENGTH(str);
|
|
||||||
|
|
||||||
if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
|
|
||||||
return PyErr_NoMemory();
|
|
||||||
nsize = len + (byteorder == 0);
|
|
||||||
v = PyBytes_FromStringAndSize(NULL, nsize * 4);
|
|
||||||
if (v == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
/* output buffer is 4-bytes aligned */
|
/* output buffer is 4-bytes aligned */
|
||||||
assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
|
assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
|
||||||
out = (uint32_t *)PyBytes_AS_STRING(v);
|
uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
|
||||||
if (byteorder == 0)
|
if (byteorder == 0) {
|
||||||
*out++ = 0xFEFF;
|
*out++ = 0xFEFF;
|
||||||
if (len == 0)
|
}
|
||||||
goto done;
|
if (len == 0) {
|
||||||
|
return PyBytesWriter_Finish(writer);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *encoding;
|
||||||
if (byteorder == -1)
|
if (byteorder == -1)
|
||||||
encoding = "utf-32-le";
|
encoding = "utf-32-le";
|
||||||
else if (byteorder == 1)
|
else if (byteorder == 1)
|
||||||
|
|
@ -6135,15 +6151,11 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
else
|
else
|
||||||
encoding = "utf-32";
|
encoding = "utf-32";
|
||||||
|
|
||||||
if (kind == PyUnicode_1BYTE_KIND) {
|
PyObject *errorHandler = NULL;
|
||||||
ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
|
PyObject *exc = NULL;
|
||||||
goto done;
|
PyObject *rep = NULL;
|
||||||
}
|
|
||||||
|
|
||||||
pos = 0;
|
|
||||||
while (pos < len) {
|
|
||||||
Py_ssize_t newpos, repsize, moreunits;
|
|
||||||
|
|
||||||
|
for (Py_ssize_t pos = 0; pos < len; ) {
|
||||||
if (kind == PyUnicode_2BYTE_KIND) {
|
if (kind == PyUnicode_2BYTE_KIND) {
|
||||||
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
|
pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
|
||||||
&out, native_ordering);
|
&out, native_ordering);
|
||||||
|
|
@ -6156,6 +6168,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
if (pos == len)
|
if (pos == len)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
Py_ssize_t newpos;
|
||||||
rep = unicode_encode_call_errorhandler(
|
rep = unicode_encode_call_errorhandler(
|
||||||
errors, &errorHandler,
|
errors, &errorHandler,
|
||||||
encoding, "surrogates not allowed",
|
encoding, "surrogates not allowed",
|
||||||
|
|
@ -6163,6 +6176,7 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
if (!rep)
|
if (!rep)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
|
Py_ssize_t repsize, moreunits;
|
||||||
if (PyBytes_Check(rep)) {
|
if (PyBytes_Check(rep)) {
|
||||||
repsize = PyBytes_GET_SIZE(rep);
|
repsize = PyBytes_GET_SIZE(rep);
|
||||||
if (repsize & 3) {
|
if (repsize & 3) {
|
||||||
|
|
@ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
|
|
||||||
/* four bytes are reserved for each surrogate */
|
/* four bytes are reserved for each surrogate */
|
||||||
if (moreunits > 0) {
|
if (moreunits > 0) {
|
||||||
Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
|
out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
|
||||||
if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
|
if (out == NULL) {
|
||||||
/* integer overflow */
|
|
||||||
PyErr_NoMemory();
|
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
|
|
||||||
goto error;
|
|
||||||
out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PyBytes_Check(rep)) {
|
if (PyBytes_Check(rep)) {
|
||||||
memcpy(out, PyBytes_AS_STRING(rep), repsize);
|
memcpy(out, PyBytes_AS_STRING(rep), repsize);
|
||||||
out += repsize / 4;
|
out += repsize / 4;
|
||||||
} else /* rep is unicode */ {
|
}
|
||||||
|
else {
|
||||||
|
/* rep is unicode */
|
||||||
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
|
||||||
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
|
||||||
&out, native_ordering);
|
&out, native_ordering);
|
||||||
|
|
@ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str,
|
||||||
Py_CLEAR(rep);
|
Py_CLEAR(rep);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
|
||||||
/* Cut back to size actually needed. This is necessary for, for example,
|
/* Cut back to size actually needed. This is necessary for, for example,
|
||||||
encoding of a string containing isolated surrogates and the 'ignore'
|
encoding of a string containing isolated surrogates and the 'ignore'
|
||||||
handler is used. */
|
handler is used. */
|
||||||
nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
|
return PyBytesWriter_FinishWithPointer(writer, out);
|
||||||
if (nsize != PyBytes_GET_SIZE(v))
|
|
||||||
_PyBytes_Resize(&v, nsize);
|
|
||||||
Py_XDECREF(errorHandler);
|
|
||||||
Py_XDECREF(exc);
|
|
||||||
done:
|
|
||||||
return v;
|
|
||||||
error:
|
error:
|
||||||
Py_XDECREF(rep);
|
Py_XDECREF(rep);
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(errorHandler);
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
Py_XDECREF(v);
|
PyBytesWriter_Discard(writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue