mirror of
				https://github.com/python/cpython.git
				synced 2025-10-26 03:04:41 +00:00 
			
		
		
		
	gh-139156: Use PyBytesWriter in UTF-32 encoder (#139157)
Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API.
This commit is contained in:
		
							parent
							
								
									f0d8583303
								
							
						
					
					
						commit
						92ba2c92c4
					
				
					 1 changed files with 60 additions and 51 deletions
				
			
		|  | @ -6089,45 +6089,61 @@ _PyUnicode_EncodeUTF32(PyObject *str, | ||||||
|                        const char *errors, |                        const char *errors, | ||||||
|                        int byteorder) |                        int byteorder) | ||||||
| { | { | ||||||
|     int kind; |     if (!PyUnicode_Check(str)) { | ||||||
|     const void *data; |         PyErr_BadArgument(); | ||||||
|     Py_ssize_t len; |         return NULL; | ||||||
|     PyObject *v; |     } | ||||||
|     uint32_t *out; |     int kind = PyUnicode_KIND(str); | ||||||
|  |     const void *data = PyUnicode_DATA(str); | ||||||
|  |     Py_ssize_t len = PyUnicode_GET_LENGTH(str); | ||||||
|  | 
 | ||||||
|  |     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) | ||||||
|  |         return PyErr_NoMemory(); | ||||||
|  |     Py_ssize_t nsize = len + (byteorder == 0); | ||||||
|  | 
 | ||||||
| #if PY_LITTLE_ENDIAN | #if PY_LITTLE_ENDIAN | ||||||
|     int native_ordering = byteorder <= 0; |     int native_ordering = byteorder <= 0; | ||||||
| #else | #else | ||||||
|     int native_ordering = byteorder >= 0; |     int native_ordering = byteorder >= 0; | ||||||
| #endif | #endif | ||||||
|     const char *encoding; |  | ||||||
|     Py_ssize_t nsize, pos; |  | ||||||
|     PyObject *errorHandler = NULL; |  | ||||||
|     PyObject *exc = NULL; |  | ||||||
|     PyObject *rep = NULL; |  | ||||||
| 
 | 
 | ||||||
|     if (!PyUnicode_Check(str)) { |     if (kind == PyUnicode_1BYTE_KIND) { | ||||||
|         PyErr_BadArgument(); |         // gh-139156: Don't use PyBytesWriter API here since it has an overhead
 | ||||||
|  |         // on short strings
 | ||||||
|  |         PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4); | ||||||
|  |         if (v == NULL) { | ||||||
|             return NULL; |             return NULL; | ||||||
|         } |         } | ||||||
|     kind = PyUnicode_KIND(str); |  | ||||||
|     data = PyUnicode_DATA(str); |  | ||||||
|     len = PyUnicode_GET_LENGTH(str); |  | ||||||
| 
 |  | ||||||
|     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) |  | ||||||
|         return PyErr_NoMemory(); |  | ||||||
|     nsize = len + (byteorder == 0); |  | ||||||
|     v = PyBytes_FromStringAndSize(NULL, nsize * 4); |  | ||||||
|     if (v == NULL) |  | ||||||
|         return NULL; |  | ||||||
| 
 | 
 | ||||||
|         /* output buffer is 4-bytes aligned */ |         /* output buffer is 4-bytes aligned */ | ||||||
|         assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); |         assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); | ||||||
|     out = (uint32_t *)PyBytes_AS_STRING(v); |         uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v); | ||||||
|     if (byteorder == 0) |         if (byteorder == 0) { | ||||||
|             *out++ = 0xFEFF; |             *out++ = 0xFEFF; | ||||||
|     if (len == 0) |         } | ||||||
|         goto done; |         if (len > 0) { | ||||||
|  |             ucs1lib_utf32_encode((const Py_UCS1 *)data, len, | ||||||
|  |                                  &out, native_ordering); | ||||||
|  |         } | ||||||
|  |         return v; | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|  |     PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4); | ||||||
|  |     if (writer == NULL) { | ||||||
|  |         return NULL; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     /* output buffer is 4-bytes aligned */ | ||||||
|  |     assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4)); | ||||||
|  |     uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer); | ||||||
|  |     if (byteorder == 0) { | ||||||
|  |         *out++ = 0xFEFF; | ||||||
|  |     } | ||||||
|  |     if (len == 0) { | ||||||
|  |         return PyBytesWriter_Finish(writer); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     const char *encoding; | ||||||
|     if (byteorder == -1) |     if (byteorder == -1) | ||||||
|         encoding = "utf-32-le"; |         encoding = "utf-32-le"; | ||||||
|     else if (byteorder == 1) |     else if (byteorder == 1) | ||||||
|  | @ -6135,15 +6151,11 @@ _PyUnicode_EncodeUTF32(PyObject *str, | ||||||
|     else |     else | ||||||
|         encoding = "utf-32"; |         encoding = "utf-32"; | ||||||
| 
 | 
 | ||||||
|     if (kind == PyUnicode_1BYTE_KIND) { |     PyObject *errorHandler = NULL; | ||||||
|         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); |     PyObject *exc = NULL; | ||||||
|         goto done; |     PyObject *rep = NULL; | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     pos = 0; |  | ||||||
|     while (pos < len) { |  | ||||||
|         Py_ssize_t newpos, repsize, moreunits; |  | ||||||
| 
 | 
 | ||||||
|  |     for (Py_ssize_t pos = 0; pos < len; ) { | ||||||
|         if (kind == PyUnicode_2BYTE_KIND) { |         if (kind == PyUnicode_2BYTE_KIND) { | ||||||
|             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, |             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, | ||||||
|                                         &out, native_ordering); |                                         &out, native_ordering); | ||||||
|  | @ -6156,6 +6168,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, | ||||||
|         if (pos == len) |         if (pos == len) | ||||||
|             break; |             break; | ||||||
| 
 | 
 | ||||||
|  |         Py_ssize_t newpos; | ||||||
|         rep = unicode_encode_call_errorhandler( |         rep = unicode_encode_call_errorhandler( | ||||||
|                 errors, &errorHandler, |                 errors, &errorHandler, | ||||||
|                 encoding, "surrogates not allowed", |                 encoding, "surrogates not allowed", | ||||||
|  | @ -6163,6 +6176,7 @@ _PyUnicode_EncodeUTF32(PyObject *str, | ||||||
|         if (!rep) |         if (!rep) | ||||||
|             goto error; |             goto error; | ||||||
| 
 | 
 | ||||||
|  |         Py_ssize_t repsize, moreunits; | ||||||
|         if (PyBytes_Check(rep)) { |         if (PyBytes_Check(rep)) { | ||||||
|             repsize = PyBytes_GET_SIZE(rep); |             repsize = PyBytes_GET_SIZE(rep); | ||||||
|             if (repsize & 3) { |             if (repsize & 3) { | ||||||
|  | @ -6188,21 +6202,18 @@ _PyUnicode_EncodeUTF32(PyObject *str, | ||||||
| 
 | 
 | ||||||
|         /* four bytes are reserved for each surrogate */ |         /* four bytes are reserved for each surrogate */ | ||||||
|         if (moreunits > 0) { |         if (moreunits > 0) { | ||||||
|             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v); |             out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out); | ||||||
|             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) { |             if (out == NULL) { | ||||||
|                 /* integer overflow */ |  | ||||||
|                 PyErr_NoMemory(); |  | ||||||
|                 goto error; |                 goto error; | ||||||
|             } |             } | ||||||
|             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0) |  | ||||||
|                 goto error; |  | ||||||
|             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos; |  | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         if (PyBytes_Check(rep)) { |         if (PyBytes_Check(rep)) { | ||||||
|             memcpy(out, PyBytes_AS_STRING(rep), repsize); |             memcpy(out, PyBytes_AS_STRING(rep), repsize); | ||||||
|             out += repsize / 4; |             out += repsize / 4; | ||||||
|         } else /* rep is unicode */ { |         } | ||||||
|  |         else { | ||||||
|  |             /* rep is unicode */ | ||||||
|             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); |             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); | ||||||
|             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, |             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, | ||||||
|                                  &out, native_ordering); |                                  &out, native_ordering); | ||||||
|  | @ -6211,21 +6222,19 @@ _PyUnicode_EncodeUTF32(PyObject *str, | ||||||
|         Py_CLEAR(rep); |         Py_CLEAR(rep); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     Py_XDECREF(errorHandler); | ||||||
|  |     Py_XDECREF(exc); | ||||||
|  | 
 | ||||||
|     /* Cut back to size actually needed. This is necessary for, for example,
 |     /* Cut back to size actually needed. This is necessary for, for example,
 | ||||||
|        encoding of a string containing isolated surrogates and the 'ignore' |        encoding of a string containing isolated surrogates and the 'ignore' | ||||||
|        handler is used. */ |        handler is used. */ | ||||||
|     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); |     return PyBytesWriter_FinishWithPointer(writer, out); | ||||||
|     if (nsize != PyBytes_GET_SIZE(v)) | 
 | ||||||
|       _PyBytes_Resize(&v, nsize); |  | ||||||
|     Py_XDECREF(errorHandler); |  | ||||||
|     Py_XDECREF(exc); |  | ||||||
|   done: |  | ||||||
|     return v; |  | ||||||
|   error: |   error: | ||||||
|     Py_XDECREF(rep); |     Py_XDECREF(rep); | ||||||
|     Py_XDECREF(errorHandler); |     Py_XDECREF(errorHandler); | ||||||
|     Py_XDECREF(exc); |     Py_XDECREF(exc); | ||||||
|     Py_XDECREF(v); |     PyBytesWriter_Discard(writer); | ||||||
|     return NULL; |     return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Victor Stinner
						Victor Stinner