mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	gh-139156: Use PyBytesWriter in UTF-16 encoder (#139233)
Replace PyBytes_FromStringAndSize() and _PyBytes_Resize() with the PyBytesWriter API.
This commit is contained in:
		
							parent
							
								
									e578a9e6a5
								
							
						
					
					
						commit
						c497694f77
					
				
					 1 changed files with 52 additions and 52 deletions
				
			
		|  | @ -6407,32 +6407,15 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
|                        const char *errors, | ||||
|                        int byteorder) | ||||
| { | ||||
|     int kind; | ||||
|     const void *data; | ||||
|     Py_ssize_t len; | ||||
|     PyObject *v; | ||||
|     unsigned short *out; | ||||
|     Py_ssize_t pairs; | ||||
| #if PY_BIG_ENDIAN | ||||
|     int native_ordering = byteorder >= 0; | ||||
| #else | ||||
|     int native_ordering = byteorder <= 0; | ||||
| #endif | ||||
|     const char *encoding; | ||||
|     Py_ssize_t nsize, pos; | ||||
|     PyObject *errorHandler = NULL; | ||||
|     PyObject *exc = NULL; | ||||
|     PyObject *rep = NULL; | ||||
| 
 | ||||
|     if (!PyUnicode_Check(str)) { | ||||
|         PyErr_BadArgument(); | ||||
|         return NULL; | ||||
|     } | ||||
|     kind = PyUnicode_KIND(str); | ||||
|     data = PyUnicode_DATA(str); | ||||
|     len = PyUnicode_GET_LENGTH(str); | ||||
|     int kind = PyUnicode_KIND(str); | ||||
|     const void *data = PyUnicode_DATA(str); | ||||
|     Py_ssize_t len = PyUnicode_GET_LENGTH(str); | ||||
| 
 | ||||
|     pairs = 0; | ||||
|     Py_ssize_t pairs = 0; | ||||
|     if (kind == PyUnicode_4BYTE_KIND) { | ||||
|         const Py_UCS4 *in = (const Py_UCS4 *)data; | ||||
|         const Py_UCS4 *end = in + len; | ||||
|  | @ -6445,27 +6428,48 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
|     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) { | ||||
|         return PyErr_NoMemory(); | ||||
|     } | ||||
|     nsize = len + pairs + (byteorder == 0); | ||||
|     v = PyBytes_FromStringAndSize(NULL, nsize * 2); | ||||
|     if (v == NULL) { | ||||
|     Py_ssize_t nsize = len + pairs + (byteorder == 0); | ||||
| 
 | ||||
| #if PY_BIG_ENDIAN | ||||
|     int native_ordering = byteorder >= 0; | ||||
| #else | ||||
|     int native_ordering = byteorder <= 0; | ||||
| #endif | ||||
| 
 | ||||
|     if (kind == PyUnicode_1BYTE_KIND) { | ||||
|         PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2); | ||||
|         if (v == NULL) { | ||||
|             return NULL; | ||||
|         } | ||||
| 
 | ||||
|         /* output buffer is 2-bytes aligned */ | ||||
|         assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); | ||||
|         unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v); | ||||
|         if (byteorder == 0) { | ||||
|             *out++ = 0xFEFF; | ||||
|         } | ||||
|         if (len > 0) { | ||||
|             ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); | ||||
|         } | ||||
|         return v; | ||||
|     } | ||||
| 
 | ||||
|     PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2); | ||||
|     if (writer == NULL) { | ||||
|         return NULL; | ||||
|     } | ||||
| 
 | ||||
|     /* output buffer is 2-bytes aligned */ | ||||
|     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); | ||||
|     out = (unsigned short *)PyBytes_AS_STRING(v); | ||||
|     assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2)); | ||||
|     unsigned short *out = PyBytesWriter_GetData(writer); | ||||
|     if (byteorder == 0) { | ||||
|         *out++ = 0xFEFF; | ||||
|     } | ||||
|     if (len == 0) { | ||||
|         goto done; | ||||
|     } | ||||
| 
 | ||||
|     if (kind == PyUnicode_1BYTE_KIND) { | ||||
|         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); | ||||
|         goto done; | ||||
|         return PyBytesWriter_Finish(writer); | ||||
|     } | ||||
| 
 | ||||
|     const char *encoding; | ||||
|     if (byteorder < 0) { | ||||
|         encoding = "utf-16-le"; | ||||
|     } | ||||
|  | @ -6476,10 +6480,11 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
|         encoding = "utf-16"; | ||||
|     } | ||||
| 
 | ||||
|     pos = 0; | ||||
|     while (pos < len) { | ||||
|         Py_ssize_t newpos, repsize, moreunits; | ||||
|     PyObject *errorHandler = NULL; | ||||
|     PyObject *exc = NULL; | ||||
|     PyObject *rep = NULL; | ||||
| 
 | ||||
|     for (Py_ssize_t pos = 0; pos < len; ) { | ||||
|         if (kind == PyUnicode_2BYTE_KIND) { | ||||
|             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, | ||||
|                                         &out, native_ordering); | ||||
|  | @ -6492,6 +6497,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
|         if (pos == len) | ||||
|             break; | ||||
| 
 | ||||
|         Py_ssize_t newpos; | ||||
|         rep = unicode_encode_call_errorhandler( | ||||
|                 errors, &errorHandler, | ||||
|                 encoding, "surrogates not allowed", | ||||
|  | @ -6499,6 +6505,7 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
|         if (!rep) | ||||
|             goto error; | ||||
| 
 | ||||
|         Py_ssize_t repsize, moreunits; | ||||
|         if (PyBytes_Check(rep)) { | ||||
|             repsize = PyBytes_GET_SIZE(rep); | ||||
|             if (repsize & 1) { | ||||
|  | @ -6524,21 +6531,17 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
| 
 | ||||
|         /* two bytes are reserved for each surrogate */ | ||||
|         if (moreunits > 0) { | ||||
|             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); | ||||
|             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) { | ||||
|                 /* integer overflow */ | ||||
|                 PyErr_NoMemory(); | ||||
|             out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out); | ||||
|             if (out == NULL) { | ||||
|                 goto error; | ||||
|             } | ||||
|             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0) | ||||
|                 goto error; | ||||
|             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; | ||||
|         } | ||||
| 
 | ||||
|         if (PyBytes_Check(rep)) { | ||||
|             memcpy(out, PyBytes_AS_STRING(rep), repsize); | ||||
|             out += repsize / 2; | ||||
|         } else /* rep is unicode */ { | ||||
|         } else { | ||||
|             /* rep is unicode */ | ||||
|             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); | ||||
|             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, | ||||
|                                  &out, native_ordering); | ||||
|  | @ -6547,23 +6550,20 @@ _PyUnicode_EncodeUTF16(PyObject *str, | |||
|         Py_CLEAR(rep); | ||||
|     } | ||||
| 
 | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
| 
 | ||||
|     /* Cut back to size actually needed. This is necessary for, for example,
 | ||||
|     encoding of a string containing isolated surrogates and the 'ignore' handler | ||||
|     is used. */ | ||||
|     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); | ||||
|     if (nsize != PyBytes_GET_SIZE(v)) | ||||
|       _PyBytes_Resize(&v, nsize); | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|   done: | ||||
|     return v; | ||||
|     return PyBytesWriter_FinishWithPointer(writer, out); | ||||
| 
 | ||||
|   error: | ||||
|     Py_XDECREF(rep); | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     Py_XDECREF(v); | ||||
|     PyBytesWriter_Discard(writer); | ||||
|     return NULL; | ||||
| #undef STORECHAR | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Victor Stinner
						Victor Stinner