mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	bpo-37348: optimize decoding ASCII string (GH-14283)
`_PyUnicode_Writer` is a relatively complex structure. Initializing it is significant overhead when decoding short ASCII string.
This commit is contained in:
		
							parent
							
								
									b3ca7972c8
								
							
						
					
					
						commit
						770847a7db
					
				
					 2 changed files with 53 additions and 34 deletions
				
			
		|  | @ -265,6 +265,8 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value, | |||
| /* Forward declaration */ | ||||
| static inline int | ||||
| _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); | ||||
| static inline void | ||||
| _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer); | ||||
| static PyObject * | ||||
| unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler, | ||||
|                     const char *errors); | ||||
|  | @ -4877,16 +4879,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, | |||
|                     _Py_error_handler error_handler, const char *errors, | ||||
|                     Py_ssize_t *consumed) | ||||
| { | ||||
|     _PyUnicodeWriter writer; | ||||
|     const char *starts = s; | ||||
|     const char *end = s + size; | ||||
| 
 | ||||
|     Py_ssize_t startinpos; | ||||
|     Py_ssize_t endinpos; | ||||
|     const char *errmsg = ""; | ||||
|     PyObject *error_handler_obj = NULL; | ||||
|     PyObject *exc = NULL; | ||||
| 
 | ||||
|     if (size == 0) { | ||||
|         if (consumed) | ||||
|             *consumed = 0; | ||||
|  | @ -4900,13 +4892,29 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, | |||
|         return get_latin1_char((unsigned char)s[0]); | ||||
|     } | ||||
| 
 | ||||
|     _PyUnicodeWriter_Init(&writer); | ||||
|     writer.min_length = size; | ||||
|     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) | ||||
|         goto onError; | ||||
|     const char *starts = s; | ||||
|     const char *end = s + size; | ||||
| 
 | ||||
|     // fast path: try ASCII string.
 | ||||
|     PyObject *u = PyUnicode_New(size, 127); | ||||
|     if (u == NULL) { | ||||
|         return NULL; | ||||
|     } | ||||
|     s += ascii_decode(s, end, PyUnicode_DATA(u)); | ||||
|     if (s == end) { | ||||
|         return u; | ||||
|     } | ||||
| 
 | ||||
|     // Use _PyUnicodeWriter after fast path is failed.
 | ||||
|     _PyUnicodeWriter writer; | ||||
|     _PyUnicodeWriter_InitWithBuffer(&writer, u); | ||||
|     writer.pos = s - starts; | ||||
| 
 | ||||
|     Py_ssize_t startinpos, endinpos; | ||||
|     const char *errmsg = ""; | ||||
|     PyObject *error_handler_obj = NULL; | ||||
|     PyObject *exc = NULL; | ||||
| 
 | ||||
|     writer.pos = ascii_decode(s, end, writer.data); | ||||
|     s += writer.pos; | ||||
|     while (s < end) { | ||||
|         Py_UCS4 ch; | ||||
|         int kind = writer.kind; | ||||
|  | @ -6451,7 +6459,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, | |||
|        length after conversion to the true value. (But decoding error | ||||
|        handler might have to resize the string) */ | ||||
|     _PyUnicodeWriter_Init(&writer); | ||||
|      writer.min_length = size; | ||||
|     writer.min_length = size; | ||||
|     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { | ||||
|         goto onError; | ||||
|     } | ||||
|  | @ -6975,13 +6983,7 @@ PyUnicode_DecodeASCII(const char *s, | |||
|                       const char *errors) | ||||
| { | ||||
|     const char *starts = s; | ||||
|     _PyUnicodeWriter writer; | ||||
|     int kind; | ||||
|     void *data; | ||||
|     Py_ssize_t startinpos; | ||||
|     Py_ssize_t endinpos; | ||||
|     Py_ssize_t outpos; | ||||
|     const char *e; | ||||
|     const char *e = s + size; | ||||
|     PyObject *error_handler_obj = NULL; | ||||
|     PyObject *exc = NULL; | ||||
|     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; | ||||
|  | @ -6993,20 +6995,25 @@ PyUnicode_DecodeASCII(const char *s, | |||
|     if (size == 1 && (unsigned char)s[0] < 128) | ||||
|         return get_latin1_char((unsigned char)s[0]); | ||||
| 
 | ||||
|     _PyUnicodeWriter_Init(&writer); | ||||
|     writer.min_length = size; | ||||
|     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) | ||||
|     // Shortcut for simple case
 | ||||
|     PyObject *u = PyUnicode_New(size, 127); | ||||
|     if (u == NULL) { | ||||
|         return NULL; | ||||
|     } | ||||
|     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u)); | ||||
|     if (outpos == size) { | ||||
|         return u; | ||||
|     } | ||||
| 
 | ||||
|     e = s + size; | ||||
|     data = writer.data; | ||||
|     outpos = ascii_decode(s, e, (Py_UCS1 *)data); | ||||
|     _PyUnicodeWriter writer; | ||||
|     _PyUnicodeWriter_InitWithBuffer(&writer, u); | ||||
|     writer.pos = outpos; | ||||
|     if (writer.pos == size) | ||||
|         return _PyUnicodeWriter_Finish(&writer); | ||||
| 
 | ||||
|     s += writer.pos; | ||||
|     kind = writer.kind; | ||||
|     s += outpos; | ||||
|     int kind = writer.kind; | ||||
|     void *data = writer.data; | ||||
|     Py_ssize_t startinpos, endinpos; | ||||
| 
 | ||||
|     while (s < e) { | ||||
|         unsigned char c = (unsigned char)*s; | ||||
|         if (c < 128) { | ||||
|  | @ -13506,6 +13513,16 @@ _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) | |||
|     assert(writer->kind <= PyUnicode_1BYTE_KIND); | ||||
| } | ||||
| 
 | ||||
| // Initialize _PyUnicodeWriter with initial buffer
 | ||||
| static inline void | ||||
| _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer) | ||||
| { | ||||
|     memset(writer, 0, sizeof(*writer)); | ||||
|     writer->buffer = buffer; | ||||
|     _PyUnicodeWriter_Update(writer); | ||||
|     writer->min_length = writer->size; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, | ||||
|                                  Py_ssize_t length, Py_UCS4 maxchar) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Inada Naoki
						Inada Naoki