mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	Optimize bytes.fromhex() and bytearray.fromhex()
Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
This commit is contained in:
		
							parent
							
								
									ebcf9edc05
								
							
						
					
					
						commit
						2bf8993db9
					
				
					 7 changed files with 101 additions and 95 deletions
				
			
		|  | @ -161,6 +161,9 @@ Optimizations | |||
| * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by | ||||
|   Victor Stinner in :issue:`25399`). | ||||
| 
 | ||||
| * Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now | ||||
|   between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`). | ||||
| 
 | ||||
| 
 | ||||
| Build and C API Changes | ||||
| ======================= | ||||
|  |  | |||
|  | @ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx( | |||
|     Py_ssize_t format_len, | ||||
|     PyObject *args, | ||||
|     int use_bytearray); | ||||
| PyAPI_FUNC(PyObject*) _PyBytes_FromHex( | ||||
|     PyObject *string, | ||||
|     int use_bytearray); | ||||
| #endif | ||||
| PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, | ||||
| 						   const char *, Py_ssize_t, | ||||
|  |  | |||
|  | @ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void); | |||
| #  error "void* different in size from int, long and long long" | ||||
| #endif /* SIZEOF_VOID_P */ | ||||
| 
 | ||||
| /* Used by Python/mystrtoul.c. */ | ||||
| /* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */ | ||||
| #ifndef Py_LIMITED_API | ||||
| PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; | ||||
| #endif | ||||
|  |  | |||
|  | @ -301,6 +301,20 @@ def test_fromhex(self): | |||
|         self.assertRaises(ValueError, self.type2test.fromhex, '\x00') | ||||
|         self.assertRaises(ValueError, self.type2test.fromhex, '12   \x00   34') | ||||
| 
 | ||||
|         for data, pos in ( | ||||
|             # invalid first hexadecimal character | ||||
|             ('12 x4 56', 3), | ||||
|             # invalid second hexadecimal character | ||||
|             ('12 3x 56', 4), | ||||
|             # two invalid hexadecimal characters | ||||
|             ('12 xy 56', 3), | ||||
|             # test non-ASCII string | ||||
|             ('12 3\xff 56', 4), | ||||
|         ): | ||||
|             with self.assertRaises(ValueError) as cm: | ||||
|                 self.type2test.fromhex(data) | ||||
|             self.assertIn('at position %s' % pos, str(cm.exception)) | ||||
| 
 | ||||
|     def test_hex(self): | ||||
|         self.assertRaises(TypeError, self.type2test.hex) | ||||
|         self.assertRaises(TypeError, self.type2test.hex, 1) | ||||
|  |  | |||
|  | @ -10,6 +10,9 @@ Release date: XXXX-XX-XX | |||
| Core and Builtins | ||||
| ----------------- | ||||
| 
 | ||||
| - Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now | ||||
|   between 2x and 3.5x faster. | ||||
| 
 | ||||
| - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter | ||||
|   API. Formatting is now between 2.5 and 5 times faster. | ||||
| 
 | ||||
|  |  | |||
|  | @ -2823,48 +2823,7 @@ static PyObject * | |||
| bytearray_fromhex_impl(PyObject*cls, PyObject *string) | ||||
| /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/ | ||||
| { | ||||
|     PyObject *newbytes; | ||||
|     char *buf; | ||||
|     Py_ssize_t hexlen, byteslen, i, j; | ||||
|     int top, bot; | ||||
|     void *data; | ||||
|     unsigned int kind; | ||||
| 
 | ||||
|     assert(PyUnicode_Check(string)); | ||||
|     if (PyUnicode_READY(string)) | ||||
|         return NULL; | ||||
|     kind = PyUnicode_KIND(string); | ||||
|     data = PyUnicode_DATA(string); | ||||
|     hexlen = PyUnicode_GET_LENGTH(string); | ||||
| 
 | ||||
|     byteslen = hexlen/2; /* This overestimates if there are spaces */ | ||||
|     newbytes = PyByteArray_FromStringAndSize(NULL, byteslen); | ||||
|     if (!newbytes) | ||||
|         return NULL; | ||||
|     buf = PyByteArray_AS_STRING(newbytes); | ||||
|     for (i = j = 0; i < hexlen; i += 2) { | ||||
|         /* skip over spaces in the input */ | ||||
|         while (PyUnicode_READ(kind, data, i) == ' ') | ||||
|             i++; | ||||
|         if (i >= hexlen) | ||||
|             break; | ||||
|         top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); | ||||
|         bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); | ||||
|         if (top == -1 || bot == -1) { | ||||
|             PyErr_Format(PyExc_ValueError, | ||||
|                          "non-hexadecimal number found in " | ||||
|                          "fromhex() arg at position %zd", i); | ||||
|             goto error; | ||||
|         } | ||||
|         buf[j++] = (top << 4) + bot; | ||||
|     } | ||||
|     if (PyByteArray_Resize(newbytes, j) < 0) | ||||
|         goto error; | ||||
|     return newbytes; | ||||
| 
 | ||||
|   error: | ||||
|     Py_DECREF(newbytes); | ||||
|     return NULL; | ||||
|     return _PyBytes_FromHex(string, 1); | ||||
| } | ||||
| 
 | ||||
| PyDoc_STRVAR(hex__doc__, | ||||
|  |  | |||
|  | @ -30,6 +30,10 @@ static PyBytesObject *nullstring; | |||
| */ | ||||
| #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) | ||||
| 
 | ||||
| /* Forward declaration */ | ||||
| Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer, | ||||
|                                                    char *str); | ||||
| 
 | ||||
| /*
 | ||||
|    For PyBytes_FromString(), the parameter `str' points to a null-terminated | ||||
|    string containing exactly `size' bytes. | ||||
|  | @ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends) | |||
|         ); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| hex_digit_to_int(Py_UCS4 c) | ||||
| { | ||||
|     if (c >= 128) | ||||
|         return -1; | ||||
|     if (Py_ISDIGIT(c)) | ||||
|         return c - '0'; | ||||
|     else { | ||||
|         if (Py_ISUPPER(c)) | ||||
|             c = Py_TOLOWER(c); | ||||
|         if (c >= 'a' && c <= 'f') | ||||
|             return c - 'a' + 10; | ||||
|     } | ||||
|     return -1; | ||||
| } | ||||
| 
 | ||||
| /*[clinic input]
 | ||||
| @classmethod | ||||
| bytes.fromhex | ||||
|  | @ -3111,47 +3099,83 @@ static PyObject * | |||
| bytes_fromhex_impl(PyTypeObject *type, PyObject *string) | ||||
| /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ | ||||
| { | ||||
|     PyObject *newstring; | ||||
|     return _PyBytes_FromHex(string, 0); | ||||
| } | ||||
| 
 | ||||
| PyObject* | ||||
| _PyBytes_FromHex(PyObject *string, int use_bytearray) | ||||
| { | ||||
|     char *buf; | ||||
|     Py_ssize_t hexlen, byteslen, i, j; | ||||
|     int top, bot; | ||||
|     void *data; | ||||
|     unsigned int kind; | ||||
|     Py_ssize_t hexlen, invalid_char; | ||||
|     unsigned int top, bot; | ||||
|     Py_UCS1 *str, *end; | ||||
|     _PyBytesWriter writer; | ||||
| 
 | ||||
|     _PyBytesWriter_Init(&writer); | ||||
|     writer.use_bytearray = use_bytearray; | ||||
| 
 | ||||
|     assert(PyUnicode_Check(string)); | ||||
|     if (PyUnicode_READY(string)) | ||||
|         return NULL; | ||||
|     kind = PyUnicode_KIND(string); | ||||
|     data = PyUnicode_DATA(string); | ||||
|     hexlen = PyUnicode_GET_LENGTH(string); | ||||
| 
 | ||||
|     byteslen = hexlen/2; /* This overestimates if there are spaces */ | ||||
|     newstring = PyBytes_FromStringAndSize(NULL, byteslen); | ||||
|     if (!newstring) | ||||
|         return NULL; | ||||
|     buf = PyBytes_AS_STRING(newstring); | ||||
|     for (i = j = 0; i < hexlen; i += 2) { | ||||
|         /* skip over spaces in the input */ | ||||
|         while (PyUnicode_READ(kind, data, i) == ' ') | ||||
|             i++; | ||||
|         if (i >= hexlen) | ||||
|     if (!PyUnicode_IS_ASCII(string)) { | ||||
|         void *data = PyUnicode_DATA(string); | ||||
|         unsigned int kind = PyUnicode_KIND(string); | ||||
|         Py_ssize_t i; | ||||
| 
 | ||||
|         /* search for the first non-ASCII character */ | ||||
|         for (i = 0; i < hexlen; i++) { | ||||
|             if (PyUnicode_READ(kind, data, i) >= 128) | ||||
|                 break; | ||||
|         top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); | ||||
|         bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); | ||||
|         if (top == -1 || bot == -1) { | ||||
|             PyErr_Format(PyExc_ValueError, | ||||
|                          "non-hexadecimal number found in " | ||||
|                          "fromhex() arg at position %zd", i); | ||||
|         } | ||||
|         invalid_char = i; | ||||
|         goto error; | ||||
|     } | ||||
|         buf[j++] = (top << 4) + bot; | ||||
| 
 | ||||
|     assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND); | ||||
|     str = PyUnicode_1BYTE_DATA(string); | ||||
| 
 | ||||
|     /* This overestimates if there are spaces */ | ||||
|     buf = _PyBytesWriter_Alloc(&writer, hexlen / 2); | ||||
|     if (buf == NULL) | ||||
|         return NULL; | ||||
| 
 | ||||
|     end = str + hexlen; | ||||
|     while (str < end) { | ||||
|         /* skip over spaces in the input */ | ||||
|         if (*str == ' ') { | ||||
|             do { | ||||
|                 str++; | ||||
|             } while (*str == ' '); | ||||
|             if (str >= end) | ||||
|                 break; | ||||
|         } | ||||
|     if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0) | ||||
| 
 | ||||
|         top = _PyLong_DigitValue[*str]; | ||||
|         if (top >= 16) { | ||||
|             invalid_char = str - PyUnicode_1BYTE_DATA(string); | ||||
|             goto error; | ||||
|     return newstring; | ||||
|         } | ||||
|         str++; | ||||
| 
 | ||||
|         bot = _PyLong_DigitValue[*str]; | ||||
|         if (bot >= 16) { | ||||
|             invalid_char = str - PyUnicode_1BYTE_DATA(string); | ||||
|             goto error; | ||||
|         } | ||||
|         str++; | ||||
| 
 | ||||
|         *buf++ = (unsigned char)((top << 4) + bot); | ||||
|     } | ||||
| 
 | ||||
|     return _PyBytesWriter_Finish(&writer, buf); | ||||
| 
 | ||||
|   error: | ||||
|     Py_XDECREF(newstring); | ||||
|     PyErr_Format(PyExc_ValueError, | ||||
|                  "non-hexadecimal number found in " | ||||
|                  "fromhex() arg at position %zd", invalid_char); | ||||
|     _PyBytesWriter_Dealloc(&writer); | ||||
|     return NULL; | ||||
| } | ||||
| 
 | ||||
|  | @ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer) | |||
| } | ||||
| 
 | ||||
| Py_LOCAL_INLINE(Py_ssize_t) | ||||
| _PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str) | ||||
| _PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str) | ||||
| { | ||||
|     char *start = _PyBytesWriter_AsString(writer); | ||||
|     assert(str != NULL); | ||||
|  | @ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size) | |||
|         allocated += allocated / OVERALLOCATE_FACTOR; | ||||
|     } | ||||
| 
 | ||||
|     pos = _PyBytesWriter_GetPos(writer, str); | ||||
|     pos = _PyBytesWriter_GetSize(writer, str); | ||||
|     if (!writer->use_small_buffer) { | ||||
|         if (writer->use_bytearray) { | ||||
|             if (PyByteArray_Resize(writer->buffer, allocated)) | ||||
|  | @ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size) | |||
| PyObject * | ||||
| _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) | ||||
| { | ||||
|     Py_ssize_t pos; | ||||
|     Py_ssize_t size; | ||||
|     PyObject *result; | ||||
| 
 | ||||
|     _PyBytesWriter_CheckConsistency(writer, str); | ||||
| 
 | ||||
|     pos = _PyBytesWriter_GetPos(writer, str); | ||||
|     if (pos == 0 && !writer->use_bytearray) { | ||||
|     size = _PyBytesWriter_GetSize(writer, str); | ||||
|     if (size == 0 && !writer->use_bytearray) { | ||||
|         Py_CLEAR(writer->buffer); | ||||
|         /* Get the empty byte string singleton */ | ||||
|         result = PyBytes_FromStringAndSize(NULL, 0); | ||||
|     } | ||||
|     else if (writer->use_small_buffer) { | ||||
|         result = PyBytes_FromStringAndSize(writer->small_buffer, pos); | ||||
|         result = PyBytes_FromStringAndSize(writer->small_buffer, size); | ||||
|     } | ||||
|     else { | ||||
|         result = writer->buffer; | ||||
|         writer->buffer = NULL; | ||||
| 
 | ||||
|         if (pos != writer->allocated) { | ||||
|         if (size != writer->allocated) { | ||||
|             if (writer->use_bytearray) { | ||||
|                 if (PyByteArray_Resize(result, pos)) { | ||||
|                 if (PyByteArray_Resize(result, size)) { | ||||
|                     Py_DECREF(result); | ||||
|                     return NULL; | ||||
|                 } | ||||
|             } | ||||
|             else { | ||||
|                 if (_PyBytes_Resize(&result, pos)) { | ||||
|                 if (_PyBytes_Resize(&result, size)) { | ||||
|                     assert(result == NULL); | ||||
|                     return NULL; | ||||
|                 } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Victor Stinner
						Victor Stinner