mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Optimize bytes.fromhex() and bytearray.fromhex()
Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
This commit is contained in:
		
							parent
							
								
									ebcf9edc05
								
							
						
					
					
						commit
						2bf8993db9
					
				
					 7 changed files with 101 additions and 95 deletions
				
			
		|  | @ -161,6 +161,9 @@ Optimizations | ||||||
| * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by | * ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by | ||||||
|   Victor Stinner in :issue:`25399`). |   Victor Stinner in :issue:`25399`). | ||||||
| 
 | 
 | ||||||
|  | * Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now | ||||||
|  |   between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`). | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| Build and C API Changes | Build and C API Changes | ||||||
| ======================= | ======================= | ||||||
|  |  | ||||||
|  | @ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx( | ||||||
|     Py_ssize_t format_len, |     Py_ssize_t format_len, | ||||||
|     PyObject *args, |     PyObject *args, | ||||||
|     int use_bytearray); |     int use_bytearray); | ||||||
|  | PyAPI_FUNC(PyObject*) _PyBytes_FromHex( | ||||||
|  |     PyObject *string, | ||||||
|  |     int use_bytearray); | ||||||
| #endif | #endif | ||||||
| PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, | PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, | ||||||
| 						   const char *, Py_ssize_t, | 						   const char *, Py_ssize_t, | ||||||
|  |  | ||||||
|  | @ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void); | ||||||
| #  error "void* different in size from int, long and long long" | #  error "void* different in size from int, long and long long" | ||||||
| #endif /* SIZEOF_VOID_P */ | #endif /* SIZEOF_VOID_P */ | ||||||
| 
 | 
 | ||||||
| /* Used by Python/mystrtoul.c. */ | /* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */ | ||||||
| #ifndef Py_LIMITED_API | #ifndef Py_LIMITED_API | ||||||
| PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; | PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -301,6 +301,20 @@ def test_fromhex(self): | ||||||
|         self.assertRaises(ValueError, self.type2test.fromhex, '\x00') |         self.assertRaises(ValueError, self.type2test.fromhex, '\x00') | ||||||
|         self.assertRaises(ValueError, self.type2test.fromhex, '12   \x00   34') |         self.assertRaises(ValueError, self.type2test.fromhex, '12   \x00   34') | ||||||
| 
 | 
 | ||||||
|  |         for data, pos in ( | ||||||
|  |             # invalid first hexadecimal character | ||||||
|  |             ('12 x4 56', 3), | ||||||
|  |             # invalid second hexadecimal character | ||||||
|  |             ('12 3x 56', 4), | ||||||
|  |             # two invalid hexadecimal characters | ||||||
|  |             ('12 xy 56', 3), | ||||||
|  |             # test non-ASCII string | ||||||
|  |             ('12 3\xff 56', 4), | ||||||
|  |         ): | ||||||
|  |             with self.assertRaises(ValueError) as cm: | ||||||
|  |                 self.type2test.fromhex(data) | ||||||
|  |             self.assertIn('at position %s' % pos, str(cm.exception)) | ||||||
|  | 
 | ||||||
|     def test_hex(self): |     def test_hex(self): | ||||||
|         self.assertRaises(TypeError, self.type2test.hex) |         self.assertRaises(TypeError, self.type2test.hex) | ||||||
|         self.assertRaises(TypeError, self.type2test.hex, 1) |         self.assertRaises(TypeError, self.type2test.hex, 1) | ||||||
|  |  | ||||||
|  | @ -10,6 +10,9 @@ Release date: XXXX-XX-XX | ||||||
| Core and Builtins | Core and Builtins | ||||||
| ----------------- | ----------------- | ||||||
| 
 | 
 | ||||||
|  | - Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now | ||||||
|  |   between 2x and 3.5x faster. | ||||||
|  | 
 | ||||||
| - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter | - Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter | ||||||
|   API. Formatting is now between 2.5 and 5 times faster. |   API. Formatting is now between 2.5 and 5 times faster. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2823,48 +2823,7 @@ static PyObject * | ||||||
| bytearray_fromhex_impl(PyObject*cls, PyObject *string) | bytearray_fromhex_impl(PyObject*cls, PyObject *string) | ||||||
| /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/ | /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/ | ||||||
| { | { | ||||||
|     PyObject *newbytes; |     return _PyBytes_FromHex(string, 1); | ||||||
|     char *buf; |  | ||||||
|     Py_ssize_t hexlen, byteslen, i, j; |  | ||||||
|     int top, bot; |  | ||||||
|     void *data; |  | ||||||
|     unsigned int kind; |  | ||||||
| 
 |  | ||||||
|     assert(PyUnicode_Check(string)); |  | ||||||
|     if (PyUnicode_READY(string)) |  | ||||||
|         return NULL; |  | ||||||
|     kind = PyUnicode_KIND(string); |  | ||||||
|     data = PyUnicode_DATA(string); |  | ||||||
|     hexlen = PyUnicode_GET_LENGTH(string); |  | ||||||
| 
 |  | ||||||
|     byteslen = hexlen/2; /* This overestimates if there are spaces */ |  | ||||||
|     newbytes = PyByteArray_FromStringAndSize(NULL, byteslen); |  | ||||||
|     if (!newbytes) |  | ||||||
|         return NULL; |  | ||||||
|     buf = PyByteArray_AS_STRING(newbytes); |  | ||||||
|     for (i = j = 0; i < hexlen; i += 2) { |  | ||||||
|         /* skip over spaces in the input */ |  | ||||||
|         while (PyUnicode_READ(kind, data, i) == ' ') |  | ||||||
|             i++; |  | ||||||
|         if (i >= hexlen) |  | ||||||
|             break; |  | ||||||
|         top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); |  | ||||||
|         bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); |  | ||||||
|         if (top == -1 || bot == -1) { |  | ||||||
|             PyErr_Format(PyExc_ValueError, |  | ||||||
|                          "non-hexadecimal number found in " |  | ||||||
|                          "fromhex() arg at position %zd", i); |  | ||||||
|             goto error; |  | ||||||
|         } |  | ||||||
|         buf[j++] = (top << 4) + bot; |  | ||||||
|     } |  | ||||||
|     if (PyByteArray_Resize(newbytes, j) < 0) |  | ||||||
|         goto error; |  | ||||||
|     return newbytes; |  | ||||||
| 
 |  | ||||||
|   error: |  | ||||||
|     Py_DECREF(newbytes); |  | ||||||
|     return NULL; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| PyDoc_STRVAR(hex__doc__, | PyDoc_STRVAR(hex__doc__, | ||||||
|  |  | ||||||
|  | @ -30,6 +30,10 @@ static PyBytesObject *nullstring; | ||||||
| */ | */ | ||||||
| #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) | #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) | ||||||
| 
 | 
 | ||||||
|  | /* Forward declaration */ | ||||||
|  | Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer, | ||||||
|  |                                                    char *str); | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|    For PyBytes_FromString(), the parameter `str' points to a null-terminated |    For PyBytes_FromString(), the parameter `str' points to a null-terminated | ||||||
|    string containing exactly `size' bytes. |    string containing exactly `size' bytes. | ||||||
|  | @ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends) | ||||||
|         ); |         ); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int |  | ||||||
| hex_digit_to_int(Py_UCS4 c) |  | ||||||
| { |  | ||||||
|     if (c >= 128) |  | ||||||
|         return -1; |  | ||||||
|     if (Py_ISDIGIT(c)) |  | ||||||
|         return c - '0'; |  | ||||||
|     else { |  | ||||||
|         if (Py_ISUPPER(c)) |  | ||||||
|             c = Py_TOLOWER(c); |  | ||||||
|         if (c >= 'a' && c <= 'f') |  | ||||||
|             return c - 'a' + 10; |  | ||||||
|     } |  | ||||||
|     return -1; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*[clinic input]
 | /*[clinic input]
 | ||||||
| @classmethod | @classmethod | ||||||
| bytes.fromhex | bytes.fromhex | ||||||
|  | @ -3111,47 +3099,83 @@ static PyObject * | ||||||
| bytes_fromhex_impl(PyTypeObject *type, PyObject *string) | bytes_fromhex_impl(PyTypeObject *type, PyObject *string) | ||||||
| /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ | /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ | ||||||
| { | { | ||||||
|     PyObject *newstring; |     return _PyBytes_FromHex(string, 0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | PyObject* | ||||||
|  | _PyBytes_FromHex(PyObject *string, int use_bytearray) | ||||||
|  | { | ||||||
|     char *buf; |     char *buf; | ||||||
|     Py_ssize_t hexlen, byteslen, i, j; |     Py_ssize_t hexlen, invalid_char; | ||||||
|     int top, bot; |     unsigned int top, bot; | ||||||
|     void *data; |     Py_UCS1 *str, *end; | ||||||
|     unsigned int kind; |     _PyBytesWriter writer; | ||||||
|  | 
 | ||||||
|  |     _PyBytesWriter_Init(&writer); | ||||||
|  |     writer.use_bytearray = use_bytearray; | ||||||
| 
 | 
 | ||||||
|     assert(PyUnicode_Check(string)); |     assert(PyUnicode_Check(string)); | ||||||
|     if (PyUnicode_READY(string)) |     if (PyUnicode_READY(string)) | ||||||
|         return NULL; |         return NULL; | ||||||
|     kind = PyUnicode_KIND(string); |  | ||||||
|     data = PyUnicode_DATA(string); |  | ||||||
|     hexlen = PyUnicode_GET_LENGTH(string); |     hexlen = PyUnicode_GET_LENGTH(string); | ||||||
| 
 | 
 | ||||||
|     byteslen = hexlen/2; /* This overestimates if there are spaces */ |     if (!PyUnicode_IS_ASCII(string)) { | ||||||
|     newstring = PyBytes_FromStringAndSize(NULL, byteslen); |         void *data = PyUnicode_DATA(string); | ||||||
|     if (!newstring) |         unsigned int kind = PyUnicode_KIND(string); | ||||||
|         return NULL; |         Py_ssize_t i; | ||||||
|     buf = PyBytes_AS_STRING(newstring); | 
 | ||||||
|     for (i = j = 0; i < hexlen; i += 2) { |         /* search for the first non-ASCII character */ | ||||||
|         /* skip over spaces in the input */ |         for (i = 0; i < hexlen; i++) { | ||||||
|         while (PyUnicode_READ(kind, data, i) == ' ') |             if (PyUnicode_READ(kind, data, i) >= 128) | ||||||
|             i++; |  | ||||||
|         if (i >= hexlen) |  | ||||||
|                 break; |                 break; | ||||||
|         top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); |         } | ||||||
|         bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); |         invalid_char = i; | ||||||
|         if (top == -1 || bot == -1) { |  | ||||||
|             PyErr_Format(PyExc_ValueError, |  | ||||||
|                          "non-hexadecimal number found in " |  | ||||||
|                          "fromhex() arg at position %zd", i); |  | ||||||
|         goto error; |         goto error; | ||||||
|     } |     } | ||||||
|         buf[j++] = (top << 4) + bot; | 
 | ||||||
|  |     assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND); | ||||||
|  |     str = PyUnicode_1BYTE_DATA(string); | ||||||
|  | 
 | ||||||
|  |     /* This overestimates if there are spaces */ | ||||||
|  |     buf = _PyBytesWriter_Alloc(&writer, hexlen / 2); | ||||||
|  |     if (buf == NULL) | ||||||
|  |         return NULL; | ||||||
|  | 
 | ||||||
|  |     end = str + hexlen; | ||||||
|  |     while (str < end) { | ||||||
|  |         /* skip over spaces in the input */ | ||||||
|  |         if (*str == ' ') { | ||||||
|  |             do { | ||||||
|  |                 str++; | ||||||
|  |             } while (*str == ' '); | ||||||
|  |             if (str >= end) | ||||||
|  |                 break; | ||||||
|         } |         } | ||||||
|     if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0) | 
 | ||||||
|  |         top = _PyLong_DigitValue[*str]; | ||||||
|  |         if (top >= 16) { | ||||||
|  |             invalid_char = str - PyUnicode_1BYTE_DATA(string); | ||||||
|             goto error; |             goto error; | ||||||
|     return newstring; |         } | ||||||
|  |         str++; | ||||||
|  | 
 | ||||||
|  |         bot = _PyLong_DigitValue[*str]; | ||||||
|  |         if (bot >= 16) { | ||||||
|  |             invalid_char = str - PyUnicode_1BYTE_DATA(string); | ||||||
|  |             goto error; | ||||||
|  |         } | ||||||
|  |         str++; | ||||||
|  | 
 | ||||||
|  |         *buf++ = (unsigned char)((top << 4) + bot); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return _PyBytesWriter_Finish(&writer, buf); | ||||||
| 
 | 
 | ||||||
|   error: |   error: | ||||||
|     Py_XDECREF(newstring); |     PyErr_Format(PyExc_ValueError, | ||||||
|  |                  "non-hexadecimal number found in " | ||||||
|  |                  "fromhex() arg at position %zd", invalid_char); | ||||||
|  |     _PyBytesWriter_Dealloc(&writer); | ||||||
|     return NULL; |     return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| Py_LOCAL_INLINE(Py_ssize_t) | Py_LOCAL_INLINE(Py_ssize_t) | ||||||
| _PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str) | _PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str) | ||||||
| { | { | ||||||
|     char *start = _PyBytesWriter_AsString(writer); |     char *start = _PyBytesWriter_AsString(writer); | ||||||
|     assert(str != NULL); |     assert(str != NULL); | ||||||
|  | @ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size) | ||||||
|         allocated += allocated / OVERALLOCATE_FACTOR; |         allocated += allocated / OVERALLOCATE_FACTOR; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     pos = _PyBytesWriter_GetPos(writer, str); |     pos = _PyBytesWriter_GetSize(writer, str); | ||||||
|     if (!writer->use_small_buffer) { |     if (!writer->use_small_buffer) { | ||||||
|         if (writer->use_bytearray) { |         if (writer->use_bytearray) { | ||||||
|             if (PyByteArray_Resize(writer->buffer, allocated)) |             if (PyByteArray_Resize(writer->buffer, allocated)) | ||||||
|  | @ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size) | ||||||
| PyObject * | PyObject * | ||||||
| _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) | _PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) | ||||||
| { | { | ||||||
|     Py_ssize_t pos; |     Py_ssize_t size; | ||||||
|     PyObject *result; |     PyObject *result; | ||||||
| 
 | 
 | ||||||
|     _PyBytesWriter_CheckConsistency(writer, str); |     _PyBytesWriter_CheckConsistency(writer, str); | ||||||
| 
 | 
 | ||||||
|     pos = _PyBytesWriter_GetPos(writer, str); |     size = _PyBytesWriter_GetSize(writer, str); | ||||||
|     if (pos == 0 && !writer->use_bytearray) { |     if (size == 0 && !writer->use_bytearray) { | ||||||
|         Py_CLEAR(writer->buffer); |         Py_CLEAR(writer->buffer); | ||||||
|         /* Get the empty byte string singleton */ |         /* Get the empty byte string singleton */ | ||||||
|         result = PyBytes_FromStringAndSize(NULL, 0); |         result = PyBytes_FromStringAndSize(NULL, 0); | ||||||
|     } |     } | ||||||
|     else if (writer->use_small_buffer) { |     else if (writer->use_small_buffer) { | ||||||
|         result = PyBytes_FromStringAndSize(writer->small_buffer, pos); |         result = PyBytes_FromStringAndSize(writer->small_buffer, size); | ||||||
|     } |     } | ||||||
|     else { |     else { | ||||||
|         result = writer->buffer; |         result = writer->buffer; | ||||||
|         writer->buffer = NULL; |         writer->buffer = NULL; | ||||||
| 
 | 
 | ||||||
|         if (pos != writer->allocated) { |         if (size != writer->allocated) { | ||||||
|             if (writer->use_bytearray) { |             if (writer->use_bytearray) { | ||||||
|                 if (PyByteArray_Resize(result, pos)) { |                 if (PyByteArray_Resize(result, size)) { | ||||||
|                     Py_DECREF(result); |                     Py_DECREF(result); | ||||||
|                     return NULL; |                     return NULL; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             else { |             else { | ||||||
|                 if (_PyBytes_Resize(&result, pos)) { |                 if (_PyBytes_Resize(&result, size)) { | ||||||
|                     assert(result == NULL); |                     assert(result == NULL); | ||||||
|                     return NULL; |                     return NULL; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Victor Stinner
						Victor Stinner