mirror of
				https://github.com/python/cpython.git
				synced 2025-10-24 18:33:49 +00:00 
			
		
		
		
	Issue #13155: Optimize finding the optimal character width of an unicode string
This commit is contained in:
		
							parent
							
								
									49a0a21f37
								
							
						
					
					
						commit
						dd4e2f0153
					
				
					 3 changed files with 206 additions and 87 deletions
				
			
		|  | @ -631,6 +631,7 @@ BYTESTR_DEPS = \ | ||||||
| 		$(srcdir)/Objects/stringlib/eq.h \ | 		$(srcdir)/Objects/stringlib/eq.h \ | ||||||
| 		$(srcdir)/Objects/stringlib/fastsearch.h \ | 		$(srcdir)/Objects/stringlib/fastsearch.h \ | ||||||
| 		$(srcdir)/Objects/stringlib/find.h \ | 		$(srcdir)/Objects/stringlib/find.h \ | ||||||
|  | 		$(srcdir)/Objects/stringlib/find_max_char.h \ | ||||||
| 		$(srcdir)/Objects/stringlib/partition.h \ | 		$(srcdir)/Objects/stringlib/partition.h \ | ||||||
| 		$(srcdir)/Objects/stringlib/split.h \ | 		$(srcdir)/Objects/stringlib/split.h \ | ||||||
| 		$(srcdir)/Objects/stringlib/stringdefs.h \ | 		$(srcdir)/Objects/stringlib/stringdefs.h \ | ||||||
|  |  | ||||||
							
								
								
									
										136
									
								
								Objects/stringlib/find_max_char.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								Objects/stringlib/find_max_char.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,136 @@ | ||||||
|  | /* Finding the optimal width of unicode characters in a buffer */ | ||||||
|  | 
 | ||||||
|  | #if STRINGLIB_IS_UNICODE | ||||||
|  | 
 | ||||||
|  | /* Mask to check or force alignment of a pointer to C 'long' boundaries */ | ||||||
|  | #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) | ||||||
|  | 
 | ||||||
|  | /* Mask to quickly check whether a C 'long' contains a
 | ||||||
|  |    non-ASCII, UTF8-encoded char. */ | ||||||
|  | #if (SIZEOF_LONG == 8) | ||||||
|  | # define UCS1_ASCII_CHAR_MASK 0x8080808080808080L | ||||||
|  | #elif (SIZEOF_LONG == 4) | ||||||
|  | # define UCS1_ASCII_CHAR_MASK 0x80808080L | ||||||
|  | #else | ||||||
|  | # error C 'long' size should be either 4 or 8! | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #if STRINGLIB_SIZEOF_CHAR == 1 | ||||||
|  | 
 | ||||||
|  | Py_LOCAL_INLINE(Py_UCS4) | ||||||
|  | STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | ||||||
|  | { | ||||||
|  |     const unsigned char *p = (const unsigned char *) begin; | ||||||
|  |     const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); | ||||||
|  | 
 | ||||||
|  |     while (p < end) { | ||||||
|  |         if (!((size_t) p & LONG_PTR_MASK)) { | ||||||
|  |             /* Help register allocation */ | ||||||
|  |             register const unsigned char *_p = p; | ||||||
|  |             while (_p < aligned_end) { | ||||||
|  |                 unsigned long value = *(unsigned long *) _p; | ||||||
|  |                 if (value & UCS1_ASCII_CHAR_MASK) | ||||||
|  |                     return 255; | ||||||
|  |                 _p += SIZEOF_LONG; | ||||||
|  |             } | ||||||
|  |             p = _p; | ||||||
|  |             if (p == end) | ||||||
|  |                 break; | ||||||
|  |         } | ||||||
|  |         if (*p++ & 0x80) | ||||||
|  |             return 255; | ||||||
|  |     } | ||||||
|  |     return 127; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #undef LONG_PTR_MASK | ||||||
|  | #undef ASCII_CHAR_MASK | ||||||
|  | 
 | ||||||
|  | #else /* STRINGLIB_SIZEOF_CHAR == 1 */ | ||||||
|  | 
 | ||||||
|  | #define MASK_ASCII 0xFFFFFF80 | ||||||
|  | #define MASK_UCS1 0xFFFFFF00 | ||||||
|  | #define MASK_UCS2 0xFFFF0000 | ||||||
|  | 
 | ||||||
|  | #define MAX_CHAR_ASCII 0x7f | ||||||
|  | #define MAX_CHAR_UCS1  0xff | ||||||
|  | #define MAX_CHAR_UCS2  0xffff | ||||||
|  | #define MAX_CHAR_UCS4  0x10ffff | ||||||
|  | 
 | ||||||
|  | Py_LOCAL_INLINE(Py_UCS4) | ||||||
|  | STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | ||||||
|  | { | ||||||
|  | #if STRINGLIB_SIZEOF_CHAR == 2 | ||||||
|  |     const Py_UCS4 mask_limit = MASK_UCS1; | ||||||
|  |     const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; | ||||||
|  | #elif STRINGLIB_SIZEOF_CHAR == 4 | ||||||
|  |     const Py_UCS4 mask_limit = MASK_UCS2; | ||||||
|  |     const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; | ||||||
|  | #else | ||||||
|  | #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) | ||||||
|  | #endif | ||||||
|  |     register Py_UCS4 mask; | ||||||
|  |     Py_ssize_t n = end - begin; | ||||||
|  |     const STRINGLIB_CHAR *p = begin; | ||||||
|  |     const STRINGLIB_CHAR *unrolled_end = begin + (n & ~ (Py_ssize_t) 3); | ||||||
|  |     Py_UCS4 max_char; | ||||||
|  | 
 | ||||||
|  |     max_char = MAX_CHAR_ASCII; | ||||||
|  |     mask = MASK_ASCII; | ||||||
|  |     while (p < unrolled_end) { | ||||||
|  |         STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; | ||||||
|  |         if (bits & mask) { | ||||||
|  |             if (mask == mask_limit) { | ||||||
|  |                 /* Limit reached */ | ||||||
|  |                 return max_char_limit; | ||||||
|  |             } | ||||||
|  |             if (mask == MASK_ASCII) { | ||||||
|  |                 max_char = MAX_CHAR_UCS1; | ||||||
|  |                 mask = MASK_UCS1; | ||||||
|  |             } | ||||||
|  |             else { | ||||||
|  |                 /* mask can't be MASK_UCS2 because of mask_limit above */ | ||||||
|  |                 assert(mask == MASK_UCS1); | ||||||
|  |                 max_char = MAX_CHAR_UCS2; | ||||||
|  |                 mask = MASK_UCS2; | ||||||
|  |             } | ||||||
|  |             /* We check the new mask on the same chars in the next iteration */ | ||||||
|  |             continue; | ||||||
|  |         } | ||||||
|  |         p += 4; | ||||||
|  |     } | ||||||
|  |     while (p < end) { | ||||||
|  |         if (p[0] & mask) { | ||||||
|  |             if (mask == mask_limit) { | ||||||
|  |                 /* Limit reached */ | ||||||
|  |                 return max_char_limit; | ||||||
|  |             } | ||||||
|  |             if (mask == MASK_ASCII) { | ||||||
|  |                 max_char = MAX_CHAR_UCS1; | ||||||
|  |                 mask = MASK_UCS1; | ||||||
|  |             } | ||||||
|  |             else { | ||||||
|  |                 /* mask can't be MASK_UCS2 because of mask_limit above */ | ||||||
|  |                 assert(mask == MASK_UCS1); | ||||||
|  |                 max_char = MAX_CHAR_UCS2; | ||||||
|  |                 mask = MASK_UCS2; | ||||||
|  |             } | ||||||
|  |             /* We check the new mask on the same chars in the next iteration */ | ||||||
|  |             continue; | ||||||
|  |         } | ||||||
|  |         p++; | ||||||
|  |     } | ||||||
|  |     return max_char; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #undef MASK_ASCII | ||||||
|  | #undef MASK_UCS1 | ||||||
|  | #undef MASK_UCS2 | ||||||
|  | #undef MAX_CHAR_ASCII | ||||||
|  | #undef MAX_CHAR_UCS1 | ||||||
|  | #undef MAX_CHAR_UCS2 | ||||||
|  | #undef MAX_CHAR_UCS4 | ||||||
|  | 
 | ||||||
|  | #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ | ||||||
|  | #endif /* STRINGLIB_IS_UNICODE */ | ||||||
|  | 
 | ||||||
|  | @ -259,6 +259,15 @@ static void copy_characters( | ||||||
| static int unicode_is_singleton(PyObject *unicode); | static int unicode_is_singleton(PyObject *unicode); | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | static PyObject * | ||||||
|  | unicode_fromascii(const unsigned char *s, Py_ssize_t size); | ||||||
|  | static PyObject * | ||||||
|  | _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); | ||||||
|  | static PyObject * | ||||||
|  | _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); | ||||||
|  | static PyObject * | ||||||
|  | _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); | ||||||
|  | 
 | ||||||
| static PyObject * | static PyObject * | ||||||
| unicode_encode_call_errorhandler(const char *errors, | unicode_encode_call_errorhandler(const char *errors, | ||||||
|        PyObject **errorHandler,const char *encoding, const char *reason, |        PyObject **errorHandler,const char *encoding, const char *reason, | ||||||
|  | @ -468,6 +477,48 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len) | ||||||
|     (BLOOM(mask, chr) \ |     (BLOOM(mask, chr) \ | ||||||
|      && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) |      && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) | ||||||
| 
 | 
 | ||||||
|  | /* Compilation of templated routines */ | ||||||
|  | 
 | ||||||
|  | #include "stringlib/asciilib.h" | ||||||
|  | #include "stringlib/fastsearch.h" | ||||||
|  | #include "stringlib/partition.h" | ||||||
|  | #include "stringlib/split.h" | ||||||
|  | #include "stringlib/count.h" | ||||||
|  | #include "stringlib/find.h" | ||||||
|  | #include "stringlib/find_max_char.h" | ||||||
|  | #include "stringlib/localeutil.h" | ||||||
|  | #include "stringlib/undef.h" | ||||||
|  | 
 | ||||||
|  | #include "stringlib/ucs1lib.h" | ||||||
|  | #include "stringlib/fastsearch.h" | ||||||
|  | #include "stringlib/partition.h" | ||||||
|  | #include "stringlib/split.h" | ||||||
|  | #include "stringlib/count.h" | ||||||
|  | #include "stringlib/find.h" | ||||||
|  | #include "stringlib/find_max_char.h" | ||||||
|  | #include "stringlib/localeutil.h" | ||||||
|  | #include "stringlib/undef.h" | ||||||
|  | 
 | ||||||
|  | #include "stringlib/ucs2lib.h" | ||||||
|  | #include "stringlib/fastsearch.h" | ||||||
|  | #include "stringlib/partition.h" | ||||||
|  | #include "stringlib/split.h" | ||||||
|  | #include "stringlib/count.h" | ||||||
|  | #include "stringlib/find.h" | ||||||
|  | #include "stringlib/find_max_char.h" | ||||||
|  | #include "stringlib/localeutil.h" | ||||||
|  | #include "stringlib/undef.h" | ||||||
|  | 
 | ||||||
|  | #include "stringlib/ucs4lib.h" | ||||||
|  | #include "stringlib/fastsearch.h" | ||||||
|  | #include "stringlib/partition.h" | ||||||
|  | #include "stringlib/split.h" | ||||||
|  | #include "stringlib/count.h" | ||||||
|  | #include "stringlib/find.h" | ||||||
|  | #include "stringlib/find_max_char.h" | ||||||
|  | #include "stringlib/localeutil.h" | ||||||
|  | #include "stringlib/undef.h" | ||||||
|  | 
 | ||||||
| /* --- Unicode Object ----------------------------------------------------- */ | /* --- Unicode Object ----------------------------------------------------- */ | ||||||
| 
 | 
 | ||||||
| static PyObject * | static PyObject * | ||||||
|  | @ -1689,17 +1740,11 @@ _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) | ||||||
| { | { | ||||||
|     PyObject *res; |     PyObject *res; | ||||||
|     unsigned char max_char = 127; |     unsigned char max_char = 127; | ||||||
|     Py_ssize_t i; |  | ||||||
| 
 | 
 | ||||||
|     assert(size >= 0); |     assert(size >= 0); | ||||||
|     if (size == 1) |     if (size == 1) | ||||||
|         return get_latin1_char(u[0]); |         return get_latin1_char(u[0]); | ||||||
|     for (i = 0; i < size; i++) { |     max_char = ucs1lib_find_max_char(u, u + size); | ||||||
|         if (u[i] & 0x80) { |  | ||||||
|             max_char = 255; |  | ||||||
|             break; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     res = PyUnicode_New(size, max_char); |     res = PyUnicode_New(size, max_char); | ||||||
|     if (!res) |     if (!res) | ||||||
|         return NULL; |         return NULL; | ||||||
|  | @ -1713,26 +1758,20 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) | ||||||
| { | { | ||||||
|     PyObject *res; |     PyObject *res; | ||||||
|     Py_UCS2 max_char = 0; |     Py_UCS2 max_char = 0; | ||||||
|     Py_ssize_t i; |  | ||||||
| 
 | 
 | ||||||
|     assert(size >= 0); |     assert(size >= 0); | ||||||
|     if (size == 1 && u[0] < 256) |     if (size == 1 && u[0] < 256) | ||||||
|         return get_latin1_char((unsigned char)u[0]); |         return get_latin1_char((unsigned char)u[0]); | ||||||
|     for (i = 0; i < size; i++) { |     max_char = ucs2lib_find_max_char(u, u + size); | ||||||
|         if (u[i] > max_char) { |  | ||||||
|             max_char = u[i]; |  | ||||||
|             if (max_char >= 256) |  | ||||||
|                 break; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     res = PyUnicode_New(size, max_char); |     res = PyUnicode_New(size, max_char); | ||||||
|     if (!res) |     if (!res) | ||||||
|         return NULL; |         return NULL; | ||||||
|     if (max_char >= 256) |     if (max_char >= 256) | ||||||
|         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); |         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); | ||||||
|     else |     else { | ||||||
|         for (i = 0; i < size; i++) |         _PyUnicode_CONVERT_BYTES( | ||||||
|             PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; |             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); | ||||||
|  |     } | ||||||
|     assert(_PyUnicode_CheckConsistency(res, 1)); |     assert(_PyUnicode_CheckConsistency(res, 1)); | ||||||
|     return res; |     return res; | ||||||
| } | } | ||||||
|  | @ -1742,18 +1781,11 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) | ||||||
| { | { | ||||||
|     PyObject *res; |     PyObject *res; | ||||||
|     Py_UCS4 max_char = 0; |     Py_UCS4 max_char = 0; | ||||||
|     Py_ssize_t i; |  | ||||||
| 
 | 
 | ||||||
|     assert(size >= 0); |     assert(size >= 0); | ||||||
|     if (size == 1 && u[0] < 256) |     if (size == 1 && u[0] < 256) | ||||||
|         return get_latin1_char(u[0]); |         return get_latin1_char(u[0]); | ||||||
|     for (i = 0; i < size; i++) { |     max_char = ucs4lib_find_max_char(u, u + size); | ||||||
|         if (u[i] > max_char) { |  | ||||||
|             max_char = u[i]; |  | ||||||
|             if (max_char >= 0x10000) |  | ||||||
|                 break; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     res = PyUnicode_New(size, max_char); |     res = PyUnicode_New(size, max_char); | ||||||
|     if (!res) |     if (!res) | ||||||
|         return NULL; |         return NULL; | ||||||
|  | @ -1794,7 +1826,7 @@ unicode_adjust_maxchar(PyObject **p_unicode) | ||||||
| { | { | ||||||
|     PyObject *unicode, *copy; |     PyObject *unicode, *copy; | ||||||
|     Py_UCS4 max_char; |     Py_UCS4 max_char; | ||||||
|     Py_ssize_t i, len; |     Py_ssize_t len; | ||||||
|     unsigned int kind; |     unsigned int kind; | ||||||
| 
 | 
 | ||||||
|     assert(p_unicode != NULL); |     assert(p_unicode != NULL); | ||||||
|  | @ -1807,37 +1839,23 @@ unicode_adjust_maxchar(PyObject **p_unicode) | ||||||
|     kind = PyUnicode_KIND(unicode); |     kind = PyUnicode_KIND(unicode); | ||||||
|     if (kind == PyUnicode_1BYTE_KIND) { |     if (kind == PyUnicode_1BYTE_KIND) { | ||||||
|         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); |         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); | ||||||
|         for (i = 0; i < len; i++) { |         max_char = ucs1lib_find_max_char(u, u + len); | ||||||
|             if (u[i] & 0x80) |         if (max_char >= 128) | ||||||
|             return; |             return; | ||||||
|     } |     } | ||||||
|         max_char = 127; |  | ||||||
|     } |  | ||||||
|     else if (kind == PyUnicode_2BYTE_KIND) { |     else if (kind == PyUnicode_2BYTE_KIND) { | ||||||
|         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); |         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); | ||||||
|         max_char = 0; |         max_char = ucs2lib_find_max_char(u, u + len); | ||||||
|         for (i = 0; i < len; i++) { |  | ||||||
|             if (u[i] > max_char) { |  | ||||||
|                 max_char = u[i]; |  | ||||||
|         if (max_char >= 256) |         if (max_char >= 256) | ||||||
|             return; |             return; | ||||||
|     } |     } | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     else { |     else { | ||||||
|         const Py_UCS4 *u; |         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); | ||||||
|         assert(kind == PyUnicode_4BYTE_KIND); |         assert(kind == PyUnicode_4BYTE_KIND); | ||||||
|         u = PyUnicode_4BYTE_DATA(unicode); |         max_char = ucs4lib_find_max_char(u, u + len); | ||||||
|         max_char = 0; |  | ||||||
|         for (i = 0; i < len; i++) { |  | ||||||
|             if (u[i] > max_char) { |  | ||||||
|                 max_char = u[i]; |  | ||||||
|         if (max_char >= 0x10000) |         if (max_char >= 0x10000) | ||||||
|             return; |             return; | ||||||
|     } |     } | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode)); |  | ||||||
|     copy = PyUnicode_New(len, max_char); |     copy = PyUnicode_New(len, max_char); | ||||||
|     copy_characters(copy, 0, unicode, 0, len); |     copy_characters(copy, 0, unicode, 0, len); | ||||||
|     Py_DECREF(unicode); |     Py_DECREF(unicode); | ||||||
|  | @ -8508,42 +8526,6 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, | ||||||
| 
 | 
 | ||||||
| /* --- Helpers ------------------------------------------------------------ */ | /* --- Helpers ------------------------------------------------------------ */ | ||||||
| 
 | 
 | ||||||
| #include "stringlib/asciilib.h" |  | ||||||
| #include "stringlib/fastsearch.h" |  | ||||||
| #include "stringlib/partition.h" |  | ||||||
| #include "stringlib/split.h" |  | ||||||
| #include "stringlib/count.h" |  | ||||||
| #include "stringlib/find.h" |  | ||||||
| #include "stringlib/localeutil.h" |  | ||||||
| #include "stringlib/undef.h" |  | ||||||
| 
 |  | ||||||
| #include "stringlib/ucs1lib.h" |  | ||||||
| #include "stringlib/fastsearch.h" |  | ||||||
| #include "stringlib/partition.h" |  | ||||||
| #include "stringlib/split.h" |  | ||||||
| #include "stringlib/count.h" |  | ||||||
| #include "stringlib/find.h" |  | ||||||
| #include "stringlib/localeutil.h" |  | ||||||
| #include "stringlib/undef.h" |  | ||||||
| 
 |  | ||||||
| #include "stringlib/ucs2lib.h" |  | ||||||
| #include "stringlib/fastsearch.h" |  | ||||||
| #include "stringlib/partition.h" |  | ||||||
| #include "stringlib/split.h" |  | ||||||
| #include "stringlib/count.h" |  | ||||||
| #include "stringlib/find.h" |  | ||||||
| #include "stringlib/localeutil.h" |  | ||||||
| #include "stringlib/undef.h" |  | ||||||
| 
 |  | ||||||
| #include "stringlib/ucs4lib.h" |  | ||||||
| #include "stringlib/fastsearch.h" |  | ||||||
| #include "stringlib/partition.h" |  | ||||||
| #include "stringlib/split.h" |  | ||||||
| #include "stringlib/count.h" |  | ||||||
| #include "stringlib/find.h" |  | ||||||
| #include "stringlib/localeutil.h" |  | ||||||
| #include "stringlib/undef.h" |  | ||||||
| 
 |  | ||||||
| static Py_ssize_t | static Py_ssize_t | ||||||
| any_find_slice(int direction, PyObject* s1, PyObject* s2, | any_find_slice(int direction, PyObject* s1, PyObject* s2, | ||||||
|                Py_ssize_t start, |                Py_ssize_t start, | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Antoine Pitrou
						Antoine Pitrou