mirror of
				https://github.com/python/cpython.git
				synced 2025-10-26 11:14:33 +00:00 
			
		
		
		
	Issue #13155: Optimize finding the optimal character width of an unicode string
This commit is contained in:
		
							parent
							
								
									49a0a21f37
								
							
						
					
					
						commit
						dd4e2f0153
					
				
					 3 changed files with 206 additions and 87 deletions
				
			
		|  | @ -631,6 +631,7 @@ BYTESTR_DEPS = \ | |||
| 		$(srcdir)/Objects/stringlib/eq.h \ | ||||
| 		$(srcdir)/Objects/stringlib/fastsearch.h \ | ||||
| 		$(srcdir)/Objects/stringlib/find.h \ | ||||
| 		$(srcdir)/Objects/stringlib/find_max_char.h \ | ||||
| 		$(srcdir)/Objects/stringlib/partition.h \ | ||||
| 		$(srcdir)/Objects/stringlib/split.h \ | ||||
| 		$(srcdir)/Objects/stringlib/stringdefs.h \ | ||||
|  |  | |||
							
								
								
									
										136
									
								
								Objects/stringlib/find_max_char.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								Objects/stringlib/find_max_char.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,136 @@ | |||
| /* Finding the optimal width of unicode characters in a buffer */ | ||||
| 
 | ||||
| #if STRINGLIB_IS_UNICODE | ||||
| 
 | ||||
| /* Mask to check or force alignment of a pointer to C 'long' boundaries */ | ||||
| #define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) | ||||
| 
 | ||||
| /* Mask to quickly check whether a C 'long' contains a
 | ||||
|    non-ASCII, UTF8-encoded char. */ | ||||
| #if (SIZEOF_LONG == 8) | ||||
| # define UCS1_ASCII_CHAR_MASK 0x8080808080808080L | ||||
| #elif (SIZEOF_LONG == 4) | ||||
| # define UCS1_ASCII_CHAR_MASK 0x80808080L | ||||
| #else | ||||
| # error C 'long' size should be either 4 or 8! | ||||
| #endif | ||||
| 
 | ||||
| #if STRINGLIB_SIZEOF_CHAR == 1 | ||||
| 
 | ||||
| Py_LOCAL_INLINE(Py_UCS4) | ||||
| STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | ||||
| { | ||||
|     const unsigned char *p = (const unsigned char *) begin; | ||||
|     const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); | ||||
| 
 | ||||
|     while (p < end) { | ||||
|         if (!((size_t) p & LONG_PTR_MASK)) { | ||||
|             /* Help register allocation */ | ||||
|             register const unsigned char *_p = p; | ||||
|             while (_p < aligned_end) { | ||||
|                 unsigned long value = *(unsigned long *) _p; | ||||
|                 if (value & UCS1_ASCII_CHAR_MASK) | ||||
|                     return 255; | ||||
|                 _p += SIZEOF_LONG; | ||||
|             } | ||||
|             p = _p; | ||||
|             if (p == end) | ||||
|                 break; | ||||
|         } | ||||
|         if (*p++ & 0x80) | ||||
|             return 255; | ||||
|     } | ||||
|     return 127; | ||||
| } | ||||
| 
 | ||||
| #undef LONG_PTR_MASK | ||||
| #undef ASCII_CHAR_MASK | ||||
| 
 | ||||
| #else /* STRINGLIB_SIZEOF_CHAR == 1 */ | ||||
| 
 | ||||
| #define MASK_ASCII 0xFFFFFF80 | ||||
| #define MASK_UCS1 0xFFFFFF00 | ||||
| #define MASK_UCS2 0xFFFF0000 | ||||
| 
 | ||||
| #define MAX_CHAR_ASCII 0x7f | ||||
| #define MAX_CHAR_UCS1  0xff | ||||
| #define MAX_CHAR_UCS2  0xffff | ||||
| #define MAX_CHAR_UCS4  0x10ffff | ||||
| 
 | ||||
| Py_LOCAL_INLINE(Py_UCS4) | ||||
| STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) | ||||
| { | ||||
| #if STRINGLIB_SIZEOF_CHAR == 2 | ||||
|     const Py_UCS4 mask_limit = MASK_UCS1; | ||||
|     const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; | ||||
| #elif STRINGLIB_SIZEOF_CHAR == 4 | ||||
|     const Py_UCS4 mask_limit = MASK_UCS2; | ||||
|     const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; | ||||
| #else | ||||
| #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) | ||||
| #endif | ||||
|     register Py_UCS4 mask; | ||||
|     Py_ssize_t n = end - begin; | ||||
|     const STRINGLIB_CHAR *p = begin; | ||||
|     const STRINGLIB_CHAR *unrolled_end = begin + (n & ~ (Py_ssize_t) 3); | ||||
|     Py_UCS4 max_char; | ||||
| 
 | ||||
|     max_char = MAX_CHAR_ASCII; | ||||
|     mask = MASK_ASCII; | ||||
|     while (p < unrolled_end) { | ||||
|         STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; | ||||
|         if (bits & mask) { | ||||
|             if (mask == mask_limit) { | ||||
|                 /* Limit reached */ | ||||
|                 return max_char_limit; | ||||
|             } | ||||
|             if (mask == MASK_ASCII) { | ||||
|                 max_char = MAX_CHAR_UCS1; | ||||
|                 mask = MASK_UCS1; | ||||
|             } | ||||
|             else { | ||||
|                 /* mask can't be MASK_UCS2 because of mask_limit above */ | ||||
|                 assert(mask == MASK_UCS1); | ||||
|                 max_char = MAX_CHAR_UCS2; | ||||
|                 mask = MASK_UCS2; | ||||
|             } | ||||
|             /* We check the new mask on the same chars in the next iteration */ | ||||
|             continue; | ||||
|         } | ||||
|         p += 4; | ||||
|     } | ||||
|     while (p < end) { | ||||
|         if (p[0] & mask) { | ||||
|             if (mask == mask_limit) { | ||||
|                 /* Limit reached */ | ||||
|                 return max_char_limit; | ||||
|             } | ||||
|             if (mask == MASK_ASCII) { | ||||
|                 max_char = MAX_CHAR_UCS1; | ||||
|                 mask = MASK_UCS1; | ||||
|             } | ||||
|             else { | ||||
|                 /* mask can't be MASK_UCS2 because of mask_limit above */ | ||||
|                 assert(mask == MASK_UCS1); | ||||
|                 max_char = MAX_CHAR_UCS2; | ||||
|                 mask = MASK_UCS2; | ||||
|             } | ||||
|             /* We check the new mask on the same chars in the next iteration */ | ||||
|             continue; | ||||
|         } | ||||
|         p++; | ||||
|     } | ||||
|     return max_char; | ||||
| } | ||||
| 
 | ||||
| #undef MASK_ASCII | ||||
| #undef MASK_UCS1 | ||||
| #undef MASK_UCS2 | ||||
| #undef MAX_CHAR_ASCII | ||||
| #undef MAX_CHAR_UCS1 | ||||
| #undef MAX_CHAR_UCS2 | ||||
| #undef MAX_CHAR_UCS4 | ||||
| 
 | ||||
| #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ | ||||
| #endif /* STRINGLIB_IS_UNICODE */ | ||||
| 
 | ||||
|  | @ -259,6 +259,15 @@ static void copy_characters( | |||
| static int unicode_is_singleton(PyObject *unicode); | ||||
| #endif | ||||
| 
 | ||||
| static PyObject * | ||||
| unicode_fromascii(const unsigned char *s, Py_ssize_t size); | ||||
| static PyObject * | ||||
| _PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); | ||||
| static PyObject * | ||||
| _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); | ||||
| static PyObject * | ||||
| _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); | ||||
| 
 | ||||
| static PyObject * | ||||
| unicode_encode_call_errorhandler(const char *errors, | ||||
|        PyObject **errorHandler,const char *encoding, const char *reason, | ||||
|  | @ -468,6 +477,48 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len) | |||
|     (BLOOM(mask, chr) \ | ||||
|      && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) | ||||
| 
 | ||||
| /* Compilation of templated routines */ | ||||
| 
 | ||||
| #include "stringlib/asciilib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/find_max_char.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| #include "stringlib/ucs1lib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/find_max_char.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| #include "stringlib/ucs2lib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/find_max_char.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| #include "stringlib/ucs4lib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/find_max_char.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| /* --- Unicode Object ----------------------------------------------------- */ | ||||
| 
 | ||||
| static PyObject * | ||||
|  | @ -1689,17 +1740,11 @@ _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) | |||
| { | ||||
|     PyObject *res; | ||||
|     unsigned char max_char = 127; | ||||
|     Py_ssize_t i; | ||||
| 
 | ||||
|     assert(size >= 0); | ||||
|     if (size == 1) | ||||
|         return get_latin1_char(u[0]); | ||||
|     for (i = 0; i < size; i++) { | ||||
|         if (u[i] & 0x80) { | ||||
|             max_char = 255; | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|     max_char = ucs1lib_find_max_char(u, u + size); | ||||
|     res = PyUnicode_New(size, max_char); | ||||
|     if (!res) | ||||
|         return NULL; | ||||
|  | @ -1713,26 +1758,20 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) | |||
| { | ||||
|     PyObject *res; | ||||
|     Py_UCS2 max_char = 0; | ||||
|     Py_ssize_t i; | ||||
| 
 | ||||
|     assert(size >= 0); | ||||
|     if (size == 1 && u[0] < 256) | ||||
|         return get_latin1_char((unsigned char)u[0]); | ||||
|     for (i = 0; i < size; i++) { | ||||
|         if (u[i] > max_char) { | ||||
|             max_char = u[i]; | ||||
|             if (max_char >= 256) | ||||
|                 break; | ||||
|         } | ||||
|     } | ||||
|     max_char = ucs2lib_find_max_char(u, u + size); | ||||
|     res = PyUnicode_New(size, max_char); | ||||
|     if (!res) | ||||
|         return NULL; | ||||
|     if (max_char >= 256) | ||||
|         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); | ||||
|     else | ||||
|         for (i = 0; i < size; i++) | ||||
|             PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; | ||||
|     else { | ||||
|         _PyUnicode_CONVERT_BYTES( | ||||
|             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); | ||||
|     } | ||||
|     assert(_PyUnicode_CheckConsistency(res, 1)); | ||||
|     return res; | ||||
| } | ||||
|  | @ -1742,18 +1781,11 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) | |||
| { | ||||
|     PyObject *res; | ||||
|     Py_UCS4 max_char = 0; | ||||
|     Py_ssize_t i; | ||||
| 
 | ||||
|     assert(size >= 0); | ||||
|     if (size == 1 && u[0] < 256) | ||||
|         return get_latin1_char(u[0]); | ||||
|     for (i = 0; i < size; i++) { | ||||
|         if (u[i] > max_char) { | ||||
|             max_char = u[i]; | ||||
|             if (max_char >= 0x10000) | ||||
|                 break; | ||||
|         } | ||||
|     } | ||||
|     max_char = ucs4lib_find_max_char(u, u + size); | ||||
|     res = PyUnicode_New(size, max_char); | ||||
|     if (!res) | ||||
|         return NULL; | ||||
|  | @ -1794,7 +1826,7 @@ unicode_adjust_maxchar(PyObject **p_unicode) | |||
| { | ||||
|     PyObject *unicode, *copy; | ||||
|     Py_UCS4 max_char; | ||||
|     Py_ssize_t i, len; | ||||
|     Py_ssize_t len; | ||||
|     unsigned int kind; | ||||
| 
 | ||||
|     assert(p_unicode != NULL); | ||||
|  | @ -1807,37 +1839,23 @@ unicode_adjust_maxchar(PyObject **p_unicode) | |||
|     kind = PyUnicode_KIND(unicode); | ||||
|     if (kind == PyUnicode_1BYTE_KIND) { | ||||
|         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); | ||||
|         for (i = 0; i < len; i++) { | ||||
|             if (u[i] & 0x80) | ||||
|                 return; | ||||
|         } | ||||
|         max_char = 127; | ||||
|         max_char = ucs1lib_find_max_char(u, u + len); | ||||
|         if (max_char >= 128) | ||||
|             return; | ||||
|     } | ||||
|     else if (kind == PyUnicode_2BYTE_KIND) { | ||||
|         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); | ||||
|         max_char = 0; | ||||
|         for (i = 0; i < len; i++) { | ||||
|             if (u[i] > max_char) { | ||||
|                 max_char = u[i]; | ||||
|                 if (max_char >= 256) | ||||
|                     return; | ||||
|             } | ||||
|         } | ||||
|         max_char = ucs2lib_find_max_char(u, u + len); | ||||
|         if (max_char >= 256) | ||||
|             return; | ||||
|     } | ||||
|     else { | ||||
|         const Py_UCS4 *u; | ||||
|         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); | ||||
|         assert(kind == PyUnicode_4BYTE_KIND); | ||||
|         u = PyUnicode_4BYTE_DATA(unicode); | ||||
|         max_char = 0; | ||||
|         for (i = 0; i < len; i++) { | ||||
|             if (u[i] > max_char) { | ||||
|                 max_char = u[i]; | ||||
|                 if (max_char >= 0x10000) | ||||
|                     return; | ||||
|             } | ||||
|         } | ||||
|         max_char = ucs4lib_find_max_char(u, u + len); | ||||
|         if (max_char >= 0x10000) | ||||
|             return; | ||||
|     } | ||||
|     assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode)); | ||||
|     copy = PyUnicode_New(len, max_char); | ||||
|     copy_characters(copy, 0, unicode, 0, len); | ||||
|     Py_DECREF(unicode); | ||||
|  | @ -8508,42 +8526,6 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, | |||
| 
 | ||||
| /* --- Helpers ------------------------------------------------------------ */ | ||||
| 
 | ||||
| #include "stringlib/asciilib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| #include "stringlib/ucs1lib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| #include "stringlib/ucs2lib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| #include "stringlib/ucs4lib.h" | ||||
| #include "stringlib/fastsearch.h" | ||||
| #include "stringlib/partition.h" | ||||
| #include "stringlib/split.h" | ||||
| #include "stringlib/count.h" | ||||
| #include "stringlib/find.h" | ||||
| #include "stringlib/localeutil.h" | ||||
| #include "stringlib/undef.h" | ||||
| 
 | ||||
| static Py_ssize_t | ||||
| any_find_slice(int direction, PyObject* s1, PyObject* s2, | ||||
|                Py_ssize_t start, | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Antoine Pitrou
						Antoine Pitrou