mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 18:54:53 +00:00 
			
		
		
		
	Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster
Patch written by Serhiy Storchaka.
This commit is contained in:
		
							parent
							
								
									d4156c1693
								
							
						
					
					
						commit
						e64322e034
					
				
					 3 changed files with 73 additions and 74 deletions
				
			
		|  | @ -157,7 +157,7 @@ Optimizations | |||
| 
 | ||||
| Major performance enhancements have been added: | ||||
| 
 | ||||
| * None yet. | ||||
| * The UTF-32 decoder is now 3x to 4x faster. | ||||
| 
 | ||||
| 
 | ||||
| Build and C API Changes | ||||
|  |  | |||
|  | @ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1? | |||
| Core and Builtins | ||||
| ----------------- | ||||
| 
 | ||||
| - Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch | ||||
|   written by Serhiy Storchaka. | ||||
| 
 | ||||
| - Issue #16197: Update winreg docstrings and documentation to match code. | ||||
|   Patch by Zachary Ware. | ||||
| 
 | ||||
|  |  | |||
|  | @ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s, | |||
|     Py_ssize_t outpos; | ||||
|     PyObject *unicode; | ||||
|     const unsigned char *q, *e; | ||||
|     int bo = 0;       /* assume native ordering by default */ | ||||
|     int le, bo = 0;       /* assume native ordering by default */ | ||||
|     const char *errmsg = ""; | ||||
|     /* Offsets from q for retrieving bytes in the right order. */ | ||||
| #if PY_LITTLE_ENDIAN | ||||
|     int iorder[] = {0, 1, 2, 3}; | ||||
| #else | ||||
|     int iorder[] = {3, 2, 1, 0}; | ||||
| #endif | ||||
|     PyObject *errorHandler = NULL; | ||||
|     PyObject *exc = NULL; | ||||
| 
 | ||||
|  | @ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s, | |||
|        byte order setting accordingly. In native mode, the leading BOM | ||||
|        mark is skipped, in all other modes, it is copied to the output | ||||
|        stream as-is (giving a ZWNBSP character). */ | ||||
|     if (bo == 0) { | ||||
|         if (size >= 4) { | ||||
|             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||
|                 (q[iorder[1]] << 8) | q[iorder[0]]; | ||||
| #if PY_LITTLE_ENDIAN | ||||
|     if (bo == 0 && size >= 4) { | ||||
|         Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; | ||||
|         if (bom == 0x0000FEFF) { | ||||
|                 q += 4; | ||||
|             bo = -1; | ||||
|             q += 4; | ||||
|         } | ||||
|         else if (bom == 0xFFFE0000) { | ||||
|                 q += 4; | ||||
|             bo = 1; | ||||
|             q += 4; | ||||
|         } | ||||
|         if (byteorder) | ||||
|             *byteorder = bo; | ||||
|     } | ||||
| 
 | ||||
|     if (q == e) { | ||||
|         if (consumed) | ||||
|             *consumed = size; | ||||
|         Py_INCREF(unicode_empty); | ||||
|         return unicode_empty; | ||||
|     } | ||||
| 
 | ||||
| #ifdef WORDS_BIGENDIAN | ||||
|     le = bo < 0; | ||||
| #else | ||||
|             if (bom == 0x0000FEFF) { | ||||
|                 q += 4; | ||||
|                 bo = 1; | ||||
|             } | ||||
|             else if (bom == 0xFFFE0000) { | ||||
|                 q += 4; | ||||
|                 bo = -1; | ||||
|             } | ||||
|     le = bo <= 0; | ||||
| #endif | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (bo == -1) { | ||||
|         /* force LE */ | ||||
|         iorder[0] = 0; | ||||
|         iorder[1] = 1; | ||||
|         iorder[2] = 2; | ||||
|         iorder[3] = 3; | ||||
|     } | ||||
|     else if (bo == 1) { | ||||
|         /* force BE */ | ||||
|         iorder[0] = 3; | ||||
|         iorder[1] = 2; | ||||
|         iorder[2] = 1; | ||||
|         iorder[3] = 0; | ||||
|     } | ||||
| 
 | ||||
|     /* This might be one to much, because of a BOM */ | ||||
|     unicode = PyUnicode_New((size+3)/4, 127); | ||||
|     unicode = PyUnicode_New((e - q + 3) / 4, 127); | ||||
|     if (!unicode) | ||||
|         return NULL; | ||||
|     if (size == 0) | ||||
|         return unicode; | ||||
|     outpos = 0; | ||||
| 
 | ||||
|     while (q < e) { | ||||
|         Py_UCS4 ch; | ||||
|         /* remaining bytes at the end? (size should be divisible by 4) */ | ||||
|         if (e-q<4) { | ||||
|             if (consumed) | ||||
|     outpos = 0; | ||||
|     while (1) { | ||||
|         Py_UCS4 ch = 0; | ||||
|         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode); | ||||
| 
 | ||||
|         if (e - q >= 4) { | ||||
|             enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); | ||||
|             void *data = PyUnicode_DATA(unicode); | ||||
|             const unsigned char *last = e - 4; | ||||
|             if (le) { | ||||
|                 do { | ||||
|                     ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; | ||||
|                     if (ch > maxch) | ||||
|                         break; | ||||
|                     PyUnicode_WRITE(kind, data, outpos++, ch); | ||||
|                     q += 4; | ||||
|                 } while (q <= last); | ||||
|             } | ||||
|             else { | ||||
|                 do { | ||||
|                     ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; | ||||
|                     if (ch > maxch) | ||||
|                         break; | ||||
|                     PyUnicode_WRITE(kind, data, outpos++, ch); | ||||
|                     q += 4; | ||||
|                 } while (q <= last); | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if (ch <= maxch) { | ||||
|             if (q == e || consumed) | ||||
|                 break; | ||||
|             /* remaining bytes at the end? (size should be divisible by 4) */ | ||||
|             errmsg = "truncated data"; | ||||
|             startinpos = ((const char *)q) - starts; | ||||
|             endinpos = ((const char *)e) - starts; | ||||
|             goto utf32Error; | ||||
|             /* The remaining input chars are ignored if the callback
 | ||||
|                chooses to skip the input */ | ||||
|         } | ||||
|         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | | ||||
|             (q[iorder[1]] << 8) | q[iorder[0]]; | ||||
| 
 | ||||
|         if (ch >= 0x110000) | ||||
|         { | ||||
|             errmsg = "codepoint not in range(0x110000)"; | ||||
|             startinpos = ((const char *)q)-starts; | ||||
|             endinpos = startinpos+4; | ||||
|             goto utf32Error; | ||||
|         } | ||||
|         else { | ||||
|             if (ch < 0x110000) { | ||||
|                 if (unicode_putchar(&unicode, &outpos, ch) < 0) | ||||
|                     goto onError; | ||||
|                 q += 4; | ||||
|                 continue; | ||||
|       utf32Error: | ||||
|             } | ||||
|             errmsg = "codepoint not in range(0x110000)"; | ||||
|             startinpos = ((const char *)q) - starts; | ||||
|             endinpos = startinpos + 4; | ||||
|         } | ||||
| 
 | ||||
|         /* The remaining input chars are ignored if the callback
 | ||||
|            chooses to skip the input */ | ||||
|         if (unicode_decode_call_errorhandler( | ||||
|                 errors, &errorHandler, | ||||
|                 "utf32", errmsg, | ||||
|  | @ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s, | |||
|             goto onError; | ||||
|     } | ||||
| 
 | ||||
|     if (byteorder) | ||||
|         *byteorder = bo; | ||||
| 
 | ||||
|     if (consumed) | ||||
|         *consumed = (const char *)q-starts; | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Victor Stinner
						Victor Stinner