mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP characters.
This commit is contained in:
		
						commit
						ae3b32ad6b
					
				
					 3 changed files with 47 additions and 9 deletions
				
			
		|  | @ -330,7 +330,7 @@ def test_badbom(self): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u0100\uffff", | ||||
|             "\x00\xff\u0100\uffff\U00010000", | ||||
|             [ | ||||
|                 "", # first byte of BOM read | ||||
|                 "", # second byte of BOM read | ||||
|  | @ -352,6 +352,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -386,7 +390,7 @@ class UTF32LETest(ReadTest): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u0100\uffff", | ||||
|             "\x00\xff\u0100\uffff\U00010000", | ||||
|             [ | ||||
|                 "", | ||||
|                 "", | ||||
|  | @ -404,6 +408,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -426,7 +434,7 @@ class UTF32BETest(ReadTest): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u0100\uffff", | ||||
|             "\x00\xff\u0100\uffff\U00010000", | ||||
|             [ | ||||
|                 "", | ||||
|                 "", | ||||
|  | @ -444,6 +452,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -494,7 +506,7 @@ def test_badbom(self): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u0100\uffff", | ||||
|             "\x00\xff\u0100\uffff\U00010000", | ||||
|             [ | ||||
|                 "", # first byte of BOM read | ||||
|                 "", # second byte of BOM read => byteorder known | ||||
|  | @ -506,6 +518,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -543,7 +559,7 @@ class UTF16LETest(ReadTest): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u0100\uffff", | ||||
|             "\x00\xff\u0100\uffff\U00010000", | ||||
|             [ | ||||
|                 "", | ||||
|                 "\x00", | ||||
|  | @ -553,6 +569,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -582,7 +602,7 @@ class UTF16BETest(ReadTest): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u0100\uffff", | ||||
|             "\x00\xff\u0100\uffff\U00010000", | ||||
|             [ | ||||
|                 "", | ||||
|                 "\x00", | ||||
|  | @ -592,6 +612,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff", | ||||
|                 "\x00\xff\u0100\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -621,7 +645,7 @@ class UTF8Test(ReadTest): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\x00\xff\u07ff\u0800\uffff", | ||||
|             "\x00\xff\u07ff\u0800\uffff\U00010000", | ||||
|             [ | ||||
|                 "\x00", | ||||
|                 "\x00", | ||||
|  | @ -634,6 +658,10 @@ def test_partial(self): | |||
|                 "\x00\xff\u07ff\u0800", | ||||
|                 "\x00\xff\u07ff\u0800", | ||||
|                 "\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\x00\xff\u07ff\u0800\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  | @ -816,7 +844,7 @@ class UTF8SigTest(ReadTest): | |||
| 
 | ||||
|     def test_partial(self): | ||||
|         self.check_partial( | ||||
|             "\ufeff\x00\xff\u07ff\u0800\uffff", | ||||
|             "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", | ||||
|             [ | ||||
|                 "", | ||||
|                 "", | ||||
|  | @ -835,6 +863,10 @@ def test_partial(self): | |||
|                 "\ufeff\x00\xff\u07ff\u0800", | ||||
|                 "\ufeff\x00\xff\u07ff\u0800", | ||||
|                 "\ufeff\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\ufeff\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\ufeff\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\ufeff\x00\xff\u07ff\u0800\uffff", | ||||
|                 "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", | ||||
|             ] | ||||
|         ) | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,6 +12,9 @@ What's New in Python 3.3.1? | |||
| Core and Builtins | ||||
| ----------------- | ||||
| 
 | ||||
| - Issue #11461: Fix the incremental UTF-16 decoder. Original patch by | ||||
|   Amaury Forgeot d'Arc. | ||||
| 
 | ||||
| - Issue #16881: Fix Py_ARRAY_LENGTH macro for GCC < 3.1. | ||||
| 
 | ||||
| - Issue #16856: Fix a segmentation fault from calling repr() on a dict with | ||||
|  |  | |||
|  | @ -5284,8 +5284,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s, | |||
|             /* The remaining input chars are ignored if the callback
 | ||||
|                chooses to skip the input */ | ||||
|         case 1: | ||||
|             q -= 2; | ||||
|             if (consumed) | ||||
|                 goto End; | ||||
|             errmsg = "unexpected end of data"; | ||||
|             startinpos = ((const char *)q) - 2 - starts; | ||||
|             startinpos = ((const char *)q) - starts; | ||||
|             endinpos = ((const char *)e) - starts; | ||||
|             break; | ||||
|         case 2: | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Serhiy Storchaka
						Serhiy Storchaka