mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	#1477: ur'\U0010FFFF' raised in narrow unicode builds.
Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
This commit is contained in:
		
							parent
							
								
									61854332b9
								
							
						
					
					
						commit
						9a0d3462fc
					
				
					 3 changed files with 63 additions and 6 deletions
				
			
		|  | @ -736,12 +736,25 @@ def write(self, text): | ||||||
|         print >>out, u'def\n' |         print >>out, u'def\n' | ||||||
| 
 | 
 | ||||||
|     def test_ucs4(self): |     def test_ucs4(self): | ||||||
|         if sys.maxunicode == 0xFFFF: |  | ||||||
|             return |  | ||||||
|         x = u'\U00100000' |         x = u'\U00100000' | ||||||
|         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") |         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") | ||||||
|         self.assertEqual(x, y) |         self.assertEqual(x, y) | ||||||
| 
 | 
 | ||||||
|  |         y = r'\U00100000' | ||||||
|  |         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") | ||||||
|  |         self.assertEqual(x, y) | ||||||
|  |         y = r'\U00010000' | ||||||
|  |         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") | ||||||
|  |         self.assertEqual(x, y) | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             '\U11111111'.decode("raw-unicode-escape") | ||||||
|  |         except UnicodeDecodeError as e: | ||||||
|  |             self.assertEqual(e.start, 0) | ||||||
|  |             self.assertEqual(e.end, 10) | ||||||
|  |         else: | ||||||
|  |             self.fail("Should have raised UnicodeDecodeError") | ||||||
|  | 
 | ||||||
|     def test_conversion(self): |     def test_conversion(self): | ||||||
|         # Make sure __unicode__() works properly |         # Make sure __unicode__() works properly | ||||||
|         class Foo0: |         class Foo0: | ||||||
|  |  | ||||||
|  | @ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2? | ||||||
| Core and builtins | Core and builtins | ||||||
| ----------------- | ----------------- | ||||||
|   |   | ||||||
|  | - Issue #1477: With narrow Unicode builds, the unicode escape sequence | ||||||
|  |   \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane.  This | ||||||
|  |   affected raw unicode literals and the 'raw-unicode-escape' codec.  Now | ||||||
|  |   UTF-16 surrogates are generated in this case, like normal unicode literals | ||||||
|  |   and the 'unicode-escape' codec. | ||||||
|  | 
 | ||||||
| - Issue #2348: add Py3k warning for file.softspace. | - Issue #2348: add Py3k warning for file.softspace. | ||||||
| 
 | 
 | ||||||
| - Issue #2346/#2347: add Py3k warnings for __methods__ and __members__. | - Issue #2346/#2347: add Py3k warnings for __methods__ and __members__. | ||||||
|  |  | ||||||
|  | @ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | ||||||
| 	    else | 	    else | ||||||
| 		x += 10 + c - 'A'; | 		x += 10 + c - 'A'; | ||||||
| 	} | 	} | ||||||
| #ifndef Py_UNICODE_WIDE |         if (x <= 0xffff) | ||||||
|         if (x > 0x10000) { |                 /* UCS-2 character */ | ||||||
|  |                 *p++ = (Py_UNICODE) x; | ||||||
|  |         else if (x <= 0x10ffff) { | ||||||
|  |                 /* UCS-4 character. Either store directly, or as
 | ||||||
|  |                    surrogate pair. */ | ||||||
|  | #ifdef Py_UNICODE_WIDE | ||||||
|  |                 *p++ = (Py_UNIC0DE) x; | ||||||
|  | #else | ||||||
|  |                 x -= 0x10000L; | ||||||
|  |                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); | ||||||
|  |                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); | ||||||
|  | #endif | ||||||
|  |         } else { | ||||||
|  |             endinpos = s-starts; | ||||||
|  |             outpos = p-PyUnicode_AS_UNICODE(v); | ||||||
|             if (unicode_decode_call_errorhandler( |             if (unicode_decode_call_errorhandler( | ||||||
|                     errors, &errorHandler, |                     errors, &errorHandler, | ||||||
|                     "rawunicodeescape", "\\Uxxxxxxxx out of range", |                     "rawunicodeescape", "\\Uxxxxxxxx out of range", | ||||||
|  | @ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | ||||||
| 		    (PyObject **)&v, &outpos, &p)) | 		    (PyObject **)&v, &outpos, &p)) | ||||||
| 		    goto onError; | 		    goto onError; | ||||||
|         } |         } | ||||||
| #endif |  | ||||||
| 	*p++ = x; |  | ||||||
| 	nextByte: | 	nextByte: | ||||||
| 	; | 	; | ||||||
|     } |     } | ||||||
|  | @ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, | ||||||
|             *p++ = hexdigit[ch & 15]; |             *p++ = hexdigit[ch & 15]; | ||||||
|         } |         } | ||||||
|         else |         else | ||||||
|  | #else | ||||||
|  | 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ | ||||||
|  | 	if (ch >= 0xD800 && ch < 0xDC00) { | ||||||
|  | 	    Py_UNICODE ch2; | ||||||
|  | 	    Py_UCS4 ucs; | ||||||
|  | 
 | ||||||
|  | 	    ch2 = *s++; | ||||||
|  | 	    size--; | ||||||
|  | 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | ||||||
|  | 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; | ||||||
|  | 		*p++ = '\\'; | ||||||
|  | 		*p++ = 'U'; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 28) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 24) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 20) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 16) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 12) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 8) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[(ucs >> 4) & 0xf]; | ||||||
|  | 		*p++ = hexdigit[ucs & 0xf]; | ||||||
|  | 		continue; | ||||||
|  | 	    } | ||||||
|  | 	    /* Fall through: isolated surrogates are copied as-is */ | ||||||
|  | 	    s--; | ||||||
|  | 	    size++; | ||||||
|  | 	} | ||||||
| #endif | #endif | ||||||
| 	/* Map 16-bit characters to '\uxxxx' */ | 	/* Map 16-bit characters to '\uxxxx' */ | ||||||
| 	if (ch >= 256) { | 	if (ch >= 256) { | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Amaury Forgeot d'Arc
						Amaury Forgeot d'Arc