mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	#1477: ur'\U0010FFFF' raised in narrow unicode builds.
Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
This commit is contained in:
		
							parent
							
								
									61854332b9
								
							
						
					
					
						commit
						9a0d3462fc
					
				
					 3 changed files with 63 additions and 6 deletions
				
			
		|  | @ -736,12 +736,25 @@ def write(self, text): | |||
|         print >>out, u'def\n' | ||||
| 
 | ||||
|     def test_ucs4(self): | ||||
|         if sys.maxunicode == 0xFFFF: | ||||
|             return | ||||
|         x = u'\U00100000' | ||||
|         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") | ||||
|         self.assertEqual(x, y) | ||||
| 
 | ||||
|         y = r'\U00100000' | ||||
|         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") | ||||
|         self.assertEqual(x, y) | ||||
|         y = r'\U00010000' | ||||
|         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") | ||||
|         self.assertEqual(x, y) | ||||
| 
 | ||||
|         try: | ||||
|             '\U11111111'.decode("raw-unicode-escape") | ||||
|         except UnicodeDecodeError as e: | ||||
|             self.assertEqual(e.start, 0) | ||||
|             self.assertEqual(e.end, 10) | ||||
|         else: | ||||
|             self.fail("Should have raised UnicodeDecodeError") | ||||
| 
 | ||||
|     def test_conversion(self): | ||||
|         # Make sure __unicode__() works properly | ||||
|         class Foo0: | ||||
|  |  | |||
|  | @ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2? | |||
| Core and builtins | ||||
| ----------------- | ||||
|   | ||||
| - Issue #1477: With narrow Unicode builds, the unicode escape sequence | ||||
|   \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane.  This | ||||
|   affected raw unicode literals and the 'raw-unicode-escape' codec.  Now | ||||
|   UTF-16 surrogates are generated in this case, like normal unicode literals | ||||
|   and the 'unicode-escape' codec. | ||||
| 
 | ||||
| - Issue #2348: add Py3k warning for file.softspace. | ||||
| 
 | ||||
| - Issue #2346/#2347: add Py3k warnings for __methods__ and __members__. | ||||
|  |  | |||
|  | @ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | |||
| 	    else | ||||
| 		x += 10 + c - 'A'; | ||||
| 	} | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|         if (x > 0x10000) { | ||||
|         if (x <= 0xffff) | ||||
|                 /* UCS-2 character */ | ||||
|                 *p++ = (Py_UNICODE) x; | ||||
|         else if (x <= 0x10ffff) { | ||||
|                 /* UCS-4 character. Either store directly, or as
 | ||||
|                    surrogate pair. */ | ||||
| #ifdef Py_UNICODE_WIDE | ||||
|                 *p++ = (Py_UNIC0DE) x; | ||||
| #else | ||||
|                 x -= 0x10000L; | ||||
|                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); | ||||
|                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); | ||||
| #endif | ||||
|         } else { | ||||
|             endinpos = s-starts; | ||||
|             outpos = p-PyUnicode_AS_UNICODE(v); | ||||
|             if (unicode_decode_call_errorhandler( | ||||
|                     errors, &errorHandler, | ||||
|                     "rawunicodeescape", "\\Uxxxxxxxx out of range", | ||||
|  | @ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | |||
| 		    (PyObject **)&v, &outpos, &p)) | ||||
| 		    goto onError; | ||||
|         } | ||||
| #endif | ||||
| 	*p++ = x; | ||||
| 	nextByte: | ||||
| 	; | ||||
|     } | ||||
|  | @ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, | |||
|             *p++ = hexdigit[ch & 15]; | ||||
|         } | ||||
|         else | ||||
| #else | ||||
| 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ | ||||
| 	if (ch >= 0xD800 && ch < 0xDC00) { | ||||
| 	    Py_UNICODE ch2; | ||||
| 	    Py_UCS4 ucs; | ||||
| 
 | ||||
| 	    ch2 = *s++; | ||||
| 	    size--; | ||||
| 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | ||||
| 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; | ||||
| 		*p++ = '\\'; | ||||
| 		*p++ = 'U'; | ||||
| 		*p++ = hexdigit[(ucs >> 28) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 24) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 20) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 16) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 12) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 8) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 4) & 0xf]; | ||||
| 		*p++ = hexdigit[ucs & 0xf]; | ||||
| 		continue; | ||||
| 	    } | ||||
| 	    /* Fall through: isolated surrogates are copied as-is */ | ||||
| 	    s--; | ||||
| 	    size++; | ||||
| 	} | ||||
| #endif | ||||
| 	/* Map 16-bit characters to '\uxxxx' */ | ||||
| 	if (ch >= 256) { | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Amaury Forgeot d'Arc
						Amaury Forgeot d'Arc