mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	#1477: ur'\U0010FFFF' raised in narrow unicode builds.
Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
This commit is contained in:
		
							parent
							
								
									61854332b9
								
							
						
					
					
						commit
						9a0d3462fc
					
				
					 3 changed files with 63 additions and 6 deletions
				
			
		|  | @ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | |||
| 	    else | ||||
| 		x += 10 + c - 'A'; | ||||
| 	} | ||||
| #ifndef Py_UNICODE_WIDE | ||||
|         if (x > 0x10000) { | ||||
|         if (x <= 0xffff) | ||||
|                 /* UCS-2 character */ | ||||
|                 *p++ = (Py_UNICODE) x; | ||||
|         else if (x <= 0x10ffff) { | ||||
|                 /* UCS-4 character. Either store directly, or as
 | ||||
|                    surrogate pair. */ | ||||
| #ifdef Py_UNICODE_WIDE | ||||
|                 *p++ = (Py_UNIC0DE) x; | ||||
| #else | ||||
|                 x -= 0x10000L; | ||||
|                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); | ||||
|                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); | ||||
| #endif | ||||
|         } else { | ||||
|             endinpos = s-starts; | ||||
|             outpos = p-PyUnicode_AS_UNICODE(v); | ||||
|             if (unicode_decode_call_errorhandler( | ||||
|                     errors, &errorHandler, | ||||
|                     "rawunicodeescape", "\\Uxxxxxxxx out of range", | ||||
|  | @ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, | |||
| 		    (PyObject **)&v, &outpos, &p)) | ||||
| 		    goto onError; | ||||
|         } | ||||
| #endif | ||||
| 	*p++ = x; | ||||
| 	nextByte: | ||||
| 	; | ||||
|     } | ||||
|  | @ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, | |||
|             *p++ = hexdigit[ch & 15]; | ||||
|         } | ||||
|         else | ||||
| #else | ||||
| 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ | ||||
| 	if (ch >= 0xD800 && ch < 0xDC00) { | ||||
| 	    Py_UNICODE ch2; | ||||
| 	    Py_UCS4 ucs; | ||||
| 
 | ||||
| 	    ch2 = *s++; | ||||
| 	    size--; | ||||
| 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { | ||||
| 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; | ||||
| 		*p++ = '\\'; | ||||
| 		*p++ = 'U'; | ||||
| 		*p++ = hexdigit[(ucs >> 28) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 24) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 20) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 16) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 12) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 8) & 0xf]; | ||||
| 		*p++ = hexdigit[(ucs >> 4) & 0xf]; | ||||
| 		*p++ = hexdigit[ucs & 0xf]; | ||||
| 		continue; | ||||
| 	    } | ||||
| 	    /* Fall through: isolated surrogates are copied as-is */ | ||||
| 	    s--; | ||||
| 	    size++; | ||||
| 	} | ||||
| #endif | ||||
| 	/* Map 16-bit characters to '\uxxxx' */ | ||||
| 	if (ch >= 256) { | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Amaury Forgeot d'Arc
						Amaury Forgeot d'Arc