mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	#1477: ur'\U0010FFFF' raised in narrow unicode builds.
Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
This commit is contained in:
		
							parent
							
								
									61854332b9
								
							
						
					
					
						commit
						9a0d3462fc
					
				
					 3 changed files with 63 additions and 6 deletions
				
			
		| 
						 | 
					@ -736,12 +736,25 @@ def write(self, text):
 | 
				
			||||||
        print >>out, u'def\n'
 | 
					        print >>out, u'def\n'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_ucs4(self):
 | 
					    def test_ucs4(self):
 | 
				
			||||||
        if sys.maxunicode == 0xFFFF:
 | 
					 | 
				
			||||||
            return
 | 
					 | 
				
			||||||
        x = u'\U00100000'
 | 
					        x = u'\U00100000'
 | 
				
			||||||
        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
 | 
					        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
 | 
				
			||||||
        self.assertEqual(x, y)
 | 
					        self.assertEqual(x, y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        y = r'\U00100000'
 | 
				
			||||||
 | 
					        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
 | 
				
			||||||
 | 
					        self.assertEqual(x, y)
 | 
				
			||||||
 | 
					        y = r'\U00010000'
 | 
				
			||||||
 | 
					        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
 | 
				
			||||||
 | 
					        self.assertEqual(x, y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            '\U11111111'.decode("raw-unicode-escape")
 | 
				
			||||||
 | 
					        except UnicodeDecodeError as e:
 | 
				
			||||||
 | 
					            self.assertEqual(e.start, 0)
 | 
				
			||||||
 | 
					            self.assertEqual(e.end, 10)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            self.fail("Should have raised UnicodeDecodeError")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_conversion(self):
 | 
					    def test_conversion(self):
 | 
				
			||||||
        # Make sure __unicode__() works properly
 | 
					        # Make sure __unicode__() works properly
 | 
				
			||||||
        class Foo0:
 | 
					        class Foo0:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2?
 | 
				
			||||||
Core and builtins
 | 
					Core and builtins
 | 
				
			||||||
-----------------
 | 
					-----------------
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					- Issue #1477: With narrow Unicode builds, the unicode escape sequence
 | 
				
			||||||
 | 
					  \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane.  This
 | 
				
			||||||
 | 
					  affected raw unicode literals and the 'raw-unicode-escape' codec.  Now
 | 
				
			||||||
 | 
					  UTF-16 surrogates are generated in this case, like normal unicode literals
 | 
				
			||||||
 | 
					  and the 'unicode-escape' codec.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Issue #2348: add Py3k warning for file.softspace.
 | 
					- Issue #2348: add Py3k warning for file.softspace.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
 | 
					- Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 | 
				
			||||||
	    else
 | 
						    else
 | 
				
			||||||
		x += 10 + c - 'A';
 | 
							x += 10 + c - 'A';
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
#ifndef Py_UNICODE_WIDE
 | 
					        if (x <= 0xffff)
 | 
				
			||||||
        if (x > 0x10000) {
 | 
					                /* UCS-2 character */
 | 
				
			||||||
 | 
					                *p++ = (Py_UNICODE) x;
 | 
				
			||||||
 | 
					        else if (x <= 0x10ffff) {
 | 
				
			||||||
 | 
					                /* UCS-4 character. Either store directly, or as
 | 
				
			||||||
 | 
					                   surrogate pair. */
 | 
				
			||||||
 | 
					#ifdef Py_UNICODE_WIDE
 | 
				
			||||||
 | 
					                *p++ = (Py_UNIC0DE) x;
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					                x -= 0x10000L;
 | 
				
			||||||
 | 
					                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
 | 
				
			||||||
 | 
					                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					        } else {
 | 
				
			||||||
 | 
					            endinpos = s-starts;
 | 
				
			||||||
 | 
					            outpos = p-PyUnicode_AS_UNICODE(v);
 | 
				
			||||||
            if (unicode_decode_call_errorhandler(
 | 
					            if (unicode_decode_call_errorhandler(
 | 
				
			||||||
                    errors, &errorHandler,
 | 
					                    errors, &errorHandler,
 | 
				
			||||||
                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
 | 
					                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
 | 
				
			||||||
| 
						 | 
					@ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 | 
				
			||||||
		    (PyObject **)&v, &outpos, &p))
 | 
							    (PyObject **)&v, &outpos, &p))
 | 
				
			||||||
		    goto onError;
 | 
							    goto onError;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
	*p++ = x;
 | 
					 | 
				
			||||||
	nextByte:
 | 
						nextByte:
 | 
				
			||||||
	;
 | 
						;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
| 
						 | 
					@ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
 | 
				
			||||||
            *p++ = hexdigit[ch & 15];
 | 
					            *p++ = hexdigit[ch & 15];
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        else
 | 
					        else
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
						/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
 | 
				
			||||||
 | 
						if (ch >= 0xD800 && ch < 0xDC00) {
 | 
				
			||||||
 | 
						    Py_UNICODE ch2;
 | 
				
			||||||
 | 
						    Py_UCS4 ucs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						    ch2 = *s++;
 | 
				
			||||||
 | 
						    size--;
 | 
				
			||||||
 | 
						    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 | 
				
			||||||
 | 
							ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
 | 
				
			||||||
 | 
							*p++ = '\\';
 | 
				
			||||||
 | 
							*p++ = 'U';
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 28) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 24) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 20) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 16) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 12) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 8) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[(ucs >> 4) & 0xf];
 | 
				
			||||||
 | 
							*p++ = hexdigit[ucs & 0xf];
 | 
				
			||||||
 | 
							continue;
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						    /* Fall through: isolated surrogates are copied as-is */
 | 
				
			||||||
 | 
						    s--;
 | 
				
			||||||
 | 
						    size++;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
	/* Map 16-bit characters to '\uxxxx' */
 | 
						/* Map 16-bit characters to '\uxxxx' */
 | 
				
			||||||
	if (ch >= 256) {
 | 
						if (ch >= 256) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue