mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
about illegal code points. The codec now supports PEP 293 style error handlers. (This is a variant of the Nik Haldimann's patch that detects truncated data)
This commit is contained in:
		
							parent
							
								
									523c9f0709
								
							
						
					
					
						commit
						a47d1c08d0
					
				
					 6 changed files with 173 additions and 5 deletions
				
			
		|  | @ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( | |||
|     int length	 		/* Number of Py_UNICODE chars to encode */ | ||||
|     ); | ||||
| 
 | ||||
| /* --- Unicode Internal Codec ---------------------------------------------
 | ||||
| 
 | ||||
|     Only for internal use in _codecsmodule.c */ | ||||
| 
 | ||||
| PyObject *_PyUnicode_DecodeUnicodeInternal( | ||||
|     const char *string, | ||||
|     int length, | ||||
|     const char *errors | ||||
|     ); | ||||
| 
 | ||||
| /* --- Latin-1 Codecs ----------------------------------------------------- 
 | ||||
| 
 | ||||
|    Note: Latin-1 corresponds to the first 256 Unicode ordinals. | ||||
|  |  | |||
|  | @ -111,7 +111,7 @@ def test_backslashescape(self): | |||
|             sout += "\\U%08x" % sys.maxunicode | ||||
|         self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) | ||||
| 
 | ||||
|     def test_relaxedutf8(self): | ||||
|     def test_decoderelaxedutf8(self): | ||||
|         # This is the test for a decoding callback handler, | ||||
|         # that relaxes the UTF-8 minimal encoding restriction. | ||||
|         # A null byte that is encoded as "\xc0\x80" will be | ||||
|  | @ -158,6 +158,35 @@ def test_charmapencode(self): | |||
|         charmap[ord("?")] = u"XYZ" | ||||
|         self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) | ||||
| 
 | ||||
|     def test_decodeunicodeinternal(self): | ||||
|         self.assertRaises( | ||||
|             UnicodeDecodeError, | ||||
|             "\x00\x00\x00\x00\x00".decode, | ||||
|             "unicode-internal", | ||||
|         ) | ||||
|         if sys.maxunicode > 0xffff: | ||||
|             def handler_unicodeinternal(exc): | ||||
|                 if not isinstance(exc, UnicodeDecodeError): | ||||
|                     raise TypeError("don't know how to handle %r" % exc) | ||||
|                 return (u"\x01", 1) | ||||
| 
 | ||||
|             self.assertEqual( | ||||
|                 "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), | ||||
|                 u"\u0000" | ||||
|             ) | ||||
| 
 | ||||
|             self.assertEqual( | ||||
|                 "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), | ||||
|                 u"\u0000\ufffd" | ||||
|             ) | ||||
| 
 | ||||
|             codecs.register_error("test.hui", handler_unicodeinternal) | ||||
| 
 | ||||
|             self.assertEqual( | ||||
|                 "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), | ||||
|                 u"\u0000\u0001\u0000" | ||||
|             ) | ||||
| 
 | ||||
|     def test_callbacks(self): | ||||
|         def handler1(exc): | ||||
|             if not isinstance(exc, UnicodeEncodeError) \ | ||||
|  | @ -503,7 +532,8 @@ def test_badhandlerresults(self): | |||
|             for (enc, bytes) in ( | ||||
|                 ("ascii", "\xff"), | ||||
|                 ("utf-8", "\xff"), | ||||
|                 ("utf-7", "+x-") | ||||
|                 ("utf-7", "+x-"), | ||||
|                 ("unicode-internal", "\x00"), | ||||
|             ): | ||||
|                 self.assertRaises( | ||||
|                     TypeError, | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from test import test_support | ||||
| import unittest | ||||
| import codecs | ||||
| import StringIO | ||||
| import sys, StringIO | ||||
| 
 | ||||
| class Queue(object): | ||||
|     """ | ||||
|  | @ -453,6 +453,54 @@ def test_decode(self): | |||
|         for uni, puny in punycode_testcases: | ||||
|             self.assertEquals(uni, puny.decode("punycode")) | ||||
| 
 | ||||
| class UnicodeInternalTest(unittest.TestCase): | ||||
|     def test_bug1251300(self): | ||||
|         # Decoding with unicode_internal used to not correctly handle "code | ||||
|         # points" above 0x10ffff on UCS-4 builds. | ||||
|         if sys.maxunicode > 0xffff: | ||||
|             ok = [ | ||||
|                 ("\x00\x10\xff\xff", u"\U0010ffff"), | ||||
|                 ("\x00\x00\x01\x01", u"\U00000101"), | ||||
|                 ("", u""), | ||||
|             ] | ||||
|             not_ok = [ | ||||
|                 "\x7f\xff\xff\xff", | ||||
|                 "\x80\x00\x00\x00", | ||||
|                 "\x81\x00\x00\x00", | ||||
|                 "\x00", | ||||
|                 "\x00\x00\x00\x00\x00", | ||||
|             ] | ||||
|             for internal, uni in ok: | ||||
|                 if sys.byteorder == "little": | ||||
|                     internal = "".join(reversed(internal)) | ||||
|                 self.assertEquals(uni, internal.decode("unicode_internal")) | ||||
|             for internal in not_ok: | ||||
|                 if sys.byteorder == "little": | ||||
|                     internal = "".join(reversed(internal)) | ||||
|                 self.assertRaises(UnicodeDecodeError, internal.decode, | ||||
|                     "unicode_internal") | ||||
| 
 | ||||
|     def test_decode_error_attributes(self): | ||||
|         if sys.maxunicode > 0xffff: | ||||
|             try: | ||||
|                 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") | ||||
|             except UnicodeDecodeError, ex: | ||||
|                 self.assertEquals("unicode_internal", ex.encoding) | ||||
|                 self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) | ||||
|                 self.assertEquals(4, ex.start) | ||||
|                 self.assertEquals(8, ex.end) | ||||
|             else: | ||||
|                 self.fail() | ||||
| 
 | ||||
|     def test_decode_callback(self): | ||||
|         if sys.maxunicode > 0xffff: | ||||
|             codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) | ||||
|             decoder = codecs.getdecoder("unicode_internal") | ||||
|             ab = u"ab".encode("unicode_internal") | ||||
|             ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), | ||||
|                 "UnicodeInternalTest") | ||||
|             self.assertEquals((u"ab", 12), ignored) | ||||
| 
 | ||||
| # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html | ||||
| nameprep_tests = [ | ||||
|     # 3.1 Map to nothing. | ||||
|  | @ -885,6 +933,7 @@ def test_main(): | |||
|         EscapeDecodeTest, | ||||
|         RecodingTest, | ||||
|         PunycodeTest, | ||||
|         UnicodeInternalTest, | ||||
|         NameprepTest, | ||||
|         CodecTest, | ||||
|         CodecsModuleTest, | ||||
|  |  | |||
|  | @ -435,6 +435,10 @@ Library | |||
|   line ending. Remove the special handling of a "\r\n" that has been split | ||||
|   between two lines. | ||||
| 
 | ||||
| - Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain | ||||
|   about illegal code points. The codec now supports PEP 293 style error | ||||
|   handlers. | ||||
| 
 | ||||
| 
 | ||||
| Build | ||||
| ----- | ||||
|  |  | |||
|  | @ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self, | |||
|     else { | ||||
| 	if (PyObject_AsReadBuffer(obj, (const void **)&data, &size)) | ||||
| 	    return NULL; | ||||
| 	return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data, | ||||
| 						 size / sizeof(Py_UNICODE)), | ||||
| 
 | ||||
| 	return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors), | ||||
| 			   size); | ||||
|     } | ||||
| } | ||||
|  |  | |||
|  | @ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) | |||
| 					    PyUnicode_GET_SIZE(unicode)); | ||||
| } | ||||
| 
 | ||||
| /* --- Unicode Internal Codec ------------------------------------------- */ | ||||
| 
 | ||||
| PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, | ||||
| 					   int size, | ||||
| 					   const char *errors) | ||||
| { | ||||
|     const char *starts = s; | ||||
|     int startinpos; | ||||
|     int endinpos; | ||||
|     int outpos; | ||||
|     Py_UNICODE unimax; | ||||
|     PyUnicodeObject *v; | ||||
|     Py_UNICODE *p; | ||||
|     const char *end; | ||||
|     const char *reason; | ||||
|     PyObject *errorHandler = NULL; | ||||
|     PyObject *exc = NULL; | ||||
| 
 | ||||
|     unimax = PyUnicode_GetMax(); | ||||
|     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); | ||||
|     if (v == NULL) | ||||
| 	goto onError; | ||||
|     if (PyUnicode_GetSize((PyObject *)v) == 0) | ||||
| 	return (PyObject *)v; | ||||
|     p = PyUnicode_AS_UNICODE(v); | ||||
|     end = s + size; | ||||
| 
 | ||||
|     while (s < end) { | ||||
|         *p = *(Py_UNICODE *)s; | ||||
|         /* We have to sanity check the raw data, otherwise doom looms for
 | ||||
|            some malformed UCS-4 data. */ | ||||
|         if ( | ||||
|             #ifdef Py_UNICODE_WIDE | ||||
|             *p > unimax || *p < 0 || | ||||
|             #endif | ||||
|             end-s < Py_UNICODE_SIZE | ||||
|             ) | ||||
|             { | ||||
|             startinpos = s - starts; | ||||
|             if (end-s < Py_UNICODE_SIZE) { | ||||
|                 endinpos = end-starts; | ||||
|                 reason = "truncated input"; | ||||
|             } | ||||
|             else { | ||||
|                 endinpos = s - starts + Py_UNICODE_SIZE; | ||||
|                 reason = "illegal code point (> 0x10FFFF)"; | ||||
|             } | ||||
|             outpos = p - PyUnicode_AS_UNICODE(v); | ||||
|             if (unicode_decode_call_errorhandler( | ||||
|                     errors, &errorHandler, | ||||
|                     "unicode_internal", reason, | ||||
|                     starts, size, &startinpos, &endinpos, &exc, &s, | ||||
|                     (PyObject **)&v, &outpos, &p)) { | ||||
|                 goto onError; | ||||
|             } | ||||
|         } | ||||
|         else { | ||||
|             p++; | ||||
|             s += Py_UNICODE_SIZE; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) | ||||
|         goto onError; | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     return (PyObject *)v; | ||||
| 
 | ||||
|  onError: | ||||
|     Py_XDECREF(v); | ||||
|     Py_XDECREF(errorHandler); | ||||
|     Py_XDECREF(exc); | ||||
|     return NULL; | ||||
| } | ||||
| 
 | ||||
| /* --- Latin-1 Codec ------------------------------------------------------ */ | ||||
| 
 | ||||
| PyObject *PyUnicode_DecodeLatin1(const char *s, | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Walter Dörwald
						Walter Dörwald