mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	[3.11] gh-99612: Fix PyUnicode_DecodeUTF8Stateful() for ASCII-only data (GH-99613) (GH-107224)
Previously *consumed was not set in this case.
(cherry picked from commit f08e52ccb0)
			
			
This commit is contained in:
		
							parent
							
								
									058741cc39
								
							
						
					
					
						commit
						b8b3e6afc0
					
				
					 4 changed files with 95 additions and 1 deletions
				
			
		
							
								
								
									
										54
									
								
								Lib/test/test_capi/test_codecs.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								Lib/test/test_capi/test_codecs.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| import unittest | ||||
| from test.support import import_helper | ||||
| 
 | ||||
| _testcapi = import_helper.import_module('_testcapi') | ||||
| 
 | ||||
| 
 | ||||
| class CAPITest(unittest.TestCase): | ||||
| 
 | ||||
|     def test_decodeutf8(self): | ||||
|         """Test PyUnicode_DecodeUTF8()""" | ||||
|         decodeutf8 = _testcapi.unicode_decodeutf8 | ||||
| 
 | ||||
|         for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']: | ||||
|             b = s.encode('utf-8') | ||||
|             self.assertEqual(decodeutf8(b), s) | ||||
|             self.assertEqual(decodeutf8(b, 'strict'), s) | ||||
| 
 | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8, b'\x80') | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xc0') | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xff') | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8, b'a\xf0\x9f') | ||||
|         self.assertEqual(decodeutf8(b'a\xf0\x9f', 'replace'), 'a\ufffd') | ||||
|         self.assertEqual(decodeutf8(b'a\xf0\x9fb', 'replace'), 'a\ufffdb') | ||||
| 
 | ||||
|         self.assertRaises(LookupError, decodeutf8, b'a\x80', 'foo') | ||||
|         # TODO: Test PyUnicode_DecodeUTF8() with NULL as data and | ||||
|         # negative size. | ||||
| 
 | ||||
|     def test_decodeutf8stateful(self): | ||||
|         """Test PyUnicode_DecodeUTF8Stateful()""" | ||||
|         decodeutf8stateful = _testcapi.unicode_decodeutf8stateful | ||||
| 
 | ||||
|         for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']: | ||||
|             b = s.encode('utf-8') | ||||
|             self.assertEqual(decodeutf8stateful(b), (s, len(b))) | ||||
|             self.assertEqual(decodeutf8stateful(b, 'strict'), (s, len(b))) | ||||
| 
 | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\x80') | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xc0') | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xff') | ||||
|         self.assertEqual(decodeutf8stateful(b'a\xf0\x9f'), ('a', 1)) | ||||
|         self.assertEqual(decodeutf8stateful(b'a\xf0\x9f', 'replace'), ('a', 1)) | ||||
|         self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'a\xf0\x9fb') | ||||
|         self.assertEqual(decodeutf8stateful(b'a\xf0\x9fb', 'replace'), ('a\ufffdb', 4)) | ||||
| 
 | ||||
|         self.assertRaises(LookupError, decodeutf8stateful, b'a\x80', 'foo') | ||||
|         # TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as data and | ||||
|         # negative size. | ||||
|         # TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as the address of | ||||
|         # "consumed". | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     unittest.main() | ||||
|  | @ -0,0 +1,2 @@ | |||
| Fix :c:func:`PyUnicode_DecodeUTF8Stateful` for ASCII-only data: | ||||
| ``*consumed`` was not set. | ||||
|  | @ -2307,6 +2307,40 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) | |||
|     return Py_BuildValue("(Nn)", result, utf8_len); | ||||
| } | ||||
| 
 | ||||
| /* Test PyUnicode_DecodeUTF8() */ | ||||
| static PyObject * | ||||
| unicode_decodeutf8(PyObject *self, PyObject *args) | ||||
| { | ||||
|     const char *data; | ||||
|     Py_ssize_t size; | ||||
|     const char *errors = NULL; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors)) | ||||
|         return NULL; | ||||
| 
 | ||||
|     return PyUnicode_DecodeUTF8(data, size, errors); | ||||
| } | ||||
| 
 | ||||
| /* Test PyUnicode_DecodeUTF8Stateful() */ | ||||
| static PyObject * | ||||
| unicode_decodeutf8stateful(PyObject *self, PyObject *args) | ||||
| { | ||||
|     const char *data; | ||||
|     Py_ssize_t size; | ||||
|     const char *errors = NULL; | ||||
|     Py_ssize_t consumed = 123456789; | ||||
|     PyObject *result; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors)) | ||||
|         return NULL; | ||||
| 
 | ||||
|     result = PyUnicode_DecodeUTF8Stateful(data, size, errors, &consumed); | ||||
|     if (!result) { | ||||
|         return NULL; | ||||
|     } | ||||
|     return Py_BuildValue("(Nn)", result, consumed); | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| unicode_findchar(PyObject *self, PyObject *args) | ||||
| { | ||||
|  | @ -6562,7 +6596,8 @@ static PyMethodDef TestMethods[] = { | |||
|     {"unicode_asucs4",          unicode_asucs4,                  METH_VARARGS}, | ||||
|     {"unicode_asutf8",          unicode_asutf8,                  METH_VARARGS}, | ||||
|     {"unicode_asutf8andsize",   unicode_asutf8andsize,           METH_VARARGS}, | ||||
|     {"unicode_findchar",        unicode_findchar,                METH_VARARGS}, | ||||
|     {"unicode_decodeutf8",       unicode_decodeutf8,             METH_VARARGS}, | ||||
|     {"unicode_decodeutf8stateful",unicode_decodeutf8stateful,    METH_VARARGS},    {"unicode_findchar",        unicode_findchar,                METH_VARARGS}, | ||||
|     {"unicode_copycharacters",  unicode_copycharacters,          METH_VARARGS}, | ||||
| #if USE_UNICODE_WCHAR_CACHE | ||||
|     {"unicode_legacy_string",   unicode_legacy_string,           METH_VARARGS}, | ||||
|  |  | |||
|  | @ -5120,6 +5120,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, | |||
|     } | ||||
|     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u)); | ||||
|     if (s == end) { | ||||
|         if (consumed) { | ||||
|             *consumed = size; | ||||
|         } | ||||
|         return u; | ||||
|     } | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Serhiy Storchaka
						Serhiy Storchaka