mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:
1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate.
This commit is contained in:
		
							parent
							
								
									223349cfb8
								
							
						
					
					
						commit
						28b21e50c8
					
				
					 4 changed files with 75 additions and 11 deletions
				
			
		| 
						 | 
					@ -898,6 +898,32 @@ def test_readline(self):
 | 
				
			||||||
class UTF7Test(ReadTest, unittest.TestCase):
 | 
					class UTF7Test(ReadTest, unittest.TestCase):
 | 
				
			||||||
    encoding = "utf-7"
 | 
					    encoding = "utf-7"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_ascii(self):
 | 
				
			||||||
 | 
					        # Set D (directly encoded characters)
 | 
				
			||||||
 | 
					        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 | 
				
			||||||
 | 
					                 'abcdefghijklmnopqrstuvwxyz'
 | 
				
			||||||
 | 
					                 '0123456789'
 | 
				
			||||||
 | 
					                 '\'(),-./:?')
 | 
				
			||||||
 | 
					        self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
 | 
				
			||||||
 | 
					        self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
 | 
				
			||||||
 | 
					        # Set O (optional direct characters)
 | 
				
			||||||
 | 
					        set_o = ' !"#$%&*;<=>@[]^_`{|}'
 | 
				
			||||||
 | 
					        self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
 | 
				
			||||||
 | 
					        self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
 | 
				
			||||||
 | 
					        # +
 | 
				
			||||||
 | 
					        self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
 | 
				
			||||||
 | 
					        self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
 | 
				
			||||||
 | 
					        # White spaces
 | 
				
			||||||
 | 
					        ws = ' \t\n\r'
 | 
				
			||||||
 | 
					        self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
 | 
				
			||||||
 | 
					        self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
 | 
				
			||||||
 | 
					        # Other ASCII characters
 | 
				
			||||||
 | 
					        other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
 | 
				
			||||||
 | 
					                                     set(set_d + set_o + '+' + ws)))
 | 
				
			||||||
 | 
					        self.assertEqual(other_ascii.encode(self.encoding),
 | 
				
			||||||
 | 
					                         b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
 | 
				
			||||||
 | 
					                         b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_partial(self):
 | 
					    def test_partial(self):
 | 
				
			||||||
        self.check_partial(
 | 
					        self.check_partial(
 | 
				
			||||||
            'a+-b\x00c\x80d\u0100e\U00010000f',
 | 
					            'a+-b\x00c\x80d\u0100e\U00010000f',
 | 
				
			||||||
| 
						 | 
					@ -939,7 +965,9 @@ def test_partial(self):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_errors(self):
 | 
					    def test_errors(self):
 | 
				
			||||||
        tests = [
 | 
					        tests = [
 | 
				
			||||||
 | 
					            (b'\xffb', '\ufffdb'),
 | 
				
			||||||
            (b'a\xffb', 'a\ufffdb'),
 | 
					            (b'a\xffb', 'a\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a\xff\xffb', 'a\ufffd\ufffdb'),
 | 
				
			||||||
            (b'a+IK', 'a\ufffd'),
 | 
					            (b'a+IK', 'a\ufffd'),
 | 
				
			||||||
            (b'a+IK-b', 'a\ufffdb'),
 | 
					            (b'a+IK-b', 'a\ufffdb'),
 | 
				
			||||||
            (b'a+IK,b', 'a\ufffdb'),
 | 
					            (b'a+IK,b', 'a\ufffdb'),
 | 
				
			||||||
| 
						 | 
					@ -955,6 +983,8 @@ def test_errors(self):
 | 
				
			||||||
            (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
 | 
					            (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
 | 
				
			||||||
            (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
 | 
					            (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
 | 
				
			||||||
            (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
 | 
					            (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
 | 
				
			||||||
 | 
					            (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
 | 
				
			||||||
 | 
					            (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
        for raw, expected in tests:
 | 
					        for raw, expected in tests:
 | 
				
			||||||
            with self.subTest(raw=raw):
 | 
					            with self.subTest(raw=raw):
 | 
				
			||||||
| 
						 | 
					@ -966,8 +996,36 @@ def test_nonbmp(self):
 | 
				
			||||||
        self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
 | 
					        self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
 | 
				
			||||||
        self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
 | 
					        self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
 | 
				
			||||||
        self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
 | 
					        self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
 | 
				
			||||||
 | 
					        self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
 | 
				
			||||||
 | 
					        self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
 | 
				
			||||||
 | 
					        self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
 | 
				
			||||||
 | 
					        self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
 | 
				
			||||||
 | 
					        self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
 | 
				
			||||||
 | 
					                         b'+IKwgrNgB3KA-')
 | 
				
			||||||
 | 
					        self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
 | 
				
			||||||
 | 
					                         '\u20ac\u20ac\U000104A0')
 | 
				
			||||||
 | 
					        self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
 | 
				
			||||||
 | 
					                         '\u20ac\u20ac\U000104A0')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    test_lone_surrogates = None
 | 
					    def test_lone_surrogates(self):
 | 
				
			||||||
 | 
					        tests = [
 | 
				
			||||||
 | 
					            (b'a+2AE-b', 'a\ud801b'),
 | 
				
			||||||
 | 
					            (b'a+2AE\xffb', 'a\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+2AE', 'a\ufffd'),
 | 
				
			||||||
 | 
					            (b'a+2AEA-b', 'a\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+2AH-b', 'a\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
 | 
				
			||||||
 | 
					            (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
 | 
				
			||||||
 | 
					            (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
 | 
				
			||||||
 | 
					            (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
 | 
				
			||||||
 | 
					            (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					        for raw, expected in tests:
 | 
				
			||||||
 | 
					            with self.subTest(raw=raw):
 | 
				
			||||||
 | 
					                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class UTF16ExTest(unittest.TestCase):
 | 
					class UTF16ExTest(unittest.TestCase):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1524,7 +1524,7 @@ def test_codecs_utf7(self):
 | 
				
			||||||
        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
 | 
					        self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Issue #2242: crash on some Windows/MSVC versions
 | 
					        # Issue #2242: crash on some Windows/MSVC versions
 | 
				
			||||||
        self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
 | 
					        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Direct encoded characters
 | 
					        # Direct encoded characters
 | 
				
			||||||
        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
 | 
					        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
 | 
				
			||||||
| 
						 | 
					@ -1966,6 +1966,7 @@ def test_codecs_errors(self):
 | 
				
			||||||
        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
 | 
					        self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
 | 
				
			||||||
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
 | 
					        self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
 | 
				
			||||||
        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
 | 
					        self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
 | 
				
			||||||
 | 
					        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Error handling (unknown character names)
 | 
					        # Error handling (unknown character names)
 | 
				
			||||||
        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
 | 
					        self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,6 +10,8 @@ Release date: tba
 | 
				
			||||||
Core and Builtins
 | 
					Core and Builtins
 | 
				
			||||||
-----------------
 | 
					-----------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Issue #25280: Import trace messages emitted in verbose (-v) mode are no
 | 
					- Issue #25280: Import trace messages emitted in verbose (-v) mode are no
 | 
				
			||||||
  longer formatted twice.
 | 
					  longer formatted twice.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4381,31 +4381,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            else { /* now leaving a base-64 section */
 | 
					            else { /* now leaving a base-64 section */
 | 
				
			||||||
                inShift = 0;
 | 
					                inShift = 0;
 | 
				
			||||||
                s++;
 | 
					 | 
				
			||||||
                if (surrogate) {
 | 
					 | 
				
			||||||
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
 | 
					 | 
				
			||||||
                        goto onError;
 | 
					 | 
				
			||||||
                    surrogate = 0;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
                if (base64bits > 0) { /* left-over bits */
 | 
					                if (base64bits > 0) { /* left-over bits */
 | 
				
			||||||
                    if (base64bits >= 6) {
 | 
					                    if (base64bits >= 6) {
 | 
				
			||||||
                        /* We've seen at least one base-64 character */
 | 
					                        /* We've seen at least one base-64 character */
 | 
				
			||||||
 | 
					                        s++;
 | 
				
			||||||
                        errmsg = "partial character in shift sequence";
 | 
					                        errmsg = "partial character in shift sequence";
 | 
				
			||||||
                        goto utf7Error;
 | 
					                        goto utf7Error;
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                    else {
 | 
					                    else {
 | 
				
			||||||
                        /* Some bits remain; they should be zero */
 | 
					                        /* Some bits remain; they should be zero */
 | 
				
			||||||
                        if (base64buffer != 0) {
 | 
					                        if (base64buffer != 0) {
 | 
				
			||||||
 | 
					                            s++;
 | 
				
			||||||
                            errmsg = "non-zero padding bits in shift sequence";
 | 
					                            errmsg = "non-zero padding bits in shift sequence";
 | 
				
			||||||
                            goto utf7Error;
 | 
					                            goto utf7Error;
 | 
				
			||||||
                        }
 | 
					                        }
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                if (ch != '-') {
 | 
					                if (surrogate && DECODE_DIRECT(ch)) {
 | 
				
			||||||
 | 
					                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
 | 
				
			||||||
 | 
					                        goto onError;
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                surrogate = 0;
 | 
				
			||||||
 | 
					                if (ch == '-') {
 | 
				
			||||||
                    /* '-' is absorbed; other terminating
 | 
					                    /* '-' is absorbed; other terminating
 | 
				
			||||||
                       characters are preserved */
 | 
					                       characters are preserved */
 | 
				
			||||||
                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
 | 
					                    s++;
 | 
				
			||||||
                        goto onError;
 | 
					 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					@ -4419,6 +4419,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            else { /* begin base64-encoded section */
 | 
					            else { /* begin base64-encoded section */
 | 
				
			||||||
                inShift = 1;
 | 
					                inShift = 1;
 | 
				
			||||||
 | 
					                surrogate = 0;
 | 
				
			||||||
                shiftOutStart = writer.pos;
 | 
					                shiftOutStart = writer.pos;
 | 
				
			||||||
                base64bits = 0;
 | 
					                base64bits = 0;
 | 
				
			||||||
                base64buffer = 0;
 | 
					                base64buffer = 0;
 | 
				
			||||||
| 
						 | 
					@ -4450,6 +4451,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
 | 
					    if (inShift && !consumed) { /* in shift sequence, no more to follow */
 | 
				
			||||||
        /* if we're in an inconsistent state, that's an error */
 | 
					        /* if we're in an inconsistent state, that's an error */
 | 
				
			||||||
 | 
					        inShift = 0;
 | 
				
			||||||
        if (surrogate ||
 | 
					        if (surrogate ||
 | 
				
			||||||
                (base64bits >= 6) ||
 | 
					                (base64bits >= 6) ||
 | 
				
			||||||
                (base64bits > 0 && base64buffer != 0)) {
 | 
					                (base64bits > 0 && base64buffer != 0)) {
 | 
				
			||||||
| 
						 | 
					@ -13337,6 +13339,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if (maxchar > writer->maxchar || writer->readonly) {
 | 
					        if (maxchar > writer->maxchar || writer->readonly) {
 | 
				
			||||||
            /* resize + widen */
 | 
					            /* resize + widen */
 | 
				
			||||||
 | 
					            maxchar = Py_MAX(maxchar, writer->maxchar);
 | 
				
			||||||
            newbuffer = PyUnicode_New(newlen, maxchar);
 | 
					            newbuffer = PyUnicode_New(newlen, maxchar);
 | 
				
			||||||
            if (newbuffer == NULL)
 | 
					            if (newbuffer == NULL)
 | 
				
			||||||
                return -1;
 | 
					                return -1;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue