mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	[3.13] gh-63161: Add more tests for source encoding (GH-139440) (#139443)
(cherry picked from commit b2f5ad0c6d)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
			
			
This commit is contained in:
		
							parent
							
								
									7bb51bd5be
								
							
						
					
					
						commit
						4216ea7c81
					
				
					 2 changed files with 179 additions and 22 deletions
				
			
		|  | @ -173,6 +173,8 @@ def test_tokenizer_fstring_warning_in_first_line(self): | ||||||
|             os.unlink(TESTFN) |             os.unlink(TESTFN) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | BUFSIZ = 2**13 | ||||||
|  | 
 | ||||||
| class AbstractSourceEncodingTest: | class AbstractSourceEncodingTest: | ||||||
| 
 | 
 | ||||||
|     def test_default_coding(self): |     def test_default_coding(self): | ||||||
|  | @ -185,14 +187,20 @@ def test_first_coding_line(self): | ||||||
|         self.check_script_output(src, br"'\xc3\u20ac'") |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
| 
 | 
 | ||||||
|     def test_second_coding_line(self): |     def test_second_coding_line(self): | ||||||
|         src = (b'#\n' |         src = (b'#!/usr/bin/python\n' | ||||||
|  |                b'#coding:iso8859-15\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
|  | 
 | ||||||
|  |     def test_second_coding_line_empty_first_line(self): | ||||||
|  |         src = (b'\n' | ||||||
|                b'#coding:iso8859-15\n' |                b'#coding:iso8859-15\n' | ||||||
|                b'print(ascii("\xc3\xa4"))\n') |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|         self.check_script_output(src, br"'\xc3\u20ac'") |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
| 
 | 
 | ||||||
|     def test_third_coding_line(self): |     def test_third_coding_line(self): | ||||||
|         # Only first two lines are tested for a magic comment. |         # Only first two lines are tested for a magic comment. | ||||||
|         src = (b'#\n' |         src = (b'#!/usr/bin/python\n' | ||||||
|                b'#\n' |                b'#\n' | ||||||
|                b'#coding:iso8859-15\n' |                b'#coding:iso8859-15\n' | ||||||
|                b'print(ascii("\xc3\xa4"))\n') |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  | @ -210,13 +218,52 @@ def test_double_coding_same_line(self): | ||||||
|                b'print(ascii("\xc3\xa4"))\n') |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|         self.check_script_output(src, br"'\xc3\u20ac'") |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
| 
 | 
 | ||||||
|  |     def test_double_coding_utf8(self): | ||||||
|  |         src = (b'#coding:utf-8\n' | ||||||
|  |                b'#coding:latin1\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xe4'") | ||||||
|  | 
 | ||||||
|  |     def test_long_first_coding_line(self): | ||||||
|  |         src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
|  | 
 | ||||||
|  |     def test_long_second_coding_line(self): | ||||||
|  |         src = (b'#!/usr/bin/python\n' | ||||||
|  |                b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
|  | 
 | ||||||
|  |     def test_long_coding_line(self): | ||||||
|  |         src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
|  | 
 | ||||||
|  |     def test_long_coding_name(self): | ||||||
|  |         src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xc3\xa4'") | ||||||
|  | 
 | ||||||
|  |     def test_long_first_utf8_line(self): | ||||||
|  |         src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' | ||||||
|  |         self.check_script_output(src, b'') | ||||||
|  |         src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' | ||||||
|  |         self.check_script_output(src, b'') | ||||||
|  | 
 | ||||||
|  |     def test_long_second_utf8_line(self): | ||||||
|  |         src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' | ||||||
|  |         self.check_script_output(src, b'') | ||||||
|  |         src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' | ||||||
|  |         self.check_script_output(src, b'') | ||||||
|  | 
 | ||||||
|     def test_first_non_utf8_coding_line(self): |     def test_first_non_utf8_coding_line(self): | ||||||
|         src = (b'#coding:iso-8859-15 \xa4\n' |         src = (b'#coding:iso-8859-15 \xa4\n' | ||||||
|                b'print(ascii("\xc3\xa4"))\n') |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|         self.check_script_output(src, br"'\xc3\u20ac'") |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
| 
 | 
 | ||||||
|     def test_second_non_utf8_coding_line(self): |     def test_second_non_utf8_coding_line(self): | ||||||
|         src = (b'\n' |         src = (b'#!/usr/bin/python\n' | ||||||
|                b'#coding:iso-8859-15 \xa4\n' |                b'#coding:iso-8859-15 \xa4\n' | ||||||
|                b'print(ascii("\xc3\xa4"))\n') |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|         self.check_script_output(src, br"'\xc3\u20ac'") |         self.check_script_output(src, br"'\xc3\u20ac'") | ||||||
|  | @ -225,27 +272,56 @@ def test_utf8_bom(self): | ||||||
|         src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') |         src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') | ||||||
|         self.check_script_output(src, br"'\xe4'") |         self.check_script_output(src, br"'\xe4'") | ||||||
| 
 | 
 | ||||||
|  |     def test_utf8_bom_utf8_comments(self): | ||||||
|  |         src = (b'\xef\xbb\xbf#\xc3\xa4\n' | ||||||
|  |                b'#\xc3\xa4\n' | ||||||
|  |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|  |         self.check_script_output(src, br"'\xe4'") | ||||||
|  | 
 | ||||||
|     def test_utf8_bom_and_utf8_coding_line(self): |     def test_utf8_bom_and_utf8_coding_line(self): | ||||||
|         src = (b'\xef\xbb\xbf#coding:utf-8\n' |         src = (b'\xef\xbb\xbf#coding:utf-8\n' | ||||||
|                b'print(ascii("\xc3\xa4"))\n') |                b'print(ascii("\xc3\xa4"))\n') | ||||||
|         self.check_script_output(src, br"'\xe4'") |         self.check_script_output(src, br"'\xe4'") | ||||||
| 
 | 
 | ||||||
|  |     def test_utf8_non_utf8_comment_line_error(self): | ||||||
|  |         src = (b'#coding: utf8\n' | ||||||
|  |                b'#\n' | ||||||
|  |                b'#\xa4\n' | ||||||
|  |                b'raise RuntimeError\n') | ||||||
|  |         self.check_script_error(src, | ||||||
|  |                 br"'utf-8' codec can't decode byte|" | ||||||
|  |                 br"encoding problem: utf8") | ||||||
|  | 
 | ||||||
|     def test_crlf(self): |     def test_crlf(self): | ||||||
|         src = (b'print(ascii("""\r\n"""))\n') |         src = (b'print(ascii("""\r\n"""))\n') | ||||||
|         out = self.check_script_output(src, br"'\n'") |         self.check_script_output(src, br"'\n'") | ||||||
| 
 | 
 | ||||||
|     def test_crcrlf(self): |     def test_crcrlf(self): | ||||||
|         src = (b'print(ascii("""\r\r\n"""))\n') |         src = (b'print(ascii("""\r\r\n"""))\n') | ||||||
|         out = self.check_script_output(src, br"'\n\n'") |         self.check_script_output(src, br"'\n\n'") | ||||||
| 
 | 
 | ||||||
|     def test_crcrcrlf(self): |     def test_crcrcrlf(self): | ||||||
|         src = (b'print(ascii("""\r\r\r\n"""))\n') |         src = (b'print(ascii("""\r\r\r\n"""))\n') | ||||||
|         out = self.check_script_output(src, br"'\n\n\n'") |         self.check_script_output(src, br"'\n\n\n'") | ||||||
| 
 | 
 | ||||||
|     def test_crcrcrlf2(self): |     def test_crcrcrlf2(self): | ||||||
|         src = (b'#coding:iso-8859-1\n' |         src = (b'#coding:iso-8859-1\n' | ||||||
|                b'print(ascii("""\r\r\r\n"""))\n') |                b'print(ascii("""\r\r\r\n"""))\n') | ||||||
|         out = self.check_script_output(src, br"'\n\n\n'") |         self.check_script_output(src, br"'\n\n\n'") | ||||||
|  | 
 | ||||||
|  |     def test_nul_in_first_coding_line(self): | ||||||
|  |         src = (b'#coding:iso8859-15\x00\n' | ||||||
|  |                b'\n' | ||||||
|  |                b'\n' | ||||||
|  |                b'raise RuntimeError\n') | ||||||
|  |         self.check_script_error(src, br"source code (string )?cannot contain null bytes") | ||||||
|  | 
 | ||||||
|  |     def test_nul_in_second_coding_line(self): | ||||||
|  |         src = (b'#!/usr/bin/python\n' | ||||||
|  |                b'#coding:iso8859-15\x00\n' | ||||||
|  |                b'\n' | ||||||
|  |                b'raise RuntimeError\n') | ||||||
|  |         self.check_script_error(src, br"source code (string )?cannot contain null bytes") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class UTF8ValidatorTest(unittest.TestCase): | class UTF8ValidatorTest(unittest.TestCase): | ||||||
|  | @ -325,6 +401,10 @@ def check_script_output(self, src, expected): | ||||||
|         out = stdout.getvalue().encode('latin1') |         out = stdout.getvalue().encode('latin1') | ||||||
|         self.assertEqual(out.rstrip(), expected) |         self.assertEqual(out.rstrip(), expected) | ||||||
| 
 | 
 | ||||||
|  |     def check_script_error(self, src, expected): | ||||||
|  |         with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm: | ||||||
|  |             exec(src) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): | class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): | ||||||
| 
 | 
 | ||||||
|  | @ -336,6 +416,14 @@ def check_script_output(self, src, expected): | ||||||
|             res = script_helper.assert_python_ok(fn) |             res = script_helper.assert_python_ok(fn) | ||||||
|         self.assertEqual(res.out.rstrip(), expected) |         self.assertEqual(res.out.rstrip(), expected) | ||||||
| 
 | 
 | ||||||
|  |     def check_script_error(self, src, expected): | ||||||
|  |         with tempfile.TemporaryDirectory() as tmpd: | ||||||
|  |             fn = os.path.join(tmpd, 'test.py') | ||||||
|  |             with open(fn, 'wb') as fp: | ||||||
|  |                 fp.write(src) | ||||||
|  |             res = script_helper.assert_python_failure(fn) | ||||||
|  |         self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     unittest.main() |     unittest.main() | ||||||
|  |  | ||||||
|  | @ -1342,7 +1342,8 @@ def readline(): | ||||||
| 
 | 
 | ||||||
|     def test_no_bom_no_encoding_cookie(self): |     def test_no_bom_no_encoding_cookie(self): | ||||||
|         lines = ( |         lines = ( | ||||||
|             b'# something\n', |             b'#!/home/\xc3\xa4/bin/python\n', | ||||||
|  |             b'# something \xe2\x82\xac\n', | ||||||
|             b'print(something)\n', |             b'print(something)\n', | ||||||
|             b'do_something(else)\n' |             b'do_something(else)\n' | ||||||
|         ) |         ) | ||||||
|  | @ -1350,16 +1351,54 @@ def test_no_bom_no_encoding_cookie(self): | ||||||
|         self.assertEqual(encoding, 'utf-8') |         self.assertEqual(encoding, 'utf-8') | ||||||
|         self.assertEqual(consumed_lines, list(lines[:2])) |         self.assertEqual(consumed_lines, list(lines[:2])) | ||||||
| 
 | 
 | ||||||
|  |     def test_no_bom_no_encoding_cookie_first_line_error(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'#!/home/\xa4/bin/python\n\n', | ||||||
|  |             b'print(something)\n', | ||||||
|  |             b'do_something(else)\n' | ||||||
|  |         ) | ||||||
|  |         with self.assertRaises(SyntaxError): | ||||||
|  |             tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  | 
 | ||||||
|  |     def test_no_bom_no_encoding_cookie_second_line_error(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'#!/usr/bin/python\n', | ||||||
|  |             b'# something \xe2\n', | ||||||
|  |             b'print(something)\n', | ||||||
|  |             b'do_something(else)\n' | ||||||
|  |         ) | ||||||
|  |         with self.assertRaises(SyntaxError): | ||||||
|  |             tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  | 
 | ||||||
|     def test_bom_no_cookie(self): |     def test_bom_no_cookie(self): | ||||||
|         lines = ( |         lines = ( | ||||||
|             b'\xef\xbb\xbf# something\n', |             b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n', | ||||||
|             b'print(something)\n', |             b'print(something)\n', | ||||||
|             b'do_something(else)\n' |             b'do_something(else)\n' | ||||||
|         ) |         ) | ||||||
|         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) |         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|         self.assertEqual(encoding, 'utf-8-sig') |         self.assertEqual(encoding, 'utf-8-sig') | ||||||
|         self.assertEqual(consumed_lines, |         self.assertEqual(consumed_lines, | ||||||
|                          [b'# something\n', b'print(something)\n']) |                          [b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n']) | ||||||
|  | 
 | ||||||
|  |     def test_bom_no_cookie_first_line_error(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'\xef\xbb\xbf#!/home/\xa4/bin/python\n', | ||||||
|  |             b'print(something)\n', | ||||||
|  |             b'do_something(else)\n' | ||||||
|  |         ) | ||||||
|  |         with self.assertRaises(SyntaxError): | ||||||
|  |             tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  | 
 | ||||||
|  |     def test_bom_no_cookie_second_line_error(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'\xef\xbb\xbf#!/usr/bin/python\n', | ||||||
|  |             b'# something \xe2\n', | ||||||
|  |             b'print(something)\n', | ||||||
|  |             b'do_something(else)\n' | ||||||
|  |         ) | ||||||
|  |         with self.assertRaises(SyntaxError): | ||||||
|  |             tokenize.detect_encoding(self.get_readline(lines)) | ||||||
| 
 | 
 | ||||||
|     def test_cookie_first_line_no_bom(self): |     def test_cookie_first_line_no_bom(self): | ||||||
|         lines = ( |         lines = ( | ||||||
|  | @ -1435,17 +1474,6 @@ def test_cookie_second_line_noncommented_first_line(self): | ||||||
|         expected = [b"print('\xc2\xa3')\n"] |         expected = [b"print('\xc2\xa3')\n"] | ||||||
|         self.assertEqual(consumed_lines, expected) |         self.assertEqual(consumed_lines, expected) | ||||||
| 
 | 
 | ||||||
|     def test_cookie_second_line_commented_first_line(self): |  | ||||||
|         lines = ( |  | ||||||
|             b"#print('\xc2\xa3')\n", |  | ||||||
|             b'# vim: set fileencoding=iso8859-15 :\n', |  | ||||||
|             b"print('\xe2\x82\xac')\n" |  | ||||||
|         ) |  | ||||||
|         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) |  | ||||||
|         self.assertEqual(encoding, 'iso8859-15') |  | ||||||
|         expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] |  | ||||||
|         self.assertEqual(consumed_lines, expected) |  | ||||||
| 
 |  | ||||||
|     def test_cookie_second_line_empty_first_line(self): |     def test_cookie_second_line_empty_first_line(self): | ||||||
|         lines = ( |         lines = ( | ||||||
|             b'\n', |             b'\n', | ||||||
|  | @ -1457,6 +1485,48 @@ def test_cookie_second_line_empty_first_line(self): | ||||||
|         expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] |         expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] | ||||||
|         self.assertEqual(consumed_lines, expected) |         self.assertEqual(consumed_lines, expected) | ||||||
| 
 | 
 | ||||||
|  |     def test_cookie_third_line(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'#!/home/\xc3\xa4/bin/python\n', | ||||||
|  |             b'# something\n', | ||||||
|  |             b'# vim: set fileencoding=ascii :\n', | ||||||
|  |             b'print(something)\n', | ||||||
|  |             b'do_something(else)\n' | ||||||
|  |         ) | ||||||
|  |         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  |         self.assertEqual(encoding, 'utf-8') | ||||||
|  |         self.assertEqual(consumed_lines, list(lines[:2])) | ||||||
|  | 
 | ||||||
|  |     def test_double_coding_line(self): | ||||||
|  |         # If the first line matches the second line is ignored. | ||||||
|  |         lines = ( | ||||||
|  |             b'#coding:iso8859-15\n', | ||||||
|  |             b'#coding:latin1\n', | ||||||
|  |             b'print(something)\n' | ||||||
|  |         ) | ||||||
|  |         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  |         self.assertEqual(encoding, 'iso8859-15') | ||||||
|  |         self.assertEqual(consumed_lines, list(lines[:1])) | ||||||
|  | 
 | ||||||
|  |     def test_double_coding_same_line(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'#coding:iso8859-15 coding:latin1\n', | ||||||
|  |             b'print(something)\n' | ||||||
|  |         ) | ||||||
|  |         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  |         self.assertEqual(encoding, 'iso8859-15') | ||||||
|  |         self.assertEqual(consumed_lines, list(lines[:1])) | ||||||
|  | 
 | ||||||
|  |     def test_double_coding_utf8(self): | ||||||
|  |         lines = ( | ||||||
|  |             b'#coding:utf-8\n', | ||||||
|  |             b'#coding:latin1\n', | ||||||
|  |             b'print(something)\n' | ||||||
|  |         ) | ||||||
|  |         encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) | ||||||
|  |         self.assertEqual(encoding, 'utf-8') | ||||||
|  |         self.assertEqual(consumed_lines, list(lines[:1])) | ||||||
|  | 
 | ||||||
|     def test_latin1_normalization(self): |     def test_latin1_normalization(self): | ||||||
|         # See get_normal_name() in Parser/tokenizer/helpers.c. |         # See get_normal_name() in Parser/tokenizer/helpers.c. | ||||||
|         encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", |         encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", | ||||||
|  | @ -1481,7 +1551,6 @@ def test_syntaxerror_latin1(self): | ||||||
|         readline = self.get_readline(lines) |         readline = self.get_readline(lines) | ||||||
|         self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) |         self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|     def test_utf8_normalization(self): |     def test_utf8_normalization(self): | ||||||
|         # See get_normal_name() in Parser/tokenizer/helpers.c. |         # See get_normal_name() in Parser/tokenizer/helpers.c. | ||||||
|         encodings = ("utf-8", "utf-8-mac", "utf-8-unix") |         encodings = ("utf-8", "utf-8-mac", "utf-8-unix") | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Miss Islington (bot)
						Miss Islington (bot)