[3.13] gh-63161: Add more tests for source encoding (GH-139440) (#139443)

(cherry picked from commit b2f5ad0c6d)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-10-07 13:34:08 +02:00 committed by GitHub
parent 7bb51bd5be
commit 4216ea7c81
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 179 additions and 22 deletions

View file

@ -173,6 +173,8 @@ def test_tokenizer_fstring_warning_in_first_line(self):
os.unlink(TESTFN) os.unlink(TESTFN)
BUFSIZ = 2**13
class AbstractSourceEncodingTest: class AbstractSourceEncodingTest:
def test_default_coding(self): def test_default_coding(self):
@ -185,14 +187,20 @@ def test_first_coding_line(self):
self.check_script_output(src, br"'\xc3\u20ac'") self.check_script_output(src, br"'\xc3\u20ac'")
def test_second_coding_line(self): def test_second_coding_line(self):
src = (b'#\n' src = (b'#!/usr/bin/python\n'
b'#coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
def test_second_coding_line_empty_first_line(self):
src = (b'\n'
b'#coding:iso8859-15\n' b'#coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n') b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'") self.check_script_output(src, br"'\xc3\u20ac'")
def test_third_coding_line(self): def test_third_coding_line(self):
# Only first two lines are tested for a magic comment. # Only first two lines are tested for a magic comment.
src = (b'#\n' src = (b'#!/usr/bin/python\n'
b'#\n' b'#\n'
b'#coding:iso8859-15\n' b'#coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n') b'print(ascii("\xc3\xa4"))\n')
@ -210,13 +218,52 @@ def test_double_coding_same_line(self):
b'print(ascii("\xc3\xa4"))\n') b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'") self.check_script_output(src, br"'\xc3\u20ac'")
def test_double_coding_utf8(self):
src = (b'#coding:utf-8\n'
b'#coding:latin1\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")
def test_long_first_coding_line(self):
src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
def test_long_second_coding_line(self):
src = (b'#!/usr/bin/python\n'
b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
def test_long_coding_line(self):
src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
def test_long_coding_name(self):
src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\xa4'")
def test_long_first_utf8_line(self):
src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')
src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')
def test_long_second_utf8_line(self):
src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')
src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')
def test_first_non_utf8_coding_line(self): def test_first_non_utf8_coding_line(self):
src = (b'#coding:iso-8859-15 \xa4\n' src = (b'#coding:iso-8859-15 \xa4\n'
b'print(ascii("\xc3\xa4"))\n') b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'") self.check_script_output(src, br"'\xc3\u20ac'")
def test_second_non_utf8_coding_line(self): def test_second_non_utf8_coding_line(self):
src = (b'\n' src = (b'#!/usr/bin/python\n'
b'#coding:iso-8859-15 \xa4\n' b'#coding:iso-8859-15 \xa4\n'
b'print(ascii("\xc3\xa4"))\n') b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'") self.check_script_output(src, br"'\xc3\u20ac'")
@ -225,27 +272,56 @@ def test_utf8_bom(self):
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'") self.check_script_output(src, br"'\xe4'")
def test_utf8_bom_utf8_comments(self):
src = (b'\xef\xbb\xbf#\xc3\xa4\n'
b'#\xc3\xa4\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")
def test_utf8_bom_and_utf8_coding_line(self): def test_utf8_bom_and_utf8_coding_line(self):
src = (b'\xef\xbb\xbf#coding:utf-8\n' src = (b'\xef\xbb\xbf#coding:utf-8\n'
b'print(ascii("\xc3\xa4"))\n') b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'") self.check_script_output(src, br"'\xe4'")
def test_utf8_non_utf8_comment_line_error(self):
src = (b'#coding: utf8\n'
b'#\n'
b'#\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"'utf-8' codec can't decode byte|"
br"encoding problem: utf8")
def test_crlf(self): def test_crlf(self):
src = (b'print(ascii("""\r\n"""))\n') src = (b'print(ascii("""\r\n"""))\n')
out = self.check_script_output(src, br"'\n'") self.check_script_output(src, br"'\n'")
def test_crcrlf(self): def test_crcrlf(self):
src = (b'print(ascii("""\r\r\n"""))\n') src = (b'print(ascii("""\r\r\n"""))\n')
out = self.check_script_output(src, br"'\n\n'") self.check_script_output(src, br"'\n\n'")
def test_crcrcrlf(self): def test_crcrcrlf(self):
src = (b'print(ascii("""\r\r\r\n"""))\n') src = (b'print(ascii("""\r\r\r\n"""))\n')
out = self.check_script_output(src, br"'\n\n\n'") self.check_script_output(src, br"'\n\n\n'")
def test_crcrcrlf2(self): def test_crcrcrlf2(self):
src = (b'#coding:iso-8859-1\n' src = (b'#coding:iso-8859-1\n'
b'print(ascii("""\r\r\r\n"""))\n') b'print(ascii("""\r\r\r\n"""))\n')
out = self.check_script_output(src, br"'\n\n\n'") self.check_script_output(src, br"'\n\n\n'")
def test_nul_in_first_coding_line(self):
src = (b'#coding:iso8859-15\x00\n'
b'\n'
b'\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
def test_nul_in_second_coding_line(self):
src = (b'#!/usr/bin/python\n'
b'#coding:iso8859-15\x00\n'
b'\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
class UTF8ValidatorTest(unittest.TestCase): class UTF8ValidatorTest(unittest.TestCase):
@ -325,6 +401,10 @@ def check_script_output(self, src, expected):
out = stdout.getvalue().encode('latin1') out = stdout.getvalue().encode('latin1')
self.assertEqual(out.rstrip(), expected) self.assertEqual(out.rstrip(), expected)
def check_script_error(self, src, expected):
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
exec(src)
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
@ -336,6 +416,14 @@ def check_script_output(self, src, expected):
res = script_helper.assert_python_ok(fn) res = script_helper.assert_python_ok(fn)
self.assertEqual(res.out.rstrip(), expected) self.assertEqual(res.out.rstrip(), expected)
def check_script_error(self, src, expected):
with tempfile.TemporaryDirectory() as tmpd:
fn = os.path.join(tmpd, 'test.py')
with open(fn, 'wb') as fp:
fp.write(src)
res = script_helper.assert_python_failure(fn)
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View file

@ -1342,7 +1342,8 @@ def readline():
def test_no_bom_no_encoding_cookie(self): def test_no_bom_no_encoding_cookie(self):
lines = ( lines = (
b'# something\n', b'#!/home/\xc3\xa4/bin/python\n',
b'# something \xe2\x82\xac\n',
b'print(something)\n', b'print(something)\n',
b'do_something(else)\n' b'do_something(else)\n'
) )
@ -1350,16 +1351,54 @@ def test_no_bom_no_encoding_cookie(self):
self.assertEqual(encoding, 'utf-8') self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:2])) self.assertEqual(consumed_lines, list(lines[:2]))
def test_no_bom_no_encoding_cookie_first_line_error(self):
lines = (
b'#!/home/\xa4/bin/python\n\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))
def test_no_bom_no_encoding_cookie_second_line_error(self):
lines = (
b'#!/usr/bin/python\n',
b'# something \xe2\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))
def test_bom_no_cookie(self): def test_bom_no_cookie(self):
lines = ( lines = (
b'\xef\xbb\xbf# something\n', b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n',
b'print(something)\n', b'print(something)\n',
b'do_something(else)\n' b'do_something(else)\n'
) )
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(encoding, 'utf-8-sig')
self.assertEqual(consumed_lines, self.assertEqual(consumed_lines,
[b'# something\n', b'print(something)\n']) [b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n'])
def test_bom_no_cookie_first_line_error(self):
lines = (
b'\xef\xbb\xbf#!/home/\xa4/bin/python\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))
def test_bom_no_cookie_second_line_error(self):
lines = (
b'\xef\xbb\xbf#!/usr/bin/python\n',
b'# something \xe2\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))
def test_cookie_first_line_no_bom(self): def test_cookie_first_line_no_bom(self):
lines = ( lines = (
@ -1435,17 +1474,6 @@ def test_cookie_second_line_noncommented_first_line(self):
expected = [b"print('\xc2\xa3')\n"] expected = [b"print('\xc2\xa3')\n"]
self.assertEqual(consumed_lines, expected) self.assertEqual(consumed_lines, expected)
def test_cookie_second_line_commented_first_line(self):
lines = (
b"#print('\xc2\xa3')\n",
b'# vim: set fileencoding=iso8859-15 :\n',
b"print('\xe2\x82\xac')\n"
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
self.assertEqual(consumed_lines, expected)
def test_cookie_second_line_empty_first_line(self): def test_cookie_second_line_empty_first_line(self):
lines = ( lines = (
b'\n', b'\n',
@ -1457,6 +1485,48 @@ def test_cookie_second_line_empty_first_line(self):
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
self.assertEqual(consumed_lines, expected) self.assertEqual(consumed_lines, expected)
def test_cookie_third_line(self):
lines = (
b'#!/home/\xc3\xa4/bin/python\n',
b'# something\n',
b'# vim: set fileencoding=ascii :\n',
b'print(something)\n',
b'do_something(else)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:2]))
def test_double_coding_line(self):
# If the first line matches the second line is ignored.
lines = (
b'#coding:iso8859-15\n',
b'#coding:latin1\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
self.assertEqual(consumed_lines, list(lines[:1]))
def test_double_coding_same_line(self):
lines = (
b'#coding:iso8859-15 coding:latin1\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
self.assertEqual(consumed_lines, list(lines[:1]))
def test_double_coding_utf8(self):
lines = (
b'#coding:utf-8\n',
b'#coding:latin1\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:1]))
def test_latin1_normalization(self): def test_latin1_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c. # See get_normal_name() in Parser/tokenizer/helpers.c.
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
@ -1481,7 +1551,6 @@ def test_syntaxerror_latin1(self):
readline = self.get_readline(lines) readline = self.get_readline(lines)
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
def test_utf8_normalization(self): def test_utf8_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c. # See get_normal_name() in Parser/tokenizer/helpers.c.
encodings = ("utf-8", "utf-8-mac", "utf-8-unix") encodings = ("utf-8", "utf-8-mac", "utf-8-unix")