gh-63161: Add more tests for source encoding (#139440)

2025-12-08 06:10:17 +00:00 · 2025-09-30 12:20:17 +03:00 · 2025-09-30 12:20:17 +03:00 · b2f5ad0c6d
commit b2f5ad0c6d
parent bc172ee830
2 changed files with 179 additions and 22 deletions
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@ -172,6 +172,8 @@ def test_tokenizer_fstring_warning_in_first_line(self):
            os.unlink(TESTFN)


+BUFSIZ = 2**13
+
 class AbstractSourceEncodingTest:

    def test_default_coding(self):
@ -184,14 +186,20 @@ def test_first_coding_line(self):
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_second_coding_line(self):
-        src = (b'#\n'
+        src = (b'#!/usr/bin/python\n'
+               b'#coding:iso8859-15\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\u20ac'")
+
+    def test_second_coding_line_empty_first_line(self):
+        src = (b'\n'
               b'#coding:iso8859-15\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_third_coding_line(self):
        # Only first two lines are tested for a magic comment.
-        src = (b'#\n'
+        src = (b'#!/usr/bin/python\n'
               b'#\n'
               b'#coding:iso8859-15\n'
               b'print(ascii("\xc3\xa4"))\n')
@ -209,13 +217,52 @@ def test_double_coding_same_line(self):
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

+    def test_double_coding_utf8(self):
+        src = (b'#coding:utf-8\n'
+               b'#coding:latin1\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xe4'")
+
+    def test_long_first_coding_line(self):
+        src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\u20ac'")
+
+    def test_long_second_coding_line(self):
+        src = (b'#!/usr/bin/python\n'
+               b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\u20ac'")
+
+    def test_long_coding_line(self):
+        src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\u20ac'")
+
+    def test_long_coding_name(self):
+        src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\xa4'")
+
+    def test_long_first_utf8_line(self):
+        src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
+        self.check_script_output(src, b'')
+        src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
+        self.check_script_output(src, b'')
+
+    def test_long_second_utf8_line(self):
+        src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
+        self.check_script_output(src, b'')
+        src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
+        self.check_script_output(src, b'')
+
    def test_first_non_utf8_coding_line(self):
        src = (b'#coding:iso-8859-15 \xa4\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")

    def test_second_non_utf8_coding_line(self):
-        src = (b'\n'
+        src = (b'#!/usr/bin/python\n'
               b'#coding:iso-8859-15 \xa4\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xc3\u20ac'")
@ -224,27 +271,56 @@ def test_utf8_bom(self):
        src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xe4'")

+    def test_utf8_bom_utf8_comments(self):
+        src = (b'\xef\xbb\xbf#\xc3\xa4\n'
+               b'#\xc3\xa4\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xe4'")
+
    def test_utf8_bom_and_utf8_coding_line(self):
        src = (b'\xef\xbb\xbf#coding:utf-8\n'
               b'print(ascii("\xc3\xa4"))\n')
        self.check_script_output(src, br"'\xe4'")

+    def test_utf8_non_utf8_comment_line_error(self):
+        src = (b'#coding: utf8\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"'utf-8' codec can't decode byte|"
+                br"encoding problem: utf8")
+
    def test_crlf(self):
        src = (b'print(ascii("""\r\n"""))\n')
-        out = self.check_script_output(src, br"'\n'")
+        self.check_script_output(src, br"'\n'")

    def test_crcrlf(self):
        src = (b'print(ascii("""\r\r\n"""))\n')
-        out = self.check_script_output(src, br"'\n\n'")
+        self.check_script_output(src, br"'\n\n'")

    def test_crcrcrlf(self):
        src = (b'print(ascii("""\r\r\r\n"""))\n')
-        out = self.check_script_output(src, br"'\n\n\n'")
+        self.check_script_output(src, br"'\n\n\n'")

    def test_crcrcrlf2(self):
        src = (b'#coding:iso-8859-1\n'
               b'print(ascii("""\r\r\r\n"""))\n')
-        out = self.check_script_output(src, br"'\n\n\n'")
+        self.check_script_output(src, br"'\n\n\n'")
+
+    def test_nul_in_first_coding_line(self):
+        src = (b'#coding:iso8859-15\x00\n'
+               b'\n'
+               b'\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"source code (string )?cannot contain null bytes")
+
+    def test_nul_in_second_coding_line(self):
+        src = (b'#!/usr/bin/python\n'
+               b'#coding:iso8859-15\x00\n'
+               b'\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"source code (string )?cannot contain null bytes")


 class UTF8ValidatorTest(unittest.TestCase):
@ -324,6 +400,10 @@ def check_script_output(self, src, expected):
        out = stdout.getvalue().encode('latin1')
        self.assertEqual(out.rstrip(), expected)

+    def check_script_error(self, src, expected):
+        with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
+            exec(src)
+

 class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

@ -335,6 +415,14 @@ def check_script_output(self, src, expected):
            res = script_helper.assert_python_ok(fn)
        self.assertEqual(res.out.rstrip(), expected)

+    def check_script_error(self, src, expected):
+        with tempfile.TemporaryDirectory() as tmpd:
+            fn = os.path.join(tmpd, 'test.py')
+            with open(fn, 'wb') as fp:
+                fp.write(src)
+            res = script_helper.assert_python_failure(fn)
+        self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
+

 if __name__ == "__main__":
    unittest.main()