gh-63161: Fix PEP 263 support (GH-139481)

* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.
* Detect decoding error in comments for UTF-8 encoding.
* Include the decoding error position for default encoding in SyntaxError.
This commit is contained in:
Serhiy Storchaka 2025-10-10 15:51:19 +03:00 committed by GitHub
parent d0b18b19fa
commit 5c942f11cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 210 additions and 46 deletions

View file

@ -1,7 +1,8 @@
# -*- coding: utf-8 -*-
import unittest
from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
from test import support
from test.support import script_helper
from test.support.os_helper import TESTFN, unlink, rmtree
from test.support.import_helper import unload
import importlib
@ -64,7 +65,7 @@ def test_issue7820(self):
# two bytes in common with the UTF-8 BOM
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
@requires_subprocess()
@support.requires_subprocess()
def test_20731(self):
sub = subprocess.Popen([sys.executable,
os.path.join(os.path.dirname(__file__),
@ -267,6 +268,17 @@ def test_second_non_utf8_coding_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
def test_first_utf8_coding_line_error(self):
src = (b'#coding:ascii \xc3\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
def test_second_utf8_coding_line_error(self):
src = (b'#!/usr/bin/python\n'
b'#coding:ascii \xc3\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
def test_utf8_bom(self):
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")
@ -282,10 +294,80 @@ def test_utf8_bom_and_utf8_coding_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")
def test_utf8_non_utf8_comment_line_error(self):
def test_utf8_bom_and_non_utf8_first_coding_line(self):
src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"encoding problem: iso-8859-15 with BOM",
lineno=1)
def test_utf8_bom_and_non_utf8_second_coding_line(self):
src = (b'\xef\xbb\xbf#first\n'
b'#coding:iso-8859-15\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"encoding problem: iso-8859-15 with BOM",
lineno=2)
def test_non_utf8_shebang(self):
src = (b'#!/home/\xa4/bin/python\n'
b'#coding:iso-8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
def test_utf8_shebang_error(self):
src = (b'#!/home/\xc3\xa4/bin/python\n'
b'#coding:ascii\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
def test_non_utf8_shebang_error(self):
src = (b'#!/home/\xa4/bin/python\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
lineno=1)
def test_non_utf8_second_line_error(self):
src = (b'#first\n'
b'#second\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 2",
lineno=2)
def test_non_utf8_third_line_error(self):
src = (b'#first\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3",
lineno=3)
def test_utf8_bom_non_utf8_third_line_error(self):
src = (b'\xef\xbb\xbf#first\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3|"
br"'utf-8' codec can't decode byte",
lineno=3)
def test_utf_8_non_utf8_third_line_error(self):
src = (b'#coding: utf-8\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"Non-UTF-8 code starting with .* on line 3|"
br"'utf-8' codec can't decode byte",
lineno=3)
def test_utf8_non_utf8_third_line_error(self):
src = (b'#coding: utf8\n'
b'#\n'
b'#\xa4\n'
b'#second\n'
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"'utf-8' codec can't decode byte|"
@ -326,7 +408,7 @@ def test_nul_in_second_coding_line(self):
class UTF8ValidatorTest(unittest.TestCase):
@unittest.skipIf(not sys.platform.startswith("linux"),
"Too slow to run on non-Linux platforms")
@requires_resource('cpu')
@support.requires_resource('cpu')
def test_invalid_utf8(self):
# This is a port of test_utf8_decode_invalid_sequences in
# test_unicode.py to exercise the separate utf8 validator in
@ -392,19 +474,29 @@ def check(content):
check(b'\xF4'+cb+b'\xBF\xBF')
@support.force_not_colorized_test_class
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
def check_script_output(self, src, expected):
with captured_stdout() as stdout:
with support.captured_stdout() as stdout:
exec(src)
out = stdout.getvalue().encode('latin1')
self.assertEqual(out.rstrip(), expected)
def check_script_error(self, src, expected):
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
def check_script_error(self, src, expected, lineno=...):
with self.assertRaises(SyntaxError) as cm:
exec(src)
exc = cm.exception
self.assertRegex(str(exc), expected.decode())
if lineno is not ...:
self.assertEqual(exc.lineno, lineno)
line = src.splitlines()[lineno-1].decode(errors='replace')
if lineno == 1:
line = line.removeprefix('\ufeff')
self.assertEqual(line, exc.text)
@support.force_not_colorized_test_class
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
def check_script_output(self, src, expected):
@ -415,13 +507,22 @@ def check_script_output(self, src, expected):
res = script_helper.assert_python_ok(fn)
self.assertEqual(res.out.rstrip(), expected)
def check_script_error(self, src, expected):
def check_script_error(self, src, expected, lineno=...):
with tempfile.TemporaryDirectory() as tmpd:
fn = os.path.join(tmpd, 'test.py')
with open(fn, 'wb') as fp:
fp.write(src)
res = script_helper.assert_python_failure(fn)
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
err = res.err.rstrip()
self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected)
if lineno is not ...:
self.assertIn(f', line {lineno}\n'.encode(),
err.replace(os.linesep.encode(), b'\n'))
line = src.splitlines()[lineno-1].decode(errors='replace')
if lineno == 1:
line = line.removeprefix('\ufeff')
self.assertIn(line.encode(), err)
if __name__ == "__main__":