mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
gh-63161: Fix tokenize.detect_encoding() (GH-139446)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error for non-UTF-8 encoding. * Detect null bytes in source code.
This commit is contained in:
parent
3222ea0f14
commit
38d4b436ca
3 changed files with 94 additions and 8 deletions
|
|
@ -1495,6 +1495,61 @@ def test_cookie_second_line_noncommented_first_line(self):
|
|||
expected = [b"print('\xc2\xa3')\n"]
|
||||
self.assertEqual(consumed_lines, expected)
|
||||
|
||||
def test_first_non_utf8_coding_line(self):
|
||||
lines = (
|
||||
b'#coding:iso-8859-15 \xa4\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
|
||||
self.assertEqual(encoding, 'iso-8859-15')
|
||||
self.assertEqual(consumed_lines, list(lines[:1]))
|
||||
|
||||
def test_first_utf8_coding_line_error(self):
|
||||
lines = (
|
||||
b'#coding:ascii \xc3\xa4\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
with self.assertRaises(SyntaxError):
|
||||
tokenize.detect_encoding(self.get_readline(lines))
|
||||
|
||||
def test_second_non_utf8_coding_line(self):
|
||||
lines = (
|
||||
b'#!/usr/bin/python\n',
|
||||
b'#coding:iso-8859-15 \xa4\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
|
||||
self.assertEqual(encoding, 'iso-8859-15')
|
||||
self.assertEqual(consumed_lines, list(lines[:2]))
|
||||
|
||||
def test_second_utf8_coding_line_error(self):
|
||||
lines = (
|
||||
b'#!/usr/bin/python\n',
|
||||
b'#coding:ascii \xc3\xa4\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
with self.assertRaises(SyntaxError):
|
||||
tokenize.detect_encoding(self.get_readline(lines))
|
||||
|
||||
def test_non_utf8_shebang(self):
|
||||
lines = (
|
||||
b'#!/home/\xa4/bin/python\n',
|
||||
b'#coding:iso-8859-15\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
|
||||
self.assertEqual(encoding, 'iso-8859-15')
|
||||
self.assertEqual(consumed_lines, list(lines[:2]))
|
||||
|
||||
def test_utf8_shebang_error(self):
|
||||
lines = (
|
||||
b'#!/home/\xc3\xa4/bin/python\n',
|
||||
b'#coding:ascii\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
with self.assertRaises(SyntaxError):
|
||||
tokenize.detect_encoding(self.get_readline(lines))
|
||||
|
||||
def test_cookie_second_line_empty_first_line(self):
|
||||
lines = (
|
||||
b'\n',
|
||||
|
|
@ -1548,6 +1603,28 @@ def test_double_coding_utf8(self):
|
|||
self.assertEqual(encoding, 'utf-8')
|
||||
self.assertEqual(consumed_lines, list(lines[:1]))
|
||||
|
||||
def test_nul_in_first_coding_line(self):
|
||||
lines = (
|
||||
b'#coding:iso8859-15\x00\n',
|
||||
b'\n',
|
||||
b'\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
with self.assertRaisesRegex(SyntaxError,
|
||||
"source code cannot contain null bytes"):
|
||||
tokenize.detect_encoding(self.get_readline(lines))
|
||||
|
||||
def test_nul_in_second_coding_line(self):
|
||||
lines = (
|
||||
b'#!/usr/bin/python\n',
|
||||
b'#coding:iso8859-15\x00\n',
|
||||
b'\n',
|
||||
b'print(something)\n'
|
||||
)
|
||||
with self.assertRaisesRegex(SyntaxError,
|
||||
"source code cannot contain null bytes"):
|
||||
tokenize.detect_encoding(self.get_readline(lines))
|
||||
|
||||
def test_latin1_normalization(self):
|
||||
# See get_normal_name() in Parser/tokenizer/helpers.c.
|
||||
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue