gh-63161: Fix PEP 263 support (GH-139481)

* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.
* Detect decoding error in comments for UTF-8 encoding.
* Include the decoding error position for default encoding in SyntaxError.
This commit is contained in:
Serhiy Storchaka 2025-10-10 15:51:19 +03:00 committed by GitHub
parent d0b18b19fa
commit 5c942f11cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 210 additions and 46 deletions

View file

@ -47,8 +47,10 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
goto error;
}
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
col_offset, errtext, tok->lineno, end_col_offset);
args = Py_BuildValue("(O(OiiNii))", errmsg,
tok->filename ? tok->filename : Py_None,
tok->lineno, col_offset, errtext,
tok->lineno, end_col_offset);
if (args) {
PyErr_SetObject(PyExc_SyntaxError, args);
Py_DECREF(args);
@ -422,10 +424,13 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
tok->encoding = cs;
} else { /* then, compare cs with BOM */
if (strcmp(tok->encoding, cs) != 0) {
_PyTokenizer_error_ret(tok);
PyErr_Format(PyExc_SyntaxError,
"encoding problem: %s with BOM", cs);
tok->line_start = line;
tok->cur = (char *)line;
assert(size <= INT_MAX);
_PyTokenizer_syntaxerror_known_range(tok, 0, (int)size,
"encoding problem: %s with BOM", cs);
PyMem_Free(cs);
_PyTokenizer_error_ret(tok);
return 0;
}
PyMem_Free(cs);
@ -496,24 +501,38 @@ valid_utf8(const unsigned char* s)
}
int
_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
{
int badchar = 0;
unsigned char *c;
const char *badchar = NULL;
const char *c;
int length;
for (c = (unsigned char *)line; *c; c += length) {
if (!(length = valid_utf8(c))) {
badchar = *c;
int col_offset = 0;
const char *line_start = line;
for (c = line; *c; c += length) {
if (!(length = valid_utf8((const unsigned char *)c))) {
badchar = c;
break;
}
col_offset++;
if (*c == '\n') {
lineno++;
col_offset = 0;
line_start = c + 1;
}
}
if (badchar) {
PyErr_Format(PyExc_SyntaxError,
"Non-UTF-8 code starting with '\\x%.2x' "
"in file %U on line %i, "
"but no encoding declared; "
"see https://peps.python.org/pep-0263/ for details",
badchar, tok->filename, tok->lineno);
tok->lineno = lineno;
tok->line_start = line_start;
tok->cur = (char *)badchar;
_PyTokenizer_syntaxerror_known_range(tok,
col_offset + 1, col_offset + 1,
"Non-UTF-8 code starting with '\\x%.2x'"
"%s%V on line %i, "
"but no encoding declared; "
"see https://peps.python.org/pep-0263/ for details",
(unsigned char)*badchar,
tok->filename ? " in file " : "", tok->filename, "",
lineno);
return 0;
}
return 1;