mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
gh-63161: Fix PEP 263 support (GH-139481)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding. * Include the decoding error position for default encoding in SyntaxError.
This commit is contained in:
parent
d0b18b19fa
commit
5c942f11cd
9 changed files with 210 additions and 46 deletions
|
|
@ -47,8 +47,10 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
|
|||
goto error;
|
||||
}
|
||||
|
||||
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
|
||||
col_offset, errtext, tok->lineno, end_col_offset);
|
||||
args = Py_BuildValue("(O(OiiNii))", errmsg,
|
||||
tok->filename ? tok->filename : Py_None,
|
||||
tok->lineno, col_offset, errtext,
|
||||
tok->lineno, end_col_offset);
|
||||
if (args) {
|
||||
PyErr_SetObject(PyExc_SyntaxError, args);
|
||||
Py_DECREF(args);
|
||||
|
|
@ -422,10 +424,13 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
|
|||
tok->encoding = cs;
|
||||
} else { /* then, compare cs with BOM */
|
||||
if (strcmp(tok->encoding, cs) != 0) {
|
||||
_PyTokenizer_error_ret(tok);
|
||||
PyErr_Format(PyExc_SyntaxError,
|
||||
"encoding problem: %s with BOM", cs);
|
||||
tok->line_start = line;
|
||||
tok->cur = (char *)line;
|
||||
assert(size <= INT_MAX);
|
||||
_PyTokenizer_syntaxerror_known_range(tok, 0, (int)size,
|
||||
"encoding problem: %s with BOM", cs);
|
||||
PyMem_Free(cs);
|
||||
_PyTokenizer_error_ret(tok);
|
||||
return 0;
|
||||
}
|
||||
PyMem_Free(cs);
|
||||
|
|
@ -496,24 +501,38 @@ valid_utf8(const unsigned char* s)
|
|||
}
|
||||
|
||||
int
|
||||
_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
|
||||
_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
|
||||
{
|
||||
int badchar = 0;
|
||||
unsigned char *c;
|
||||
const char *badchar = NULL;
|
||||
const char *c;
|
||||
int length;
|
||||
for (c = (unsigned char *)line; *c; c += length) {
|
||||
if (!(length = valid_utf8(c))) {
|
||||
badchar = *c;
|
||||
int col_offset = 0;
|
||||
const char *line_start = line;
|
||||
for (c = line; *c; c += length) {
|
||||
if (!(length = valid_utf8((const unsigned char *)c))) {
|
||||
badchar = c;
|
||||
break;
|
||||
}
|
||||
col_offset++;
|
||||
if (*c == '\n') {
|
||||
lineno++;
|
||||
col_offset = 0;
|
||||
line_start = c + 1;
|
||||
}
|
||||
}
|
||||
if (badchar) {
|
||||
PyErr_Format(PyExc_SyntaxError,
|
||||
"Non-UTF-8 code starting with '\\x%.2x' "
|
||||
"in file %U on line %i, "
|
||||
"but no encoding declared; "
|
||||
"see https://peps.python.org/pep-0263/ for details",
|
||||
badchar, tok->filename, tok->lineno);
|
||||
tok->lineno = lineno;
|
||||
tok->line_start = line_start;
|
||||
tok->cur = (char *)badchar;
|
||||
_PyTokenizer_syntaxerror_known_range(tok,
|
||||
col_offset + 1, col_offset + 1,
|
||||
"Non-UTF-8 code starting with '\\x%.2x'"
|
||||
"%s%V on line %i, "
|
||||
"but no encoding declared; "
|
||||
"see https://peps.python.org/pep-0263/ for details",
|
||||
(unsigned char)*badchar,
|
||||
tok->filename ? " in file " : "", tok->filename, "",
|
||||
lineno);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue