[3.14] gh-63161: Fix PEP 263 support (GH-139481) (GH-139898)

* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.
* Detect decoding error in comments for UTF-8 encoding.
* Include the decoding error position for default encoding in SyntaxError.
(cherry picked from commit 5c942f11cd)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-10-19 21:16:33 +02:00 committed by GitHub
parent 69d263cfe1
commit 9ff705c6c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 211 additions and 46 deletions

View file

@ -86,15 +86,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
/* need to check line 1 and 2 separately since check_coding_spec
assumes a single line as input */
if (newl[0]) {
tok->lineno = 1;
if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
return NULL;
}
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
tok->lineno = 2;
if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
tok, buf_setreadl))
return NULL;
}
}
tok->lineno = 0;
if (tok->enc != NULL) {
assert(utf8 == NULL);
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
@ -102,6 +105,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
return _PyTokenizer_error_ret(tok);
str = PyBytes_AS_STRING(utf8);
}
else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
return _PyTokenizer_error_ret(tok);
}
assert(tok->decoding_buffer == NULL);
tok->decoding_buffer = utf8; /* CAUTION */
return str;