[3.14] gh-63161: Fix PEP 263 support (GH-139481) (GH-139898)

* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding. * Include the decoding error position for default encoding in SyntaxError. (cherry picked from commit 5c942f11cd) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-12-08 06:10:17 +00:00 · 2025-10-19 21:16:33 +02:00 · 2025-10-19 21:16:33 +02:00 · 9ff705c6c5
commit 9ff705c6c5
parent 69d263cfe1
9 changed files with 211 additions and 46 deletions
--- a/Parser/tokenizer/string_tokenizer.c
+++ b/Parser/tokenizer/string_tokenizer.c
@ -86,15 +86,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
    /* need to check line 1 and 2 separately since check_coding_spec
       assumes a single line as input */
    if (newl[0]) {
+        tok->lineno = 1;
        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
            return NULL;
        }
        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
+            tok->lineno = 2;
            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                   tok, buf_setreadl))
                return NULL;
        }
    }
+    tok->lineno = 0;
    if (tok->enc != NULL) {
        assert(utf8 == NULL);
        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
@ -102,6 +105,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
            return _PyTokenizer_error_ret(tok);
        str = PyBytes_AS_STRING(utf8);
    }
+    else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
+        return _PyTokenizer_error_ret(tok);
+    }
    assert(tok->decoding_buffer == NULL);
    tok->decoding_buffer = utf8; /* CAUTION */
    return str;