gh-104169: Refactor tokenizer into lexer and wrappers (#110684)

* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
2026-01-06 15:32:22 +00:00 · 2023-10-11 17:14:44 +02:00 · 2023-10-11 17:14:44 +02:00 · 01481f2dc1
commit 01481f2dc1
parent eb50cd37ea
29 changed files with 3185 additions and 2988 deletions
--- a/Parser/tokenizer/string_tokenizer.c
+++ b/Parser/tokenizer/string_tokenizer.c
@ -0,0 +1,129 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "helpers.h"
+#include "../lexer/state.h"
+
+static int
+tok_underflow_string(struct tok_state *tok) {
+    char *end = strchr(tok->inp, '\n');
+    if (end != NULL) {
+        end++;
+    }
+    else {
+        end = strchr(tok->inp, '\0');
+        if (end == tok->inp) {
+            tok->done = E_EOF;
+            return 0;
+        }
+    }
+    if (tok->start == NULL) {
+        tok->buf = tok->cur;
+    }
+    tok->line_start = tok->cur;
+    ADVANCE_LINENO();
+    tok->inp = end;
+    return 1;
+}
+
+/* Fetch a byte from TOK, using the string buffer. */
+static int
+buf_getc(struct tok_state *tok) {
+    return Py_CHARMASK(*tok->str++);
+}
+
+/* Unfetch a byte from TOK, using the string buffer. */
+static void
+buf_ungetc(int c, struct tok_state *tok) {
+    tok->str--;
+    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
+}
+
+/* Set the readline function for TOK to ENC. For the string-based
+   tokenizer, this means to just record the encoding. */
+static int
+buf_setreadl(struct tok_state *tok, const char* enc) {
+    tok->enc = enc;
+    return 1;
+}
+
+/* Decode a byte string STR for use as the buffer of TOK.
+   Look for encoding declarations inside STR, and record them
+   inside TOK.  */
+static char *
+decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
+{
+    PyObject* utf8 = NULL;
+    char *str;
+    const char *s;
+    const char *newl[2] = {NULL, NULL};
+    int lineno = 0;
+    tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
+    if (str == NULL)
+        return NULL;
+    tok->enc = NULL;
+    tok->str = str;
+    if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
+        return _PyTokenizer_error_ret(tok);
+    str = tok->str;             /* string after BOM if any */
+    assert(str);
+    if (tok->enc != NULL) {
+        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
+        if (utf8 == NULL)
+            return _PyTokenizer_error_ret(tok);
+        str = PyBytes_AsString(utf8);
+    }
+    for (s = str;; s++) {
+        if (*s == '\0') break;
+        else if (*s == '\n') {
+            assert(lineno < 2);
+            newl[lineno] = s;
+            lineno++;
+            if (lineno == 2) break;
+        }
+    }
+    tok->enc = NULL;
+    /* need to check line 1 and 2 separately since check_coding_spec
+       assumes a single line as input */
+    if (newl[0]) {
+        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
+            return NULL;
+        }
+        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
+            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
+                                   tok, buf_setreadl))
+                return NULL;
+        }
+    }
+    if (tok->enc != NULL) {
+        assert(utf8 == NULL);
+        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
+        if (utf8 == NULL)
+            return _PyTokenizer_error_ret(tok);
+        str = PyBytes_AS_STRING(utf8);
+    }
+    assert(tok->decoding_buffer == NULL);
+    tok->decoding_buffer = utf8; /* CAUTION */
+    return str;
+}
+
+/* Set up tokenizer for string */
+struct tok_state *
+_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
+{
+    struct tok_state *tok = _PyTokenizer_tok_new();
+    char *decoded;
+
+    if (tok == NULL)
+        return NULL;
+    decoded = decode_str(str, exec_input, tok, preserve_crlf);
+    if (decoded == NULL) {
+        _PyTokenizer_Free(tok);
+        return NULL;
+    }
+
+    tok->buf = tok->cur = tok->inp = decoded;
+    tok->end = decoded;
+    tok->underflow = &tok_underflow_string;
+    return tok;
+}