mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 07:31:38 +00:00 
			
		
		
		
	(cherry picked from commit 8549559f38)
Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
		
	
			
		
			
				
	
	
		
			443 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			443 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
#include "Python.h"
 | 
						|
#include "errcode.h"
 | 
						|
#include "internal/pycore_critical_section.h"   // Py_BEGIN_CRITICAL_SECTION
 | 
						|
#include "../Parser/lexer/state.h"
 | 
						|
#include "../Parser/lexer/lexer.h"
 | 
						|
#include "../Parser/tokenizer/tokenizer.h"
 | 
						|
#include "../Parser/pegen.h"                    // _PyPegen_byte_offset_to_character_offset()
 | 
						|
 | 
						|
static struct PyModuleDef _tokenizemodule;
 | 
						|
 | 
						|
typedef struct {
 | 
						|
    PyTypeObject *TokenizerIter;
 | 
						|
} tokenize_state;
 | 
						|
 | 
						|
static tokenize_state *
 | 
						|
get_tokenize_state(PyObject *module) {
 | 
						|
    return (tokenize_state *)PyModule_GetState(module);
 | 
						|
}
 | 
						|
 | 
						|
#define _tokenize_get_state_by_type(type) \
 | 
						|
    get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
 | 
						|
 | 
						|
#include "pycore_runtime.h"
 | 
						|
#include "clinic/Python-tokenize.c.h"
 | 
						|
 | 
						|
/*[clinic input]
 | 
						|
module _tokenizer
 | 
						|
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
 | 
						|
[clinic start generated code]*/
 | 
						|
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
 | 
						|
 | 
						|
typedef struct
 | 
						|
{
 | 
						|
    PyObject_HEAD struct tok_state *tok;
 | 
						|
    int done;
 | 
						|
 | 
						|
    /* Needed to cache line for performance */
 | 
						|
    PyObject *last_line;
 | 
						|
    Py_ssize_t last_lineno;
 | 
						|
    Py_ssize_t last_end_lineno;
 | 
						|
    Py_ssize_t byte_col_offset_diff;
 | 
						|
} tokenizeriterobject;
 | 
						|
 | 
						|
/*[clinic input]
 | 
						|
@classmethod
 | 
						|
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
 | 
						|
 | 
						|
    readline: object
 | 
						|
    /
 | 
						|
    *
 | 
						|
    extra_tokens: bool
 | 
						|
    encoding: str(c_default="NULL") = 'utf-8'
 | 
						|
[clinic start generated code]*/
 | 
						|
 | 
						|
static PyObject *
 | 
						|
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
 | 
						|
                       int extra_tokens, const char *encoding)
 | 
						|
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
 | 
						|
{
 | 
						|
    tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
 | 
						|
    if (self == NULL) {
 | 
						|
        return NULL;
 | 
						|
    }
 | 
						|
    PyObject *filename = PyUnicode_FromString("<string>");
 | 
						|
    if (filename == NULL) {
 | 
						|
        return NULL;
 | 
						|
    }
 | 
						|
    self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
 | 
						|
    if (self->tok == NULL) {
 | 
						|
        Py_DECREF(filename);
 | 
						|
        return NULL;
 | 
						|
    }
 | 
						|
    self->tok->filename = filename;
 | 
						|
    if (extra_tokens) {
 | 
						|
        self->tok->tok_extra_tokens = 1;
 | 
						|
    }
 | 
						|
    self->done = 0;
 | 
						|
 | 
						|
    self->last_line = NULL;
 | 
						|
    self->byte_col_offset_diff = 0;
 | 
						|
    self->last_lineno = 0;
 | 
						|
    self->last_end_lineno = 0;
 | 
						|
 | 
						|
    return (PyObject *)self;
 | 
						|
}
 | 
						|
 | 
						|
static int
 | 
						|
_tokenizer_error(tokenizeriterobject *it)
 | 
						|
{
 | 
						|
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
 | 
						|
    if (PyErr_Occurred()) {
 | 
						|
        return -1;
 | 
						|
    }
 | 
						|
 | 
						|
    const char *msg = NULL;
 | 
						|
    PyObject* errtype = PyExc_SyntaxError;
 | 
						|
    struct tok_state *tok = it->tok;
 | 
						|
    switch (tok->done) {
 | 
						|
        case E_TOKEN:
 | 
						|
            msg = "invalid token";
 | 
						|
            break;
 | 
						|
        case E_EOF:
 | 
						|
            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
 | 
						|
            PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
 | 
						|
                                       tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
 | 
						|
            return -1;
 | 
						|
        case E_DEDENT:
 | 
						|
            msg = "unindent does not match any outer indentation level";
 | 
						|
            errtype = PyExc_IndentationError;
 | 
						|
            break;
 | 
						|
        case E_INTR:
 | 
						|
            if (!PyErr_Occurred()) {
 | 
						|
                PyErr_SetNone(PyExc_KeyboardInterrupt);
 | 
						|
            }
 | 
						|
            return -1;
 | 
						|
        case E_NOMEM:
 | 
						|
            PyErr_NoMemory();
 | 
						|
            return -1;
 | 
						|
        case E_TABSPACE:
 | 
						|
            errtype = PyExc_TabError;
 | 
						|
            msg = "inconsistent use of tabs and spaces in indentation";
 | 
						|
            break;
 | 
						|
        case E_TOODEEP:
 | 
						|
            errtype = PyExc_IndentationError;
 | 
						|
            msg = "too many levels of indentation";
 | 
						|
            break;
 | 
						|
        case E_LINECONT: {
 | 
						|
            msg = "unexpected character after line continuation character";
 | 
						|
            break;
 | 
						|
        }
 | 
						|
        default:
 | 
						|
            msg = "unknown tokenization error";
 | 
						|
    }
 | 
						|
 | 
						|
    PyObject* errstr = NULL;
 | 
						|
    PyObject* error_line = NULL;
 | 
						|
    PyObject* tmp = NULL;
 | 
						|
    PyObject* value = NULL;
 | 
						|
    int result = 0;
 | 
						|
 | 
						|
    Py_ssize_t size = tok->inp - tok->buf;
 | 
						|
    assert(tok->buf[size-1] == '\n');
 | 
						|
    size -= 1; // Remove the newline character from the end of the line
 | 
						|
    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
 | 
						|
    if (!error_line) {
 | 
						|
        result = -1;
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
 | 
						|
    Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
 | 
						|
    if (offset == -1) {
 | 
						|
        result = -1;
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
    tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
 | 
						|
    if (!tmp) {
 | 
						|
        result = -1;
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
 | 
						|
    errstr = PyUnicode_FromString(msg);
 | 
						|
    if (!errstr) {
 | 
						|
        result = -1;
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
 | 
						|
    value = PyTuple_Pack(2, errstr, tmp);
 | 
						|
    if (!value) {
 | 
						|
        result = -1;
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
 | 
						|
    PyErr_SetObject(errtype, value);
 | 
						|
 | 
						|
exit:
 | 
						|
    Py_XDECREF(errstr);
 | 
						|
    Py_XDECREF(error_line);
 | 
						|
    Py_XDECREF(tmp);
 | 
						|
    Py_XDECREF(value);
 | 
						|
    return result;
 | 
						|
}
 | 
						|
 | 
						|
static PyObject *
 | 
						|
_get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t size,
 | 
						|
                  int *line_changed)
 | 
						|
{
 | 
						|
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
 | 
						|
    PyObject *line;
 | 
						|
    if (it->tok->lineno != it->last_lineno) {
 | 
						|
        // Line has changed since last token, so we fetch the new line and cache it
 | 
						|
        // in the iter object.
 | 
						|
        Py_XDECREF(it->last_line);
 | 
						|
        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
 | 
						|
        it->last_line = line;
 | 
						|
        it->byte_col_offset_diff = 0;
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        line = it->last_line;
 | 
						|
        *line_changed = 0;
 | 
						|
    }
 | 
						|
    return line;
 | 
						|
}
 | 
						|
 | 
						|
static void
 | 
						|
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
 | 
						|
                 PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
 | 
						|
                 Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
 | 
						|
{
 | 
						|
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
 | 
						|
    Py_ssize_t byte_offset = -1;
 | 
						|
    if (token.start != NULL && token.start >= line_start) {
 | 
						|
        byte_offset = token.start - line_start;
 | 
						|
        if (line_changed) {
 | 
						|
            *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
 | 
						|
            it->byte_col_offset_diff = byte_offset - *col_offset;
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            *col_offset = byte_offset - it->byte_col_offset_diff;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    if (token.end != NULL && token.end >= it->tok->line_start) {
 | 
						|
        Py_ssize_t end_byte_offset = token.end - it->tok->line_start;
 | 
						|
        if (lineno == end_lineno) {
 | 
						|
            // If the whole token is at the same line, we can just use the token.start
 | 
						|
            // buffer for figuring out the new column offset, since using line is not
 | 
						|
            // performant for very long lines.
 | 
						|
            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
 | 
						|
            *end_col_offset = *col_offset + token_col_offset;
 | 
						|
            it->byte_col_offset_diff += token.end - token.start - token_col_offset;
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
 | 
						|
            it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    it->last_lineno = lineno;
 | 
						|
    it->last_end_lineno = end_lineno;
 | 
						|
}
 | 
						|
 | 
						|
static PyObject *
 | 
						|
tokenizeriter_next(tokenizeriterobject *it)
 | 
						|
{
 | 
						|
    PyObject* result = NULL;
 | 
						|
 | 
						|
    Py_BEGIN_CRITICAL_SECTION(it);
 | 
						|
 | 
						|
    struct token token;
 | 
						|
    _PyToken_Init(&token);
 | 
						|
 | 
						|
    int type = _PyTokenizer_Get(it->tok, &token);
 | 
						|
    if (type == ERRORTOKEN) {
 | 
						|
        if(!PyErr_Occurred()) {
 | 
						|
            _tokenizer_error(it);
 | 
						|
            assert(PyErr_Occurred());
 | 
						|
        }
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
    if (it->done || type == ERRORTOKEN) {
 | 
						|
        PyErr_SetString(PyExc_StopIteration, "EOF");
 | 
						|
        it->done = 1;
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
    PyObject *str = NULL;
 | 
						|
    if (token.start == NULL || token.end == NULL) {
 | 
						|
        str = PyUnicode_FromString("");
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
 | 
						|
    }
 | 
						|
    if (str == NULL) {
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
 | 
						|
    int is_trailing_token = 0;
 | 
						|
    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
 | 
						|
        is_trailing_token = 1;
 | 
						|
    }
 | 
						|
 | 
						|
    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
 | 
						|
    PyObject* line = NULL;
 | 
						|
    int line_changed = 1;
 | 
						|
    if (it->tok->tok_extra_tokens && is_trailing_token) {
 | 
						|
        line = PyUnicode_FromString("");
 | 
						|
    } else {
 | 
						|
        Py_ssize_t size = it->tok->inp - line_start;
 | 
						|
        if (size >= 1 && it->tok->implicit_newline) {
 | 
						|
            size -= 1;
 | 
						|
        }
 | 
						|
 | 
						|
        line = _get_current_line(it, line_start, size, &line_changed);
 | 
						|
    }
 | 
						|
    if (line == NULL) {
 | 
						|
        Py_DECREF(str);
 | 
						|
        goto exit;
 | 
						|
    }
 | 
						|
 | 
						|
    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
 | 
						|
    Py_ssize_t end_lineno = it->tok->lineno;
 | 
						|
    Py_ssize_t col_offset = -1;
 | 
						|
    Py_ssize_t end_col_offset = -1;
 | 
						|
    _get_col_offsets(it, token, line_start, line, line_changed,
 | 
						|
                     lineno, end_lineno, &col_offset, &end_col_offset);
 | 
						|
 | 
						|
    if (it->tok->tok_extra_tokens) {
 | 
						|
        if (is_trailing_token) {
 | 
						|
            lineno = end_lineno = lineno + 1;
 | 
						|
            col_offset = end_col_offset = 0;
 | 
						|
        }
 | 
						|
        // Necessary adjustments to match the original Python tokenize
 | 
						|
        // implementation
 | 
						|
        if (type > DEDENT && type < OP) {
 | 
						|
            type = OP;
 | 
						|
        }
 | 
						|
        else if (type == NEWLINE) {
 | 
						|
            Py_DECREF(str);
 | 
						|
            if (!it->tok->implicit_newline) {
 | 
						|
                if (it->tok->start[0] == '\r') {
 | 
						|
                    str = PyUnicode_FromString("\r\n");
 | 
						|
                } else {
 | 
						|
                    str = PyUnicode_FromString("\n");
 | 
						|
                }
 | 
						|
            }
 | 
						|
            end_col_offset++;
 | 
						|
        }
 | 
						|
        else if (type == NL) {
 | 
						|
            if (it->tok->implicit_newline) {
 | 
						|
                Py_DECREF(str);
 | 
						|
                str = PyUnicode_FromString("");
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        if (str == NULL) {
 | 
						|
            Py_DECREF(line);
 | 
						|
            goto exit;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    result = Py_BuildValue("(iN(nn)(nn)O)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
 | 
						|
exit:
 | 
						|
    _PyToken_Free(&token);
 | 
						|
    if (type == ENDMARKER) {
 | 
						|
        it->done = 1;
 | 
						|
    }
 | 
						|
 | 
						|
    Py_END_CRITICAL_SECTION();
 | 
						|
    return result;
 | 
						|
}
 | 
						|
 | 
						|
static void
 | 
						|
tokenizeriter_dealloc(tokenizeriterobject *it)
 | 
						|
{
 | 
						|
    PyTypeObject *tp = Py_TYPE(it);
 | 
						|
    Py_XDECREF(it->last_line);
 | 
						|
    _PyTokenizer_Free(it->tok);
 | 
						|
    tp->tp_free(it);
 | 
						|
    Py_DECREF(tp);
 | 
						|
}
 | 
						|
 | 
						|
static PyType_Slot tokenizeriter_slots[] = {
 | 
						|
    {Py_tp_new, tokenizeriter_new},
 | 
						|
    {Py_tp_dealloc, tokenizeriter_dealloc},
 | 
						|
    {Py_tp_getattro, PyObject_GenericGetAttr},
 | 
						|
    {Py_tp_iter, PyObject_SelfIter},
 | 
						|
    {Py_tp_iternext, tokenizeriter_next},
 | 
						|
    {0, NULL},
 | 
						|
};
 | 
						|
 | 
						|
static PyType_Spec tokenizeriter_spec = {
 | 
						|
    .name = "_tokenize.TokenizerIter",
 | 
						|
    .basicsize = sizeof(tokenizeriterobject),
 | 
						|
    .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
 | 
						|
    .slots = tokenizeriter_slots,
 | 
						|
};
 | 
						|
 | 
						|
static int
 | 
						|
tokenizemodule_exec(PyObject *m)
 | 
						|
{
 | 
						|
    tokenize_state *state = get_tokenize_state(m);
 | 
						|
    if (state == NULL) {
 | 
						|
        return -1;
 | 
						|
    }
 | 
						|
 | 
						|
    state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
 | 
						|
    if (state->TokenizerIter == NULL) {
 | 
						|
        return -1;
 | 
						|
    }
 | 
						|
    if (PyModule_AddType(m, state->TokenizerIter) < 0) {
 | 
						|
        return -1;
 | 
						|
    }
 | 
						|
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
static PyMethodDef tokenize_methods[] = {
 | 
						|
    {NULL, NULL, 0, NULL} /* Sentinel */
 | 
						|
};
 | 
						|
 | 
						|
static PyModuleDef_Slot tokenizemodule_slots[] = {
 | 
						|
    {Py_mod_exec, tokenizemodule_exec},
 | 
						|
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
 | 
						|
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
 | 
						|
    {0, NULL}
 | 
						|
};
 | 
						|
 | 
						|
static int
 | 
						|
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
 | 
						|
{
 | 
						|
    tokenize_state *state = get_tokenize_state(m);
 | 
						|
    Py_VISIT(state->TokenizerIter);
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
static int
 | 
						|
tokenizemodule_clear(PyObject *m)
 | 
						|
{
 | 
						|
    tokenize_state *state = get_tokenize_state(m);
 | 
						|
    Py_CLEAR(state->TokenizerIter);
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void
 | 
						|
tokenizemodule_free(void *m)
 | 
						|
{
 | 
						|
    tokenizemodule_clear((PyObject *)m);
 | 
						|
}
 | 
						|
 | 
						|
static struct PyModuleDef _tokenizemodule = {
 | 
						|
    PyModuleDef_HEAD_INIT,
 | 
						|
    .m_name = "_tokenize",
 | 
						|
    .m_size = sizeof(tokenize_state),
 | 
						|
    .m_slots = tokenizemodule_slots,
 | 
						|
    .m_methods = tokenize_methods,
 | 
						|
    .m_traverse = tokenizemodule_traverse,
 | 
						|
    .m_clear = tokenizemodule_clear,
 | 
						|
    .m_free = tokenizemodule_free,
 | 
						|
};
 | 
						|
 | 
						|
PyMODINIT_FUNC
 | 
						|
PyInit__tokenize(void)
 | 
						|
{
 | 
						|
    return PyModuleDef_Init(&_tokenizemodule);
 | 
						|
}
 |