mirror of
				https://github.com/python/cpython.git
				synced 2025-10-26 19:24:34 +00:00 
			
		
		
		
	 01481f2dc1
			
		
	
	
		01481f2dc1
		
			
		
	
	
	
	
		
			
			* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
		
			
				
	
	
		
			366 lines
		
	
	
	
		
			9.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			366 lines
		
	
	
	
		
			9.8 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #include "Python.h"
 | |
| #include "errcode.h"
 | |
| #include "../Parser/lexer/state.h"
 | |
| #include "../Parser/lexer/lexer.h"
 | |
| #include "../Parser/tokenizer/tokenizer.h"
 | |
| #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
 | |
| #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
 | |
| 
 | |
| static struct PyModuleDef _tokenizemodule;
 | |
| 
 | |
| typedef struct {
 | |
|     PyTypeObject *TokenizerIter;
 | |
| } tokenize_state;
 | |
| 
 | |
| static tokenize_state *
 | |
| get_tokenize_state(PyObject *module) {
 | |
|     return (tokenize_state *)PyModule_GetState(module);
 | |
| }
 | |
| 
 | |
| #define _tokenize_get_state_by_type(type) \
 | |
|     get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
 | |
| 
 | |
| #include "pycore_runtime.h"
 | |
| #include "clinic/Python-tokenize.c.h"
 | |
| 
 | |
| /*[clinic input]
 | |
| module _tokenizer
 | |
| class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
 | |
| [clinic start generated code]*/
 | |
| /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
 | |
| 
 | |
| typedef struct
 | |
| {
 | |
|     PyObject_HEAD struct tok_state *tok;
 | |
|     int done;
 | |
| } tokenizeriterobject;
 | |
| 
 | |
| /*[clinic input]
 | |
| @classmethod
 | |
| _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
 | |
| 
 | |
|     readline: object
 | |
|     /
 | |
|     *
 | |
|     extra_tokens: bool
 | |
|     encoding: str(c_default="NULL") = 'utf-8'
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
 | |
|                        int extra_tokens, const char *encoding)
 | |
| /*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
 | |
| {
 | |
|     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
 | |
|     if (self == NULL) {
 | |
|         return NULL;
 | |
|     }
 | |
|     PyObject *filename = PyUnicode_FromString("<string>");
 | |
|     if (filename == NULL) {
 | |
|         return NULL;
 | |
|     }
 | |
|     self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
 | |
|     if (self->tok == NULL) {
 | |
|         Py_DECREF(filename);
 | |
|         return NULL;
 | |
|     }
 | |
|     self->tok->filename = filename;
 | |
|     if (extra_tokens) {
 | |
|         self->tok->tok_extra_tokens = 1;
 | |
|     }
 | |
|     self->done = 0;
 | |
|     return (PyObject *)self;
 | |
| }
 | |
| 
 | |
| static int
 | |
| _tokenizer_error(struct tok_state *tok)
 | |
| {
 | |
|     if (PyErr_Occurred()) {
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     const char *msg = NULL;
 | |
|     PyObject* errtype = PyExc_SyntaxError;
 | |
|     switch (tok->done) {
 | |
|         case E_TOKEN:
 | |
|             msg = "invalid token";
 | |
|             break;
 | |
|         case E_EOF:
 | |
|             PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
 | |
|             PyErr_SyntaxLocationObject(tok->filename, tok->lineno,
 | |
|                                        tok->inp - tok->buf < 0 ? 0 : (int)(tok->inp - tok->buf));
 | |
|             return -1;
 | |
|         case E_DEDENT:
 | |
|             msg = "unindent does not match any outer indentation level";
 | |
|             errtype = PyExc_IndentationError;
 | |
|             break;
 | |
|         case E_INTR:
 | |
|             if (!PyErr_Occurred()) {
 | |
|                 PyErr_SetNone(PyExc_KeyboardInterrupt);
 | |
|             }
 | |
|             return -1;
 | |
|         case E_NOMEM:
 | |
|             PyErr_NoMemory();
 | |
|             return -1;
 | |
|         case E_TABSPACE:
 | |
|             errtype = PyExc_TabError;
 | |
|             msg = "inconsistent use of tabs and spaces in indentation";
 | |
|             break;
 | |
|         case E_TOODEEP:
 | |
|             errtype = PyExc_IndentationError;
 | |
|             msg = "too many levels of indentation";
 | |
|             break;
 | |
|         case E_LINECONT: {
 | |
|             msg = "unexpected character after line continuation character";
 | |
|             break;
 | |
|         }
 | |
|         default:
 | |
|             msg = "unknown tokenization error";
 | |
|     }
 | |
| 
 | |
|     PyObject* errstr = NULL;
 | |
|     PyObject* error_line = NULL;
 | |
|     PyObject* tmp = NULL;
 | |
|     PyObject* value = NULL;
 | |
|     int result = 0;
 | |
| 
 | |
|     Py_ssize_t size = tok->inp - tok->buf;
 | |
|     assert(tok->buf[size-1] == '\n');
 | |
|     size -= 1; // Remove the newline character from the end of the line
 | |
|     error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
 | |
|     if (!error_line) {
 | |
|         result = -1;
 | |
|         goto exit;
 | |
|     }
 | |
| 
 | |
|     Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset(error_line, tok->inp - tok->buf);
 | |
|     if (offset == -1) {
 | |
|         result = -1;
 | |
|         goto exit;
 | |
|     }
 | |
|     tmp = Py_BuildValue("(OnnOOO)", tok->filename, tok->lineno, offset, error_line, Py_None, Py_None);
 | |
|     if (!tmp) {
 | |
|         result = -1;
 | |
|         goto exit;
 | |
|     }
 | |
| 
 | |
|     errstr = PyUnicode_FromString(msg);
 | |
|     if (!errstr) {
 | |
|         result = -1;
 | |
|         goto exit;
 | |
|     }
 | |
| 
 | |
|     value = PyTuple_Pack(2, errstr, tmp);
 | |
|     if (!value) {
 | |
|         result = -1;
 | |
|         goto exit;
 | |
|     }
 | |
| 
 | |
|     PyErr_SetObject(errtype, value);
 | |
| 
 | |
| exit:
 | |
|     Py_XDECREF(errstr);
 | |
|     Py_XDECREF(error_line);
 | |
|     Py_XDECREF(tmp);
 | |
|     Py_XDECREF(value);
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| tokenizeriter_next(tokenizeriterobject *it)
 | |
| {
 | |
|     PyObject* result = NULL;
 | |
|     struct token token;
 | |
|     _PyToken_Init(&token);
 | |
| 
 | |
|     int type = _PyTokenizer_Get(it->tok, &token);
 | |
|     if (type == ERRORTOKEN) {
 | |
|         if(!PyErr_Occurred()) {
 | |
|             _tokenizer_error(it->tok);
 | |
|             assert(PyErr_Occurred());
 | |
|         }
 | |
|         goto exit;
 | |
|     }
 | |
|     if (it->done || type == ERRORTOKEN) {
 | |
|         PyErr_SetString(PyExc_StopIteration, "EOF");
 | |
|         it->done = 1;
 | |
|         goto exit;
 | |
|     }
 | |
|     PyObject *str = NULL;
 | |
|     if (token.start == NULL || token.end == NULL) {
 | |
|         str = PyUnicode_FromString("");
 | |
|     }
 | |
|     else {
 | |
|         str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
 | |
|     }
 | |
|     if (str == NULL) {
 | |
|         goto exit;
 | |
|     }
 | |
| 
 | |
|     int is_trailing_token = 0;
 | |
|     if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
 | |
|         is_trailing_token = 1;
 | |
|     }
 | |
| 
 | |
|     const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
 | |
|     PyObject* line = NULL;
 | |
|     if (it->tok->tok_extra_tokens && is_trailing_token) {
 | |
|         line = PyUnicode_FromString("");
 | |
|     } else {
 | |
|         Py_ssize_t size = it->tok->inp - line_start;
 | |
|         if (size >= 1 && it->tok->implicit_newline) {
 | |
|             size -= 1;
 | |
|         }
 | |
|         line = PyUnicode_DecodeUTF8(line_start, size, "replace");
 | |
|     }
 | |
|     if (line == NULL) {
 | |
|         Py_DECREF(str);
 | |
|         goto exit;
 | |
|     }
 | |
| 
 | |
|     Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
 | |
|     Py_ssize_t end_lineno = it->tok->lineno;
 | |
|     Py_ssize_t col_offset = -1;
 | |
|     Py_ssize_t end_col_offset = -1;
 | |
|     if (token.start != NULL && token.start >= line_start) {
 | |
|         col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
 | |
|     }
 | |
|     if (token.end != NULL && token.end >= it->tok->line_start) {
 | |
|         end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
 | |
|     }
 | |
| 
 | |
|     if (it->tok->tok_extra_tokens) {
 | |
|         if (is_trailing_token) {
 | |
|             lineno = end_lineno = lineno + 1;
 | |
|             col_offset = end_col_offset = 0;
 | |
|         }
 | |
|         // Necessary adjustments to match the original Python tokenize
 | |
|         // implementation
 | |
|         if (type > DEDENT && type < OP) {
 | |
|             type = OP;
 | |
|         }
 | |
|         else if (type == NEWLINE) {
 | |
|             Py_DECREF(str);
 | |
|             if (!it->tok->implicit_newline) {
 | |
|                 if (it->tok->start[0] == '\r') {
 | |
|                     str = PyUnicode_FromString("\r\n");
 | |
|                 } else {
 | |
|                     str = PyUnicode_FromString("\n");
 | |
|                 }
 | |
|             }
 | |
|             end_col_offset++;
 | |
|         }
 | |
|         else if (type == NL) {
 | |
|             if (it->tok->implicit_newline) {
 | |
|                 Py_DECREF(str);
 | |
|                 str = PyUnicode_FromString("");
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         if (str == NULL) {
 | |
|             Py_DECREF(line);
 | |
|             goto exit;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
 | |
| exit:
 | |
|     _PyToken_Free(&token);
 | |
|     if (type == ENDMARKER) {
 | |
|         it->done = 1;
 | |
|     }
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| static void
 | |
| tokenizeriter_dealloc(tokenizeriterobject *it)
 | |
| {
 | |
|     PyTypeObject *tp = Py_TYPE(it);
 | |
|     _PyTokenizer_Free(it->tok);
 | |
|     tp->tp_free(it);
 | |
|     Py_DECREF(tp);
 | |
| }
 | |
| 
 | |
| static PyType_Slot tokenizeriter_slots[] = {
 | |
|     {Py_tp_new, tokenizeriter_new},
 | |
|     {Py_tp_dealloc, tokenizeriter_dealloc},
 | |
|     {Py_tp_getattro, PyObject_GenericGetAttr},
 | |
|     {Py_tp_iter, PyObject_SelfIter},
 | |
|     {Py_tp_iternext, tokenizeriter_next},
 | |
|     {0, NULL},
 | |
| };
 | |
| 
 | |
| static PyType_Spec tokenizeriter_spec = {
 | |
|     .name = "_tokenize.TokenizerIter",
 | |
|     .basicsize = sizeof(tokenizeriterobject),
 | |
|     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
 | |
|     .slots = tokenizeriter_slots,
 | |
| };
 | |
| 
 | |
| static int
 | |
| tokenizemodule_exec(PyObject *m)
 | |
| {
 | |
|     tokenize_state *state = get_tokenize_state(m);
 | |
|     if (state == NULL) {
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
 | |
|     if (state->TokenizerIter == NULL) {
 | |
|         return -1;
 | |
|     }
 | |
|     if (PyModule_AddType(m, state->TokenizerIter) < 0) {
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static PyMethodDef tokenize_methods[] = {
 | |
|     {NULL, NULL, 0, NULL} /* Sentinel */
 | |
| };
 | |
| 
 | |
| static PyModuleDef_Slot tokenizemodule_slots[] = {
 | |
|     {Py_mod_exec, tokenizemodule_exec},
 | |
|     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
 | |
|     {0, NULL}
 | |
| };
 | |
| 
 | |
| static int
 | |
| tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
 | |
| {
 | |
|     tokenize_state *state = get_tokenize_state(m);
 | |
|     Py_VISIT(state->TokenizerIter);
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static int
 | |
| tokenizemodule_clear(PyObject *m)
 | |
| {
 | |
|     tokenize_state *state = get_tokenize_state(m);
 | |
|     Py_CLEAR(state->TokenizerIter);
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static void
 | |
| tokenizemodule_free(void *m)
 | |
| {
 | |
|     tokenizemodule_clear((PyObject *)m);
 | |
| }
 | |
| 
 | |
| static struct PyModuleDef _tokenizemodule = {
 | |
|     PyModuleDef_HEAD_INIT,
 | |
|     .m_name = "_tokenize",
 | |
|     .m_size = sizeof(tokenize_state),
 | |
|     .m_slots = tokenizemodule_slots,
 | |
|     .m_methods = tokenize_methods,
 | |
|     .m_traverse = tokenizemodule_traverse,
 | |
|     .m_clear = tokenizemodule_clear,
 | |
|     .m_free = tokenizemodule_free,
 | |
| };
 | |
| 
 | |
| PyMODINIT_FUNC
 | |
| PyInit__tokenize(void)
 | |
| {
 | |
|     return PyModuleDef_Init(&_tokenizemodule);
 | |
| }
 |