mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 10:44:55 +00:00 
			
		
		
		
	 cbf0afd8a1
			
		
	
	
		cbf0afd8a1
		
			
		
	
	
	
	
		
			
			Right now, the tokenizer only returns type and two pointers to the start and end of the token. This PR modifies the tokenizer to return the type and set all of the necessary information, so that the parser does not have to this.
		
			
				
	
	
		
			194 lines
		
	
	
	
		
			5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			194 lines
		
	
	
	
		
			5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #include "Python.h"
 | |
| #include "../Parser/tokenizer.h"
 | |
| 
 | |
| static struct PyModuleDef _tokenizemodule;
 | |
| 
 | |
| typedef struct {
 | |
|     PyTypeObject *TokenizerIter;
 | |
| } tokenize_state;
 | |
| 
 | |
| static tokenize_state *
 | |
| get_tokenize_state(PyObject *module) {
 | |
|     return (tokenize_state *)PyModule_GetState(module);
 | |
| }
 | |
| 
 | |
| #define _tokenize_get_state_by_type(type) \
 | |
|     get_tokenize_state(PyType_GetModuleByDef(type, &_tokenizemodule))
 | |
| 
 | |
| #include "pycore_runtime.h"
 | |
| #include "clinic/Python-tokenize.c.h"
 | |
| 
 | |
| /*[clinic input]
 | |
| module _tokenizer
 | |
| class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
 | |
| [clinic start generated code]*/
 | |
| /*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
 | |
| 
 | |
| typedef struct
 | |
| {
 | |
|     PyObject_HEAD struct tok_state *tok;
 | |
| } tokenizeriterobject;
 | |
| 
 | |
| /*[clinic input]
 | |
| @classmethod
 | |
| _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
 | |
| 
 | |
|     source: str
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| tokenizeriter_new_impl(PyTypeObject *type, const char *source)
 | |
| /*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
 | |
| {
 | |
|     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
 | |
|     if (self == NULL) {
 | |
|         return NULL;
 | |
|     }
 | |
|     PyObject *filename = PyUnicode_FromString("<string>");
 | |
|     if (filename == NULL) {
 | |
|         return NULL;
 | |
|     }
 | |
|     self->tok = _PyTokenizer_FromUTF8(source, 1);
 | |
|     if (self->tok == NULL) {
 | |
|         Py_DECREF(filename);
 | |
|         return NULL;
 | |
|     }
 | |
|     self->tok->filename = filename;
 | |
|     return (PyObject *)self;
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| tokenizeriter_next(tokenizeriterobject *it)
 | |
| {
 | |
|     struct token token;
 | |
|     int type = _PyTokenizer_Get(it->tok, &token);
 | |
|     if (type == ERRORTOKEN && PyErr_Occurred()) {
 | |
|         return NULL;
 | |
|     }
 | |
|     if (type == ERRORTOKEN || type == ENDMARKER) {
 | |
|         PyErr_SetString(PyExc_StopIteration, "EOF");
 | |
|         return NULL;
 | |
|     }
 | |
|     PyObject *str = NULL;
 | |
|     if (token.start == NULL || token.end == NULL) {
 | |
|         str = PyUnicode_FromString("");
 | |
|     }
 | |
|     else {
 | |
|         str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
 | |
|     }
 | |
|     if (str == NULL) {
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     Py_ssize_t size = it->tok->inp - it->tok->buf;
 | |
|     PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
 | |
|     if (line == NULL) {
 | |
|         Py_DECREF(str);
 | |
|         return NULL;
 | |
|     }
 | |
|     const char *line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
 | |
|     int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
 | |
|     int end_lineno = it->tok->lineno;
 | |
|     int col_offset = -1;
 | |
|     int end_col_offset = -1;
 | |
|     if (token.start != NULL && token.start >= line_start) {
 | |
|         col_offset = (int)(token.start - line_start);
 | |
|     }
 | |
|     if (token.end != NULL && token.end >= it->tok->line_start) {
 | |
|         end_col_offset = (int)(token.end - it->tok->line_start);
 | |
|     }
 | |
| 
 | |
|     return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
 | |
| }
 | |
| 
 | |
| static void
 | |
| tokenizeriter_dealloc(tokenizeriterobject *it)
 | |
| {
 | |
|     PyTypeObject *tp = Py_TYPE(it);
 | |
|     _PyTokenizer_Free(it->tok);
 | |
|     tp->tp_free(it);
 | |
|     Py_DECREF(tp);
 | |
| }
 | |
| 
 | |
| static PyType_Slot tokenizeriter_slots[] = {
 | |
|     {Py_tp_new, tokenizeriter_new},
 | |
|     {Py_tp_dealloc, tokenizeriter_dealloc},
 | |
|     {Py_tp_getattro, PyObject_GenericGetAttr},
 | |
|     {Py_tp_iter, PyObject_SelfIter},
 | |
|     {Py_tp_iternext, tokenizeriter_next},
 | |
|     {0, NULL},
 | |
| };
 | |
| 
 | |
| static PyType_Spec tokenizeriter_spec = {
 | |
|     .name = "_tokenize.TokenizerIter",
 | |
|     .basicsize = sizeof(tokenizeriterobject),
 | |
|     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
 | |
|     .slots = tokenizeriter_slots,
 | |
| };
 | |
| 
 | |
| static int
 | |
| tokenizemodule_exec(PyObject *m)
 | |
| {
 | |
|     tokenize_state *state = get_tokenize_state(m);
 | |
|     if (state == NULL) {
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(m, &tokenizeriter_spec, NULL);
 | |
|     if (state->TokenizerIter == NULL) {
 | |
|         return -1;
 | |
|     }
 | |
|     if (PyModule_AddType(m, state->TokenizerIter) < 0) {
 | |
|         return -1;
 | |
|     }
 | |
| 
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static PyMethodDef tokenize_methods[] = {
 | |
|     {NULL, NULL, 0, NULL} /* Sentinel */
 | |
| };
 | |
| 
 | |
| static PyModuleDef_Slot tokenizemodule_slots[] = {
 | |
|     {Py_mod_exec, tokenizemodule_exec},
 | |
|     {0, NULL}
 | |
| };
 | |
| 
 | |
| static int
 | |
| tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
 | |
| {
 | |
|     tokenize_state *state = get_tokenize_state(m);
 | |
|     Py_VISIT(state->TokenizerIter);
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static int
 | |
| tokenizemodule_clear(PyObject *m)
 | |
| {
 | |
|     tokenize_state *state = get_tokenize_state(m);
 | |
|     Py_CLEAR(state->TokenizerIter);
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| static void
 | |
| tokenizemodule_free(void *m)
 | |
| {
 | |
|     tokenizemodule_clear((PyObject *)m);
 | |
| }
 | |
| 
 | |
| static struct PyModuleDef _tokenizemodule = {
 | |
|     PyModuleDef_HEAD_INIT,
 | |
|     .m_name = "_tokenize",
 | |
|     .m_size = sizeof(tokenize_state),
 | |
|     .m_slots = tokenizemodule_slots,
 | |
|     .m_methods = tokenize_methods,
 | |
|     .m_traverse = tokenizemodule_traverse,
 | |
|     .m_clear = tokenizemodule_clear,
 | |
|     .m_free = tokenizemodule_free,
 | |
| };
 | |
| 
 | |
| PyMODINIT_FUNC
 | |
| PyInit__tokenize(void)
 | |
| {
 | |
|     return PyModuleDef_Init(&_tokenizemodule);
 | |
| }
 |