mirror of
https://github.com/python/cpython.git
synced 2026-01-06 15:32:22 +00:00
gh-104169: Refactor tokenizer into lexer and wrappers (#110684)
* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
parent
eb50cd37ea
commit
01481f2dc1
29 changed files with 3185 additions and 2988 deletions
129
Parser/tokenizer/string_tokenizer.c
Normal file
129
Parser/tokenizer/string_tokenizer.c
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
#include "Python.h"
|
||||
#include "errcode.h"
|
||||
|
||||
#include "helpers.h"
|
||||
#include "../lexer/state.h"
|
||||
|
||||
static int
|
||||
tok_underflow_string(struct tok_state *tok) {
|
||||
char *end = strchr(tok->inp, '\n');
|
||||
if (end != NULL) {
|
||||
end++;
|
||||
}
|
||||
else {
|
||||
end = strchr(tok->inp, '\0');
|
||||
if (end == tok->inp) {
|
||||
tok->done = E_EOF;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (tok->start == NULL) {
|
||||
tok->buf = tok->cur;
|
||||
}
|
||||
tok->line_start = tok->cur;
|
||||
ADVANCE_LINENO();
|
||||
tok->inp = end;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Fetch a byte from TOK, using the string buffer. */
|
||||
static int
|
||||
buf_getc(struct tok_state *tok) {
|
||||
return Py_CHARMASK(*tok->str++);
|
||||
}
|
||||
|
||||
/* Unfetch a byte from TOK, using the string buffer. */
|
||||
static void
|
||||
buf_ungetc(int c, struct tok_state *tok) {
|
||||
tok->str--;
|
||||
assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
|
||||
}
|
||||
|
||||
/* Set the readline function for TOK to ENC. For the string-based
|
||||
tokenizer, this means to just record the encoding. */
|
||||
static int
|
||||
buf_setreadl(struct tok_state *tok, const char* enc) {
|
||||
tok->enc = enc;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Decode a byte string STR for use as the buffer of TOK.
|
||||
Look for encoding declarations inside STR, and record them
|
||||
inside TOK. */
|
||||
static char *
|
||||
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
|
||||
{
|
||||
PyObject* utf8 = NULL;
|
||||
char *str;
|
||||
const char *s;
|
||||
const char *newl[2] = {NULL, NULL};
|
||||
int lineno = 0;
|
||||
tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
|
||||
if (str == NULL)
|
||||
return NULL;
|
||||
tok->enc = NULL;
|
||||
tok->str = str;
|
||||
if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
|
||||
return _PyTokenizer_error_ret(tok);
|
||||
str = tok->str; /* string after BOM if any */
|
||||
assert(str);
|
||||
if (tok->enc != NULL) {
|
||||
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
|
||||
if (utf8 == NULL)
|
||||
return _PyTokenizer_error_ret(tok);
|
||||
str = PyBytes_AsString(utf8);
|
||||
}
|
||||
for (s = str;; s++) {
|
||||
if (*s == '\0') break;
|
||||
else if (*s == '\n') {
|
||||
assert(lineno < 2);
|
||||
newl[lineno] = s;
|
||||
lineno++;
|
||||
if (lineno == 2) break;
|
||||
}
|
||||
}
|
||||
tok->enc = NULL;
|
||||
/* need to check line 1 and 2 separately since check_coding_spec
|
||||
assumes a single line as input */
|
||||
if (newl[0]) {
|
||||
if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
|
||||
return NULL;
|
||||
}
|
||||
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
|
||||
if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
|
||||
tok, buf_setreadl))
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
if (tok->enc != NULL) {
|
||||
assert(utf8 == NULL);
|
||||
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
|
||||
if (utf8 == NULL)
|
||||
return _PyTokenizer_error_ret(tok);
|
||||
str = PyBytes_AS_STRING(utf8);
|
||||
}
|
||||
assert(tok->decoding_buffer == NULL);
|
||||
tok->decoding_buffer = utf8; /* CAUTION */
|
||||
return str;
|
||||
}
|
||||
|
||||
/* Set up tokenizer for string */
|
||||
struct tok_state *
|
||||
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
|
||||
{
|
||||
struct tok_state *tok = _PyTokenizer_tok_new();
|
||||
char *decoded;
|
||||
|
||||
if (tok == NULL)
|
||||
return NULL;
|
||||
decoded = decode_str(str, exec_input, tok, preserve_crlf);
|
||||
if (decoded == NULL) {
|
||||
_PyTokenizer_Free(tok);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
tok->buf = tok->cur = tok->inp = decoded;
|
||||
tok->end = decoded;
|
||||
tok->underflow = &tok_underflow_string;
|
||||
return tok;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue