mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 07:31:38 +00:00 
			
		
		
		
	
		
			
	
	
		
			130 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			130 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| 
								 | 
							
								#include "Python.h"
							 | 
						||
| 
								 | 
							
								#include "errcode.h"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include "helpers.h"
							 | 
						||
| 
								 | 
							
								#include "../lexer/state.h"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int
							 | 
						||
| 
								 | 
							
								tok_underflow_string(struct tok_state *tok) {
							 | 
						||
| 
								 | 
							
								    char *end = strchr(tok->inp, '\n');
							 | 
						||
| 
								 | 
							
								    if (end != NULL) {
							 | 
						||
| 
								 | 
							
								        end++;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        end = strchr(tok->inp, '\0');
							 | 
						||
| 
								 | 
							
								        if (end == tok->inp) {
							 | 
						||
| 
								 | 
							
								            tok->done = E_EOF;
							 | 
						||
| 
								 | 
							
								            return 0;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    if (tok->start == NULL) {
							 | 
						||
| 
								 | 
							
								        tok->buf = tok->cur;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    tok->line_start = tok->cur;
							 | 
						||
| 
								 | 
							
								    ADVANCE_LINENO();
							 | 
						||
| 
								 | 
							
								    tok->inp = end;
							 | 
						||
| 
								 | 
							
								    return 1;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* Fetch a byte from TOK, using the string buffer. */
							 | 
						||
| 
								 | 
							
								static int
							 | 
						||
| 
								 | 
							
								buf_getc(struct tok_state *tok) {
							 | 
						||
| 
								 | 
							
								    return Py_CHARMASK(*tok->str++);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* Unfetch a byte from TOK, using the string buffer. */
							 | 
						||
| 
								 | 
							
								static void
							 | 
						||
| 
								 | 
							
								buf_ungetc(int c, struct tok_state *tok) {
							 | 
						||
| 
								 | 
							
								    tok->str--;
							 | 
						||
| 
								 | 
							
								    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* Set the readline function for TOK to ENC. For the string-based
							 | 
						||
| 
								 | 
							
								   tokenizer, this means to just record the encoding. */
							 | 
						||
| 
								 | 
							
								static int
							 | 
						||
| 
								 | 
							
								buf_setreadl(struct tok_state *tok, const char* enc) {
							 | 
						||
| 
								 | 
							
								    tok->enc = enc;
							 | 
						||
| 
								 | 
							
								    return 1;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* Decode a byte string STR for use as the buffer of TOK.
							 | 
						||
| 
								 | 
							
								   Look for encoding declarations inside STR, and record them
							 | 
						||
| 
								 | 
							
								   inside TOK.  */
							 | 
						||
| 
								 | 
							
								static char *
							 | 
						||
| 
								 | 
							
								decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    PyObject* utf8 = NULL;
							 | 
						||
| 
								 | 
							
								    char *str;
							 | 
						||
| 
								 | 
							
								    const char *s;
							 | 
						||
| 
								 | 
							
								    const char *newl[2] = {NULL, NULL};
							 | 
						||
| 
								 | 
							
								    int lineno = 0;
							 | 
						||
| 
								 | 
							
								    tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
							 | 
						||
| 
								 | 
							
								    if (str == NULL)
							 | 
						||
| 
								 | 
							
								        return NULL;
							 | 
						||
| 
								 | 
							
								    tok->enc = NULL;
							 | 
						||
| 
								 | 
							
								    tok->str = str;
							 | 
						||
| 
								 | 
							
								    if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
							 | 
						||
| 
								 | 
							
								        return _PyTokenizer_error_ret(tok);
							 | 
						||
| 
								 | 
							
								    str = tok->str;             /* string after BOM if any */
							 | 
						||
| 
								 | 
							
								    assert(str);
							 | 
						||
| 
								 | 
							
								    if (tok->enc != NULL) {
							 | 
						||
| 
								 | 
							
								        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
							 | 
						||
| 
								 | 
							
								        if (utf8 == NULL)
							 | 
						||
| 
								 | 
							
								            return _PyTokenizer_error_ret(tok);
							 | 
						||
| 
								 | 
							
								        str = PyBytes_AsString(utf8);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    for (s = str;; s++) {
							 | 
						||
| 
								 | 
							
								        if (*s == '\0') break;
							 | 
						||
| 
								 | 
							
								        else if (*s == '\n') {
							 | 
						||
| 
								 | 
							
								            assert(lineno < 2);
							 | 
						||
| 
								 | 
							
								            newl[lineno] = s;
							 | 
						||
| 
								 | 
							
								            lineno++;
							 | 
						||
| 
								 | 
							
								            if (lineno == 2) break;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    tok->enc = NULL;
							 | 
						||
| 
								 | 
							
								    /* need to check line 1 and 2 separately since check_coding_spec
							 | 
						||
| 
								 | 
							
								       assumes a single line as input */
							 | 
						||
| 
								 | 
							
								    if (newl[0]) {
							 | 
						||
| 
								 | 
							
								        if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
							 | 
						||
| 
								 | 
							
								            return NULL;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								        if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
							 | 
						||
| 
								 | 
							
								            if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
							 | 
						||
| 
								 | 
							
								                                   tok, buf_setreadl))
							 | 
						||
| 
								 | 
							
								                return NULL;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    if (tok->enc != NULL) {
							 | 
						||
| 
								 | 
							
								        assert(utf8 == NULL);
							 | 
						||
| 
								 | 
							
								        utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
							 | 
						||
| 
								 | 
							
								        if (utf8 == NULL)
							 | 
						||
| 
								 | 
							
								            return _PyTokenizer_error_ret(tok);
							 | 
						||
| 
								 | 
							
								        str = PyBytes_AS_STRING(utf8);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    assert(tok->decoding_buffer == NULL);
							 | 
						||
| 
								 | 
							
								    tok->decoding_buffer = utf8; /* CAUTION */
							 | 
						||
| 
								 | 
							
								    return str;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* Set up tokenizer for string */
							 | 
						||
| 
								 | 
							
								struct tok_state *
							 | 
						||
| 
								 | 
							
								_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    struct tok_state *tok = _PyTokenizer_tok_new();
							 | 
						||
| 
								 | 
							
								    char *decoded;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if (tok == NULL)
							 | 
						||
| 
								 | 
							
								        return NULL;
							 | 
						||
| 
								 | 
							
								    decoded = decode_str(str, exec_input, tok, preserve_crlf);
							 | 
						||
| 
								 | 
							
								    if (decoded == NULL) {
							 | 
						||
| 
								 | 
							
								        _PyTokenizer_Free(tok);
							 | 
						||
| 
								 | 
							
								        return NULL;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    tok->buf = tok->cur = tok->inp = decoded;
							 | 
						||
| 
								 | 
							
								    tok->end = decoded;
							 | 
						||
| 
								 | 
							
								    tok->underflow = &tok_underflow_string;
							 | 
						||
| 
								 | 
							
								    return tok;
							 | 
						||
| 
								 | 
							
								}
							 |