mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	 01481f2dc1
			
		
	
	
		01481f2dc1
		
			
		
	
	
	
	
		
			
			* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
		
			
				
	
	
		
			129 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			129 lines
		
	
	
	
		
			3.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #include "Python.h"
 | |
| #include "errcode.h"
 | |
| 
 | |
| #include "helpers.h"
 | |
| #include "../lexer/state.h"
 | |
| 
 | |
| static int
 | |
| tok_underflow_string(struct tok_state *tok) {
 | |
|     char *end = strchr(tok->inp, '\n');
 | |
|     if (end != NULL) {
 | |
|         end++;
 | |
|     }
 | |
|     else {
 | |
|         end = strchr(tok->inp, '\0');
 | |
|         if (end == tok->inp) {
 | |
|             tok->done = E_EOF;
 | |
|             return 0;
 | |
|         }
 | |
|     }
 | |
|     if (tok->start == NULL) {
 | |
|         tok->buf = tok->cur;
 | |
|     }
 | |
|     tok->line_start = tok->cur;
 | |
|     ADVANCE_LINENO();
 | |
|     tok->inp = end;
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| /* Fetch a byte from TOK, using the string buffer. */
 | |
| static int
 | |
| buf_getc(struct tok_state *tok) {
 | |
|     return Py_CHARMASK(*tok->str++);
 | |
| }
 | |
| 
 | |
| /* Unfetch a byte from TOK, using the string buffer. */
 | |
| static void
 | |
| buf_ungetc(int c, struct tok_state *tok) {
 | |
|     tok->str--;
 | |
|     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
 | |
| }
 | |
| 
 | |
| /* Set the readline function for TOK to ENC. For the string-based
 | |
|    tokenizer, this means to just record the encoding. */
 | |
| static int
 | |
| buf_setreadl(struct tok_state *tok, const char* enc) {
 | |
|     tok->enc = enc;
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| /* Decode a byte string STR for use as the buffer of TOK.
 | |
|    Look for encoding declarations inside STR, and record them
 | |
|    inside TOK.  */
 | |
| static char *
 | |
| decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
 | |
| {
 | |
|     PyObject* utf8 = NULL;
 | |
|     char *str;
 | |
|     const char *s;
 | |
|     const char *newl[2] = {NULL, NULL};
 | |
|     int lineno = 0;
 | |
|     tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
 | |
|     if (str == NULL)
 | |
|         return NULL;
 | |
|     tok->enc = NULL;
 | |
|     tok->str = str;
 | |
|     if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 | |
|         return _PyTokenizer_error_ret(tok);
 | |
|     str = tok->str;             /* string after BOM if any */
 | |
|     assert(str);
 | |
|     if (tok->enc != NULL) {
 | |
|         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
 | |
|         if (utf8 == NULL)
 | |
|             return _PyTokenizer_error_ret(tok);
 | |
|         str = PyBytes_AsString(utf8);
 | |
|     }
 | |
|     for (s = str;; s++) {
 | |
|         if (*s == '\0') break;
 | |
|         else if (*s == '\n') {
 | |
|             assert(lineno < 2);
 | |
|             newl[lineno] = s;
 | |
|             lineno++;
 | |
|             if (lineno == 2) break;
 | |
|         }
 | |
|     }
 | |
|     tok->enc = NULL;
 | |
|     /* need to check line 1 and 2 separately since check_coding_spec
 | |
|        assumes a single line as input */
 | |
|     if (newl[0]) {
 | |
|         if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
 | |
|             return NULL;
 | |
|         }
 | |
|         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
 | |
|             if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
 | |
|                                    tok, buf_setreadl))
 | |
|                 return NULL;
 | |
|         }
 | |
|     }
 | |
|     if (tok->enc != NULL) {
 | |
|         assert(utf8 == NULL);
 | |
|         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
 | |
|         if (utf8 == NULL)
 | |
|             return _PyTokenizer_error_ret(tok);
 | |
|         str = PyBytes_AS_STRING(utf8);
 | |
|     }
 | |
|     assert(tok->decoding_buffer == NULL);
 | |
|     tok->decoding_buffer = utf8; /* CAUTION */
 | |
|     return str;
 | |
| }
 | |
| 
 | |
| /* Set up tokenizer for string */
 | |
| struct tok_state *
 | |
| _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
 | |
| {
 | |
|     struct tok_state *tok = _PyTokenizer_tok_new();
 | |
|     char *decoded;
 | |
| 
 | |
|     if (tok == NULL)
 | |
|         return NULL;
 | |
|     decoded = decode_str(str, exec_input, tok, preserve_crlf);
 | |
|     if (decoded == NULL) {
 | |
|         _PyTokenizer_Free(tok);
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     tok->buf = tok->cur = tok->inp = decoded;
 | |
|     tok->end = decoded;
 | |
|     tok->underflow = &tok_underflow_string;
 | |
|     return tok;
 | |
| }
 |