mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	gh-104169: Refactor tokenizer into lexer and wrappers (#110684)
* The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <pablogsal@gmail.com> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									eb50cd37ea
								
							
						
					
					
						commit
						01481f2dc1
					
				
					 29 changed files with 3185 additions and 2988 deletions
				
			
		|  | @ -347,20 +347,36 @@ PEGEN_OBJS=		\ | |||
| 		Parser/string_parser.o \ | ||||
| 		Parser/peg_api.o | ||||
| 
 | ||||
| TOKENIZER_OBJS=		\ | ||||
| 		Parser/lexer/buffer.o \ | ||||
| 		Parser/lexer/lexer.o \ | ||||
| 		Parser/lexer/state.o \ | ||||
| 		Parser/tokenizer/file_tokenizer.o \ | ||||
| 		Parser/tokenizer/readline_tokenizer.o \ | ||||
| 		Parser/tokenizer/string_tokenizer.o \ | ||||
| 		Parser/tokenizer/utf8_tokenizer.o \ | ||||
| 		Parser/tokenizer/helpers.o | ||||
| 
 | ||||
| PEGEN_HEADERS= \ | ||||
| 		$(srcdir)/Include/internal/pycore_parser.h \ | ||||
| 		$(srcdir)/Parser/pegen.h \ | ||||
| 		$(srcdir)/Parser/string_parser.h | ||||
| 
 | ||||
| TOKENIZER_HEADERS= \ | ||||
| 		Parser/lexer/buffer.h \ | ||||
| 		Parser/lexer/lexer.h \ | ||||
| 		Parser/lexer/state.h \ | ||||
| 		Parser/tokenizer/tokenizer.h \ | ||||
| 		Parser/tokenizer/helpers.h | ||||
| 
 | ||||
| POBJS=		\ | ||||
| 		Parser/token.o \ | ||||
| 
 | ||||
| PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o | ||||
| PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o | ||||
| 
 | ||||
| PARSER_HEADERS= \ | ||||
| 		$(PEGEN_HEADERS) \ | ||||
| 		$(srcdir)/Parser/tokenizer.h | ||||
| 		$(TOKENIZER_HEADERS) | ||||
| 
 | ||||
| ########################################################################## | ||||
| # Python | ||||
|  | @ -1397,6 +1413,8 @@ regen-pegen-metaparser: | |||
| .PHONY: regen-pegen | ||||
| regen-pegen: | ||||
| 	@$(MKDIR_P) $(srcdir)/Parser | ||||
| 	@$(MKDIR_P) $(srcdir)/Parser/tokenizer | ||||
| 	@$(MKDIR_P) $(srcdir)/Parser/lexer | ||||
| 	PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -q c \ | ||||
| 		$(srcdir)/Grammar/python.gram \ | ||||
| 		$(srcdir)/Grammar/Tokens \ | ||||
|  |  | |||
|  | @ -0,0 +1,4 @@ | |||
| Split the tokenizer into two separate directories: | ||||
| - One part includes the actual lexeme producing logic and lives in ``Parser/lexer``. | ||||
| - The second part wraps the lexer according to the different tokenization modes | ||||
|   we have (string, utf-8, file, interactive, readline) and lives in ``Parser/tokenizer``. | ||||
|  | @ -172,7 +172,14 @@ | |||
|     <ClCompile Include="..\Parser\action_helpers.c" /> | ||||
|     <ClCompile Include="..\Parser\string_parser.c" /> | ||||
|     <ClCompile Include="..\Parser\token.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\lexer\buffer.c" /> | ||||
|     <ClCompile Include="..\Parser\lexer\state.c" /> | ||||
|     <ClCompile Include="..\Parser\lexer\lexer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\helpers.c" /> | ||||
|     <ClCompile Include="..\PC\invalid_parameter_handler.c" /> | ||||
|     <ClCompile Include="..\PC\msvcrtmodule.c" /> | ||||
|     <ClCompile Include="..\PC\winreg.c" /> | ||||
|  |  | |||
|  | @ -397,7 +397,28 @@ | |||
|     <ClCompile Include="..\Parser\token.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer.c"> | ||||
|     <ClCompile Include="..\Parser\lexer\lexer.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\lexer\buffer.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\lexer\state.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\helpers.c"> | ||||
|       <Filter>Source Files</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Python\traceback.c"> | ||||
|  |  | |||
|  | @ -362,7 +362,11 @@ | |||
|     <ClInclude Include="..\Objects\stringlib\replace.h" /> | ||||
|     <ClInclude Include="..\Objects\stringlib\split.h" /> | ||||
|     <ClInclude Include="..\Objects\unicodetype_db.h" /> | ||||
|     <ClInclude Include="..\Parser\tokenizer.h" /> | ||||
|     <ClInclude Include="..\Parser\lexer\state.h" /> | ||||
|     <ClInclude Include="..\Parser\lexer\lexer.h" /> | ||||
|     <ClInclude Include="..\Parser\lexer\buffer.h" /> | ||||
|     <ClInclude Include="..\Parser\tokenizer\helpers.h" /> | ||||
|     <ClInclude Include="..\Parser\tokenizer\tokenizer.h" /> | ||||
|     <ClInclude Include="..\Parser\string_parser.h" /> | ||||
|     <ClInclude Include="..\Parser\pegen.h" /> | ||||
|     <ClInclude Include="..\PC\errmap.h" /> | ||||
|  | @ -507,7 +511,14 @@ | |||
|     <ClCompile Include="..\Objects\unionobject.c" /> | ||||
|     <ClCompile Include="..\Objects\weakrefobject.c" /> | ||||
|     <ClCompile Include="..\Parser\myreadline.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\lexer\state.c" /> | ||||
|     <ClCompile Include="..\Parser\lexer\lexer.c" /> | ||||
|     <ClCompile Include="..\Parser\lexer\buffer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" /> | ||||
|     <ClCompile Include="..\Parser\tokenizer\helpers.c" /> | ||||
|     <ClCompile Include="..\Parser\token.c" /> | ||||
|     <ClCompile Include="..\Parser\pegen.c" /> | ||||
|     <ClCompile Include="..\Parser\pegen_errors.c" /> | ||||
|  |  | |||
|  | @ -291,7 +291,19 @@ | |||
|     <ClInclude Include="..\Objects\unicodetype_db.h"> | ||||
|       <Filter>Objects</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\Parser\tokenizer.h"> | ||||
|     <ClInclude Include="..\Parser\lexer\lexer.h"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\Parser\lexer\state.h"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\Parser\lexer\buffer.h"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\Parser\tokenizer\tokenizer.h"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\Parser\tokenizer\helpers.h"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClInclude> | ||||
|     <ClInclude Include="..\PC\errmap.h"> | ||||
|  | @ -1139,7 +1151,28 @@ | |||
|     <ClCompile Include="..\Parser\myreadline.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer.c"> | ||||
|     <ClCompile Include="..\Parser\lexer\lexer.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\lexer\state.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\lexer\buffer.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\tokenizer\helpers.c"> | ||||
|       <Filter>Parser</Filter> | ||||
|     </ClCompile> | ||||
|     <ClCompile Include="..\Parser\token.c"> | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| #include <Python.h> | ||||
| 
 | ||||
| #include "pegen.h" | ||||
| #include "tokenizer.h" | ||||
| #include "string_parser.h" | ||||
| #include "pycore_runtime.h"         // _PyRuntime | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										76
									
								
								Parser/lexer/buffer.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								Parser/lexer/buffer.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,76 @@ | |||
| #include "Python.h" | ||||
| #include "errcode.h" | ||||
| 
 | ||||
| #include "state.h" | ||||
| 
 | ||||
| /* Traverse and remember all f-string buffers, in order to be able to restore
 | ||||
|    them after reallocating tok->buf */ | ||||
| void | ||||
| _PyLexer_remember_fstring_buffers(struct tok_state *tok) | ||||
| { | ||||
|     int index; | ||||
|     tokenizer_mode *mode; | ||||
| 
 | ||||
|     for (index = tok->tok_mode_stack_index; index >= 0; --index) { | ||||
|         mode = &(tok->tok_mode_stack[index]); | ||||
|         mode->f_string_start_offset = mode->f_string_start - tok->buf; | ||||
|         mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /* Traverse and restore all f-string buffers after reallocating tok->buf */ | ||||
| void | ||||
| _PyLexer_restore_fstring_buffers(struct tok_state *tok) | ||||
| { | ||||
|     int index; | ||||
|     tokenizer_mode *mode; | ||||
| 
 | ||||
|     for (index = tok->tok_mode_stack_index; index >= 0; --index) { | ||||
|         mode = &(tok->tok_mode_stack[index]); | ||||
|         mode->f_string_start = tok->buf + mode->f_string_start_offset; | ||||
|         mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /* Read a line of text from TOK into S, using the stream in TOK.
 | ||||
|    Return NULL on failure, else S. | ||||
| 
 | ||||
|    On entry, tok->decoding_buffer will be one of: | ||||
|      1) NULL: need to call tok->decoding_readline to get a new line | ||||
|      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and | ||||
|        stored the result in tok->decoding_buffer | ||||
|      3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room | ||||
|        (in the s buffer) to copy entire contents of the line read | ||||
|        by tok->decoding_readline.  tok->decoding_buffer has the overflow. | ||||
|        In this case, tok_readline_recode is called in a loop (with an expanded buffer) | ||||
|        until the buffer ends with a '\n' (or until the end of the file is | ||||
|        reached): see tok_nextc and its calls to tok_reserve_buf. | ||||
| */ | ||||
| int | ||||
| _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) | ||||
| { | ||||
|     Py_ssize_t cur = tok->cur - tok->buf; | ||||
|     Py_ssize_t oldsize = tok->inp - tok->buf; | ||||
|     Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); | ||||
|     if (newsize > tok->end - tok->buf) { | ||||
|         char *newbuf = tok->buf; | ||||
|         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; | ||||
|         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; | ||||
|         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; | ||||
|         _PyLexer_remember_fstring_buffers(tok); | ||||
|         newbuf = (char *)PyMem_Realloc(newbuf, newsize); | ||||
|         if (newbuf == NULL) { | ||||
|             tok->done = E_NOMEM; | ||||
|             return 0; | ||||
|         } | ||||
|         tok->buf = newbuf; | ||||
|         tok->cur = tok->buf + cur; | ||||
|         tok->inp = tok->buf + oldsize; | ||||
|         tok->end = tok->buf + newsize; | ||||
|         tok->start = start < 0 ? NULL : tok->buf + start; | ||||
|         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start; | ||||
|         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start; | ||||
|         _PyLexer_restore_fstring_buffers(tok); | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
							
								
								
									
										10
									
								
								Parser/lexer/buffer.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Parser/lexer/buffer.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,10 @@ | |||
| #ifndef _LEXER_BUFFER_H_ | ||||
| #define _LEXER_BUFFER_H_ | ||||
| 
 | ||||
| #include "pyport.h" | ||||
| 
 | ||||
| void _PyLexer_remember_fstring_buffers(struct tok_state *tok); | ||||
| void _PyLexer_restore_fstring_buffers(struct tok_state *tok); | ||||
| int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size); | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										1419
									
								
								Parser/lexer/lexer.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1419
									
								
								Parser/lexer/lexer.c
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										10
									
								
								Parser/lexer/lexer.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Parser/lexer/lexer.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,10 @@ | |||
| #ifndef _PY_LEXER_LEXER_H_ | ||||
| #define _PY_LEXER_LEXER_H_ | ||||
| 
 | ||||
| #include "state.h" | ||||
| 
 | ||||
| int _PyLexer_update_fstring_expr(struct tok_state *tok, char cur); | ||||
| 
 | ||||
| int _PyTokenizer_Get(struct tok_state *, struct token *); | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										149
									
								
								Parser/lexer/state.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								Parser/lexer/state.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,149 @@ | |||
| #include "Python.h" | ||||
| #include "pycore_pystate.h" | ||||
| #include "pycore_token.h" | ||||
| #include "errcode.h" | ||||
| 
 | ||||
| #include "state.h" | ||||
| 
 | ||||
| /* Never change this */ | ||||
| #define TABSIZE 8 | ||||
| 
 | ||||
| /* Create and initialize a new tok_state structure */ | ||||
| struct tok_state * | ||||
| _PyTokenizer_tok_new(void) | ||||
| { | ||||
|     struct tok_state *tok = (struct tok_state *)PyMem_Malloc( | ||||
|                                             sizeof(struct tok_state)); | ||||
|     if (tok == NULL) | ||||
|         return NULL; | ||||
|     tok->buf = tok->cur = tok->inp = NULL; | ||||
|     tok->fp_interactive = 0; | ||||
|     tok->interactive_src_start = NULL; | ||||
|     tok->interactive_src_end = NULL; | ||||
|     tok->start = NULL; | ||||
|     tok->end = NULL; | ||||
|     tok->done = E_OK; | ||||
|     tok->fp = NULL; | ||||
|     tok->input = NULL; | ||||
|     tok->tabsize = TABSIZE; | ||||
|     tok->indent = 0; | ||||
|     tok->indstack[0] = 0; | ||||
|     tok->atbol = 1; | ||||
|     tok->pendin = 0; | ||||
|     tok->prompt = tok->nextprompt = NULL; | ||||
|     tok->lineno = 0; | ||||
|     tok->starting_col_offset = -1; | ||||
|     tok->col_offset = -1; | ||||
|     tok->level = 0; | ||||
|     tok->altindstack[0] = 0; | ||||
|     tok->decoding_state = STATE_INIT; | ||||
|     tok->decoding_erred = 0; | ||||
|     tok->enc = NULL; | ||||
|     tok->encoding = NULL; | ||||
|     tok->cont_line = 0; | ||||
|     tok->filename = NULL; | ||||
|     tok->decoding_readline = NULL; | ||||
|     tok->decoding_buffer = NULL; | ||||
|     tok->readline = NULL; | ||||
|     tok->type_comments = 0; | ||||
|     tok->interactive_underflow = IUNDERFLOW_NORMAL; | ||||
|     tok->underflow = NULL; | ||||
|     tok->str = NULL; | ||||
|     tok->report_warnings = 1; | ||||
|     tok->tok_extra_tokens = 0; | ||||
|     tok->comment_newline = 0; | ||||
|     tok->implicit_newline = 0; | ||||
|     tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0}; | ||||
|     tok->tok_mode_stack_index = 0; | ||||
| #ifdef Py_DEBUG | ||||
|     tok->debug = _Py_GetConfig()->parser_debug; | ||||
| #endif | ||||
|     return tok; | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| free_fstring_expressions(struct tok_state *tok) | ||||
| { | ||||
|     int index; | ||||
|     tokenizer_mode *mode; | ||||
| 
 | ||||
|     for (index = tok->tok_mode_stack_index; index >= 0; --index) { | ||||
|         mode = &(tok->tok_mode_stack[index]); | ||||
|         if (mode->last_expr_buffer != NULL) { | ||||
|             PyMem_Free(mode->last_expr_buffer); | ||||
|             mode->last_expr_buffer = NULL; | ||||
|             mode->last_expr_size = 0; | ||||
|             mode->last_expr_end = -1; | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /* Free a tok_state structure */ | ||||
| void | ||||
| _PyTokenizer_Free(struct tok_state *tok) | ||||
| { | ||||
|     if (tok->encoding != NULL) { | ||||
|         PyMem_Free(tok->encoding); | ||||
|     } | ||||
|     Py_XDECREF(tok->decoding_readline); | ||||
|     Py_XDECREF(tok->decoding_buffer); | ||||
|     Py_XDECREF(tok->readline); | ||||
|     Py_XDECREF(tok->filename); | ||||
|     if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) { | ||||
|         PyMem_Free(tok->buf); | ||||
|     } | ||||
|     if (tok->input) { | ||||
|         PyMem_Free(tok->input); | ||||
|     } | ||||
|     if (tok->interactive_src_start != NULL) { | ||||
|         PyMem_Free(tok->interactive_src_start); | ||||
|     } | ||||
|     free_fstring_expressions(tok); | ||||
|     PyMem_Free(tok); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| _PyToken_Free(struct token *token) { | ||||
|     Py_XDECREF(token->metadata); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| _PyToken_Init(struct token *token) { | ||||
|     token->metadata = NULL; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset, | ||||
|                          int end_col_offset, const char *start, const char *end) | ||||
| { | ||||
|     token->level = tok->level; | ||||
|     token->lineno = token->end_lineno = tok->lineno; | ||||
|     token->col_offset = col_offset; | ||||
|     token->end_col_offset = end_col_offset; | ||||
|     token->start = start; | ||||
|     token->end = end; | ||||
|     return type; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end) | ||||
| { | ||||
|     assert((start == NULL && end == NULL) || (start != NULL && end != NULL)); | ||||
|     token->level = tok->level; | ||||
|     if (ISSTRINGLIT(type)) { | ||||
|         token->lineno = tok->first_lineno; | ||||
|     } | ||||
|     else { | ||||
|         token->lineno = tok->lineno; | ||||
|     } | ||||
|     token->end_lineno = tok->lineno; | ||||
|     token->col_offset = token->end_col_offset = -1; | ||||
|     token->start = start; | ||||
|     token->end = end; | ||||
| 
 | ||||
|     if (start != NULL && end != NULL) { | ||||
|         token->col_offset = tok->starting_col_offset; | ||||
|         token->end_col_offset = tok->col_offset; | ||||
|     } | ||||
|     return type; | ||||
| } | ||||
|  | @ -1,19 +1,15 @@ | |||
| #ifndef Py_TOKENIZER_H | ||||
| #define Py_TOKENIZER_H | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| #ifndef _PY_LEXER_H_ | ||||
| #define _PY_LEXER_H_ | ||||
| 
 | ||||
| #include "object.h" | ||||
| 
 | ||||
| /* Tokenizer interface */ | ||||
| 
 | ||||
| #include "pycore_token.h" /* For token types */ | ||||
| 
 | ||||
| #define MAXINDENT 100       /* Max indentation level */ | ||||
| #define MAXLEVEL 200        /* Max parentheses level */ | ||||
| #define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */ | ||||
| 
 | ||||
| #define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) | ||||
| #define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) | ||||
| 
 | ||||
| enum decoding_state { | ||||
|     STATE_INIT, | ||||
|     STATE_SEEK_CODING, | ||||
|  | @ -118,6 +114,8 @@ struct tok_state { | |||
| 
 | ||||
|     /* How to proceed when asked for a new token in interactive mode */ | ||||
|     enum interactive_underflow_t interactive_underflow; | ||||
|     int (*underflow)(struct tok_state *); /* Function to call when buffer is empty and we need to refill it*/ | ||||
| 
 | ||||
|     int report_warnings; | ||||
|     // TODO: Factor this into its own thing
 | ||||
|     tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL]; | ||||
|  | @ -130,19 +128,14 @@ struct tok_state { | |||
| #endif | ||||
| }; | ||||
| 
 | ||||
| extern struct tok_state *_PyTokenizer_FromString(const char *, int, int); | ||||
| extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int); | ||||
| extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int); | ||||
| extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, | ||||
|                                               const char *, const char *); | ||||
| extern void _PyTokenizer_Free(struct tok_state *); | ||||
| extern void _PyToken_Free(struct token *); | ||||
| extern void _PyToken_Init(struct token *); | ||||
| extern int _PyTokenizer_Get(struct tok_state *, struct token *); | ||||
| int _PyLexer_type_comment_token_setup(struct tok_state *tok, struct token *token, int type, int col_offset, | ||||
|                          int end_col_offset, const char *start, const char *end); | ||||
| int _PyLexer_token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end); | ||||
| 
 | ||||
| struct tok_state *_PyTokenizer_tok_new(void); | ||||
| void _PyTokenizer_Free(struct tok_state *); | ||||
| void _PyToken_Free(struct token *); | ||||
| void _PyToken_Init(struct token *); | ||||
| 
 | ||||
| #define tok_dump _Py_tok_dump | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
| #endif /* !Py_TOKENIZER_H */ | ||||
|  | @ -1,6 +1,5 @@ | |||
| #include "Python.h" | ||||
| 
 | ||||
| #include "tokenizer.h" | ||||
| #include "pegen.h" | ||||
| 
 | ||||
| mod_ty | ||||
|  |  | |||
|  | @ -3,7 +3,8 @@ | |||
| #include "pycore_pystate.h"       // _PyThreadState_GET() | ||||
| #include <errcode.h> | ||||
| 
 | ||||
| #include "tokenizer.h" | ||||
| #include "lexer/lexer.h" | ||||
| #include "tokenizer/tokenizer.h" | ||||
| #include "pegen.h" | ||||
| 
 | ||||
| // Internal parser functions
 | ||||
|  |  | |||
|  | @ -2,7 +2,8 @@ | |||
| #include <errcode.h> | ||||
| 
 | ||||
| #include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject() | ||||
| #include "tokenizer.h" | ||||
| #include "lexer/state.h" | ||||
| #include "lexer/lexer.h" | ||||
| #include "pegen.h" | ||||
| 
 | ||||
| // TOKENIZER ERRORS
 | ||||
|  |  | |||
|  | @ -4,7 +4,7 @@ | |||
| #include "pycore_bytesobject.h"   // _PyBytes_DecodeEscape() | ||||
| #include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal() | ||||
| 
 | ||||
| #include "tokenizer.h" | ||||
| #include "lexer/state.h" | ||||
| #include "pegen.h" | ||||
| #include "string_parser.h" | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										2951
									
								
								Parser/tokenizer.c
									
										
									
									
									
								
							
							
						
						
									
										2951
									
								
								Parser/tokenizer.c
									
										
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										470
									
								
								Parser/tokenizer/file_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										470
									
								
								Parser/tokenizer/file_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,470 @@ | |||
| #ifdef HAVE_UNISTD_H | ||||
| #  include <unistd.h>             // read()
 | ||||
| #endif | ||||
| 
 | ||||
| #include "Python.h" | ||||
| #include "pycore_call.h" | ||||
| #include "pycore_import.h" | ||||
| #include "pycore_fileutils.h" | ||||
| #include "errcode.h" | ||||
| 
 | ||||
| #include "helpers.h" | ||||
| #include "../lexer/state.h" | ||||
| #include "../lexer/lexer.h" | ||||
| #include "../lexer/buffer.h" | ||||
| 
 | ||||
| static int | ||||
| tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { | ||||
|     assert(tok->fp_interactive); | ||||
| 
 | ||||
|     if (!line) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; | ||||
|     Py_ssize_t line_size = strlen(line); | ||||
|     char last_char = line[line_size > 0 ? line_size - 1 : line_size]; | ||||
|     if (last_char != '\n') { | ||||
|         line_size += 1; | ||||
|     } | ||||
|     char* new_str = tok->interactive_src_start; | ||||
| 
 | ||||
|     new_str = PyMem_Realloc(new_str, current_size + line_size + 1); | ||||
|     if (!new_str) { | ||||
|         if (tok->interactive_src_start) { | ||||
|             PyMem_Free(tok->interactive_src_start); | ||||
|         } | ||||
|         tok->interactive_src_start = NULL; | ||||
|         tok->interactive_src_end = NULL; | ||||
|         tok->done = E_NOMEM; | ||||
|         return -1; | ||||
|     } | ||||
|     strcpy(new_str + current_size, line); | ||||
|     tok->implicit_newline = 0; | ||||
|     if (last_char != '\n') { | ||||
|         /* Last line does not end in \n, fake one */ | ||||
|         new_str[current_size + line_size - 1] = '\n'; | ||||
|         new_str[current_size + line_size] = '\0'; | ||||
|         tok->implicit_newline = 1; | ||||
|     } | ||||
|     tok->interactive_src_start = new_str; | ||||
|     tok->interactive_src_end = new_str + current_size + line_size; | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| tok_readline_raw(struct tok_state *tok) | ||||
| { | ||||
|     do { | ||||
|         if (!_PyLexer_tok_reserve_buf(tok, BUFSIZ)) { | ||||
|             return 0; | ||||
|         } | ||||
|         int n_chars = (int)(tok->end - tok->inp); | ||||
|         size_t line_size = 0; | ||||
|         char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); | ||||
|         if (line == NULL) { | ||||
|             return 1; | ||||
|         } | ||||
|         if (tok->fp_interactive && | ||||
|             tok_concatenate_interactive_new_line(tok, line) == -1) { | ||||
|             return 0; | ||||
|         } | ||||
|         tok->inp += line_size; | ||||
|         if (tok->inp == tok->buf) { | ||||
|             return 0; | ||||
|         } | ||||
|     } while (tok->inp[-1] != '\n'); | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| tok_readline_recode(struct tok_state *tok) { | ||||
|     PyObject *line; | ||||
|     const  char *buf; | ||||
|     Py_ssize_t buflen; | ||||
|     line = tok->decoding_buffer; | ||||
|     if (line == NULL) { | ||||
|         line = PyObject_CallNoArgs(tok->decoding_readline); | ||||
|         if (line == NULL) { | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             goto error; | ||||
|         } | ||||
|     } | ||||
|     else { | ||||
|         tok->decoding_buffer = NULL; | ||||
|     } | ||||
|     buf = PyUnicode_AsUTF8AndSize(line, &buflen); | ||||
|     if (buf == NULL) { | ||||
|         _PyTokenizer_error_ret(tok); | ||||
|         goto error; | ||||
|     } | ||||
|     // Make room for the null terminator *and* potentially
 | ||||
|     // an extra newline character that we may need to artificially
 | ||||
|     // add.
 | ||||
|     size_t buffer_size = buflen + 2; | ||||
|     if (!_PyLexer_tok_reserve_buf(tok, buffer_size)) { | ||||
|         goto error; | ||||
|     } | ||||
|     memcpy(tok->inp, buf, buflen); | ||||
|     tok->inp += buflen; | ||||
|     *tok->inp = '\0'; | ||||
|     if (tok->fp_interactive && | ||||
|         tok_concatenate_interactive_new_line(tok, buf) == -1) { | ||||
|         goto error; | ||||
|     } | ||||
|     Py_DECREF(line); | ||||
|     return 1; | ||||
| error: | ||||
|     Py_XDECREF(line); | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| /* Fetch the next byte from TOK. */ | ||||
| static int fp_getc(struct tok_state *tok) { | ||||
|     return getc(tok->fp); | ||||
| } | ||||
| 
 | ||||
| /* Unfetch the last byte back into TOK.  */ | ||||
| static void fp_ungetc(int c, struct tok_state *tok) { | ||||
|     ungetc(c, tok->fp); | ||||
| } | ||||
| 
 | ||||
| /* Set the readline function for TOK to a StreamReader's
 | ||||
|    readline function. The StreamReader is named ENC. | ||||
| 
 | ||||
|    This function is called from _PyTokenizer_check_bom and _PyTokenizer_check_coding_spec. | ||||
| 
 | ||||
|    ENC is usually identical to the future value of tok->encoding, | ||||
|    except for the (currently unsupported) case of UTF-16. | ||||
| 
 | ||||
|    Return 1 on success, 0 on failure. */ | ||||
| static int | ||||
| fp_setreadl(struct tok_state *tok, const char* enc) | ||||
| { | ||||
|     PyObject *readline, *open, *stream; | ||||
|     int fd; | ||||
|     long pos; | ||||
| 
 | ||||
|     fd = fileno(tok->fp); | ||||
|     /* Due to buffering the file offset for fd can be different from the file
 | ||||
|      * position of tok->fp.  If tok->fp was opened in text mode on Windows, | ||||
|      * its file position counts CRLF as one char and can't be directly mapped | ||||
|      * to the file offset for fd.  Instead we step back one byte and read to | ||||
|      * the end of line.*/ | ||||
|     pos = ftell(tok->fp); | ||||
|     if (pos == -1 || | ||||
|         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { | ||||
|         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     open = _PyImport_GetModuleAttrString("io", "open"); | ||||
|     if (open == NULL) { | ||||
|         return 0; | ||||
|     } | ||||
|     stream = PyObject_CallFunction(open, "isisOOO", | ||||
|                     fd, "r", -1, enc, Py_None, Py_None, Py_False); | ||||
|     Py_DECREF(open); | ||||
|     if (stream == NULL) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     readline = PyObject_GetAttr(stream, &_Py_ID(readline)); | ||||
|     Py_DECREF(stream); | ||||
|     if (readline == NULL) { | ||||
|         return 0; | ||||
|     } | ||||
|     Py_XSETREF(tok->decoding_readline, readline); | ||||
| 
 | ||||
|     if (pos > 0) { | ||||
|         PyObject *bufobj = _PyObject_CallNoArgs(readline); | ||||
|         if (bufobj == NULL) { | ||||
|             return 0; | ||||
|         } | ||||
|         Py_DECREF(bufobj); | ||||
|     } | ||||
| 
 | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| tok_underflow_interactive(struct tok_state *tok) { | ||||
|     if (tok->interactive_underflow == IUNDERFLOW_STOP) { | ||||
|         tok->done = E_INTERACT_STOP; | ||||
|         return 1; | ||||
|     } | ||||
|     char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); | ||||
|     if (newtok != NULL) { | ||||
|         char *translated = _PyTokenizer_translate_newlines(newtok, 0, 0, tok); | ||||
|         PyMem_Free(newtok); | ||||
|         if (translated == NULL) { | ||||
|             return 0; | ||||
|         } | ||||
|         newtok = translated; | ||||
|     } | ||||
|     if (tok->encoding && newtok && *newtok) { | ||||
|         /* Recode to UTF-8 */ | ||||
|         Py_ssize_t buflen; | ||||
|         const char* buf; | ||||
|         PyObject *u = _PyTokenizer_translate_into_utf8(newtok, tok->encoding); | ||||
|         PyMem_Free(newtok); | ||||
|         if (u == NULL) { | ||||
|             tok->done = E_DECODE; | ||||
|             return 0; | ||||
|         } | ||||
|         buflen = PyBytes_GET_SIZE(u); | ||||
|         buf = PyBytes_AS_STRING(u); | ||||
|         newtok = PyMem_Malloc(buflen+1); | ||||
|         if (newtok == NULL) { | ||||
|             Py_DECREF(u); | ||||
|             tok->done = E_NOMEM; | ||||
|             return 0; | ||||
|         } | ||||
|         strcpy(newtok, buf); | ||||
|         Py_DECREF(u); | ||||
|     } | ||||
|     if (tok->fp_interactive && | ||||
|         tok_concatenate_interactive_new_line(tok, newtok) == -1) { | ||||
|         PyMem_Free(newtok); | ||||
|         return 0; | ||||
|     } | ||||
|     if (tok->nextprompt != NULL) { | ||||
|         tok->prompt = tok->nextprompt; | ||||
|     } | ||||
|     if (newtok == NULL) { | ||||
|         tok->done = E_INTR; | ||||
|     } | ||||
|     else if (*newtok == '\0') { | ||||
|         PyMem_Free(newtok); | ||||
|         tok->done = E_EOF; | ||||
|     } | ||||
|     else if (tok->start != NULL) { | ||||
|         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; | ||||
|         _PyLexer_remember_fstring_buffers(tok); | ||||
|         size_t size = strlen(newtok); | ||||
|         ADVANCE_LINENO(); | ||||
|         if (!_PyLexer_tok_reserve_buf(tok, size + 1)) { | ||||
|             PyMem_Free(tok->buf); | ||||
|             tok->buf = NULL; | ||||
|             PyMem_Free(newtok); | ||||
|             return 0; | ||||
|         } | ||||
|         memcpy(tok->cur, newtok, size + 1); | ||||
|         PyMem_Free(newtok); | ||||
|         tok->inp += size; | ||||
|         tok->multi_line_start = tok->buf + cur_multi_line_start; | ||||
|         _PyLexer_restore_fstring_buffers(tok); | ||||
|     } | ||||
|     else { | ||||
|         _PyLexer_remember_fstring_buffers(tok); | ||||
|         ADVANCE_LINENO(); | ||||
|         PyMem_Free(tok->buf); | ||||
|         tok->buf = newtok; | ||||
|         tok->cur = tok->buf; | ||||
|         tok->line_start = tok->buf; | ||||
|         tok->inp = strchr(tok->buf, '\0'); | ||||
|         tok->end = tok->inp + 1; | ||||
|         _PyLexer_restore_fstring_buffers(tok); | ||||
|     } | ||||
|     if (tok->done != E_OK) { | ||||
|         if (tok->prompt != NULL) { | ||||
|             PySys_WriteStderr("\n"); | ||||
|         } | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     if (tok->tok_mode_stack_index && !_PyLexer_update_fstring_expr(tok, 0)) { | ||||
|         return 0; | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| tok_underflow_file(struct tok_state *tok) { | ||||
|     if (tok->start == NULL && !INSIDE_FSTRING(tok)) { | ||||
|         tok->cur = tok->inp = tok->buf; | ||||
|     } | ||||
|     if (tok->decoding_state == STATE_INIT) { | ||||
|         /* We have not yet determined the encoding.
 | ||||
|            If an encoding is found, use the file-pointer | ||||
|            reader functions from now on. */ | ||||
|         if (!_PyTokenizer_check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             return 0; | ||||
|         } | ||||
|         assert(tok->decoding_state != STATE_INIT); | ||||
|     } | ||||
|     /* Read until '\n' or EOF */ | ||||
|     if (tok->decoding_readline != NULL) { | ||||
|         /* We already have a codec associated with this input. */ | ||||
|         if (!tok_readline_recode(tok)) { | ||||
|             return 0; | ||||
|         } | ||||
|     } | ||||
|     else { | ||||
|         /* We want a 'raw' read. */ | ||||
|         if (!tok_readline_raw(tok)) { | ||||
|             return 0; | ||||
|         } | ||||
|     } | ||||
|     if (tok->inp == tok->cur) { | ||||
|         tok->done = E_EOF; | ||||
|         return 0; | ||||
|     } | ||||
|     tok->implicit_newline = 0; | ||||
|     if (tok->inp[-1] != '\n') { | ||||
|         assert(tok->inp + 1 < tok->end); | ||||
|         /* Last line does not end in \n, fake one */ | ||||
|         *tok->inp++ = '\n'; | ||||
|         *tok->inp = '\0'; | ||||
|         tok->implicit_newline = 1; | ||||
|     } | ||||
| 
 | ||||
|     if (tok->tok_mode_stack_index && !_PyLexer_update_fstring_expr(tok, 0)) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     ADVANCE_LINENO(); | ||||
|     if (tok->decoding_state != STATE_NORMAL) { | ||||
|         if (tok->lineno > 2) { | ||||
|             tok->decoding_state = STATE_NORMAL; | ||||
|         } | ||||
|         else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur), | ||||
|                                     tok, fp_setreadl)) | ||||
|         { | ||||
|             return 0; | ||||
|         } | ||||
|     } | ||||
|     /* The default encoding is UTF-8, so make sure we don't have any
 | ||||
|        non-UTF-8 sequences in it. */ | ||||
|     if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) { | ||||
|         _PyTokenizer_error_ret(tok); | ||||
|         return 0; | ||||
|     } | ||||
|     assert(tok->done == E_OK); | ||||
|     return tok->done == E_OK; | ||||
| } | ||||
| 
 | ||||
| /* Set up tokenizer for file */ | ||||
| struct tok_state * | ||||
| _PyTokenizer_FromFile(FILE *fp, const char* enc, | ||||
|                       const char *ps1, const char *ps2) | ||||
| { | ||||
|     struct tok_state *tok = _PyTokenizer_tok_new(); | ||||
|     if (tok == NULL) | ||||
|         return NULL; | ||||
|     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { | ||||
|         _PyTokenizer_Free(tok); | ||||
|         return NULL; | ||||
|     } | ||||
|     tok->cur = tok->inp = tok->buf; | ||||
|     tok->end = tok->buf + BUFSIZ; | ||||
|     tok->fp = fp; | ||||
|     tok->prompt = ps1; | ||||
|     tok->nextprompt = ps2; | ||||
|     if (ps1 || ps2) { | ||||
|         tok->underflow = &tok_underflow_interactive; | ||||
|     } else { | ||||
|         tok->underflow = &tok_underflow_file; | ||||
|     } | ||||
|     if (enc != NULL) { | ||||
|         /* Must copy encoding declaration since it
 | ||||
|            gets copied into the parse tree. */ | ||||
|         tok->encoding = _PyTokenizer_new_string(enc, strlen(enc), tok); | ||||
|         if (!tok->encoding) { | ||||
|             _PyTokenizer_Free(tok); | ||||
|             return NULL; | ||||
|         } | ||||
|         tok->decoding_state = STATE_NORMAL; | ||||
|     } | ||||
|     return tok; | ||||
| } | ||||
| 
 | ||||
| #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) | ||||
| // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's
 | ||||
| // dup() emulation with open() is slow.
 | ||||
| typedef union { | ||||
|     void *cookie; | ||||
|     int fd; | ||||
| } borrowed; | ||||
| 
 | ||||
| static ssize_t | ||||
| borrow_read(void *cookie, char *buf, size_t size) | ||||
| { | ||||
|     borrowed b = {.cookie = cookie}; | ||||
|     return read(b.fd, (void *)buf, size); | ||||
| } | ||||
| 
 | ||||
| static FILE * | ||||
| fdopen_borrow(int fd) { | ||||
|     // supports only reading. seek fails. close and write are no-ops.
 | ||||
|     cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; | ||||
|     borrowed b = {.fd = fd}; | ||||
|     return fopencookie(b.cookie, "r", io_cb); | ||||
| } | ||||
| #else | ||||
| static FILE * | ||||
| fdopen_borrow(int fd) { | ||||
|     fd = _Py_dup(fd); | ||||
|     if (fd < 0) { | ||||
|         return NULL; | ||||
|     } | ||||
|     return fdopen(fd, "r"); | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| /* Get the encoding of a Python file. Check for the coding cookie and check if
 | ||||
|    the file starts with a BOM. | ||||
| 
 | ||||
|    _PyTokenizer_FindEncodingFilename() returns NULL when it can't find the | ||||
|    encoding in the first or second line of the file (in which case the encoding | ||||
|    should be assumed to be UTF-8). | ||||
| 
 | ||||
|    The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed | ||||
|    by the caller. */ | ||||
| char * | ||||
| _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) | ||||
| { | ||||
|     struct tok_state *tok; | ||||
|     FILE *fp; | ||||
|     char *encoding = NULL; | ||||
| 
 | ||||
|     fp = fdopen_borrow(fd); | ||||
|     if (fp == NULL) { | ||||
|         return NULL; | ||||
|     } | ||||
|     tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); | ||||
|     if (tok == NULL) { | ||||
|         fclose(fp); | ||||
|         return NULL; | ||||
|     } | ||||
|     if (filename != NULL) { | ||||
|         tok->filename = Py_NewRef(filename); | ||||
|     } | ||||
|     else { | ||||
|         tok->filename = PyUnicode_FromString("<string>"); | ||||
|         if (tok->filename == NULL) { | ||||
|             fclose(fp); | ||||
|             _PyTokenizer_Free(tok); | ||||
|             return encoding; | ||||
|         } | ||||
|     } | ||||
|     struct token token; | ||||
|     // We don't want to report warnings here because it could cause infinite recursion
 | ||||
|     // if fetching the encoding shows a warning.
 | ||||
|     tok->report_warnings = 0; | ||||
|     while (tok->lineno < 2 && tok->done == E_OK) { | ||||
|         _PyToken_Init(&token); | ||||
|         _PyTokenizer_Get(tok, &token); | ||||
|         _PyToken_Free(&token); | ||||
|     } | ||||
|     fclose(fp); | ||||
|     if (tok->encoding) { | ||||
|         encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); | ||||
|         if (encoding) { | ||||
|             strcpy(encoding, tok->encoding); | ||||
|         } | ||||
|     } | ||||
|     _PyTokenizer_Free(tok); | ||||
|     return encoding; | ||||
| } | ||||
							
								
								
									
										552
									
								
								Parser/tokenizer/helpers.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										552
									
								
								Parser/tokenizer/helpers.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,552 @@ | |||
| #include "Python.h" | ||||
| #include "errcode.h" | ||||
| #include "pycore_token.h" | ||||
| 
 | ||||
| #include "../lexer/state.h" | ||||
| 
 | ||||
| 
 | ||||
| /* ############## ERRORS ############## */ | ||||
| 
 | ||||
| static int | ||||
| _syntaxerror_range(struct tok_state *tok, const char *format, | ||||
|                    int col_offset, int end_col_offset, | ||||
|                    va_list vargs) | ||||
| { | ||||
|     // In release builds, we don't want to overwrite a previous error, but in debug builds we
 | ||||
|     // want to fail if we are not doing it so we can fix it.
 | ||||
|     assert(tok->done != E_ERROR); | ||||
|     if (tok->done == E_ERROR) { | ||||
|         return ERRORTOKEN; | ||||
|     } | ||||
|     PyObject *errmsg, *errtext, *args; | ||||
|     errmsg = PyUnicode_FromFormatV(format, vargs); | ||||
|     if (!errmsg) { | ||||
|         goto error; | ||||
|     } | ||||
| 
 | ||||
|     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, | ||||
|                                    "replace"); | ||||
|     if (!errtext) { | ||||
|         goto error; | ||||
|     } | ||||
| 
 | ||||
|     if (col_offset == -1) { | ||||
|         col_offset = (int)PyUnicode_GET_LENGTH(errtext); | ||||
|     } | ||||
|     if (end_col_offset == -1) { | ||||
|         end_col_offset = col_offset; | ||||
|     } | ||||
| 
 | ||||
|     Py_ssize_t line_len = strcspn(tok->line_start, "\n"); | ||||
|     if (line_len != tok->cur - tok->line_start) { | ||||
|         Py_DECREF(errtext); | ||||
|         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, | ||||
|                                        "replace"); | ||||
|     } | ||||
|     if (!errtext) { | ||||
|         goto error; | ||||
|     } | ||||
| 
 | ||||
|     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, | ||||
|                          col_offset, errtext, tok->lineno, end_col_offset); | ||||
|     if (args) { | ||||
|         PyErr_SetObject(PyExc_SyntaxError, args); | ||||
|         Py_DECREF(args); | ||||
|     } | ||||
| 
 | ||||
| error: | ||||
|     Py_XDECREF(errmsg); | ||||
|     tok->done = E_ERROR; | ||||
|     return ERRORTOKEN; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...) | ||||
| { | ||||
|     // This errors are cleaned on startup. Todo: Fix it.
 | ||||
|     va_list vargs; | ||||
|     va_start(vargs, format); | ||||
|     int ret = _syntaxerror_range(tok, format, -1, -1, vargs); | ||||
|     va_end(vargs); | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyTokenizer_syntaxerror_known_range(struct tok_state *tok, | ||||
|                         int col_offset, int end_col_offset, | ||||
|                         const char *format, ...) | ||||
| { | ||||
|     va_list vargs; | ||||
|     va_start(vargs, format); | ||||
|     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs); | ||||
|     va_end(vargs); | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyTokenizer_indenterror(struct tok_state *tok) | ||||
| { | ||||
|     tok->done = E_TABSPACE; | ||||
|     tok->cur = tok->inp; | ||||
|     return ERRORTOKEN; | ||||
| } | ||||
| 
 | ||||
| char * | ||||
| _PyTokenizer_error_ret(struct tok_state *tok) /* XXX */ | ||||
| { | ||||
|     tok->decoding_erred = 1; | ||||
|     if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */ | ||||
|         PyMem_Free(tok->buf); | ||||
|     } | ||||
|     tok->buf = tok->cur = tok->inp = NULL; | ||||
|     tok->start = NULL; | ||||
|     tok->end = NULL; | ||||
|     tok->done = E_DECODE; | ||||
|     return NULL;                /* as if it were EOF */ | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) | ||||
| { | ||||
|     if (!tok->report_warnings) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     PyObject *msg = PyUnicode_FromFormat( | ||||
|         "invalid escape sequence '\\%c'", | ||||
|         (char) first_invalid_escape_char | ||||
|     ); | ||||
| 
 | ||||
|     if (msg == NULL) { | ||||
|         return -1; | ||||
|     } | ||||
| 
 | ||||
|     if (PyErr_WarnExplicitObject(PyExc_SyntaxWarning, msg, tok->filename, | ||||
|                                  tok->lineno, NULL, NULL) < 0) { | ||||
|         Py_DECREF(msg); | ||||
| 
 | ||||
|         if (PyErr_ExceptionMatches(PyExc_SyntaxWarning)) { | ||||
|             /* Replace the SyntaxWarning exception with a SyntaxError
 | ||||
|                to get a more accurate error report */ | ||||
|             PyErr_Clear(); | ||||
|             return _PyTokenizer_syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char); | ||||
|         } | ||||
| 
 | ||||
|         return -1; | ||||
|     } | ||||
| 
 | ||||
|     Py_DECREF(msg); | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) | ||||
| { | ||||
|     if (!tok->report_warnings) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     PyObject *errmsg; | ||||
|     va_list vargs; | ||||
|     va_start(vargs, format); | ||||
|     errmsg = PyUnicode_FromFormatV(format, vargs); | ||||
|     va_end(vargs); | ||||
|     if (!errmsg) { | ||||
|         goto error; | ||||
|     } | ||||
| 
 | ||||
|     if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, | ||||
|                                  tok->lineno, NULL, NULL) < 0) { | ||||
|         if (PyErr_ExceptionMatches(category)) { | ||||
|             /* Replace the DeprecationWarning exception with a SyntaxError
 | ||||
|                to get a more accurate error report */ | ||||
|             PyErr_Clear(); | ||||
|             _PyTokenizer_syntaxerror(tok, "%U", errmsg); | ||||
|         } | ||||
|         goto error; | ||||
|     } | ||||
|     Py_DECREF(errmsg); | ||||
|     return 0; | ||||
| 
 | ||||
| error: | ||||
|     Py_XDECREF(errmsg); | ||||
|     tok->done = E_ERROR; | ||||
|     return -1; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* ############## STRING MANIPULATION ############## */ | ||||
| 
 | ||||
| char * | ||||
| _PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok) | ||||
| { | ||||
|     char* result = (char *)PyMem_Malloc(len + 1); | ||||
|     if (!result) { | ||||
|         tok->done = E_NOMEM; | ||||
|         return NULL; | ||||
|     } | ||||
|     memcpy(result, s, len); | ||||
|     result[len] = '\0'; | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
| _PyTokenizer_translate_into_utf8(const char* str, const char* enc) { | ||||
|     PyObject *utf8; | ||||
|     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); | ||||
|     if (buf == NULL) | ||||
|         return NULL; | ||||
|     utf8 = PyUnicode_AsUTF8String(buf); | ||||
|     Py_DECREF(buf); | ||||
|     return utf8; | ||||
| } | ||||
| 
 | ||||
| char * | ||||
| _PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, | ||||
|                    struct tok_state *tok) { | ||||
|     int skip_next_lf = 0; | ||||
|     size_t needed_length = strlen(s) + 2, final_length; | ||||
|     char *buf, *current; | ||||
|     char c = '\0'; | ||||
|     buf = PyMem_Malloc(needed_length); | ||||
|     if (buf == NULL) { | ||||
|         tok->done = E_NOMEM; | ||||
|         return NULL; | ||||
|     } | ||||
|     for (current = buf; *s; s++, current++) { | ||||
|         c = *s; | ||||
|         if (skip_next_lf) { | ||||
|             skip_next_lf = 0; | ||||
|             if (c == '\n') { | ||||
|                 c = *++s; | ||||
|                 if (!c) | ||||
|                     break; | ||||
|             } | ||||
|         } | ||||
|         if (!preserve_crlf && c == '\r') { | ||||
|             skip_next_lf = 1; | ||||
|             c = '\n'; | ||||
|         } | ||||
|         *current = c; | ||||
|     } | ||||
|     /* If this is exec input, add a newline to the end of the string if
 | ||||
|        there isn't one already. */ | ||||
|     if (exec_input && c != '\n' && c != '\0') { | ||||
|         *current = '\n'; | ||||
|         current++; | ||||
|     } | ||||
|     *current = '\0'; | ||||
|     final_length = current - buf + 1; | ||||
|     if (final_length < needed_length && final_length) { | ||||
|         /* should never fail */ | ||||
|         char* result = PyMem_Realloc(buf, final_length); | ||||
|         if (result == NULL) { | ||||
|             PyMem_Free(buf); | ||||
|         } | ||||
|         buf = result; | ||||
|     } | ||||
|     return buf; | ||||
| } | ||||
| 
 | ||||
| /* ############## ENCODING STUFF ############## */ | ||||
| 
 | ||||
| 
 | ||||
| /* See whether the file starts with a BOM. If it does,
 | ||||
|    invoke the set_readline function with the new encoding. | ||||
|    Return 1 on success, 0 on failure.  */ | ||||
| int | ||||
| _PyTokenizer_check_bom(int get_char(struct tok_state *), | ||||
|           void unget_char(int, struct tok_state *), | ||||
|           int set_readline(struct tok_state *, const char *), | ||||
|           struct tok_state *tok) | ||||
| { | ||||
|     int ch1, ch2, ch3; | ||||
|     ch1 = get_char(tok); | ||||
|     tok->decoding_state = STATE_SEEK_CODING; | ||||
|     if (ch1 == EOF) { | ||||
|         return 1; | ||||
|     } else if (ch1 == 0xEF) { | ||||
|         ch2 = get_char(tok); | ||||
|         if (ch2 != 0xBB) { | ||||
|             unget_char(ch2, tok); | ||||
|             unget_char(ch1, tok); | ||||
|             return 1; | ||||
|         } | ||||
|         ch3 = get_char(tok); | ||||
|         if (ch3 != 0xBF) { | ||||
|             unget_char(ch3, tok); | ||||
|             unget_char(ch2, tok); | ||||
|             unget_char(ch1, tok); | ||||
|             return 1; | ||||
|         } | ||||
|     } else { | ||||
|         unget_char(ch1, tok); | ||||
|         return 1; | ||||
|     } | ||||
|     if (tok->encoding != NULL) | ||||
|         PyMem_Free(tok->encoding); | ||||
|     tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok); | ||||
|     if (!tok->encoding) | ||||
|         return 0; | ||||
|     /* No need to set_readline: input is already utf-8 */ | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| static const char * | ||||
| get_normal_name(const char *s)  /* for utf-8 and latin-1 */ | ||||
| { | ||||
|     char buf[13]; | ||||
|     int i; | ||||
|     for (i = 0; i < 12; i++) { | ||||
|         int c = s[i]; | ||||
|         if (c == '\0') | ||||
|             break; | ||||
|         else if (c == '_') | ||||
|             buf[i] = '-'; | ||||
|         else | ||||
|             buf[i] = Py_TOLOWER(c); | ||||
|     } | ||||
|     buf[i] = '\0'; | ||||
|     if (strcmp(buf, "utf-8") == 0 || | ||||
|         strncmp(buf, "utf-8-", 6) == 0) | ||||
|         return "utf-8"; | ||||
|     else if (strcmp(buf, "latin-1") == 0 || | ||||
|              strcmp(buf, "iso-8859-1") == 0 || | ||||
|              strcmp(buf, "iso-latin-1") == 0 || | ||||
|              strncmp(buf, "latin-1-", 8) == 0 || | ||||
|              strncmp(buf, "iso-8859-1-", 11) == 0 || | ||||
|              strncmp(buf, "iso-latin-1-", 12) == 0) | ||||
|         return "iso-8859-1"; | ||||
|     else | ||||
|         return s; | ||||
| } | ||||
| 
 | ||||
| /* Return the coding spec in S, or NULL if none is found.  */ | ||||
| static int | ||||
| get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) | ||||
| { | ||||
|     Py_ssize_t i; | ||||
|     *spec = NULL; | ||||
|     /* Coding spec must be in a comment, and that comment must be
 | ||||
|      * the only statement on the source code line. */ | ||||
|     for (i = 0; i < size - 6; i++) { | ||||
|         if (s[i] == '#') | ||||
|             break; | ||||
|         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') | ||||
|             return 1; | ||||
|     } | ||||
|     for (; i < size - 6; i++) { /* XXX inefficient search */ | ||||
|         const char* t = s + i; | ||||
|         if (memcmp(t, "coding", 6) == 0) { | ||||
|             const char* begin = NULL; | ||||
|             t += 6; | ||||
|             if (t[0] != ':' && t[0] != '=') | ||||
|                 continue; | ||||
|             do { | ||||
|                 t++; | ||||
|             } while (t[0] == ' ' || t[0] == '\t'); | ||||
| 
 | ||||
|             begin = t; | ||||
|             while (Py_ISALNUM(t[0]) || | ||||
|                    t[0] == '-' || t[0] == '_' || t[0] == '.') | ||||
|                 t++; | ||||
| 
 | ||||
|             if (begin < t) { | ||||
|                 char* r = _PyTokenizer_new_string(begin, t - begin, tok); | ||||
|                 const char* q; | ||||
|                 if (!r) | ||||
|                     return 0; | ||||
|                 q = get_normal_name(r); | ||||
|                 if (r != q) { | ||||
|                     PyMem_Free(r); | ||||
|                     r = _PyTokenizer_new_string(q, strlen(q), tok); | ||||
|                     if (!r) | ||||
|                         return 0; | ||||
|                 } | ||||
|                 *spec = r; | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| /* Check whether the line contains a coding spec. If it does,
 | ||||
|    invoke the set_readline function for the new encoding. | ||||
|    This function receives the tok_state and the new encoding. | ||||
|    Return 1 on success, 0 on failure.  */ | ||||
| int | ||||
| _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, | ||||
|                   int set_readline(struct tok_state *, const char *)) | ||||
| { | ||||
|     char *cs; | ||||
|     if (tok->cont_line) { | ||||
|         /* It's a continuation line, so it can't be a coding spec. */ | ||||
|         tok->decoding_state = STATE_NORMAL; | ||||
|         return 1; | ||||
|     } | ||||
|     if (!get_coding_spec(line, &cs, size, tok)) { | ||||
|         return 0; | ||||
|     } | ||||
|     if (!cs) { | ||||
|         Py_ssize_t i; | ||||
|         for (i = 0; i < size; i++) { | ||||
|             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') | ||||
|                 break; | ||||
|             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { | ||||
|                 /* Stop checking coding spec after a line containing
 | ||||
|                  * anything except a comment. */ | ||||
|                 tok->decoding_state = STATE_NORMAL; | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|         return 1; | ||||
|     } | ||||
|     tok->decoding_state = STATE_NORMAL; | ||||
|     if (tok->encoding == NULL) { | ||||
|         assert(tok->decoding_readline == NULL); | ||||
|         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); | ||||
|             PyMem_Free(cs); | ||||
|             return 0; | ||||
|         } | ||||
|         tok->encoding = cs; | ||||
|     } else {                /* then, compare cs with BOM */ | ||||
|         if (strcmp(tok->encoding, cs) != 0) { | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             PyErr_Format(PyExc_SyntaxError, | ||||
|                          "encoding problem: %s with BOM", cs); | ||||
|             PyMem_Free(cs); | ||||
|             return 0; | ||||
|         } | ||||
|         PyMem_Free(cs); | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| /* Check whether the characters at s start a valid
 | ||||
|    UTF-8 sequence. Return the number of characters forming | ||||
|    the sequence if yes, 0 if not.  The special cases match | ||||
|    those in stringlib/codecs.h:utf8_decode. | ||||
| */ | ||||
| static int | ||||
| valid_utf8(const unsigned char* s) | ||||
| { | ||||
|     int expected = 0; | ||||
|     int length; | ||||
|     if (*s < 0x80) { | ||||
|         /* single-byte code */ | ||||
|         return 1; | ||||
|     } | ||||
|     else if (*s < 0xE0) { | ||||
|         /* \xC2\x80-\xDF\xBF -- 0080-07FF */ | ||||
|         if (*s < 0xC2) { | ||||
|             /* invalid sequence
 | ||||
|                \x80-\xBF -- continuation byte | ||||
|                \xC0-\xC1 -- fake 0000-007F */ | ||||
|             return 0; | ||||
|         } | ||||
|         expected = 1; | ||||
|     } | ||||
|     else if (*s < 0xF0) { | ||||
|         /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ | ||||
|         if (*s == 0xE0 && *(s + 1) < 0xA0) { | ||||
|             /* invalid sequence
 | ||||
|                \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ | ||||
|             return 0; | ||||
|         } | ||||
|         else if (*s == 0xED && *(s + 1) >= 0xA0) { | ||||
|             /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
 | ||||
|                will result in surrogates in range D800-DFFF. Surrogates are | ||||
|                not valid UTF-8 so they are rejected. | ||||
|                See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
 | ||||
|                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
 | ||||
|             return 0; | ||||
|         } | ||||
|         expected = 2; | ||||
|     } | ||||
|     else if (*s < 0xF5) { | ||||
|         /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ | ||||
|         if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { | ||||
|             /* invalid sequence -- one of:
 | ||||
|                \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF | ||||
|                \xF4\x90\x80\x80- -- 110000- overflow */ | ||||
|             return 0; | ||||
|         } | ||||
|         expected = 3; | ||||
|     } | ||||
|     else { | ||||
|         /* invalid start byte */ | ||||
|         return 0; | ||||
|     } | ||||
|     length = expected + 1; | ||||
|     for (; expected; expected--) | ||||
|         if (s[expected] < 0x80 || s[expected] >= 0xC0) | ||||
|             return 0; | ||||
|     return length; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok) | ||||
| { | ||||
|     int badchar = 0; | ||||
|     unsigned char *c; | ||||
|     int length; | ||||
|     for (c = (unsigned char *)line; *c; c += length) { | ||||
|         if (!(length = valid_utf8(c))) { | ||||
|             badchar = *c; | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|     if (badchar) { | ||||
|         PyErr_Format(PyExc_SyntaxError, | ||||
|                      "Non-UTF-8 code starting with '\\x%.2x' " | ||||
|                      "in file %U on line %i, " | ||||
|                      "but no encoding declared; " | ||||
|                      "see https://peps.python.org/pep-0263/ for details", | ||||
|                      badchar, tok->filename, tok->lineno); | ||||
|         return 0; | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* ############## DEBUGGING STUFF ############## */ | ||||
| 
 | ||||
| #ifdef Py_DEBUG | ||||
| void | ||||
| _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size) | ||||
| { | ||||
|     if (s == NULL) { | ||||
|         fputs("NULL", f); | ||||
|         return; | ||||
|     } | ||||
|     putc('"', f); | ||||
|     while (size-- > 0) { | ||||
|         unsigned char c = *s++; | ||||
|         switch (c) { | ||||
|             case '\n': fputs("\\n", f); break; | ||||
|             case '\r': fputs("\\r", f); break; | ||||
|             case '\t': fputs("\\t", f); break; | ||||
|             case '\f': fputs("\\f", f); break; | ||||
|             case '\'': fputs("\\'", f); break; | ||||
|             case '"': fputs("\\\"", f); break; | ||||
|             default: | ||||
|                 if (0x20 <= c && c <= 0x7f) | ||||
|                     putc(c, f); | ||||
|                 else | ||||
|                     fprintf(f, "\\x%02x", c); | ||||
|         } | ||||
|     } | ||||
|     putc('"', f); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| _PyTokenizer_tok_dump(int type, char *start, char *end) | ||||
| { | ||||
|     fprintf(stderr, "%s", _PyParser_TokenNames[type]); | ||||
|     if (type == NAME || type == NUMBER || type == STRING || type == OP) | ||||
|         fprintf(stderr, "(%.*s)", (int)(end - start), start); | ||||
| } | ||||
| #endif | ||||
							
								
								
									
										37
									
								
								Parser/tokenizer/helpers.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								Parser/tokenizer/helpers.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| #ifndef _PY_TOKENIZER_HELPERS_H_ | ||||
| #define _PY_TOKENIZER_HELPERS_H_ | ||||
| 
 | ||||
| #include "Python.h" | ||||
| 
 | ||||
| #include "../lexer/state.h" | ||||
| 
 | ||||
| #define ADVANCE_LINENO() \ | ||||
|             tok->lineno++; \ | ||||
|             tok->col_offset = 0; | ||||
| 
 | ||||
| int _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...); | ||||
| int _PyTokenizer_syntaxerror_known_range(struct tok_state *tok, int col_offset, int end_col_offset, const char *format, ...); | ||||
| int _PyTokenizer_indenterror(struct tok_state *tok); | ||||
| int _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char); | ||||
| int _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...); | ||||
| char *_PyTokenizer_error_ret(struct tok_state *tok); | ||||
| 
 | ||||
| char *_PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok); | ||||
| char *_PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, struct tok_state *tok); | ||||
| PyObject *_PyTokenizer_translate_into_utf8(const char* str, const char* enc); | ||||
| 
 | ||||
| int _PyTokenizer_check_bom(int get_char(struct tok_state *), | ||||
|           void unget_char(int, struct tok_state *), | ||||
|           int set_readline(struct tok_state *, const char *), | ||||
|           struct tok_state *tok); | ||||
| int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, | ||||
|                   int set_readline(struct tok_state *, const char *)); | ||||
| int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok); | ||||
| 
 | ||||
| #ifdef Py_DEBUG | ||||
| void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size); | ||||
| void _PyTokenizer_tok_dump(int type, char *start, char *end); | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										134
									
								
								Parser/tokenizer/readline_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										134
									
								
								Parser/tokenizer/readline_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,134 @@ | |||
| #include "Python.h" | ||||
| #include "errcode.h" | ||||
| 
 | ||||
| #include "helpers.h" | ||||
| #include "../lexer/lexer.h" | ||||
| #include "../lexer/state.h" | ||||
| #include "../lexer/buffer.h" | ||||
| 
 | ||||
| static int | ||||
| tok_readline_string(struct tok_state* tok) { | ||||
|     PyObject* line = NULL; | ||||
|     PyObject* raw_line = PyObject_CallNoArgs(tok->readline); | ||||
|     if (raw_line == NULL) { | ||||
|         if (PyErr_ExceptionMatches(PyExc_StopIteration)) { | ||||
|             PyErr_Clear(); | ||||
|             return 1; | ||||
|         } | ||||
|         _PyTokenizer_error_ret(tok); | ||||
|         goto error; | ||||
|     } | ||||
|     if(tok->encoding != NULL) { | ||||
|         if (!PyBytes_Check(raw_line)) { | ||||
|             PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object"); | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             goto error; | ||||
|         } | ||||
|         line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line), | ||||
|                                 tok->encoding, "replace"); | ||||
|         Py_CLEAR(raw_line); | ||||
|         if (line == NULL) { | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             goto error; | ||||
|         } | ||||
|     } else { | ||||
|         if(!PyUnicode_Check(raw_line)) { | ||||
|             PyErr_Format(PyExc_TypeError, "readline() returned a non-string object"); | ||||
|             _PyTokenizer_error_ret(tok); | ||||
|             goto error; | ||||
|         } | ||||
|         line = raw_line; | ||||
|         raw_line = NULL; | ||||
|     } | ||||
|     Py_ssize_t buflen; | ||||
|     const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen); | ||||
|     if (buf == NULL) { | ||||
|         _PyTokenizer_error_ret(tok); | ||||
|         goto error; | ||||
|     } | ||||
| 
 | ||||
|     // Make room for the null terminator *and* potentially
 | ||||
|     // an extra newline character that we may need to artificially
 | ||||
|     // add.
 | ||||
|     size_t buffer_size = buflen + 2; | ||||
|     if (!_PyLexer_tok_reserve_buf(tok, buffer_size)) { | ||||
|         goto error; | ||||
|     } | ||||
|     memcpy(tok->inp, buf, buflen); | ||||
|     tok->inp += buflen; | ||||
|     *tok->inp = '\0'; | ||||
| 
 | ||||
|     tok->line_start = tok->cur; | ||||
|     Py_DECREF(line); | ||||
|     return 1; | ||||
| error: | ||||
|     Py_XDECREF(raw_line); | ||||
|     Py_XDECREF(line); | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| tok_underflow_readline(struct tok_state* tok) { | ||||
|     assert(tok->decoding_state == STATE_NORMAL); | ||||
|     assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL); | ||||
|     if (tok->start == NULL && !INSIDE_FSTRING(tok)) { | ||||
|         tok->cur = tok->inp = tok->buf; | ||||
|     } | ||||
|     if (!tok_readline_string(tok)) { | ||||
|         return 0; | ||||
|     } | ||||
|     if (tok->inp == tok->cur) { | ||||
|         tok->done = E_EOF; | ||||
|         return 0; | ||||
|     } | ||||
|     tok->implicit_newline = 0; | ||||
|     if (tok->inp[-1] != '\n') { | ||||
|         assert(tok->inp + 1 < tok->end); | ||||
|         /* Last line does not end in \n, fake one */ | ||||
|         *tok->inp++ = '\n'; | ||||
|         *tok->inp = '\0'; | ||||
|         tok->implicit_newline = 1; | ||||
|     } | ||||
| 
 | ||||
|     if (tok->tok_mode_stack_index && !_PyLexer_update_fstring_expr(tok, 0)) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     ADVANCE_LINENO(); | ||||
|     /* The default encoding is UTF-8, so make sure we don't have any
 | ||||
|        non-UTF-8 sequences in it. */ | ||||
|     if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) { | ||||
|         _PyTokenizer_error_ret(tok); | ||||
|         return 0; | ||||
|     } | ||||
|     assert(tok->done == E_OK); | ||||
|     return tok->done == E_OK; | ||||
| } | ||||
| 
 | ||||
| struct tok_state * | ||||
| _PyTokenizer_FromReadline(PyObject* readline, const char* enc, | ||||
|                           int exec_input, int preserve_crlf) | ||||
| { | ||||
|     struct tok_state *tok = _PyTokenizer_tok_new(); | ||||
|     if (tok == NULL) | ||||
|         return NULL; | ||||
|     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { | ||||
|         _PyTokenizer_Free(tok); | ||||
|         return NULL; | ||||
|     } | ||||
|     tok->cur = tok->inp = tok->buf; | ||||
|     tok->end = tok->buf + BUFSIZ; | ||||
|     tok->fp = NULL; | ||||
|     if (enc != NULL) { | ||||
|         tok->encoding = _PyTokenizer_new_string(enc, strlen(enc), tok); | ||||
|         if (!tok->encoding) { | ||||
|             _PyTokenizer_Free(tok); | ||||
|             return NULL; | ||||
|         } | ||||
|     } | ||||
|     tok->decoding_state = STATE_NORMAL; | ||||
|     tok->underflow = &tok_underflow_readline; | ||||
|     Py_INCREF(readline); | ||||
|     tok->readline = readline; | ||||
|     return tok; | ||||
| } | ||||
							
								
								
									
										129
									
								
								Parser/tokenizer/string_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										129
									
								
								Parser/tokenizer/string_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,129 @@ | |||
| #include "Python.h" | ||||
| #include "errcode.h" | ||||
| 
 | ||||
| #include "helpers.h" | ||||
| #include "../lexer/state.h" | ||||
| 
 | ||||
| static int | ||||
| tok_underflow_string(struct tok_state *tok) { | ||||
|     char *end = strchr(tok->inp, '\n'); | ||||
|     if (end != NULL) { | ||||
|         end++; | ||||
|     } | ||||
|     else { | ||||
|         end = strchr(tok->inp, '\0'); | ||||
|         if (end == tok->inp) { | ||||
|             tok->done = E_EOF; | ||||
|             return 0; | ||||
|         } | ||||
|     } | ||||
|     if (tok->start == NULL) { | ||||
|         tok->buf = tok->cur; | ||||
|     } | ||||
|     tok->line_start = tok->cur; | ||||
|     ADVANCE_LINENO(); | ||||
|     tok->inp = end; | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| /* Fetch a byte from TOK, using the string buffer. */ | ||||
| static int | ||||
| buf_getc(struct tok_state *tok) { | ||||
|     return Py_CHARMASK(*tok->str++); | ||||
| } | ||||
| 
 | ||||
| /* Unfetch a byte from TOK, using the string buffer. */ | ||||
| static void | ||||
| buf_ungetc(int c, struct tok_state *tok) { | ||||
|     tok->str--; | ||||
|     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */ | ||||
| } | ||||
| 
 | ||||
| /* Set the readline function for TOK to ENC. For the string-based
 | ||||
|    tokenizer, this means to just record the encoding. */ | ||||
| static int | ||||
| buf_setreadl(struct tok_state *tok, const char* enc) { | ||||
|     tok->enc = enc; | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| /* Decode a byte string STR for use as the buffer of TOK.
 | ||||
|    Look for encoding declarations inside STR, and record them | ||||
|    inside TOK.  */ | ||||
| static char * | ||||
| decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf) | ||||
| { | ||||
|     PyObject* utf8 = NULL; | ||||
|     char *str; | ||||
|     const char *s; | ||||
|     const char *newl[2] = {NULL, NULL}; | ||||
|     int lineno = 0; | ||||
|     tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok); | ||||
|     if (str == NULL) | ||||
|         return NULL; | ||||
|     tok->enc = NULL; | ||||
|     tok->str = str; | ||||
|     if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) | ||||
|         return _PyTokenizer_error_ret(tok); | ||||
|     str = tok->str;             /* string after BOM if any */ | ||||
|     assert(str); | ||||
|     if (tok->enc != NULL) { | ||||
|         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); | ||||
|         if (utf8 == NULL) | ||||
|             return _PyTokenizer_error_ret(tok); | ||||
|         str = PyBytes_AsString(utf8); | ||||
|     } | ||||
|     for (s = str;; s++) { | ||||
|         if (*s == '\0') break; | ||||
|         else if (*s == '\n') { | ||||
|             assert(lineno < 2); | ||||
|             newl[lineno] = s; | ||||
|             lineno++; | ||||
|             if (lineno == 2) break; | ||||
|         } | ||||
|     } | ||||
|     tok->enc = NULL; | ||||
|     /* need to check line 1 and 2 separately since check_coding_spec
 | ||||
|        assumes a single line as input */ | ||||
|     if (newl[0]) { | ||||
|         if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { | ||||
|             return NULL; | ||||
|         } | ||||
|         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { | ||||
|             if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0], | ||||
|                                    tok, buf_setreadl)) | ||||
|                 return NULL; | ||||
|         } | ||||
|     } | ||||
|     if (tok->enc != NULL) { | ||||
|         assert(utf8 == NULL); | ||||
|         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc); | ||||
|         if (utf8 == NULL) | ||||
|             return _PyTokenizer_error_ret(tok); | ||||
|         str = PyBytes_AS_STRING(utf8); | ||||
|     } | ||||
|     assert(tok->decoding_buffer == NULL); | ||||
|     tok->decoding_buffer = utf8; /* CAUTION */ | ||||
|     return str; | ||||
| } | ||||
| 
 | ||||
| /* Set up tokenizer for string */ | ||||
| struct tok_state * | ||||
| _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) | ||||
| { | ||||
|     struct tok_state *tok = _PyTokenizer_tok_new(); | ||||
|     char *decoded; | ||||
| 
 | ||||
|     if (tok == NULL) | ||||
|         return NULL; | ||||
|     decoded = decode_str(str, exec_input, tok, preserve_crlf); | ||||
|     if (decoded == NULL) { | ||||
|         _PyTokenizer_Free(tok); | ||||
|         return NULL; | ||||
|     } | ||||
| 
 | ||||
|     tok->buf = tok->cur = tok->inp = decoded; | ||||
|     tok->end = decoded; | ||||
|     tok->underflow = &tok_underflow_string; | ||||
|     return tok; | ||||
| } | ||||
							
								
								
									
										14
									
								
								Parser/tokenizer/tokenizer.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								Parser/tokenizer/tokenizer.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,14 @@ | |||
| #ifndef Py_TOKENIZER_H | ||||
| #define Py_TOKENIZER_H | ||||
| 
 | ||||
| #include "Python.h" | ||||
| 
 | ||||
| struct tok_state *_PyTokenizer_FromString(const char *, int, int); | ||||
| struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int); | ||||
| struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int); | ||||
| struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, | ||||
|                                               const char *, const char *); | ||||
| 
 | ||||
| #define tok_dump _Py_tok_dump | ||||
| 
 | ||||
| #endif /* !Py_TOKENIZER_H */ | ||||
							
								
								
									
										55
									
								
								Parser/tokenizer/utf8_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								Parser/tokenizer/utf8_tokenizer.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,55 @@ | |||
| #include "Python.h" | ||||
| #include "errcode.h" | ||||
| 
 | ||||
| #include "helpers.h" | ||||
| #include "../lexer/state.h" | ||||
| 
 | ||||
| static int | ||||
| tok_underflow_string(struct tok_state *tok) { | ||||
|     char *end = strchr(tok->inp, '\n'); | ||||
|     if (end != NULL) { | ||||
|         end++; | ||||
|     } | ||||
|     else { | ||||
|         end = strchr(tok->inp, '\0'); | ||||
|         if (end == tok->inp) { | ||||
|             tok->done = E_EOF; | ||||
|             return 0; | ||||
|         } | ||||
|     } | ||||
|     if (tok->start == NULL) { | ||||
|         tok->buf = tok->cur; | ||||
|     } | ||||
|     tok->line_start = tok->cur; | ||||
|     ADVANCE_LINENO(); | ||||
|     tok->inp = end; | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| /* Set up tokenizer for UTF-8 string */ | ||||
| struct tok_state * | ||||
| _PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf) | ||||
| { | ||||
|     struct tok_state *tok = _PyTokenizer_tok_new(); | ||||
|     char *translated; | ||||
|     if (tok == NULL) | ||||
|         return NULL; | ||||
|     tok->input = translated = _PyTokenizer_translate_newlines(str, exec_input, preserve_crlf, tok); | ||||
|     if (translated == NULL) { | ||||
|         _PyTokenizer_Free(tok); | ||||
|         return NULL; | ||||
|     } | ||||
|     tok->decoding_state = STATE_NORMAL; | ||||
|     tok->enc = NULL; | ||||
|     tok->str = translated; | ||||
|     tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok); | ||||
|     if (!tok->encoding) { | ||||
|         _PyTokenizer_Free(tok); | ||||
|         return NULL; | ||||
|     } | ||||
| 
 | ||||
|     tok->buf = tok->cur = tok->inp = translated; | ||||
|     tok->end = translated; | ||||
|     tok->underflow = &tok_underflow_string; | ||||
|     return tok; | ||||
| } | ||||
|  | @ -1,6 +1,8 @@ | |||
| #include "Python.h" | ||||
| #include "errcode.h" | ||||
| #include "../Parser/tokenizer.h" | ||||
| #include "../Parser/lexer/state.h" | ||||
| #include "../Parser/lexer/lexer.h" | ||||
| #include "../Parser/tokenizer/tokenizer.h" | ||||
| #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset() | ||||
| #include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset() | ||||
| 
 | ||||
|  |  | |||
|  | @ -335,7 +335,7 @@ Objects/unicodeobject.c	unicode_encode_call_errorhandler	argparse	- | |||
| Objects/unicodeobject.c	unicode_translate_call_errorhandler	argparse	- | ||||
| Parser/parser.c	-	reserved_keywords	- | ||||
| Parser/parser.c	-	soft_keywords	- | ||||
| Parser/tokenizer.c	-	type_comment_prefix	- | ||||
| Parser/lexer/lexer.c	-	type_comment_prefix	- | ||||
| Python/ast_opt.c	fold_unaryop	ops	- | ||||
| Python/ceval.c	-	_PyEval_BinaryOps	- | ||||
| Python/ceval.c	-	_Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS	- | ||||
|  |  | |||
| Can't render this file because it has a wrong number of fields in line 4. | 
							
								
								
									
										2
									
								
								configure
									
										
									
										generated
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								configure
									
										
									
										generated
									
									
										vendored
									
									
								
							|  | @ -26679,6 +26679,8 @@ SRCDIRS="\ | |||
|   Modules/expat \ | ||||
|   Objects \ | ||||
|   Parser \ | ||||
|   Parser/tokenizer \ | ||||
|   Parser/lexer \ | ||||
|   Programs \ | ||||
|   Python \ | ||||
|   Python/frozen_modules \ | ||||
|  |  | |||
|  | @ -6526,6 +6526,8 @@ SRCDIRS="\ | |||
|   Modules/expat \ | ||||
|   Objects \ | ||||
|   Parser \ | ||||
|   Parser/tokenizer \ | ||||
|   Parser/lexer \ | ||||
|   Programs \ | ||||
|   Python \ | ||||
|   Python/frozen_modules \ | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Lysandros Nikolaou
						Lysandros Nikolaou