mirror of
				https://github.com/python/cpython.git
				synced 2025-11-01 06:01:29 +00:00 
			
		
		
		
	bpo-42827: Fix crash on SyntaxError in multiline expressions (GH-24140)
When trying to extract the error line for the error message there
are two distinct cases:
1. The input comes from a file, which means that we can extract the
   error line by using `PyErr_ProgramTextObject` and which we already
   do.
2. The input does not come from a file, at which point we need to get
   the source code from the tokenizer:
   * If the tokenizer's current line number is the same with the line
     of the error, we get the line from `tok->buf` and we're ready.
   * Else, we can extract the error line from the source code in the
     following two ways:
     * If the input comes from a string we have all the input
       in `tok->str` and we can extract the error line from it.
     * If the input comes from stdin, i.e. the interactive prompt, we
       do not have access to the previous line. That's why a new
       field `tok->stdin_content` is added which holds the whole input for the
       current (multiline) statement or expression. We can then extract the
       error line from `tok->stdin_content` like we do in the string case above.
Co-authored-by: Pablo Galindo <Pablogsal@gmail.com>
			
			
This commit is contained in:
		
							parent
							
								
									9712358277
								
							
						
					
					
						commit
						e5fe509054
					
				
					 5 changed files with 64 additions and 2 deletions
				
			
		|  | @ -209,6 +209,9 @@ def testSyntaxErrorOffset(self): | ||||||
|         check('x = "a', 1, 7) |         check('x = "a', 1, 7) | ||||||
|         check('lambda x: x = 2', 1, 1) |         check('lambda x: x = 2', 1, 1) | ||||||
|         check('f{a + b + c}', 1, 2) |         check('f{a + b + c}', 1, 2) | ||||||
|  |         check('[file for str(file) in []\n])', 1, 11) | ||||||
|  |         check('[\nfile\nfor str(file)\nin\n[]\n]', 3, 5) | ||||||
|  |         check('[file for\n str(file) in []]', 2, 2) | ||||||
| 
 | 
 | ||||||
|         # Errors thrown by compile.c |         # Errors thrown by compile.c | ||||||
|         check('class foo:return 1', 1, 11) |         check('class foo:return 1', 1, 11) | ||||||
|  |  | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | Fix a crash when working out the error line of a :exc:`SyntaxError` in some | ||||||
|  | multi-line expressions. | ||||||
|  | @ -380,6 +380,27 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...) | ||||||
|     return NULL; |     return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static PyObject * | ||||||
|  | get_error_line(Parser *p, Py_ssize_t lineno) | ||||||
|  | { | ||||||
|  |     /* If p->tok->fp == NULL, then we're parsing from a string, which means that
 | ||||||
|  |        the whole source is stored in p->tok->str. If not, then we're parsing | ||||||
|  |        from the REPL, so the source lines of the current (multi-line) statement | ||||||
|  |        are stored in p->tok->stdin_content */ | ||||||
|  |     assert(p->tok->fp == NULL || p->tok->fp == stdin); | ||||||
|  | 
 | ||||||
|  |     char *cur_line = p->tok->fp == NULL ? p->tok->str : p->tok->stdin_content; | ||||||
|  |     for (int i = 0; i < lineno - 1; i++) { | ||||||
|  |         cur_line = strchr(cur_line, '\n') + 1; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     char *next_newline; | ||||||
|  |     if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
 | ||||||
|  |         next_newline = cur_line + strlen(cur_line); | ||||||
|  |     } | ||||||
|  |     return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace"); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| void * | void * | ||||||
| _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, | _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, | ||||||
|                                     Py_ssize_t lineno, Py_ssize_t col_offset, |                                     Py_ssize_t lineno, Py_ssize_t col_offset, | ||||||
|  | @ -416,8 +437,22 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (!error_line) { |     if (!error_line) { | ||||||
|         Py_ssize_t size = p->tok->inp - p->tok->buf; |         /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
 | ||||||
|         error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); |            then we need to find the error line from some other source, because | ||||||
|  |            p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly | ||||||
|  |            failed or we're parsing from a string or the REPL. There's a third edge case where | ||||||
|  |            we're actually parsing from a file, which has an E_EOF SyntaxError and in that case | ||||||
|  |            `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which | ||||||
|  |            does not physically exist */ | ||||||
|  |         assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF); | ||||||
|  | 
 | ||||||
|  |         if (p->tok->lineno == lineno) { | ||||||
|  |             Py_ssize_t size = p->tok->inp - p->tok->buf; | ||||||
|  |             error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace"); | ||||||
|  |         } | ||||||
|  |         else { | ||||||
|  |             error_line = get_error_line(p, lineno); | ||||||
|  |         } | ||||||
|         if (!error_line) { |         if (!error_line) { | ||||||
|             goto error; |             goto error; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  | @ -81,6 +81,7 @@ tok_new(void) | ||||||
|     tok->decoding_readline = NULL; |     tok->decoding_readline = NULL; | ||||||
|     tok->decoding_buffer = NULL; |     tok->decoding_buffer = NULL; | ||||||
|     tok->type_comments = 0; |     tok->type_comments = 0; | ||||||
|  |     tok->stdin_content = NULL; | ||||||
| 
 | 
 | ||||||
|     tok->async_hacks = 0; |     tok->async_hacks = 0; | ||||||
|     tok->async_def = 0; |     tok->async_def = 0; | ||||||
|  | @ -816,6 +817,8 @@ PyTokenizer_Free(struct tok_state *tok) | ||||||
|         PyMem_Free(tok->buf); |         PyMem_Free(tok->buf); | ||||||
|     if (tok->input) |     if (tok->input) | ||||||
|         PyMem_Free(tok->input); |         PyMem_Free(tok->input); | ||||||
|  |     if (tok->stdin_content) | ||||||
|  |         PyMem_Free(tok->stdin_content); | ||||||
|     PyMem_Free(tok); |     PyMem_Free(tok); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -856,6 +859,24 @@ tok_nextc(struct tok_state *tok) | ||||||
|                 if (translated == NULL) |                 if (translated == NULL) | ||||||
|                     return EOF; |                     return EOF; | ||||||
|                 newtok = translated; |                 newtok = translated; | ||||||
|  |                 if (tok->stdin_content == NULL) { | ||||||
|  |                     tok->stdin_content = PyMem_Malloc(strlen(translated) + 1); | ||||||
|  |                     if (tok->stdin_content == NULL) { | ||||||
|  |                         tok->done = E_NOMEM; | ||||||
|  |                         return EOF; | ||||||
|  |                     } | ||||||
|  |                     sprintf(tok->stdin_content, "%s", translated); | ||||||
|  |                 } | ||||||
|  |                 else { | ||||||
|  |                     char *new_str = PyMem_Malloc(strlen(tok->stdin_content) + strlen(translated) + 1); | ||||||
|  |                     if (new_str == NULL) { | ||||||
|  |                         tok->done = E_NOMEM; | ||||||
|  |                         return EOF; | ||||||
|  |                     } | ||||||
|  |                     sprintf(new_str, "%s%s", tok->stdin_content, translated); | ||||||
|  |                     PyMem_Free(tok->stdin_content); | ||||||
|  |                     tok->stdin_content = new_str; | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|             if (tok->encoding && newtok && *newtok) { |             if (tok->encoding && newtok && *newtok) { | ||||||
|                 /* Recode to UTF-8 */ |                 /* Recode to UTF-8 */ | ||||||
|  |  | ||||||
|  | @ -37,6 +37,7 @@ struct tok_state { | ||||||
|     int atbol;          /* Nonzero if at begin of new line */ |     int atbol;          /* Nonzero if at begin of new line */ | ||||||
|     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */ |     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */ | ||||||
|     const char *prompt, *nextprompt;          /* For interactive prompting */ |     const char *prompt, *nextprompt;          /* For interactive prompting */ | ||||||
|  |     char *stdin_content; | ||||||
|     int lineno;         /* Current line number */ |     int lineno;         /* Current line number */ | ||||||
|     int first_lineno;   /* First line of a single line or multi line string
 |     int first_lineno;   /* First line of a single line or multi line string
 | ||||||
|                            expression (cf. issue 16806) */ |                            expression (cf. issue 16806) */ | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Lysandros Nikolaou
						Lysandros Nikolaou