mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	Patch #1031213: Decode source line in SyntaxErrors back to its original
source encoding. Will backport to 2.5.
This commit is contained in:
		
							parent
							
								
									58bd49f5fe
								
							
						
					
					
						commit
						a5136196bc
					
				
					 6 changed files with 107 additions and 5 deletions
				
			
		|  | @ -155,6 +155,32 @@ def testWithAss(self): | ||||||
|         self.assertEquals(dct.get('result'), 1) |         self.assertEquals(dct.get('result'), 1) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |     def _testErrEnc(self, src, text, offset): | ||||||
|  |         try: | ||||||
|  |             compile(src, "", "exec") | ||||||
|  |         except SyntaxError, e: | ||||||
|  |             self.assertEquals(e.offset, offset) | ||||||
|  |             self.assertEquals(e.text, text) | ||||||
|  | 
 | ||||||
|  |     def testSourceCodeEncodingsError(self): | ||||||
|  |         # Test SyntaxError with encoding definition | ||||||
|  |         sjis = "print '\x83\x70\x83\x43\x83\x5c\x83\x93', '\n" | ||||||
|  |         ascii = "print '12345678', '\n" | ||||||
|  |         encdef = "#! -*- coding: ShiftJIS -*-\n" | ||||||
|  | 
 | ||||||
|  |         # ascii source without encdef | ||||||
|  |         self._testErrEnc(ascii, ascii, 19) | ||||||
|  | 
 | ||||||
|  |         # ascii source with encdef | ||||||
|  |         self._testErrEnc(encdef+ascii, ascii, 19) | ||||||
|  | 
 | ||||||
|  |         # non-ascii source with encdef | ||||||
|  |         self._testErrEnc(encdef+sjis, sjis, 19) | ||||||
|  | 
 | ||||||
|  |         # ShiftJIS source without encdef | ||||||
|  |         self._testErrEnc(sjis, sjis, 19) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard) | NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard) | ||||||
| 
 | 
 | ||||||
| ############################################################################### | ############################################################################### | ||||||
|  |  | ||||||
|  | @ -320,6 +320,7 @@ Lars Immisch | ||||||
| Tony Ingraldi | Tony Ingraldi | ||||||
| John Interrante | John Interrante | ||||||
| Bob Ippolito | Bob Ippolito | ||||||
|  | Atsuo Ishimoto | ||||||
| Ben Jackson | Ben Jackson | ||||||
| Paul Jackson | Paul Jackson | ||||||
| David Jacobs | David Jacobs | ||||||
|  |  | ||||||
|  | @ -12,6 +12,9 @@ What's New in Python 2.6 alpha 1? | ||||||
| Core and builtins | Core and builtins | ||||||
| ----------------- | ----------------- | ||||||
| 
 | 
 | ||||||
|  | - Patch #1031213: Decode source line in SyntaxErrors back to its original source | ||||||
|  |   encoding. | ||||||
|  | 
 | ||||||
| - Py_ssize_t fields work in structmember when HAVE_LONG_LONG is not defined. | - Py_ssize_t fields work in structmember when HAVE_LONG_LONG is not defined. | ||||||
| 
 | 
 | ||||||
| - PEP 3123: Provide forward compatibility with Python 3.0, while keeping | - PEP 3123: Provide forward compatibility with Python 3.0, while keeping | ||||||
|  |  | ||||||
|  | @ -218,16 +218,24 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, | ||||||
| 			err_ret->error = E_EOF; | 			err_ret->error = E_EOF; | ||||||
| 		err_ret->lineno = tok->lineno; | 		err_ret->lineno = tok->lineno; | ||||||
| 		if (tok->buf != NULL) { | 		if (tok->buf != NULL) { | ||||||
|  | 			char *text = NULL; | ||||||
| 			size_t len; | 			size_t len; | ||||||
| 			assert(tok->cur - tok->buf < INT_MAX); | 			assert(tok->cur - tok->buf < INT_MAX); | ||||||
| 			err_ret->offset = (int)(tok->cur - tok->buf); | 			err_ret->offset = (int)(tok->cur - tok->buf); | ||||||
| 			len = tok->inp - tok->buf; | 			len = tok->inp - tok->buf; | ||||||
| 			err_ret->text = (char *) PyObject_MALLOC(len + 1); | #ifdef Py_USING_UNICODE | ||||||
| 			if (err_ret->text != NULL) { | 			text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset); | ||||||
| 				if (len > 0) | 
 | ||||||
| 					strncpy(err_ret->text, tok->buf, len); | #endif | ||||||
| 				err_ret->text[len] = '\0'; | 			if (text == NULL) { | ||||||
|  | 				text = (char *) PyObject_MALLOC(len + 1); | ||||||
|  | 				if (text != NULL) { | ||||||
|  | 					if (len > 0) | ||||||
|  | 						strncpy(text, tok->buf, len); | ||||||
|  | 					text[len] = '\0'; | ||||||
|  | 				} | ||||||
| 			} | 			} | ||||||
|  | 			err_ret->text = text; | ||||||
| 		} | 		} | ||||||
| 	} else if (tok->encoding != NULL) { | 	} else if (tok->encoding != NULL) { | ||||||
| 		node* r = PyNode_New(encoding_decl); | 		node* r = PyNode_New(encoding_decl); | ||||||
|  |  | ||||||
|  | @ -1522,6 +1522,68 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) | ||||||
| 	return result; | 	return result; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* This function is only called from parsetok. However, it cannot live
 | ||||||
|  |    there, as it must be empty for PGEN, and we can check for PGEN only | ||||||
|  |    in this file. */ | ||||||
|  | 
 | ||||||
|  | #ifdef PGEN | ||||||
|  | char* | ||||||
|  | PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) | ||||||
|  | { | ||||||
|  | 	return NULL; | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | static PyObject * | ||||||
|  | dec_utf8(const char *enc, const char *text, size_t len) { | ||||||
|  | 	PyObject *ret = NULL;	 | ||||||
|  | 	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); | ||||||
|  | 	if (unicode_text) { | ||||||
|  | 		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); | ||||||
|  | 		Py_DECREF(unicode_text); | ||||||
|  | 	} | ||||||
|  | 	if (!ret) { | ||||||
|  | 		PyErr_Print(); | ||||||
|  | 	} | ||||||
|  | 	return ret; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | char * | ||||||
|  | PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) | ||||||
|  | { | ||||||
|  | 	char *text = NULL; | ||||||
|  | 	if (tok->encoding) { | ||||||
|  | 		/* convert source to original encondig */ | ||||||
|  | 		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); | ||||||
|  | 		if (lineobj != NULL) { | ||||||
|  | 			int linelen = PyString_Size(lineobj); | ||||||
|  | 			const char *line = PyString_AsString(lineobj); | ||||||
|  | 			text = PyObject_MALLOC(linelen + 1); | ||||||
|  | 			if (text != NULL && line != NULL) { | ||||||
|  | 				if (linelen) | ||||||
|  | 					strncpy(text, line, linelen); | ||||||
|  | 				text[linelen] = '\0'; | ||||||
|  | 			} | ||||||
|  | 			Py_DECREF(lineobj); | ||||||
|  | 					 | ||||||
|  | 			/* adjust error offset */ | ||||||
|  | 			if (*offset > 1) { | ||||||
|  | 				PyObject *offsetobj = dec_utf8(tok->encoding,  | ||||||
|  | 							       tok->buf, *offset-1); | ||||||
|  | 				if (offsetobj) { | ||||||
|  | 					*offset = PyString_Size(offsetobj) + 1; | ||||||
|  | 					Py_DECREF(offsetobj); | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			 | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return text; | ||||||
|  | 
 | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | 			    | ||||||
|  | 
 | ||||||
| #ifdef Py_DEBUG | #ifdef Py_DEBUG | ||||||
| 
 | 
 | ||||||
| void | void | ||||||
|  |  | ||||||
|  | @ -58,6 +58,8 @@ extern struct tok_state *PyTokenizer_FromString(const char *); | ||||||
| extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *); | extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *); | ||||||
| extern void PyTokenizer_Free(struct tok_state *); | extern void PyTokenizer_Free(struct tok_state *); | ||||||
| extern int PyTokenizer_Get(struct tok_state *, char **, char **); | extern int PyTokenizer_Get(struct tok_state *, char **, char **); | ||||||
|  | extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,  | ||||||
|  | 					  int len, int *offset); | ||||||
| 
 | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Martin v. Löwis
						Martin v. Löwis