mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	Patch #534304: Implement phase 1 of PEP 263.
This commit is contained in:
		
							parent
							
								
									a729daf2e4
								
							
						
					
					
						commit
						00f1e3f5a5
					
				
					 13 changed files with 656 additions and 31 deletions
				
			
		|  | @ -7,11 +7,14 @@ chapter describes how the lexical analyzer breaks a file into tokens. | ||||||
| \index{parser} | \index{parser} | ||||||
| \index{token} | \index{token} | ||||||
| 
 | 
 | ||||||
| Python uses the 7-bit \ASCII{} character set for program text and string | Python uses the 7-bit \ASCII{} character set for program text. | ||||||
| literals. 8-bit characters may be used in string literals and comments | \versionadded[An encoding declaration can be used to indicate that  | ||||||
| but their interpretation is platform dependent; the proper way to | string literals and comments use an encoding different from ASCII.]{2.3} | ||||||
| insert 8-bit characters in string literals is by using octal or | For compatibility with older versions, Python only warns if it finds | ||||||
| hexadecimal escape sequences. | 8-bit characters; those warnings should be corrected by either declaring | ||||||
|  | an explicit encoding, or using escape sequences if those bytes are binary | ||||||
|  | data, instead of characters. | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| The run-time character set depends on the I/O devices connected to the | The run-time character set depends on the I/O devices connected to the | ||||||
| program but is generally a superset of \ASCII. | program but is generally a superset of \ASCII. | ||||||
|  | @ -69,6 +72,37 @@ Comments are ignored by the syntax; they are not tokens. | ||||||
| \index{hash character} | \index{hash character} | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | \subsection{Encoding declarations\label{encodings}} | ||||||
|  | 
 | ||||||
|  | If a comment in the first or second line of the Python script matches | ||||||
|  | the regular expression "coding[=:]\s*([\w-_.]+)", this comment is | ||||||
|  | processed as an encoding declaration; the first group of this | ||||||
|  | expression names the encoding of the source code file. The recommended | ||||||
|  | forms of this expression are | ||||||
|  | 
 | ||||||
|  | \begin{verbatim} | ||||||
|  | # -*- coding: <encoding-name> -*- | ||||||
|  | \end{verbatim} | ||||||
|  | 
 | ||||||
|  | which is recognized also by GNU Emacs, and | ||||||
|  | 
 | ||||||
|  | \begin{verbatim} | ||||||
|  | # vim:fileencoding=<encoding-name> | ||||||
|  | \end{verbatim} | ||||||
|  | 
 | ||||||
|  | which is recognized by Bram Moolenar's VIM. In addition, if the first | ||||||
|  | bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the | ||||||
|  | declared file encoding is UTF-8 (this is supported, among others, by | ||||||
|  | Microsoft's notepad.exe). | ||||||
|  | 
 | ||||||
|  | If an encoding is declared, the encoding name must be recognized by | ||||||
|  | Python. % XXX there should be a list of supported encodings. | ||||||
|  | The encoding is used for all lexical analysis, in particular to find | ||||||
|  | the end of a string, and to interpret the contents of Unicode literals. | ||||||
|  | String literals are converted to Unicode for syntactical analysis, | ||||||
|  | then converted back to their original encoding before interpretation | ||||||
|  | starts. | ||||||
|  | 
 | ||||||
| \subsection{Explicit line joining\label{explicit-joining}} | \subsection{Explicit line joining\label{explicit-joining}} | ||||||
| 
 | 
 | ||||||
| Two or more physical lines may be joined into logical lines using | Two or more physical lines may be joined into logical lines using | ||||||
|  |  | ||||||
|  | @ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter] | ||||||
| list_if: 'if' test [list_iter] | list_if: 'if' test [list_iter] | ||||||
| 
 | 
 | ||||||
| testlist1: test (',' test)* | testlist1: test (',' test)* | ||||||
|  | 
 | ||||||
|  | # not used in grammar, but may appear in "node" passed from Parser to Compiler | ||||||
|  | encoding_decl: NAME | ||||||
|  |  | ||||||
|  | @ -25,6 +25,7 @@ extern "C" { | ||||||
| #define E_OVERFLOW      19	/* Node had too many children */ | #define E_OVERFLOW      19	/* Node had too many children */ | ||||||
| #define E_TOODEEP	20	/* Too many indentation levels */ | #define E_TOODEEP	20	/* Too many indentation levels */ | ||||||
| #define E_DEDENT	21	/* No matching outer block for dedent */ | #define E_DEDENT	21	/* No matching outer block for dedent */ | ||||||
|  | #define E_DECODE	22	/* Error in decoding into Unicode */ | ||||||
| 
 | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -65,3 +65,4 @@ | ||||||
| #define list_for 320 | #define list_for 320 | ||||||
| #define list_if 321 | #define list_if 321 | ||||||
| #define testlist1 322 | #define testlist1 322 | ||||||
|  | #define encoding_decl 323 | ||||||
|  |  | ||||||
|  | @ -190,15 +190,15 @@ POBJS=		\ | ||||||
| 		Parser/node.o \ | 		Parser/node.o \ | ||||||
| 		Parser/parser.o \ | 		Parser/parser.o \ | ||||||
| 		Parser/parsetok.o \ | 		Parser/parsetok.o \ | ||||||
| 		Parser/tokenizer.o \ |  | ||||||
| 		Parser/bitset.o \ | 		Parser/bitset.o \ | ||||||
| 		Parser/metagrammar.o | 		Parser/metagrammar.o | ||||||
| 
 | 
 | ||||||
| PARSER_OBJS=	$(POBJS) Parser/myreadline.o | PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o | ||||||
| 
 | 
 | ||||||
| PGOBJS=		\ | PGOBJS=		\ | ||||||
| 		Objects/obmalloc.o \ | 		Objects/obmalloc.o \ | ||||||
| 		Python/mysnprintf.o \ | 		Python/mysnprintf.o \ | ||||||
|  | 		Parser/tokenizer_pgen.o \ | ||||||
| 		Parser/firstsets.o \ | 		Parser/firstsets.o \ | ||||||
| 		Parser/grammar.o \ | 		Parser/grammar.o \ | ||||||
| 		Parser/pgen.o \ | 		Parser/pgen.o \ | ||||||
|  | @ -434,6 +434,8 @@ Parser/grammar.o:	$(srcdir)/Parser/grammar.c \ | ||||||
| 				$(srcdir)/Include/grammar.h | 				$(srcdir)/Include/grammar.h | ||||||
| Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c | Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c | ||||||
| 
 | 
 | ||||||
|  | Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| Python/compile.o Python/symtable.o: $(GRAMMAR_H) | Python/compile.o Python/symtable.o: $(GRAMMAR_H) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,6 +6,8 @@ Type/class unification and new-style classes | ||||||
| 
 | 
 | ||||||
| Core and builtins | Core and builtins | ||||||
| 
 | 
 | ||||||
|  | - Encoding declarations (PEP 263, phase 1) have been implemented. | ||||||
|  | 
 | ||||||
| - list.sort() has a new implementation.  While cross-platform results | - list.sort() has a new implementation.  While cross-platform results | ||||||
|   may vary, and in data-dependent ways, this is much faster on many |   may vary, and in data-dependent ways, this is much faster on many | ||||||
|   kinds of partially ordered lists than the previous implementation, |   kinds of partially ordered lists than the previous implementation, | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ | ||||||
| #include "parser.h" | #include "parser.h" | ||||||
| #include "parsetok.h" | #include "parsetok.h" | ||||||
| #include "errcode.h" | #include "errcode.h" | ||||||
|  | #include "graminit.h" | ||||||
| 
 | 
 | ||||||
| int Py_TabcheckFlag; | int Py_TabcheckFlag; | ||||||
| 
 | 
 | ||||||
|  | @ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename, | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  |         tok->filename = filename ? filename : "<string>"; | ||||||
| 	if (Py_TabcheckFlag || Py_VerboseFlag) { | 	if (Py_TabcheckFlag || Py_VerboseFlag) { | ||||||
| 		tok->filename = filename ? filename : "<string>"; |  | ||||||
| 		tok->altwarning = (tok->filename != NULL); | 		tok->altwarning = (tok->filename != NULL); | ||||||
| 		if (Py_TabcheckFlag >= 2) | 		if (Py_TabcheckFlag >= 2) | ||||||
| 			tok->alterror++; | 			tok->alterror++; | ||||||
|  | @ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start, | ||||||
| 		err_ret->error = E_NOMEM; | 		err_ret->error = E_NOMEM; | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 	} | 	} | ||||||
|  | 	tok->filename = filename; | ||||||
| 	if (Py_TabcheckFlag || Py_VerboseFlag) { | 	if (Py_TabcheckFlag || Py_VerboseFlag) { | ||||||
| 		tok->filename = filename; |  | ||||||
| 		tok->altwarning = (filename != NULL); | 		tok->altwarning = (filename != NULL); | ||||||
| 		if (Py_TabcheckFlag >= 2) | 		if (Py_TabcheckFlag >= 2) | ||||||
| 			tok->alterror++; | 			tok->alterror++; | ||||||
|  | @ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, | ||||||
| 				err_ret->text[len] = '\0'; | 				err_ret->text[len] = '\0'; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
|  | 	} else if (tok->encoding != NULL) { | ||||||
|  | 		node* r = PyNode_New(encoding_decl); | ||||||
|  | 		r->n_str = tok->encoding; | ||||||
|  | 		r->n_nchildren = 1; | ||||||
|  | 		r->n_child = n; | ||||||
|  | 		tok->encoding = NULL; | ||||||
|  | 		n = r; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	PyTokenizer_Free(tok); | 	PyTokenizer_Free(tok); | ||||||
|  |  | ||||||
|  | @ -5,10 +5,19 @@ | ||||||
| #include "pgenheaders.h" | #include "pgenheaders.h" | ||||||
| 
 | 
 | ||||||
| #include <ctype.h> | #include <ctype.h> | ||||||
|  | #include <assert.h> | ||||||
| 
 | 
 | ||||||
| #include "tokenizer.h" | #include "tokenizer.h" | ||||||
| #include "errcode.h" | #include "errcode.h" | ||||||
| 
 | 
 | ||||||
|  | #ifndef PGEN | ||||||
|  | #include "unicodeobject.h" | ||||||
|  | #include "stringobject.h" | ||||||
|  | #include "fileobject.h" | ||||||
|  | #include "codecs.h" | ||||||
|  | #include "abstract.h" | ||||||
|  | #endif /* PGEN */ | ||||||
|  | 
 | ||||||
| extern char *PyOS_Readline(char *); | extern char *PyOS_Readline(char *); | ||||||
| /* Return malloc'ed string including trailing \n;
 | /* Return malloc'ed string including trailing \n;
 | ||||||
|    empty malloc'ed string for EOF; |    empty malloc'ed string for EOF; | ||||||
|  | @ -114,9 +123,416 @@ tok_new(void) | ||||||
| 	tok->alterror = 0; | 	tok->alterror = 0; | ||||||
| 	tok->alttabsize = 1; | 	tok->alttabsize = 1; | ||||||
| 	tok->altindstack[0] = 0; | 	tok->altindstack[0] = 0; | ||||||
|  | 	tok->decoding_state = 0; | ||||||
|  | 	tok->decoding_erred = 0; | ||||||
|  | 	tok->read_coding_spec = 0; | ||||||
|  | 	tok->issued_encoding_warning = 0; | ||||||
|  | 	tok->encoding = NULL; | ||||||
|  | 	tok->decoding_readline = NULL; | ||||||
|  | 	tok->decoding_buffer = NULL; | ||||||
| 	return tok; | 	return tok; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #ifdef PGEN | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | decoding_fgets(char *s, int size, struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	return fgets(s, size, tok->fp); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | decoding_feof(struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	return feof(tok->fp); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static const char * | ||||||
|  | decode_str(const char *str, struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	return str; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #else /* PGEN */ | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | error_ret(struct tok_state *tok) /* XXX */ | ||||||
|  | { | ||||||
|  | 	tok->decoding_erred = 1; | ||||||
|  | 	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */ | ||||||
|  | 		PyMem_DEL(tok->buf); | ||||||
|  | 	tok->buf = NULL; | ||||||
|  | 	return NULL;		/* as if it were EOF */ | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | new_string(const char *s, int len) | ||||||
|  | { | ||||||
|  | 	char* result = PyMem_NEW(char, len + 1); | ||||||
|  | 	if (result != NULL) { | ||||||
|  | 		memcpy(result, s, len); | ||||||
|  | 		result[len] = '\0'; | ||||||
|  | 	} | ||||||
|  | 	return result; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | get_normal_name(char *s)	/* for utf-8 and latin-1 */ | ||||||
|  | { | ||||||
|  | 	char buf[13]; | ||||||
|  | 	int i; | ||||||
|  | 	for (i = 0; i < 12; i++) { | ||||||
|  | 		int c = s[i]; | ||||||
|  | 		if (c == '\0') break; | ||||||
|  | 		else if (c == '_') buf[i] = '-'; | ||||||
|  | 		else buf[i] = tolower(c); | ||||||
|  | 	} | ||||||
|  | 	buf[i] = '\0'; | ||||||
|  | 	if (strcmp(buf, "utf-8") == 0 || | ||||||
|  | 	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8"; | ||||||
|  | 	else if (strcmp(buf, "latin-1") == 0 || | ||||||
|  | 		 strcmp(buf, "iso-8859-1") == 0 || | ||||||
|  | 		 strcmp(buf, "iso-latin-1") == 0 || | ||||||
|  | 		 strncmp(buf, "latin-1-", 8) == 0 || | ||||||
|  | 		 strncmp(buf, "iso-8859-1-", 11) == 0 || | ||||||
|  | 		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1"; | ||||||
|  | 	else return s; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Return the coding spec in S, or NULL if none is found.  */ | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | get_coding_spec(const char *s, int size) | ||||||
|  | { | ||||||
|  | 	int i; | ||||||
|  | 	for (i = 0; i < size - 6; i++) { /* XXX inefficient search */ | ||||||
|  | 		const char* t = s + i; | ||||||
|  | 		if (strncmp(t, "coding", 6) == 0) { | ||||||
|  | 			const char* begin = NULL; | ||||||
|  | 			t += 6; | ||||||
|  | 			if (t[0] != ':' && t[0] != '=') | ||||||
|  | 				continue; | ||||||
|  | 			do { | ||||||
|  | 				t++; | ||||||
|  | 			} while (t[0] == '\x20' || t[0] == '\t'); | ||||||
|  | 
 | ||||||
|  | 			begin = t; | ||||||
|  | 			while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' || | ||||||
|  | 			       t[0] == '.') | ||||||
|  | 				t++; | ||||||
|  | 
 | ||||||
|  | 			if (begin < t) { | ||||||
|  | 				char* r = new_string(begin, t - begin); | ||||||
|  | 				char* q = get_normal_name(r); | ||||||
|  | 				if (r != q) { | ||||||
|  | 					assert(strlen(r) >= strlen(q)); | ||||||
|  | 					strcpy(r, q); | ||||||
|  | 				} | ||||||
|  | 				return r; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return NULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Check whether the line contains a coding spec. If it does,
 | ||||||
|  |    invoke the set_readline function for the new encoding. | ||||||
|  |    This function receives the tok_state and the new encoding. | ||||||
|  |    Return 1 on success, 0 on failure.  */ | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | check_coding_spec(const char* line, int size, struct tok_state *tok, | ||||||
|  | 		  int set_readline(struct tok_state *, const char *)) | ||||||
|  | { | ||||||
|  | 	int r = 1; | ||||||
|  | 	char* cs = get_coding_spec(line, size); | ||||||
|  | 	if (cs != NULL) { | ||||||
|  | 		tok->read_coding_spec = 1; | ||||||
|  | 		if (tok->encoding == NULL) { | ||||||
|  | 			assert(tok->decoding_state == 1); /* raw */ | ||||||
|  | 			if (strcmp(cs, "utf-8") == 0 || | ||||||
|  | 			    strcmp(cs, "iso-8859-1") == 0) { | ||||||
|  | 				tok->encoding = cs; | ||||||
|  | 			} else { | ||||||
|  | 				r = set_readline(tok, cs); | ||||||
|  | 				if (r) { | ||||||
|  | 					tok->encoding = cs; | ||||||
|  | 					tok->decoding_state = -1; | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 		} else {	/* then, compare cs with BOM */ | ||||||
|  | 			r = (strcmp(tok->encoding, cs) == 0); | ||||||
|  | 			PyMem_DEL(cs); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	return r; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* See whether the file starts with a BOM. If it does,
 | ||||||
|  |    invoke the set_readline function with the new encoding. | ||||||
|  |    Return 1 on success, 0 on failure.  */ | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | check_bom(int get_char(struct tok_state *), | ||||||
|  | 	  void unget_char(int, struct tok_state *), | ||||||
|  | 	  int set_readline(struct tok_state *, const char *), | ||||||
|  | 	  struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	int ch = get_char(tok); | ||||||
|  | 	tok->decoding_state = 1; | ||||||
|  | 	if (ch == EOF) { | ||||||
|  | 		return 1; | ||||||
|  | 	} else if (ch == 0xEF) { | ||||||
|  | 		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM; | ||||||
|  | 		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM; | ||||||
|  | #if 0 | ||||||
|  | 	/* Disable support for UTF-16 BOMs until a decision
 | ||||||
|  | 	   is made whether this needs to be supported.  */ | ||||||
|  | 	} else if (ch == 0xFE) { | ||||||
|  | 		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; | ||||||
|  | 		if (!set_readline(tok, "utf-16-be")) return 0; | ||||||
|  | 		tok->decoding_state = -1; | ||||||
|  | 	} else if (ch == 0xFF) { | ||||||
|  | 		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; | ||||||
|  | 		if (!set_readline(tok, "utf-16-le")) return 0; | ||||||
|  | 		tok->decoding_state = -1; | ||||||
|  | #endif | ||||||
|  | 	} else { | ||||||
|  | 		unget_char(ch, tok); | ||||||
|  | 		return 1; | ||||||
|  | 	} | ||||||
|  | 	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */ | ||||||
|  | 	return 1; | ||||||
|  |   NON_BOM: | ||||||
|  | 	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */ | ||||||
|  | 	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */ | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Read a line of text from TOK into S, using the stream in TOK.
 | ||||||
|  |    Return NULL on failure, else S.  */ | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | fp_readl(char *s, int size, struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	PyObject* utf8; | ||||||
|  | 	PyObject* buf = tok->decoding_buffer; | ||||||
|  | 	if (buf == NULL) { | ||||||
|  | 		buf = PyObject_CallObject(tok->decoding_readline, NULL); | ||||||
|  | 		if (buf == NULL) return error_ret(tok); | ||||||
|  | 	} else { | ||||||
|  | 		tok->decoding_buffer = NULL; | ||||||
|  | 	} | ||||||
|  | 	utf8 = PyUnicode_AsUTF8String(buf); | ||||||
|  | 	Py_DECREF(buf); | ||||||
|  | 	if (utf8 == NULL) return error_ret(tok); | ||||||
|  | 	else { | ||||||
|  | 		const char* str = PyString_AsString(utf8); | ||||||
|  | 		assert(strlen(str) < size); /* XXX */ | ||||||
|  | 		strcpy(s, str); | ||||||
|  | 		Py_DECREF(utf8); | ||||||
|  | 		if (s[0] == '\0') return NULL; /* EOF */ | ||||||
|  | 		return s; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Set the readline function for TOK to a StreamReader's
 | ||||||
|  |    readline function. The StreamReader is named ENC. | ||||||
|  | 
 | ||||||
|  |    This function is called from check_bom and check_coding_spec. | ||||||
|  | 
 | ||||||
|  |    ENC is usually identical to the future value of tok->encoding, | ||||||
|  |    except for the (currently unsupported) case of UTF-16. | ||||||
|  | 
 | ||||||
|  |    Return 1 on success, 0 on failure. */ | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | fp_setreadl(struct tok_state *tok, const char* enc) | ||||||
|  | { | ||||||
|  | 	PyObject *reader, *stream, *readline; | ||||||
|  | 
 | ||||||
|  | 	stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL); | ||||||
|  | 	if (stream == NULL) return 0; | ||||||
|  | 
 | ||||||
|  | 	reader = PyCodec_StreamReader(enc, stream, NULL); | ||||||
|  | 	Py_DECREF(stream); | ||||||
|  | 	if (reader == NULL) return 0; | ||||||
|  | 
 | ||||||
|  | 	readline = PyObject_GetAttrString(reader, "readline"); | ||||||
|  | 	Py_DECREF(reader); | ||||||
|  | 	if (readline == NULL) return 0; | ||||||
|  | 
 | ||||||
|  | 	tok->decoding_readline = readline; | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Fetch the next byte from TOK. */ | ||||||
|  | 
 | ||||||
|  | static int fp_getc(struct tok_state *tok) { | ||||||
|  | 	return getc(tok->fp); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Unfetch the last byte back into TOK.  */ | ||||||
|  | 
 | ||||||
|  | static void fp_ungetc(int c, struct tok_state *tok) { | ||||||
|  | 	ungetc(c, tok->fp); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Read a line of input from TOK. Determine encoding
 | ||||||
|  |    if necessary.  */ | ||||||
|  | 
 | ||||||
|  | static char * | ||||||
|  | decoding_fgets(char *s, int size, struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	char *line; | ||||||
|  | 	int warn = 0, badchar = 0; | ||||||
|  | 	for (;;) | ||||||
|  | 		if (tok->decoding_state < 0) { | ||||||
|  | 			/* We already have a codec associated with
 | ||||||
|  | 			   this input. */ | ||||||
|  | 			line = fp_readl(s, size, tok); | ||||||
|  | 			break; | ||||||
|  | 		} else if (tok->decoding_state > 0) { | ||||||
|  | 			/* We want a 'raw' read. */ | ||||||
|  | 			line = Py_UniversalNewlineFgets(s, size,  | ||||||
|  | 							tok->fp, NULL); | ||||||
|  | 			warn = 1; | ||||||
|  | 			break; | ||||||
|  | 		} else { | ||||||
|  | 			/* We have not yet determined the encoding.
 | ||||||
|  | 			   If an encoding is found, use the file-pointer | ||||||
|  | 			   reader functions from now on. */ | ||||||
|  | 			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) | ||||||
|  | 				return error_ret(tok); | ||||||
|  | 			assert(tok->decoding_state != 0); | ||||||
|  | 		} | ||||||
|  | 	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { | ||||||
|  | 		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) { | ||||||
|  | 			return error_ret(tok); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | #ifndef PGEN | ||||||
|  | 	if (warn && line && !tok->issued_encoding_warning && !tok->encoding) { | ||||||
|  | 		unsigned char *c; | ||||||
|  | 		for (c = line; *c; c++) | ||||||
|  | 			if (*c > 127) { | ||||||
|  | 				badchar = *c; | ||||||
|  | 				break; | ||||||
|  | 			} | ||||||
|  | 	} | ||||||
|  | 	if (badchar) { | ||||||
|  | 		char buf[200]; | ||||||
|  | 		sprintf(buf, "Non-ASCII character '\\x%.2x', " | ||||||
|  | 			"but no declared encoding", badchar); | ||||||
|  | 		PyErr_WarnExplicit(PyExc_DeprecationWarning, | ||||||
|  | 				   buf, tok->filename, tok->lineno,  | ||||||
|  | 				   NULL, NULL); | ||||||
|  | 		tok->issued_encoding_warning = 1; | ||||||
|  | 	} | ||||||
|  | #endif | ||||||
|  | 	return line; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int | ||||||
|  | decoding_feof(struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	if (tok->decoding_state >= 0) { | ||||||
|  | 		return feof(tok->fp); | ||||||
|  | 	} else { | ||||||
|  | 		PyObject* buf = tok->decoding_buffer; | ||||||
|  | 		if (buf == NULL) { | ||||||
|  | 			buf = PyObject_CallObject(tok->decoding_readline, NULL); | ||||||
|  | 			if (buf == NULL) { | ||||||
|  | 				error_ret(tok); | ||||||
|  | 				return 1; | ||||||
|  | 			} else { | ||||||
|  | 				tok->decoding_buffer = buf; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		return PyObject_Length(buf) == 0; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Fetch a byte from TOK, using the string buffer. */ | ||||||
|  | 
 | ||||||
|  | static int buf_getc(struct tok_state *tok) { | ||||||
|  | 	return *tok->str++; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Unfetch a byte from TOK, using the string buffer. */ | ||||||
|  | 
 | ||||||
|  | static void buf_ungetc(int c, struct tok_state *tok) { | ||||||
|  | 	tok->str--; | ||||||
|  | 	assert(*tok->str == c);	/* tok->cur may point to read-only segment */ | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Set the readline function for TOK to ENC. For the string-based
 | ||||||
|  |    tokenizer, this means to just record the encoding. */ | ||||||
|  | 
 | ||||||
|  | static int buf_setreadl(struct tok_state *tok, const char* enc) { | ||||||
|  | 	tok->enc = enc; | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Return a UTF-8 encoding Python string object from the
 | ||||||
|  |    C byte string STR, which is encoded with ENC. */ | ||||||
|  | 
 | ||||||
|  | static PyObject * | ||||||
|  | translate_into_utf8(const char* str, const char* enc) { | ||||||
|  | 	PyObject *utf8; | ||||||
|  | 	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); | ||||||
|  | 	if (buf == NULL) | ||||||
|  | 		return NULL; | ||||||
|  | 	utf8 = PyUnicode_AsUTF8String(buf); | ||||||
|  | 	Py_DECREF(buf); | ||||||
|  | 	return utf8; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /* Decode a byte string STR for use as the buffer of TOK.
 | ||||||
|  |    Look for encoding declarations inside STR, and record them | ||||||
|  |    inside TOK.  */ | ||||||
|  | 
 | ||||||
|  | static const char * | ||||||
|  | decode_str(const char *str, struct tok_state *tok) | ||||||
|  | { | ||||||
|  | 	PyObject* utf8 = NULL; | ||||||
|  | 	const char *s; | ||||||
|  | 	int lineno = 0; | ||||||
|  | 	tok->enc = NULL; | ||||||
|  | 	tok->str = str; | ||||||
|  | 	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) | ||||||
|  | 		return NULL; | ||||||
|  | 	str = tok->str;		/* string after BOM if any */ | ||||||
|  | 	assert(r); | ||||||
|  | 	if (tok->enc != NULL) { | ||||||
|  | 		utf8 = translate_into_utf8(str, tok->enc); | ||||||
|  | 		if (utf8 == NULL) | ||||||
|  | 			return NULL; | ||||||
|  | 		str = PyString_AsString(utf8); | ||||||
|  | 	} | ||||||
|  | 	for (s = str;; s++) { | ||||||
|  | 		if (*s == '\0') break; | ||||||
|  | 		else if (*s == '\n') { | ||||||
|  | 			lineno++; | ||||||
|  | 			if (lineno == 2) break; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	tok->enc = NULL; | ||||||
|  | 	if (!check_coding_spec(str, s - str, tok, buf_setreadl)) | ||||||
|  | 		return NULL; | ||||||
|  | 	if (tok->enc != NULL) { | ||||||
|  | 		assert(utf8 == NULL); | ||||||
|  | 		utf8 = translate_into_utf8(str, tok->enc); | ||||||
|  | 		if (utf8 == NULL) | ||||||
|  | 			return NULL; | ||||||
|  | 		str = PyString_AsString(utf8); | ||||||
|  | 	} | ||||||
|  | 	assert(tok->decoding_buffer == NULL); | ||||||
|  | 	tok->decoding_buffer = utf8; /* CAUTION */ | ||||||
|  | 	return str; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #endif /* PGEN */ | ||||||
| 
 | 
 | ||||||
| /* Set up tokenizer for string */ | /* Set up tokenizer for string */ | ||||||
| 
 | 
 | ||||||
|  | @ -126,6 +542,9 @@ PyTokenizer_FromString(char *str) | ||||||
| 	struct tok_state *tok = tok_new(); | 	struct tok_state *tok = tok_new(); | ||||||
| 	if (tok == NULL) | 	if (tok == NULL) | ||||||
| 		return NULL; | 		return NULL; | ||||||
|  | 	str = (char *)decode_str(str, tok); | ||||||
|  | 	if (str == NULL) | ||||||
|  | 		return NULL; | ||||||
| 	tok->buf = tok->cur = tok->end = tok->inp = str; | 	tok->buf = tok->cur = tok->end = tok->inp = str; | ||||||
| 	return tok; | 	return tok; | ||||||
| } | } | ||||||
|  | @ -157,6 +576,10 @@ PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) | ||||||
| void | void | ||||||
| PyTokenizer_Free(struct tok_state *tok) | PyTokenizer_Free(struct tok_state *tok) | ||||||
| { | { | ||||||
|  | 	if (tok->encoding != NULL) | ||||||
|  | 		PyMem_DEL(tok->encoding); | ||||||
|  | 	Py_XDECREF(tok->decoding_readline); | ||||||
|  | 	Py_XDECREF(tok->decoding_buffer); | ||||||
| 	if (tok->fp != NULL && tok->buf != NULL) | 	if (tok->fp != NULL && tok->buf != NULL) | ||||||
| 		PyMem_DEL(tok->buf); | 		PyMem_DEL(tok->buf); | ||||||
| 	PyMem_DEL(tok); | 	PyMem_DEL(tok); | ||||||
|  | @ -246,8 +669,8 @@ tok_nextc(register struct tok_state *tok) | ||||||
| 					} | 					} | ||||||
| 					tok->end = tok->buf + BUFSIZ; | 					tok->end = tok->buf + BUFSIZ; | ||||||
| 				} | 				} | ||||||
| 				if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf), | 				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), | ||||||
| 					  tok->fp, NULL) == NULL) { | 					  tok) == NULL) { | ||||||
| 					tok->done = E_EOF; | 					tok->done = E_EOF; | ||||||
| 					done = 1; | 					done = 1; | ||||||
| 				} | 				} | ||||||
|  | @ -259,7 +682,7 @@ tok_nextc(register struct tok_state *tok) | ||||||
| 			} | 			} | ||||||
| 			else { | 			else { | ||||||
| 				cur = tok->cur - tok->buf; | 				cur = tok->cur - tok->buf; | ||||||
| 				if (feof(tok->fp)) { | 				if (decoding_feof(tok)) { | ||||||
| 					tok->done = E_EOF; | 					tok->done = E_EOF; | ||||||
| 					done = 1; | 					done = 1; | ||||||
| 				} | 				} | ||||||
|  | @ -285,9 +708,9 @@ tok_nextc(register struct tok_state *tok) | ||||||
| 				tok->end = tok->buf + newsize; | 				tok->end = tok->buf + newsize; | ||||||
| 				tok->start = curstart < 0 ? NULL : | 				tok->start = curstart < 0 ? NULL : | ||||||
| 					     tok->buf + curstart; | 					     tok->buf + curstart; | ||||||
| 				if (Py_UniversalNewlineFgets(tok->inp, | 				if (decoding_fgets(tok->inp, | ||||||
| 					       (int)(tok->end - tok->inp), | 					       (int)(tok->end - tok->inp), | ||||||
| 					       tok->fp, NULL) == NULL) { | 					       tok) == NULL) { | ||||||
| 					/* Last line does not end in \n,
 | 					/* Last line does not end in \n,
 | ||||||
| 					   fake one */ | 					   fake one */ | ||||||
| 					strcpy(tok->inp, "\n"); | 					strcpy(tok->inp, "\n"); | ||||||
|  | @ -506,9 +929,8 @@ indenterror(struct tok_state *tok) | ||||||
| 
 | 
 | ||||||
| /* Get next token, after space stripping etc. */ | /* Get next token, after space stripping etc. */ | ||||||
| 
 | 
 | ||||||
| int | static int | ||||||
| PyTokenizer_Get(register struct tok_state *tok, char **p_start, | tok_get(register struct tok_state *tok, char **p_start, char **p_end) | ||||||
| 		char **p_end) |  | ||||||
| { | { | ||||||
| 	register int c; | 	register int c; | ||||||
| 	int blankline; | 	int blankline; | ||||||
|  | @ -915,6 +1337,16 @@ PyTokenizer_Get(register struct tok_state *tok, char **p_start, | ||||||
| 	return PyToken_OneChar(c); | 	return PyToken_OneChar(c); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | int | ||||||
|  | PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) | ||||||
|  | { | ||||||
|  | 	int result = tok_get(tok, p_start, p_end); | ||||||
|  | 	if (tok->decoding_erred) { | ||||||
|  | 		result = ERRORTOKEN; | ||||||
|  | 		tok->done = E_DECODE; | ||||||
|  | 	} | ||||||
|  | 	return result; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| #ifdef Py_DEBUG | #ifdef Py_DEBUG | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -4,6 +4,7 @@ | ||||||
| extern "C" { | extern "C" { | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | #include "object.h" | ||||||
| 
 | 
 | ||||||
| /* Tokenizer interface */ | /* Tokenizer interface */ | ||||||
| 
 | 
 | ||||||
|  | @ -38,6 +39,16 @@ struct tok_state { | ||||||
| 	int alterror;	/* Issue error if alternate tabs don't match */ | 	int alterror;	/* Issue error if alternate tabs don't match */ | ||||||
| 	int alttabsize;	/* Alternate tab spacing */ | 	int alttabsize;	/* Alternate tab spacing */ | ||||||
| 	int altindstack[MAXINDENT];	/* Stack of alternate indents */ | 	int altindstack[MAXINDENT];	/* Stack of alternate indents */ | ||||||
|  | 	/* Stuff for PEP 0263 */ | ||||||
|  | 	int decoding_state;	/* -1:decoding, 0:init, 1:raw */ | ||||||
|  | 	int decoding_erred;	/* whether erred in decoding  */ | ||||||
|  | 	int read_coding_spec;	/* whether 'coding:...' has been read  */ | ||||||
|  | 	int issued_encoding_warning; /* whether non-ASCII warning was issued */ | ||||||
|  | 	char *encoding; | ||||||
|  | 	PyObject *decoding_readline; /* codecs.open(...).readline */ | ||||||
|  | 	PyObject *decoding_buffer; | ||||||
|  | 	const char* enc; | ||||||
|  | 	const char* str; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| extern struct tok_state *PyTokenizer_FromString(char *); | extern struct tok_state *PyTokenizer_FromString(char *); | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								Parser/tokenizer_pgen.c
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								Parser/tokenizer_pgen.c
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,2 @@ | ||||||
|  | #define PGEN | ||||||
|  | #include "tokenizer.c" | ||||||
							
								
								
									
										123
									
								
								Python/compile.c
									
										
									
									
									
								
							
							
						
						
									
										123
									
								
								Python/compile.c
									
										
									
									
									
								
							|  | @ -485,6 +485,7 @@ struct compiling { | ||||||
| 	int c_closure;		/* Is nested w/freevars? */ | 	int c_closure;		/* Is nested w/freevars? */ | ||||||
| 	struct symtable *c_symtable; /* pointer to module symbol table */ | 	struct symtable *c_symtable; /* pointer to module symbol table */ | ||||||
|         PyFutureFeatures *c_future; /* pointer to module's __future__ */ |         PyFutureFeatures *c_future; /* pointer to module's __future__ */ | ||||||
|  | 	char *c_encoding;	/* source encoding (a borrowed reference) */ | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
|  | @ -1181,6 +1182,23 @@ parsenumber(struct compiling *co, char *s) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static PyObject * | ||||||
|  | decode_utf8(char **sPtr, char *end, char* encoding) | ||||||
|  | { | ||||||
|  | 	PyObject *u, *v; | ||||||
|  | 	char *s, *t; | ||||||
|  | 	t = s = *sPtr; | ||||||
|  | 	/* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */ | ||||||
|  | 	while (s < end && (*s & 0x80)) s++; | ||||||
|  | 	*sPtr = s; | ||||||
|  | 	u = PyUnicode_DecodeUTF8(t, s - t, NULL); | ||||||
|  | 	if (u == NULL) | ||||||
|  | 		return NULL; | ||||||
|  | 	v = PyUnicode_AsEncodedString(u, encoding, NULL); | ||||||
|  | 	Py_DECREF(u); | ||||||
|  | 	return v; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static PyObject * | static PyObject * | ||||||
| parsestr(struct compiling *com, char *s) | parsestr(struct compiling *com, char *s) | ||||||
| { | { | ||||||
|  | @ -1193,6 +1211,8 @@ parsestr(struct compiling *com, char *s) | ||||||
| 	int first = *s; | 	int first = *s; | ||||||
| 	int quote = first; | 	int quote = first; | ||||||
| 	int rawmode = 0; | 	int rawmode = 0; | ||||||
|  | 	char* encoding = ((com == NULL) ? NULL : com->c_encoding); | ||||||
|  | 	int need_encoding; | ||||||
| 	int unicode = 0; | 	int unicode = 0; | ||||||
| 
 | 
 | ||||||
| 	if (isalpha(quote) || quote == '_') { | 	if (isalpha(quote) || quote == '_') { | ||||||
|  | @ -1230,28 +1250,101 @@ parsestr(struct compiling *com, char *s) | ||||||
| 	} | 	} | ||||||
| #ifdef Py_USING_UNICODE | #ifdef Py_USING_UNICODE | ||||||
| 	if (unicode || Py_UnicodeFlag) { | 	if (unicode || Py_UnicodeFlag) { | ||||||
|  | 		PyObject *u, *w; | ||||||
|  | 		if (encoding == NULL) { | ||||||
|  | 			buf = s; | ||||||
|  | 			u = NULL; | ||||||
|  | 		} else if (strcmp(encoding, "iso-8859-1") == 0) { | ||||||
|  | 			buf = s; | ||||||
|  | 			u = NULL; | ||||||
|  | 		} else { | ||||||
|  | 			/* "\XX" may become "\u005c\uHHLL" (12 bytes) */ | ||||||
|  | 			u = PyString_FromStringAndSize((char *)NULL, len * 4); | ||||||
|  | 			if (u == NULL) | ||||||
|  | 				return NULL; | ||||||
|  | 			p = buf = PyString_AsString(u); | ||||||
|  | 			end = s + len; | ||||||
|  | 			while (s < end) { | ||||||
|  | 				if (*s == '\\') { | ||||||
|  | 					*p++ = *s++; | ||||||
|  | 					if (*s & 0x80) { | ||||||
|  | 						strcpy(p, "u005c"); | ||||||
|  | 						p += 5; | ||||||
|  | 					} | ||||||
|  | 				} | ||||||
|  | 				if (*s & 0x80) { /* XXX inefficient */ | ||||||
|  | 					char *r; | ||||||
|  | 					int rn, i; | ||||||
|  | 					w = decode_utf8(&s, end, "utf-16-be"); | ||||||
|  | 					if (w == NULL) { | ||||||
|  | 						Py_DECREF(u); | ||||||
|  | 						return NULL; | ||||||
|  | 					} | ||||||
|  | 					r = PyString_AsString(w); | ||||||
|  | 					rn = PyString_Size(w); | ||||||
|  | 					assert(rn % 2 == 0); | ||||||
|  | 					for (i = 0; i < rn; i += 2) { | ||||||
|  | 						sprintf(p, "\\u%02x%02x", | ||||||
|  | 							r[i + 0] & 0xFF, | ||||||
|  | 							r[i + 1] & 0xFF); | ||||||
|  | 						p += 6; | ||||||
|  | 					} | ||||||
|  | 					Py_DECREF(w); | ||||||
|  | 				} else { | ||||||
|  | 					*p++ = *s++; | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 			len = p - buf; | ||||||
|  | 		} | ||||||
| 		if (rawmode) | 		if (rawmode) | ||||||
| 			v = PyUnicode_DecodeRawUnicodeEscape( | 			v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL); | ||||||
| 				 s, len, NULL); |  | ||||||
| 		else | 		else | ||||||
| 			v = PyUnicode_DecodeUnicodeEscape( | 			v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL); | ||||||
| 				s, len, NULL); | 		Py_XDECREF(u); | ||||||
| 		if (v == NULL) | 		if (v == NULL) | ||||||
| 			PyErr_SyntaxLocation(com->c_filename, com->c_lineno); | 			PyErr_SyntaxLocation(com->c_filename, com->c_lineno); | ||||||
| 		return v; | 		return v; | ||||||
| 			 | 			 | ||||||
| 	} | 	} | ||||||
| #endif | #endif | ||||||
| 	if (rawmode || strchr(s, '\\') == NULL) | 	need_encoding = (encoding != NULL && | ||||||
| 		return PyString_FromStringAndSize(s, len); | 			 strcmp(encoding, "utf-8") != 0 && | ||||||
| 	v = PyString_FromStringAndSize((char *)NULL, len); | 			 strcmp(encoding, "iso-8859-1") != 0); | ||||||
|  | 	if (rawmode || strchr(s, '\\') == NULL) { | ||||||
|  | 		if (need_encoding) { | ||||||
|  | 			PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL); | ||||||
|  | 			if (u == NULL) | ||||||
|  | 				return NULL; | ||||||
|  | 			v = PyUnicode_AsEncodedString(u, encoding, NULL); | ||||||
|  | 			Py_DECREF(u); | ||||||
|  | 			return v; | ||||||
|  | 		} else { | ||||||
|  | 			return PyString_FromStringAndSize(s, len); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */ | ||||||
|  | 				       need_encoding ? len * 4 : len); | ||||||
| 	if (v == NULL) | 	if (v == NULL) | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 	p = buf = PyString_AsString(v); | 	p = buf = PyString_AsString(v); | ||||||
| 	end = s + len; | 	end = s + len; | ||||||
| 	while (s < end) { | 	while (s < end) { | ||||||
| 		if (*s != '\\') { | 		if (*s != '\\') { | ||||||
| 			*p++ = *s++; | 		  ORDINAL:  | ||||||
|  | 			if (need_encoding && (*s & 0x80)) { | ||||||
|  | 				char *r; | ||||||
|  | 				int rn; | ||||||
|  | 				PyObject* w = decode_utf8(&s, end, encoding); | ||||||
|  | 				if (w == NULL) | ||||||
|  | 					return NULL; | ||||||
|  | 				r = PyString_AsString(w); | ||||||
|  | 				rn = PyString_Size(w); | ||||||
|  | 				memcpy(p, r, rn); | ||||||
|  | 				p += rn; | ||||||
|  | 				Py_DECREF(w); | ||||||
|  | 			} else { | ||||||
|  | 				*p++ = *s++; | ||||||
|  | 			} | ||||||
| 			continue; | 			continue; | ||||||
| 		} | 		} | ||||||
| 		s++; | 		s++; | ||||||
|  | @ -1320,8 +1413,8 @@ parsestr(struct compiling *com, char *s) | ||||||
| #endif | #endif | ||||||
| 		default: | 		default: | ||||||
| 			*p++ = '\\'; | 			*p++ = '\\'; | ||||||
| 			*p++ = s[-1]; | 			s--; | ||||||
| 			break; | 			goto ORDINAL; | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	_PyString_Resize(&v, (int)(p - buf)); | 	_PyString_Resize(&v, (int)(p - buf)); | ||||||
|  | @ -4149,6 +4242,12 @@ jcompile(node *n, char *filename, struct compiling *base, | ||||||
| 	PyCodeObject *co; | 	PyCodeObject *co; | ||||||
| 	if (!com_init(&sc, filename)) | 	if (!com_init(&sc, filename)) | ||||||
| 		return NULL; | 		return NULL; | ||||||
|  | 	if (TYPE(n) == encoding_decl) { | ||||||
|  | 		sc.c_encoding = STR(n); | ||||||
|  | 		n = CHILD(n, 0); | ||||||
|  | 	} else { | ||||||
|  | 		sc.c_encoding = NULL; | ||||||
|  | 	} | ||||||
| 	if (base) { | 	if (base) { | ||||||
| 		sc.c_private = base->c_private; | 		sc.c_private = base->c_private; | ||||||
| 		sc.c_symtable = base->c_symtable; | 		sc.c_symtable = base->c_symtable; | ||||||
|  | @ -4157,6 +4256,10 @@ jcompile(node *n, char *filename, struct compiling *base, | ||||||
| 		    || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION)) | 		    || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION)) | ||||||
| 			sc.c_nested = 1; | 			sc.c_nested = 1; | ||||||
| 		sc.c_flags |= base->c_flags & PyCF_MASK; | 		sc.c_flags |= base->c_flags & PyCF_MASK; | ||||||
|  | 		if (base->c_encoding != NULL) { | ||||||
|  | 			assert(sc.c_encoding == NULL); | ||||||
|  | 			sc.c_encoding = base->c_encoding; | ||||||
|  | 		} | ||||||
| 	} else { | 	} else { | ||||||
| 		sc.c_private = NULL; | 		sc.c_private = NULL; | ||||||
| 		sc.c_future = PyNode_Future(n, filename); | 		sc.c_future = PyNode_Future(n, filename); | ||||||
|  |  | ||||||
|  | @ -1463,7 +1463,17 @@ static state states_66[2] = { | ||||||
| 	{1, arcs_66_0}, | 	{1, arcs_66_0}, | ||||||
| 	{2, arcs_66_1}, | 	{2, arcs_66_1}, | ||||||
| }; | }; | ||||||
| static dfa dfas[67] = { | static arc arcs_67_0[1] = { | ||||||
|  | 	{12, 1}, | ||||||
|  | }; | ||||||
|  | static arc arcs_67_1[1] = { | ||||||
|  | 	{0, 1}, | ||||||
|  | }; | ||||||
|  | static state states_67[2] = { | ||||||
|  | 	{1, arcs_67_0}, | ||||||
|  | 	{1, arcs_67_1}, | ||||||
|  | }; | ||||||
|  | static dfa dfas[68] = { | ||||||
| 	{256, "single_input", 0, 3, states_0, | 	{256, "single_input", 0, 3, states_0, | ||||||
| 	 "\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"}, | 	 "\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"}, | ||||||
| 	{257, "file_input", 0, 2, states_1, | 	{257, "file_input", 0, 2, states_1, | ||||||
|  | @ -1598,8 +1608,10 @@ static dfa dfas[67] = { | ||||||
| 	 "\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"}, | 	 "\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"}, | ||||||
| 	{322, "testlist1", 0, 2, states_66, | 	{322, "testlist1", 0, 2, states_66, | ||||||
| 	 "\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"}, | 	 "\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"}, | ||||||
|  | 	{323, "encoding_decl", 0, 2, states_67, | ||||||
|  | 	 "\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"}, | ||||||
| }; | }; | ||||||
| static label labels[148] = { | static label labels[149] = { | ||||||
| 	{0, "EMPTY"}, | 	{0, "EMPTY"}, | ||||||
| 	{256, 0}, | 	{256, 0}, | ||||||
| 	{4, 0}, | 	{4, 0}, | ||||||
|  | @ -1748,10 +1760,11 @@ static label labels[148] = { | ||||||
| 	{318, 0}, | 	{318, 0}, | ||||||
| 	{319, 0}, | 	{319, 0}, | ||||||
| 	{321, 0}, | 	{321, 0}, | ||||||
|  | 	{323, 0}, | ||||||
| }; | }; | ||||||
| grammar _PyParser_Grammar = { | grammar _PyParser_Grammar = { | ||||||
| 	67, | 	68, | ||||||
| 	dfas, | 	dfas, | ||||||
| 	{148, labels}, | 	{149, labels}, | ||||||
| 	256 | 	256 | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -1221,6 +1221,7 @@ static void | ||||||
| err_input(perrdetail *err) | err_input(perrdetail *err) | ||||||
| { | { | ||||||
| 	PyObject *v, *w, *errtype; | 	PyObject *v, *w, *errtype; | ||||||
|  | 	PyObject* u = NULL; | ||||||
| 	char *msg = NULL; | 	char *msg = NULL; | ||||||
| 	errtype = PyExc_SyntaxError; | 	errtype = PyExc_SyntaxError; | ||||||
| 	v = Py_BuildValue("(ziiz)", err->filename, | 	v = Py_BuildValue("(ziiz)", err->filename, | ||||||
|  | @ -1272,12 +1273,24 @@ err_input(perrdetail *err) | ||||||
| 		errtype = PyExc_IndentationError; | 		errtype = PyExc_IndentationError; | ||||||
| 		msg = "too many levels of indentation"; | 		msg = "too many levels of indentation"; | ||||||
| 		break; | 		break; | ||||||
|  | 	case E_DECODE: {	/* XXX */ | ||||||
|  | 		PyThreadState* tstate = PyThreadState_Get(); | ||||||
|  | 		PyObject* value = tstate->curexc_value; | ||||||
|  | 		if (value != NULL) { | ||||||
|  | 			u = PyObject_Repr(value); | ||||||
|  | 			if (u != NULL) { | ||||||
|  | 				msg = PyString_AsString(u); | ||||||
|  | 				break; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
| 	default: | 	default: | ||||||
| 		fprintf(stderr, "error=%d\n", err->error); | 		fprintf(stderr, "error=%d\n", err->error); | ||||||
| 		msg = "unknown parsing error"; | 		msg = "unknown parsing error"; | ||||||
| 		break; | 		break; | ||||||
| 	} | 	} | ||||||
| 	w = Py_BuildValue("(sO)", msg, v); | 	w = Py_BuildValue("(sO)", msg, v); | ||||||
|  | 	Py_XDECREF(u); | ||||||
| 	Py_XDECREF(v); | 	Py_XDECREF(v); | ||||||
| 	PyErr_SetObject(errtype, w); | 	PyErr_SetObject(errtype, w); | ||||||
| 	Py_XDECREF(w); | 	Py_XDECREF(w); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Martin v. Löwis
						Martin v. Löwis