Patch #534304: Implement phase 1 of PEP 263.

2025-12-08 06:10:17 +00:00 · 2002-08-04 17:29:52 +00:00 · 2002-08-04 17:29:52 +00:00 · 00f1e3f5a5
commit 00f1e3f5a5
parent a729daf2e4
13 changed files with 656 additions and 31 deletions
--- a/Doc/ref/ref2.tex
+++ b/Doc/ref/ref2.tex
@ -7,11 +7,14 @@ chapter describes how the lexical analyzer breaks a file into tokens.
 \index{parser}
 \index{token}
-Python uses the 7-bit \ASCII{} character set for program text and string
+Python uses the 7-bit \ASCII{} character set for program text.
-literals. 8-bit characters may be used in string literals and comments
+\versionadded[An encoding declaration can be used to indicate that 
-but their interpretation is platform dependent; the proper way to
+string literals and comments use an encoding different from ASCII.]{2.3}
-insert 8-bit characters in string literals is by using octal or
+For compatibility with older versions, Python only warns if it finds
-hexadecimal escape sequences.
+8-bit characters; those warnings should be corrected by either declaring
 an explicit encoding, or using escape sequences if those bytes are binary
 data, instead of characters.
 The run-time character set depends on the I/O devices connected to the
 program but is generally a superset of \ASCII.
@ -69,6 +72,37 @@ Comments are ignored by the syntax; they are not tokens.
 \index{hash character}
 \subsection{Encoding declarations\label{encodings}}
 If a comment in the first or second line of the Python script matches
 the regular expression "coding[=:]\s*([\w-_.]+)", this comment is
 processed as an encoding declaration; the first group of this
 expression names the encoding of the source code file. The recommended
 forms of this expression are
 \begin{verbatim}
 # -*- coding: <encoding-name> -*-
 \end{verbatim}
 which is recognized also by GNU Emacs, and
 \begin{verbatim}
 # vim:fileencoding=<encoding-name>
 \end{verbatim}
 which is recognized by Bram Moolenar's VIM. In addition, if the first
 bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the
 declared file encoding is UTF-8 (this is supported, among others, by
 Microsoft's notepad.exe).
 If an encoding is declared, the encoding name must be recognized by
 Python. % XXX there should be a list of supported encodings.
 The encoding is used for all lexical analysis, in particular to find
 the end of a string, and to interpret the contents of Unicode literals.
 String literals are converted to Unicode for syntactical analysis,
 then converted back to their original encoding before interpretation
 starts.
 \subsection{Explicit line joining\label{explicit-joining}}
 Two or more physical lines may be joined into logical lines using
--- a/Grammar/Grammar
+++ b/Grammar/Grammar
@ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter]
 list_if: 'if' test [list_iter]
 testlist1: test (',' test)*
 # not used in grammar, but may appear in "node" passed from Parser to Compiler
 encoding_decl: NAME
--- a/Include/errcode.h
+++ b/Include/errcode.h
@ -25,6 +25,7 @@ extern "C" {
 #define E_OVERFLOW      19	/* Node had too many children */
 #define E_TOODEEP	20	/* Too many indentation levels */
 #define E_DEDENT	21	/* No matching outer block for dedent */
 #define E_DECODE	22	/* Error in decoding into Unicode */
 #ifdef __cplusplus
 }
--- a/Include/graminit.h
+++ b/Include/graminit.h
@ -65,3 +65,4 @@
 #define list_for 320
 #define list_if 321
 #define testlist1 322
 #define encoding_decl 323
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@ -190,15 +190,15 @@ POBJS=		\
 		Parser/node.o \
 		Parser/parser.o \
 		Parser/parsetok.o \
 		Parser/tokenizer.o \
 		Parser/bitset.o \
 		Parser/metagrammar.o
-PARSER_OBJS=	$(POBJS) Parser/myreadline.o
+PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o
 PGOBJS=		\
 		Objects/obmalloc.o \
 		Python/mysnprintf.o \
 		Parser/tokenizer_pgen.o \
 		Parser/firstsets.o \
 		Parser/grammar.o \
 		Parser/pgen.o \
@ -434,6 +434,8 @@ Parser/grammar.o:	$(srcdir)/Parser/grammar.c \
 				$(srcdir)/Include/grammar.h
 Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c
 Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c
 Python/compile.o Python/symtable.o: $(GRAMMAR_H)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -6,6 +6,8 @@ Type/class unification and new-style classes
 Core and builtins
 - Encoding declarations (PEP 263, phase 1) have been implemented.
 - list.sort() has a new implementation.  While cross-platform results
  may vary, and in data-dependent ways, this is much faster on many
  kinds of partially ordered lists than the previous implementation,
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@ -8,6 +8,7 @@
 #include "parser.h"
 #include "parsetok.h"
 #include "errcode.h"
 #include "graminit.h"
 int Py_TabcheckFlag;
@ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename,
 		return NULL;
 	}
        tok->filename = filename ? filename : "<string>";
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
 		tok->filename = filename ? filename : "<string>";
 		tok->altwarning = (tok->filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start,
 		err_ret->error = E_NOMEM;
 		return NULL;
 	}
 	tok->filename = filename;
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
 		tok->filename = filename;
 		tok->altwarning = (filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 				err_ret->text[len] = '\0';
 			}
 		}
 	} else if (tok->encoding != NULL) {
 		node* r = PyNode_New(encoding_decl);
 		r->n_str = tok->encoding;
 		r->n_nchildren = 1;
 		r->n_child = n;
 		tok->encoding = NULL;
 		n = r;
 	}
 	PyTokenizer_Free(tok);
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -5,10 +5,19 @@
 #include "pgenheaders.h"
 #include <ctype.h>
 #include <assert.h>
 #include "tokenizer.h"
 #include "errcode.h"
 #ifndef PGEN
 #include "unicodeobject.h"
 #include "stringobject.h"
 #include "fileobject.h"
 #include "codecs.h"
 #include "abstract.h"
 #endif /* PGEN */
 extern char *PyOS_Readline(char *);
 /* Return malloc'ed string including trailing \n;
   empty malloc'ed string for EOF;
@ -114,9 +123,416 @@ tok_new(void)
 	tok->alterror = 0;
 	tok->alttabsize = 1;
 	tok->altindstack[0] = 0;
 	tok->decoding_state = 0;
 	tok->decoding_erred = 0;
 	tok->read_coding_spec = 0;
 	tok->issued_encoding_warning = 0;
 	tok->encoding = NULL;
 	tok->decoding_readline = NULL;
 	tok->decoding_buffer = NULL;
 	return tok;
 }
 #ifdef PGEN
 static char *
 decoding_fgets(char *s, int size, struct tok_state *tok)
 {
 	return fgets(s, size, tok->fp);
 }
 static int
 decoding_feof(struct tok_state *tok)
 {
 	return feof(tok->fp);
 }
 static const char *
 decode_str(const char *str, struct tok_state *tok)
 {
 	return str;
 }
 #else /* PGEN */
 static char *
 error_ret(struct tok_state *tok) /* XXX */
 {
 	tok->decoding_erred = 1;
 	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 		PyMem_DEL(tok->buf);
 	tok->buf = NULL;
 	return NULL;		/* as if it were EOF */
 }
 static char *
 new_string(const char *s, int len)
 {
 	char* result = PyMem_NEW(char, len + 1);
 	if (result != NULL) {
 		memcpy(result, s, len);
 		result[len] = '\0';
 	}
 	return result;
 }
 static char *
 get_normal_name(char *s)	/* for utf-8 and latin-1 */
 {
 	char buf[13];
 	int i;
 	for (i = 0; i < 12; i++) {
 		int c = s[i];
 		if (c == '\0') break;
 		else if (c == '_') buf[i] = '-';
 		else buf[i] = tolower(c);
 	}
 	buf[i] = '\0';
 	if (strcmp(buf, "utf-8") == 0 ||
 	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 	else if (strcmp(buf, "latin-1") == 0 ||
 		 strcmp(buf, "iso-8859-1") == 0 ||
 		 strcmp(buf, "iso-latin-1") == 0 ||
 		 strncmp(buf, "latin-1-", 8) == 0 ||
 		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
 		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 	else return s;
 }
 /* Return the coding spec in S, or NULL if none is found.  */
 static char *
 get_coding_spec(const char *s, int size)
 {
 	int i;
 	for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
 		const char* t = s + i;
 		if (strncmp(t, "coding", 6) == 0) {
 			const char* begin = NULL;
 			t += 6;
 			if (t[0] != ':' && t[0] != '=')
 				continue;
 			do {
 				t++;
 			} while (t[0] == '\x20' || t[0] == '\t');
 			begin = t;
 			while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
 			       t[0] == '.')
 				t++;
 			if (begin < t) {
 				char* r = new_string(begin, t - begin);
 				char* q = get_normal_name(r);
 				if (r != q) {
 					assert(strlen(r) >= strlen(q));
 					strcpy(r, q);
 				}
 				return r;
 			}
 		}
 	}
 	return NULL;
 }
 /* Check whether the line contains a coding spec. If it does,
   invoke the set_readline function for the new encoding.
   This function receives the tok_state and the new encoding.
   Return 1 on success, 0 on failure.  */
 static int
 check_coding_spec(const char* line, int size, struct tok_state *tok,
 		  int set_readline(struct tok_state *, const char *))
 {
 	int r = 1;
 	char* cs = get_coding_spec(line, size);
 	if (cs != NULL) {
 		tok->read_coding_spec = 1;
 		if (tok->encoding == NULL) {
 			assert(tok->decoding_state == 1); /* raw */
 			if (strcmp(cs, "utf-8") == 0 ||
 			    strcmp(cs, "iso-8859-1") == 0) {
 				tok->encoding = cs;
 			} else {
 				r = set_readline(tok, cs);
 				if (r) {
 					tok->encoding = cs;
 					tok->decoding_state = -1;
 				}
 			}
 		} else {	/* then, compare cs with BOM */
 			r = (strcmp(tok->encoding, cs) == 0);
 			PyMem_DEL(cs);
 		}
 	}
 	return r;
 }
 /* See whether the file starts with a BOM. If it does,
   invoke the set_readline function with the new encoding.
   Return 1 on success, 0 on failure.  */
 static int
 check_bom(int get_char(struct tok_state *),
 	  void unget_char(int, struct tok_state *),
 	  int set_readline(struct tok_state *, const char *),
 	  struct tok_state *tok)
 {
 	int ch = get_char(tok);
 	tok->decoding_state = 1;
 	if (ch == EOF) {
 		return 1;
 	} else if (ch == 0xEF) {
 		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
 		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
 #if 0
 	/* Disable support for UTF-16 BOMs until a decision
 	   is made whether this needs to be supported.  */
 	} else if (ch == 0xFE) {
 		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 		if (!set_readline(tok, "utf-16-be")) return 0;
 		tok->decoding_state = -1;
 	} else if (ch == 0xFF) {
 		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 		if (!set_readline(tok, "utf-16-le")) return 0;
 		tok->decoding_state = -1;
 #endif
 	} else {
 		unget_char(ch, tok);
 		return 1;
 	}
 	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
 	return 1;
  NON_BOM:
 	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
 	return 1;
 }
 /* Read a line of text from TOK into S, using the stream in TOK.
   Return NULL on failure, else S.  */
 static char *
 fp_readl(char *s, int size, struct tok_state *tok)
 {
 	PyObject* utf8;
 	PyObject* buf = tok->decoding_buffer;
 	if (buf == NULL) {
 		buf = PyObject_CallObject(tok->decoding_readline, NULL);
 		if (buf == NULL) return error_ret(tok);
 	} else {
 		tok->decoding_buffer = NULL;
 	}
 	utf8 = PyUnicode_AsUTF8String(buf);
 	Py_DECREF(buf);
 	if (utf8 == NULL) return error_ret(tok);
 	else {
 		const char* str = PyString_AsString(utf8);
 		assert(strlen(str) < size); /* XXX */
 		strcpy(s, str);
 		Py_DECREF(utf8);
 		if (s[0] == '\0') return NULL; /* EOF */
 		return s;
 	}
 }
 /* Set the readline function for TOK to a StreamReader's
   readline function. The StreamReader is named ENC.
   This function is called from check_bom and check_coding_spec.
   ENC is usually identical to the future value of tok->encoding,
   except for the (currently unsupported) case of UTF-16.
   Return 1 on success, 0 on failure. */
 static int
 fp_setreadl(struct tok_state *tok, const char* enc)
 {
 	PyObject *reader, *stream, *readline;
 	stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
 	if (stream == NULL) return 0;
 	reader = PyCodec_StreamReader(enc, stream, NULL);
 	Py_DECREF(stream);
 	if (reader == NULL) return 0;
 	readline = PyObject_GetAttrString(reader, "readline");
 	Py_DECREF(reader);
 	if (readline == NULL) return 0;
 	tok->decoding_readline = readline;
 	return 1;
 }
 /* Fetch the next byte from TOK. */
 static int fp_getc(struct tok_state *tok) {
 	return getc(tok->fp);
 }
 /* Unfetch the last byte back into TOK.  */
 static void fp_ungetc(int c, struct tok_state *tok) {
 	ungetc(c, tok->fp);
 }
 /* Read a line of input from TOK. Determine encoding
   if necessary.  */
 static char *
 decoding_fgets(char *s, int size, struct tok_state *tok)
 {
 	char *line;
 	int warn = 0, badchar = 0;
 	for (;;)
 		if (tok->decoding_state < 0) {
 			/* We already have a codec associated with
 			   this input. */
 			line = fp_readl(s, size, tok);
 			break;
 		} else if (tok->decoding_state > 0) {
 			/* We want a 'raw' read. */
 			line = Py_UniversalNewlineFgets(s, size, 
 							tok->fp, NULL);
 			warn = 1;
 			break;
 		} else {
 			/* We have not yet determined the encoding.
 			   If an encoding is found, use the file-pointer
 			   reader functions from now on. */
 			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 				return error_ret(tok);
 			assert(tok->decoding_state != 0);
 		}
 	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 			return error_ret(tok);
 		}
 	}
 #ifndef PGEN
 	if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
 		unsigned char *c;
 		for (c = line; *c; c++)
 			if (*c > 127) {
 				badchar = *c;
 				break;
 			}
 	}
 	if (badchar) {
 		char buf[200];
 		sprintf(buf, "Non-ASCII character '\\x%.2x', "
 			"but no declared encoding", badchar);
 		PyErr_WarnExplicit(PyExc_DeprecationWarning,
 				   buf, tok->filename, tok->lineno, 
 				   NULL, NULL);
 		tok->issued_encoding_warning = 1;
 	}
 #endif
 	return line;
 }
 static int
 decoding_feof(struct tok_state *tok)
 {
 	if (tok->decoding_state >= 0) {
 		return feof(tok->fp);
 	} else {
 		PyObject* buf = tok->decoding_buffer;
 		if (buf == NULL) {
 			buf = PyObject_CallObject(tok->decoding_readline, NULL);
 			if (buf == NULL) {
 				error_ret(tok);
 				return 1;
 			} else {
 				tok->decoding_buffer = buf;
 			}
 		}
 		return PyObject_Length(buf) == 0;
 	}
 }
 /* Fetch a byte from TOK, using the string buffer. */
 static int buf_getc(struct tok_state *tok) {
 	return *tok->str++;
 }
 /* Unfetch a byte from TOK, using the string buffer. */
 static void buf_ungetc(int c, struct tok_state *tok) {
 	tok->str--;
 	assert(*tok->str == c);	/* tok->cur may point to read-only segment */
 }
 /* Set the readline function for TOK to ENC. For the string-based
   tokenizer, this means to just record the encoding. */
 static int buf_setreadl(struct tok_state *tok, const char* enc) {
 	tok->enc = enc;
 	return 1;
 }
 /* Return a UTF-8 encoding Python string object from the
   C byte string STR, which is encoded with ENC. */
 static PyObject *
 translate_into_utf8(const char* str, const char* enc) {
 	PyObject *utf8;
 	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 	if (buf == NULL)
 		return NULL;
 	utf8 = PyUnicode_AsUTF8String(buf);
 	Py_DECREF(buf);
 	return utf8;
 }
 /* Decode a byte string STR for use as the buffer of TOK.
   Look for encoding declarations inside STR, and record them
   inside TOK.  */
 static const char *
 decode_str(const char *str, struct tok_state *tok)
 {
 	PyObject* utf8 = NULL;
 	const char *s;
 	int lineno = 0;
 	tok->enc = NULL;
 	tok->str = str;
 	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 		return NULL;
 	str = tok->str;		/* string after BOM if any */
 	assert(r);
 	if (tok->enc != NULL) {
 		utf8 = translate_into_utf8(str, tok->enc);
 		if (utf8 == NULL)
 			return NULL;
 		str = PyString_AsString(utf8);
 	}
 	for (s = str;; s++) {
 		if (*s == '\0') break;
 		else if (*s == '\n') {
 			lineno++;
 			if (lineno == 2) break;
 		}
 	}
 	tok->enc = NULL;
 	if (!check_coding_spec(str, s - str, tok, buf_setreadl))
 		return NULL;
 	if (tok->enc != NULL) {
 		assert(utf8 == NULL);
 		utf8 = translate_into_utf8(str, tok->enc);
 		if (utf8 == NULL)
 			return NULL;
 		str = PyString_AsString(utf8);
 	}
 	assert(tok->decoding_buffer == NULL);
 	tok->decoding_buffer = utf8; /* CAUTION */
 	return str;
 }
 #endif /* PGEN */
 /* Set up tokenizer for string */
@ -126,6 +542,9 @@ PyTokenizer_FromString(char *str)
 	struct tok_state *tok = tok_new();
 	if (tok == NULL)
 		return NULL;
 	str = (char *)decode_str(str, tok);
 	if (str == NULL)
 		return NULL;
 	tok->buf = tok->cur = tok->end = tok->inp = str;
 	return tok;
 }
@ -157,6 +576,10 @@ PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 void
 PyTokenizer_Free(struct tok_state *tok)
 {
 	if (tok->encoding != NULL)
 		PyMem_DEL(tok->encoding);
 	Py_XDECREF(tok->decoding_readline);
 	Py_XDECREF(tok->decoding_buffer);
 	if (tok->fp != NULL && tok->buf != NULL)
 		PyMem_DEL(tok->buf);
 	PyMem_DEL(tok);
@ -246,8 +669,8 @@ tok_nextc(register struct tok_state *tok)
 					}
 					tok->end = tok->buf + BUFSIZ;
 				}
-				if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf),
+				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
-					  tok->fp, NULL) == NULL) {
+					  tok) == NULL) {
 					tok->done = E_EOF;
 					done = 1;
 				}
@ -259,7 +682,7 @@ tok_nextc(register struct tok_state *tok)
 			}
 			else {
 				cur = tok->cur - tok->buf;
-				if (feof(tok->fp)) {
+				if (decoding_feof(tok)) {
 					tok->done = E_EOF;
 					done = 1;
 				}
@ -285,9 +708,9 @@ tok_nextc(register struct tok_state *tok)
 				tok->end = tok->buf + newsize;
 				tok->start = curstart < 0 ? NULL :
 					     tok->buf + curstart;
-				if (Py_UniversalNewlineFgets(tok->inp,
+				if (decoding_fgets(tok->inp,
 					       (int)(tok->end - tok->inp),
-					       tok->fp, NULL) == NULL) {
+					       tok) == NULL) {
 					/* Last line does not end in \n,
 					   fake one */
 					strcpy(tok->inp, "\n");
@ -506,9 +929,8 @@ indenterror(struct tok_state *tok)
 /* Get next token, after space stripping etc. */
-int
+static int
-PyTokenizer_Get(register struct tok_state *tok, char **p_start,
+tok_get(register struct tok_state *tok, char **p_start, char **p_end)
 		char **p_end)
 {
 	register int c;
 	int blankline;
@ -915,6 +1337,16 @@ PyTokenizer_Get(register struct tok_state *tok, char **p_start,
 	return PyToken_OneChar(c);
 }
 int
 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 {
 	int result = tok_get(tok, p_start, p_end);
 	if (tok->decoding_erred) {
 		result = ERRORTOKEN;
 		tok->done = E_DECODE;
 	}
 	return result;
 }
 #ifdef Py_DEBUG
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@ -4,6 +4,7 @@
 extern "C" {
 #endif
 #include "object.h"
 /* Tokenizer interface */
@ -38,6 +39,16 @@ struct tok_state {
 	int alterror;	/* Issue error if alternate tabs don't match */
 	int alttabsize;	/* Alternate tab spacing */
 	int altindstack[MAXINDENT];	/* Stack of alternate indents */
 	/* Stuff for PEP 0263 */
 	int decoding_state;	/* -1:decoding, 0:init, 1:raw */
 	int decoding_erred;	/* whether erred in decoding  */
 	int read_coding_spec;	/* whether 'coding:...' has been read  */
 	int issued_encoding_warning; /* whether non-ASCII warning was issued */
 	char *encoding;
 	PyObject *decoding_readline; /* codecs.open(...).readline */
 	PyObject *decoding_buffer;
 	const char* enc;
 	const char* str;
 };
 extern struct tok_state *PyTokenizer_FromString(char *);
--- a/Parser/tokenizer_pgen.c
+++ b/Parser/tokenizer_pgen.c
@ -0,0 +1,2 @@
 #define PGEN
 #include "tokenizer.c"
--- a/Python/compile.c
+++ b/Python/compile.c
@ -485,6 +485,7 @@ struct compiling {
 	int c_closure;		/* Is nested w/freevars? */
 	struct symtable *c_symtable; /* pointer to module symbol table */
        PyFutureFeatures *c_future; /* pointer to module's __future__ */
 	char *c_encoding;	/* source encoding (a borrowed reference) */
 };
 static int
@ -1181,6 +1182,23 @@ parsenumber(struct compiling *co, char *s)
 	}
 }
 static PyObject *
 decode_utf8(char **sPtr, char *end, char* encoding)
 {
 	PyObject *u, *v;
 	char *s, *t;
 	t = s = *sPtr;
 	/* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
 	while (s < end && (*s & 0x80)) s++;
 	*sPtr = s;
 	u = PyUnicode_DecodeUTF8(t, s - t, NULL);
 	if (u == NULL)
 		return NULL;
 	v = PyUnicode_AsEncodedString(u, encoding, NULL);
 	Py_DECREF(u);
 	return v;
 }
 static PyObject *
 parsestr(struct compiling *com, char *s)
 {
@ -1193,6 +1211,8 @@ parsestr(struct compiling *com, char *s)
 	int first = *s;
 	int quote = first;
 	int rawmode = 0;
 	char* encoding = ((com == NULL) ? NULL : com->c_encoding);
 	int need_encoding;
 	int unicode = 0;
 	if (isalpha(quote) || quote == '_') {
@ -1230,28 +1250,101 @@ parsestr(struct compiling *com, char *s)
 	}
 #ifdef Py_USING_UNICODE
 	if (unicode || Py_UnicodeFlag) {
 		PyObject *u, *w;
 		if (encoding == NULL) {
 			buf = s;
 			u = NULL;
 		} else if (strcmp(encoding, "iso-8859-1") == 0) {
 			buf = s;
 			u = NULL;
 		} else {
 			/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
 			u = PyString_FromStringAndSize((char *)NULL, len * 4);
 			if (u == NULL)
 				return NULL;
 			p = buf = PyString_AsString(u);
 			end = s + len;
 			while (s < end) {
 				if (*s == '\\') {
 					*p++ = *s++;
 					if (*s & 0x80) {
 						strcpy(p, "u005c");
 						p += 5;
 					}
 				}
 				if (*s & 0x80) { /* XXX inefficient */
 					char *r;
 					int rn, i;
 					w = decode_utf8(&s, end, "utf-16-be");
 					if (w == NULL) {
 						Py_DECREF(u);
 						return NULL;
 					}
 					r = PyString_AsString(w);
 					rn = PyString_Size(w);
 					assert(rn % 2 == 0);
 					for (i = 0; i < rn; i += 2) {
 						sprintf(p, "\\u%02x%02x",
 							r[i + 0] & 0xFF,
 							r[i + 1] & 0xFF);
 						p += 6;
 					}
 					Py_DECREF(w);
 				} else {
 					*p++ = *s++;
 				}
 			}
 			len = p - buf;
 		}
 		if (rawmode)
-			v = PyUnicode_DecodeRawUnicodeEscape(
+			v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL);
 				 s, len, NULL);
 		else
-			v = PyUnicode_DecodeUnicodeEscape(
+			v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL);
-				s, len, NULL);
+		Py_XDECREF(u);
 		if (v == NULL)
 			PyErr_SyntaxLocation(com->c_filename, com->c_lineno);
 		return v;
 	}
 #endif
-	if (rawmode || strchr(s, '\\') == NULL)
+	need_encoding = (encoding != NULL &&
-		return PyString_FromStringAndSize(s, len);
+			 strcmp(encoding, "utf-8") != 0 &&
-	v = PyString_FromStringAndSize((char *)NULL, len);
+			 strcmp(encoding, "iso-8859-1") != 0);
 	if (rawmode || strchr(s, '\\') == NULL) {
 		if (need_encoding) {
 			PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL);
 			if (u == NULL)
 				return NULL;
 			v = PyUnicode_AsEncodedString(u, encoding, NULL);
 			Py_DECREF(u);
 			return v;
 		} else {
 			return PyString_FromStringAndSize(s, len);
 		}
 	}
 	v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */
 				       need_encoding ? len * 4 : len);
 	if (v == NULL)
 		return NULL;
 	p = buf = PyString_AsString(v);
 	end = s + len;
 	while (s < end) {
 		if (*s != '\\') {
-			*p++ = *s++;
+		  ORDINAL: 
 			if (need_encoding && (*s & 0x80)) {
 				char *r;
 				int rn;
 				PyObject* w = decode_utf8(&s, end, encoding);
 				if (w == NULL)
 					return NULL;
 				r = PyString_AsString(w);
 				rn = PyString_Size(w);
 				memcpy(p, r, rn);
 				p += rn;
 				Py_DECREF(w);
 			} else {
 				*p++ = *s++;
 			}
 			continue;
 		}
 		s++;
@ -1320,8 +1413,8 @@ parsestr(struct compiling *com, char *s)
 #endif
 		default:
 			*p++ = '\\';
-			*p++ = s[-1];
+			s--;
-			break;
+			goto ORDINAL;
 		}
 	}
 	_PyString_Resize(&v, (int)(p - buf));
@ -4149,6 +4242,12 @@ jcompile(node *n, char *filename, struct compiling *base,
 	PyCodeObject *co;
 	if (!com_init(&sc, filename))
 		return NULL;
 	if (TYPE(n) == encoding_decl) {
 		sc.c_encoding = STR(n);
 		n = CHILD(n, 0);
 	} else {
 		sc.c_encoding = NULL;
 	}
 	if (base) {
 		sc.c_private = base->c_private;
 		sc.c_symtable = base->c_symtable;
@ -4157,6 +4256,10 @@ jcompile(node *n, char *filename, struct compiling *base,
 		    || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION))
 			sc.c_nested = 1;
 		sc.c_flags |= base->c_flags & PyCF_MASK;
 		if (base->c_encoding != NULL) {
 			assert(sc.c_encoding == NULL);
 			sc.c_encoding = base->c_encoding;
 		}
 	} else {
 		sc.c_private = NULL;
 		sc.c_future = PyNode_Future(n, filename);
--- a/Python/graminit.c
+++ b/Python/graminit.c
@ -1463,7 +1463,17 @@ static state states_66[2] = {
 	{1, arcs_66_0},
 	{2, arcs_66_1},
 };
-static dfa dfas[67] = {
+static arc arcs_67_0[1] = {
 	{12, 1},
 };
 static arc arcs_67_1[1] = {
 	{0, 1},
 };
 static state states_67[2] = {
 	{1, arcs_67_0},
 	{1, arcs_67_1},
 };
 static dfa dfas[68] = {
 	{256, "single_input", 0, 3, states_0,
 	 "\004\030\001\000\000\000\124\360\213\011\162\000\002\000\140\210\244\005\001"},
 	{257, "file_input", 0, 2, states_1,
@ -1598,8 +1608,10 @@ static dfa dfas[67] = {
 	 "\000\000\000\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\000"},
 	{322, "testlist1", 0, 2, states_66,
 	 "\000\020\001\000\000\000\000\000\000\000\000\000\002\000\140\210\244\005\000"},
 	{323, "encoding_decl", 0, 2, states_67,
 	 "\000\020\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"},
 };
-static label labels[148] = {
+static label labels[149] = {
 	{0, "EMPTY"},
 	{256, 0},
 	{4, 0},
@ -1748,10 +1760,11 @@ static label labels[148] = {
 	{318, 0},
 	{319, 0},
 	{321, 0},
 	{323, 0},
 };
 grammar _PyParser_Grammar = {
-	67,
+	68,
 	dfas,
-	{148, labels},
+	{149, labels},
 	256
 };
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@ -1221,6 +1221,7 @@ static void
 err_input(perrdetail *err)
 {
 	PyObject *v, *w, *errtype;
 	PyObject* u = NULL;
 	char *msg = NULL;
 	errtype = PyExc_SyntaxError;
 	v = Py_BuildValue("(ziiz)", err->filename,
@ -1272,12 +1273,24 @@ err_input(perrdetail *err)
 		errtype = PyExc_IndentationError;
 		msg = "too many levels of indentation";
 		break;
 	case E_DECODE: {	/* XXX */
 		PyThreadState* tstate = PyThreadState_Get();
 		PyObject* value = tstate->curexc_value;
 		if (value != NULL) {
 			u = PyObject_Repr(value);
 			if (u != NULL) {
 				msg = PyString_AsString(u);
 				break;
 			}
 		}
 	}
 	default:
 		fprintf(stderr, "error=%d\n", err->error);
 		msg = "unknown parsing error";
 		break;
 	}
 	w = Py_BuildValue("(sO)", msg, v);
 	Py_XDECREF(u);
 	Py_XDECREF(v);
 	PyErr_SetObject(errtype, w);
 	Py_XDECREF(w);