diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 21e8637a7ca..4428e8cea19 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1877,6 +1877,43 @@ def test_roundtrip(self): " print('Can not import' # comment2\n)" "else: print('Loaded')\n") + self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") + self.check_roundtrip(r"f'\\N{SNAKE}'") + self.check_roundtrip(r"f'\\N{{SNAKE}}'") + self.check_roundtrip(r"f'\N{SNAKE}'") + self.check_roundtrip(r"f'\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'") + + self.check_roundtrip(r"f'\\N{1}'") + self.check_roundtrip(r"f'\\\\N{2}'") + self.check_roundtrip(r"f'\\\\\\N{3}'") + self.check_roundtrip(r"f'\\\\\\\\N{4}'") + + self.check_roundtrip(r"f'\\N{{'") + self.check_roundtrip(r"f'\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\\\N{{'") + cases = [ + """ +if 1: + "foo" +"bar" +""", + """ +if 1: + ("foo" + "bar") +""", + """ +if 1: + "foo" + "bar" +""" ] + for case in cases: + self.check_roundtrip(case) + + def test_continuation(self): # Balancing continuation self.check_roundtrip("a = (3,4, \n" @@ -1911,9 +1948,6 @@ def test_random_files(self): tempdir = os.path.dirname(__file__) or os.curdir testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) - # TODO: Remove this once we can untokenize PEP 701 syntax - testfiles.remove(os.path.join(tempdir, "test_fstring.py")) - if not support.is_resource_enabled("cpu"): testfiles = random.sample(testfiles, 10) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 0ab1893d42f..7f418bb7a1b 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -168,6 +168,7 @@ def __init__(self): self.tokens = [] self.prev_row = 1 self.prev_col = 0 + self.prev_type = None self.encoding = None def add_whitespace(self, start): @@ -183,6 +184,29 @@ def add_whitespace(self, start): if col_offset: self.tokens.append(" " * col_offset) + def escape_brackets(self, token): + characters = [] + consume_until_next_bracket = False + for character in token: + if character == "}": + if consume_until_next_bracket: + consume_until_next_bracket = False + else: + characters.append(character) + if character == "{": + n_backslashes = sum( + 1 for char in _itertools.takewhile( + "\\".__eq__, + characters[-2::-1] + ) + ) + if n_backslashes % 2 == 0: + characters.append(character) + else: + consume_until_next_bracket = True + characters.append(character) + return "".join(characters) + def untokenize(self, iterable): it = iter(iterable) indents = [] @@ -214,11 +238,13 @@ def untokenize(self, iterable): startline = False elif tok_type == FSTRING_MIDDLE: if '{' in token or '}' in token: + token = self.escape_brackets(token) + last_line = token.splitlines()[-1] end_line, end_col = end - end = (end_line, end_col + token.count('{') + token.count('}')) - token = re.sub('{', '{{', token) - token = re.sub('}', '}}', token) - + extra_chars = last_line.count("{{") + last_line.count("}}") + end = (end_line, end_col + extra_chars) + elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): + self.tokens.append(" ") self.add_whitespace(start) self.tokens.append(token) @@ -226,6 +252,7 @@ def untokenize(self, iterable): if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 + self.prev_type = tok_type return "".join(self.tokens) def compat(self, token, iterable): @@ -233,6 +260,7 @@ def compat(self, token, iterable): toks_append = self.tokens.append startline = token[0] in (NEWLINE, NL) prevstring = False + in_fstring = 0 for tok in _itertools.chain([token], iterable): toknum, tokval = tok[:2] @@ -251,6 +279,10 @@ def compat(self, token, iterable): else: prevstring = False + if toknum == FSTRING_START: + in_fstring += 1 + elif toknum == FSTRING_END: + in_fstring -= 1 if toknum == INDENT: indents.append(tokval) continue @@ -263,11 +295,18 @@ def compat(self, token, iterable): toks_append(indents[-1]) startline = False elif toknum == FSTRING_MIDDLE: - if '{' in tokval or '}' in tokval: - tokval = re.sub('{', '{{', tokval) - tokval = re.sub('}', '}}', tokval) + tokval = self.escape_brackets(tokval) + + # Insert a space between two consecutive brackets if we are in an f-string + if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring: + tokval = ' ' + tokval + + # Insert a space between two consecutive f-strings + if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): + self.tokens.append(" ") toks_append(tokval) + self.prev_type = toknum def untokenize(iterable): diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst b/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst new file mode 100644 index 00000000000..045596bfcdc --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst @@ -0,0 +1,2 @@ +Fix a bug that was causing the :func:`tokenize.untokenize` function to +handle unicode named literals incorrectly. Patch by Pablo Galindo