[3.12] gh-125553: Fix backslash continuation in untokenize (GH-126010) (#130579)

(cherry picked from commit 7ad793e5db)

Co-authored-by: Tomas R. <tomas.roun8@gmail.com>
This commit is contained in:
Hugo van Kemenade 2025-02-27 23:57:13 +02:00 committed by GitHub
parent 245ca2607c
commit b8f2ff08f1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 53 additions and 11 deletions

View file

@ -1,20 +1,20 @@
from test import support
from test.support import os_helper
import os
import re
import token
import unittest
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
TokenError)
from io import BytesIO, StringIO
import unittest
from textwrap import dedent
from unittest import TestCase, mock
from test import support
from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
INVALID_UNDERSCORE_LITERALS)
from test.support import os_helper
from test.support.script_helper import run_test_script, make_script, run_python_until_end
import os
import token
# Converts a source string into a list of textual representation
# of the tokens such as:
@ -1816,6 +1816,22 @@ def test_iter_compat(self):
self.assertEqual(untokenize(iter(tokens)), b'Hello ')
def contains_ambiguous_backslash(source):
"""Return `True` if the source contains a backslash on a
line by itself. For example:
a = (1
\\
)
Code like this cannot be untokenized exactly. This is because
the tokenizer does not produce any tokens for the line containing
the backslash and so there is no way to know its indent.
"""
pattern = re.compile(br'\n\s*\\\r?\n')
return pattern.search(source) is not None
class TestRoundtrip(TestCase):
def check_roundtrip(self, f):
@ -1826,6 +1842,9 @@ def check_roundtrip(self, f):
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
The test fails if the 3 pair tokenizations do not match.
If the source code can be untokenized unambiguously, the
untokenized code must match the original code exactly.
When untokenize bugs are fixed, untokenize with 5-tuples should
reproduce code that does not contain a backslash continuation
following spaces. A proper test should test this.
@ -1849,6 +1868,13 @@ def check_roundtrip(self, f):
tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)
if not contains_ambiguous_backslash(code):
# The BOM does not produce a token so there is no way to preserve it.
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
untokenized_code = untokenize(tokenize(readline))
self.assertEqual(code_without_bom, untokenized_code)
def check_line_extraction(self, f):
if isinstance(f, str):
code = f.encode('utf-8')

View file

@ -171,6 +171,7 @@ def __init__(self):
self.prev_row = 1
self.prev_col = 0
self.prev_type = None
self.prev_line = ""
self.encoding = None
def add_whitespace(self, start):
@ -178,14 +179,28 @@ def add_whitespace(self, start):
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
raise ValueError("start ({},{}) precedes previous end ({},{})"
.format(row, col, self.prev_row, self.prev_col))
row_offset = row - self.prev_row
if row_offset:
self.tokens.append("\\\n" * row_offset)
self.prev_col = 0
self.add_backslash_continuation(start)
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def add_backslash_continuation(self, start):
"""Add backslash continuation characters if the row has increased
without encountering a newline token.
This also inserts the correct amount of whitespace before the backslash.
"""
row = start[0]
row_offset = row - self.prev_row
if row_offset == 0:
return
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
line = self.prev_line.rstrip('\\\r\n')
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
self.tokens.append(ws + f"\\{newline}" * row_offset)
self.prev_col = 0
def escape_brackets(self, token):
characters = []
consume_until_next_bracket = False
@ -245,8 +260,6 @@ def untokenize(self, iterable):
end_line, end_col = end
extra_chars = last_line.count("{{") + last_line.count("}}")
end = (end_line, end_col + extra_chars)
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
self.tokens.append(" ")
self.add_whitespace(start)
self.tokens.append(token)
@ -255,6 +268,7 @@ def untokenize(self, iterable):
self.prev_row += 1
self.prev_col = 0
self.prev_type = tok_type
self.prev_line = line
return "".join(self.tokens)
def compat(self, token, iterable):

View file

@ -0,0 +1,2 @@
Fix round-trip invariance for backslash continuations in
:func:`tokenize.untokenize`.