[3.12] gh-125553: Fix backslash continuation in untokenize (GH-126010) (#130579)

(cherry picked from commit 7ad793e5db) Co-authored-by: Tomas R. <tomas.roun8@gmail.com>
2025-12-31 04:23:37 +00:00 · 2025-02-27 23:57:13 +02:00 · 2025-02-27 23:57:13 +02:00 · b8f2ff08f1
commit b8f2ff08f1
parent 245ca2607c
3 changed files with 53 additions and 11 deletions
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -1,20 +1,20 @@
-from test import support
-from test.support import os_helper
+import os
+import re
+import token
+import unittest
 from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                     open as tokenize_open, Untokenizer, generate_tokens,
                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
                     TokenError)
 from io import BytesIO, StringIO
-import unittest
 from textwrap import dedent
 from unittest import TestCase, mock
+from test import support
 from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
                               INVALID_UNDERSCORE_LITERALS)
 from test.support import os_helper
 from test.support.script_helper import run_test_script, make_script, run_python_until_end
-import os
-import token

 # Converts a source string into a list of textual representation
 # of the tokens such as:
@ -1816,6 +1816,22 @@ def test_iter_compat(self):
        self.assertEqual(untokenize(iter(tokens)), b'Hello ')


+def contains_ambiguous_backslash(source):
+    """Return `True` if the source contains a backslash on a
+    line by itself. For example:
+
+    a = (1
+        \\
+    )
+
+    Code like this cannot be untokenized exactly. This is because
+    the tokenizer does not produce any tokens for the line containing
+    the backslash and so there is no way to know its indent.
+    """
+    pattern = re.compile(br'\n\s*\\\r?\n')
+    return pattern.search(source) is not None
+
+
 class TestRoundtrip(TestCase):

    def check_roundtrip(self, f):
@ -1826,6 +1842,9 @@ def check_roundtrip(self, f):
        tokenize.untokenize(), and the latter tokenized again to 2-tuples.
        The test fails if the 3 pair tokenizations do not match.

+        If the source code can be untokenized unambiguously, the
+        untokenized code must match the original code exactly.
+
        When untokenize bugs are fixed, untokenize with 5-tuples should
        reproduce code that does not contain a backslash continuation
        following spaces.  A proper test should test this.
@ -1849,6 +1868,13 @@ def check_roundtrip(self, f):
        tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
        self.assertEqual(tokens2_from5, tokens2)

+        if not contains_ambiguous_backslash(code):
+            # The BOM does not produce a token so there is no way to preserve it.
+            code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
+            readline = iter(code_without_bom.splitlines(keepends=True)).__next__
+            untokenized_code = untokenize(tokenize(readline))
+            self.assertEqual(code_without_bom, untokenized_code)
+
    def check_line_extraction(self, f):
        if isinstance(f, str):
            code = f.encode('utf-8')
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -171,6 +171,7 @@ def __init__(self):
        self.prev_row = 1
        self.prev_col = 0
        self.prev_type = None
+        self.prev_line = ""
        self.encoding = None

    def add_whitespace(self, start):
@ -178,14 +179,28 @@ def add_whitespace(self, start):
        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
            raise ValueError("start ({},{}) precedes previous end ({},{})"
                             .format(row, col, self.prev_row, self.prev_col))
-        row_offset = row - self.prev_row
-        if row_offset:
-            self.tokens.append("\\\n" * row_offset)
-            self.prev_col = 0
+        self.add_backslash_continuation(start)
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

+    def add_backslash_continuation(self, start):
+        """Add backslash continuation characters if the row has increased
+        without encountering a newline token.
+
+        This also inserts the correct amount of whitespace before the backslash.
+        """
+        row = start[0]
+        row_offset = row - self.prev_row
+        if row_offset == 0:
+            return
+
+        newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
+        line = self.prev_line.rstrip('\\\r\n')
+        ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
+        self.tokens.append(ws + f"\\{newline}" * row_offset)
+        self.prev_col = 0
+
    def escape_brackets(self, token):
        characters = []
        consume_until_next_bracket = False
@ -245,8 +260,6 @@ def untokenize(self, iterable):
                    end_line, end_col = end
                    extra_chars = last_line.count("{{") + last_line.count("}}")
                    end = (end_line, end_col + extra_chars)
-            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
-                self.tokens.append(" ")

            self.add_whitespace(start)
            self.tokens.append(token)
@ -255,6 +268,7 @@ def untokenize(self, iterable):
                self.prev_row += 1
                self.prev_col = 0
            self.prev_type = tok_type
+            self.prev_line = line
        return "".join(self.tokens)

    def compat(self, token, iterable):
--- a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
+++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
@ -0,0 +1,2 @@
+Fix round-trip invariance for backslash continuations in
+:func:`tokenize.untokenize`.