lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

between runs given the same Grammar.txt input regardless of the hash randomization setting.
2025-12-08 06:10:17 +00:00 · 2016-09-08 00:46:26 +00:00 · 2016-09-08 00:46:26 +00:00 · 0c578d62fc
commit 0c578d62fc
parent ef37dfcd84 dd1c638b92
6 changed files with 116 additions and 16 deletions
--- a/Lib/lib2to3/pgen2/driver.py
+++ b/Lib/lib2to3/pgen2/driver.py
@ -106,16 +106,19 @@ def parse_string(self, text, debug=False):
        return self.parse_tokens(tokens, debug)
 def _generate_pickle_name(gt):
    head, tail = os.path.splitext(gt)
    if tail == ".txt":
        tail = ""
    return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
 def load_grammar(gt="Grammar.txt", gp=None,
                 save=True, force=False, logger=None):
    """Load the grammar (maybe from a pickle)."""
    if logger is None:
        logger = logging.getLogger()
-    if gp is None:
+    gp = _generate_pickle_name(gt) if gp is None else gp
        head, tail = os.path.splitext(gt)
        if tail == ".txt":
            tail = ""
        gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
    if force or not _newer(gp, gt):
        logger.info("Generating grammar tables from %s", gt)
        g = pgen.generate_grammar(gt)
@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None,
            try:
                g.dump(gp)
            except OSError as e:
-                logger.info("Writing failed:"+str(e))
+                logger.info("Writing failed: %s", e)
    else:
        g = grammar.Grammar()
        g.load(gp)
--- a/Lib/lib2to3/pgen2/grammar.py
+++ b/Lib/lib2to3/pgen2/grammar.py
@ -13,6 +13,7 @@
 """
 # Python imports
 import collections
 import pickle
 # Local imports
@ -85,9 +86,21 @@ def __init__(self):
        self.start = 256
    def dump(self, filename):
-        """Dump the grammar tables to a pickle file."""
+        """Dump the grammar tables to a pickle file.
        dump() recursively changes all dict to OrderedDict, so the pickled file
        is not exactly the same as what was passed in to dump(). load() uses the
        pickled file to create the tables, but  only changes OrderedDict to dict
        at the top level; it does not recursively change OrderedDict to dict.
        So, the loaded tables are different from the original tables that were
        passed to load() in that some of the OrderedDict (from the pickled file)
        are not changed back to dict. For parsing, this has no effect on
        performance because OrderedDict uses dict's __getitem__ with nothing in
        between.
        """
        with open(filename, "wb") as f:
-            pickle.dump(self.__dict__, f, 2)
+            d = _make_deterministic(self.__dict__)
            pickle.dump(d, f, 2)
    def load(self, filename):
        """Load the grammar tables from a pickle file."""
@ -124,6 +137,17 @@ def report(self):
        print("start", self.start)
 def _make_deterministic(top):
    if isinstance(top, dict):
      return collections.OrderedDict(
          sorted(((k, _make_deterministic(v)) for k, v in top.items())))
    if isinstance(top, list):
      return [_make_deterministic(e) for e in top]
    if isinstance(top, tuple):
      return tuple(_make_deterministic(e) for e in top)
    return top
 # Map from operator to number (since tokenize doesn't do this)
 opmap_raw = """
--- a/Lib/lib2to3/pgen2/pgen.py
+++ b/Lib/lib2to3/pgen2/pgen.py
@ -39,7 +39,7 @@ def make_grammar(self):
            states = []
            for state in dfa:
                arcs = []
-                for label, next in state.arcs.items():
+                for label, next in sorted(state.arcs.items()):
                    arcs.append((self.make_label(c, label), dfa.index(next)))
                if state.isfinal:
                    arcs.append((0, dfa.index(state)))
@ -52,7 +52,7 @@ def make_grammar(self):
    def make_first(self, c, name):
        rawfirst = self.first[name]
        first = {}
-        for label in rawfirst:
+        for label in sorted(rawfirst):
            ilabel = self.make_label(c, label)
            ##assert ilabel not in first # XXX failed on <> ... !=
            first[ilabel] = 1
@ -192,7 +192,7 @@ def addclosure(state, base):
                for label, next in nfastate.arcs:
                    if label is not None:
                        addclosure(next, arcs.setdefault(label, {}))
-            for label, nfaset in arcs.items():
+            for label, nfaset in sorted(arcs.items()):
                for st in states:
                    if st.nfaset == nfaset:
                        break
@ -222,7 +222,7 @@ def dump_dfa(self, name, dfa):
        print("Dump of DFA for", name)
        for i, state in enumerate(dfa):
            print("  State", i, state.isfinal and "(final)" or "")
-            for label, next in state.arcs.items():
+            for label, next in sorted(state.arcs.items()):
                print("    %s -> %d" % (label, dfa.index(next)))
    def simplify_dfa(self, dfa):
--- a/Lib/lib2to3/tests/support.py
+++ b/Lib/lib2to3/tests/support.py
@ -9,13 +9,13 @@
 # Local imports
 from lib2to3 import pytree, refactor
-from lib2to3.pgen2 import driver
+from lib2to3.pgen2 import driver as pgen2_driver
 test_dir = os.path.dirname(__file__)
 proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
 grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
-grammar = driver.load_grammar(grammar_path)
+grammar = pgen2_driver.load_grammar(grammar_path)
-driver = driver.Driver(grammar, convert=pytree.convert)
+driver = pgen2_driver.Driver(grammar, convert=pytree.convert)
 def parse_string(string):
    return driver.parse_string(reformat(string), debug=True)
--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@ -15,11 +15,15 @@
 # Python imports
 import os
 import shutil
 import subprocess
 import sys
 import tempfile
 import unittest
 import warnings
 import subprocess
 # Local imports
 from lib2to3.pgen2 import driver as pgen2_driver
 from lib2to3.pgen2 import tokenize
 from ..pgen2.parse import ParseError
 from lib2to3.pygram import python_symbols as syms
@ -34,6 +38,71 @@ def test_formfeed(self):
        self.assertEqual(t.children[1].children[0].type, syms.print_stmt)
 class TestPgen2Caching(support.TestCase):
    def test_load_grammar_from_txt_file(self):
        pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
    def test_load_grammar_from_pickle(self):
        # Make a copy of the grammar file in a temp directory we are
        # guaranteed to be able to write to.
        tmpdir = tempfile.mkdtemp()
        try:
            grammar_copy = os.path.join(
                    tmpdir, os.path.basename(support.grammar_path))
            shutil.copy(support.grammar_path, grammar_copy)
            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
            self.assertTrue(os.path.exists(pickle_name))
            os.unlink(grammar_copy)  # Only the pickle remains...
            pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
        finally:
            shutil.rmtree(tmpdir)
    @unittest.skipIf(sys.executable is None, 'sys.executable required')
    def test_load_grammar_from_subprocess(self):
        tmpdir = tempfile.mkdtemp()
        tmpsubdir = os.path.join(tmpdir, 'subdir')
        try:
            os.mkdir(tmpsubdir)
            grammar_base = os.path.basename(support.grammar_path)
            grammar_copy = os.path.join(tmpdir, grammar_base)
            grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
            shutil.copy(support.grammar_path, grammar_copy)
            shutil.copy(support.grammar_path, grammar_sub_copy)
            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
            pickle_sub_name = pgen2_driver._generate_pickle_name(
                     grammar_sub_copy)
            self.assertNotEqual(pickle_name, pickle_sub_name)
            # Generate a pickle file from this process.
            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
            self.assertTrue(os.path.exists(pickle_name))
            # Generate a new pickle file in a subprocess with a most likely
            # different hash randomization seed.
            sub_env = dict(os.environ)
            sub_env['PYTHONHASHSEED'] = 'random'
            subprocess.check_call(
                    [sys.executable, '-c', """
 from lib2to3.pgen2 import driver as pgen2_driver
 pgen2_driver.load_grammar(%r, save=True, force=True)
                    """ % (grammar_sub_copy,)],
                    env=sub_env)
            self.assertTrue(os.path.exists(pickle_sub_name))
            with open(pickle_name, 'rb') as pickle_f_1, \
                    open(pickle_sub_name, 'rb') as pickle_f_2:
                self.assertEqual(
                    pickle_f_1.read(), pickle_f_2.read(),
                    msg='Grammar caches generated using different hash seeds'
                    ' were not identical.')
        finally:
            shutil.rmtree(tmpdir)
 class GrammarTest(support.TestCase):
    def validate(self, code):
        support.parse_string(code)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -99,6 +99,10 @@ Core and Builtins
 Library
 -------
 - lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
  between runs given the same Grammar.txt input regardless of the hash
  randomization setting.
 - Issue #28005: Allow ImportErrors in encoding implementation to propagate.
 - Issue #27570: Avoid zero-length memcpy() etc calls with null source