lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

between runs given the same Grammar.txt input regardless of the hash
randomization setting.
This commit is contained in:
Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D) 2016-09-08 00:46:26 +00:00
commit 0c578d62fc
6 changed files with 116 additions and 16 deletions

View file

@ -106,16 +106,19 @@ def parse_string(self, text, debug=False):
return self.parse_tokens(tokens, debug) return self.parse_tokens(tokens, debug)
def _generate_pickle_name(gt):
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
def load_grammar(gt="Grammar.txt", gp=None, def load_grammar(gt="Grammar.txt", gp=None,
save=True, force=False, logger=None): save=True, force=False, logger=None):
"""Load the grammar (maybe from a pickle).""" """Load the grammar (maybe from a pickle)."""
if logger is None: if logger is None:
logger = logging.getLogger() logger = logging.getLogger()
if gp is None: gp = _generate_pickle_name(gt) if gp is None else gp
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
if force or not _newer(gp, gt): if force or not _newer(gp, gt):
logger.info("Generating grammar tables from %s", gt) logger.info("Generating grammar tables from %s", gt)
g = pgen.generate_grammar(gt) g = pgen.generate_grammar(gt)
@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None,
try: try:
g.dump(gp) g.dump(gp)
except OSError as e: except OSError as e:
logger.info("Writing failed:"+str(e)) logger.info("Writing failed: %s", e)
else: else:
g = grammar.Grammar() g = grammar.Grammar()
g.load(gp) g.load(gp)

View file

@ -13,6 +13,7 @@
""" """
# Python imports # Python imports
import collections
import pickle import pickle
# Local imports # Local imports
@ -85,9 +86,21 @@ def __init__(self):
self.start = 256 self.start = 256
def dump(self, filename): def dump(self, filename):
"""Dump the grammar tables to a pickle file.""" """Dump the grammar tables to a pickle file.
dump() recursively changes all dict to OrderedDict, so the pickled file
is not exactly the same as what was passed in to dump(). load() uses the
pickled file to create the tables, but only changes OrderedDict to dict
at the top level; it does not recursively change OrderedDict to dict.
So, the loaded tables are different from the original tables that were
passed to load() in that some of the OrderedDict (from the pickled file)
are not changed back to dict. For parsing, this has no effect on
performance because OrderedDict uses dict's __getitem__ with nothing in
between.
"""
with open(filename, "wb") as f: with open(filename, "wb") as f:
pickle.dump(self.__dict__, f, 2) d = _make_deterministic(self.__dict__)
pickle.dump(d, f, 2)
def load(self, filename): def load(self, filename):
"""Load the grammar tables from a pickle file.""" """Load the grammar tables from a pickle file."""
@ -124,6 +137,17 @@ def report(self):
print("start", self.start) print("start", self.start)
def _make_deterministic(top):
if isinstance(top, dict):
return collections.OrderedDict(
sorted(((k, _make_deterministic(v)) for k, v in top.items())))
if isinstance(top, list):
return [_make_deterministic(e) for e in top]
if isinstance(top, tuple):
return tuple(_make_deterministic(e) for e in top)
return top
# Map from operator to number (since tokenize doesn't do this) # Map from operator to number (since tokenize doesn't do this)
opmap_raw = """ opmap_raw = """

View file

@ -39,7 +39,7 @@ def make_grammar(self):
states = [] states = []
for state in dfa: for state in dfa:
arcs = [] arcs = []
for label, next in state.arcs.items(): for label, next in sorted(state.arcs.items()):
arcs.append((self.make_label(c, label), dfa.index(next))) arcs.append((self.make_label(c, label), dfa.index(next)))
if state.isfinal: if state.isfinal:
arcs.append((0, dfa.index(state))) arcs.append((0, dfa.index(state)))
@ -52,7 +52,7 @@ def make_grammar(self):
def make_first(self, c, name): def make_first(self, c, name):
rawfirst = self.first[name] rawfirst = self.first[name]
first = {} first = {}
for label in rawfirst: for label in sorted(rawfirst):
ilabel = self.make_label(c, label) ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... != ##assert ilabel not in first # XXX failed on <> ... !=
first[ilabel] = 1 first[ilabel] = 1
@ -192,7 +192,7 @@ def addclosure(state, base):
for label, next in nfastate.arcs: for label, next in nfastate.arcs:
if label is not None: if label is not None:
addclosure(next, arcs.setdefault(label, {})) addclosure(next, arcs.setdefault(label, {}))
for label, nfaset in arcs.items(): for label, nfaset in sorted(arcs.items()):
for st in states: for st in states:
if st.nfaset == nfaset: if st.nfaset == nfaset:
break break
@ -222,7 +222,7 @@ def dump_dfa(self, name, dfa):
print("Dump of DFA for", name) print("Dump of DFA for", name)
for i, state in enumerate(dfa): for i, state in enumerate(dfa):
print(" State", i, state.isfinal and "(final)" or "") print(" State", i, state.isfinal and "(final)" or "")
for label, next in state.arcs.items(): for label, next in sorted(state.arcs.items()):
print(" %s -> %d" % (label, dfa.index(next))) print(" %s -> %d" % (label, dfa.index(next)))
def simplify_dfa(self, dfa): def simplify_dfa(self, dfa):

View file

@ -9,13 +9,13 @@
# Local imports # Local imports
from lib2to3 import pytree, refactor from lib2to3 import pytree, refactor
from lib2to3.pgen2 import driver from lib2to3.pgen2 import driver as pgen2_driver
test_dir = os.path.dirname(__file__) test_dir = os.path.dirname(__file__)
proj_dir = os.path.normpath(os.path.join(test_dir, "..")) proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
grammar_path = os.path.join(test_dir, "..", "Grammar.txt") grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
grammar = driver.load_grammar(grammar_path) grammar = pgen2_driver.load_grammar(grammar_path)
driver = driver.Driver(grammar, convert=pytree.convert) driver = pgen2_driver.Driver(grammar, convert=pytree.convert)
def parse_string(string): def parse_string(string):
return driver.parse_string(reformat(string), debug=True) return driver.parse_string(reformat(string), debug=True)

View file

@ -15,11 +15,15 @@
# Python imports # Python imports
import os import os
import shutil
import subprocess
import sys
import tempfile
import unittest import unittest
import warnings import warnings
import subprocess
# Local imports # Local imports
from lib2to3.pgen2 import driver as pgen2_driver
from lib2to3.pgen2 import tokenize from lib2to3.pgen2 import tokenize
from ..pgen2.parse import ParseError from ..pgen2.parse import ParseError
from lib2to3.pygram import python_symbols as syms from lib2to3.pygram import python_symbols as syms
@ -34,6 +38,71 @@ def test_formfeed(self):
self.assertEqual(t.children[1].children[0].type, syms.print_stmt) self.assertEqual(t.children[1].children[0].type, syms.print_stmt)
class TestPgen2Caching(support.TestCase):
def test_load_grammar_from_txt_file(self):
pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
def test_load_grammar_from_pickle(self):
# Make a copy of the grammar file in a temp directory we are
# guaranteed to be able to write to.
tmpdir = tempfile.mkdtemp()
try:
grammar_copy = os.path.join(
tmpdir, os.path.basename(support.grammar_path))
shutil.copy(support.grammar_path, grammar_copy)
pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
self.assertTrue(os.path.exists(pickle_name))
os.unlink(grammar_copy) # Only the pickle remains...
pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
finally:
shutil.rmtree(tmpdir)
@unittest.skipIf(sys.executable is None, 'sys.executable required')
def test_load_grammar_from_subprocess(self):
tmpdir = tempfile.mkdtemp()
tmpsubdir = os.path.join(tmpdir, 'subdir')
try:
os.mkdir(tmpsubdir)
grammar_base = os.path.basename(support.grammar_path)
grammar_copy = os.path.join(tmpdir, grammar_base)
grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
shutil.copy(support.grammar_path, grammar_copy)
shutil.copy(support.grammar_path, grammar_sub_copy)
pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
pickle_sub_name = pgen2_driver._generate_pickle_name(
grammar_sub_copy)
self.assertNotEqual(pickle_name, pickle_sub_name)
# Generate a pickle file from this process.
pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
self.assertTrue(os.path.exists(pickle_name))
# Generate a new pickle file in a subprocess with a most likely
# different hash randomization seed.
sub_env = dict(os.environ)
sub_env['PYTHONHASHSEED'] = 'random'
subprocess.check_call(
[sys.executable, '-c', """
from lib2to3.pgen2 import driver as pgen2_driver
pgen2_driver.load_grammar(%r, save=True, force=True)
""" % (grammar_sub_copy,)],
env=sub_env)
self.assertTrue(os.path.exists(pickle_sub_name))
with open(pickle_name, 'rb') as pickle_f_1, \
open(pickle_sub_name, 'rb') as pickle_f_2:
self.assertEqual(
pickle_f_1.read(), pickle_f_2.read(),
msg='Grammar caches generated using different hash seeds'
' were not identical.')
finally:
shutil.rmtree(tmpdir)
class GrammarTest(support.TestCase): class GrammarTest(support.TestCase):
def validate(self, code): def validate(self, code):
support.parse_string(code) support.parse_string(code)

View file

@ -99,6 +99,10 @@ Core and Builtins
Library Library
------- -------
- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
between runs given the same Grammar.txt input regardless of the hash
randomization setting.
- Issue #28005: Allow ImportErrors in encoding implementation to propagate. - Issue #28005: Allow ImportErrors in encoding implementation to propagate.
- Issue #27570: Avoid zero-length memcpy() etc calls with null source - Issue #27570: Avoid zero-length memcpy() etc calls with null source