mirror of
https://github.com/python/cpython.git
synced 2026-04-15 08:11:10 +00:00
The behaviour of Cut in nested parentheses, Repeat, Opt, and similar is somewhat chaotic. Apparently even the academic papers on PEG aren't as clear as they could be. And it doesn't really matter. Python only uses top-level cuts. When that changes, we can clarify as much as necessary (and even change the implementation to make sense for what we'll need). Document that this is deliberately unspecified, and add a test to make sure any decision is deliberate, tested and documented.
1182 lines
40 KiB
Python
1182 lines
40 KiB
Python
import ast
|
|
import difflib
|
|
import io
|
|
import textwrap
|
|
import unittest
|
|
|
|
from test import test_tools
|
|
from typing import Dict, Any
|
|
from tokenize import TokenInfo, NAME, NEWLINE, NUMBER, OP
|
|
|
|
test_tools.skip_if_missing("peg_generator")
|
|
with test_tools.imports_under_tool("peg_generator"):
|
|
from pegen.grammar_parser import GeneratedParser as GrammarParser
|
|
from pegen.testutil import parse_string, generate_parser, make_parser
|
|
from pegen.grammar import GrammarVisitor, GrammarError, Grammar
|
|
from pegen.grammar_visualizer import ASTGrammarPrinter
|
|
from pegen.parser import Parser
|
|
from pegen.parser_generator import compute_nullables, compute_left_recursives
|
|
from pegen.python_generator import PythonParserGenerator
|
|
|
|
|
|
class TestPegen(unittest.TestCase):
|
|
def test_parse_grammar(self) -> None:
|
|
grammar_source = """
|
|
start: sum NEWLINE
|
|
sum: t1=term '+' t2=term { action } | term
|
|
term: NUMBER
|
|
"""
|
|
expected = """
|
|
start: sum NEWLINE
|
|
sum: term '+' term | term
|
|
term: NUMBER
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
rules = grammar.rules
|
|
self.assertEqual(str(grammar), textwrap.dedent(expected).strip())
|
|
# Check the str() and repr() of a few rules; AST nodes don't support ==.
|
|
self.assertEqual(str(rules["start"]), "start: sum NEWLINE")
|
|
self.assertEqual(str(rules["sum"]), "sum: term '+' term | term")
|
|
expected_repr = (
|
|
"Rule('term', None, Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))"
|
|
)
|
|
self.assertEqual(repr(rules["term"]), expected_repr)
|
|
|
|
def test_repeated_rules(self) -> None:
|
|
grammar_source = """
|
|
start: the_rule NEWLINE
|
|
the_rule: 'b' NEWLINE
|
|
the_rule: 'a' NEWLINE
|
|
"""
|
|
with self.assertRaisesRegex(GrammarError, "Repeated rule 'the_rule'"):
|
|
parse_string(grammar_source, GrammarParser)
|
|
|
|
def test_long_rule_str(self) -> None:
|
|
grammar_source = """
|
|
start: zero | one | one zero | one one | one zero zero | one zero one | one one zero | one one one
|
|
"""
|
|
expected = """
|
|
start:
|
|
| zero
|
|
| one
|
|
| one zero
|
|
| one one
|
|
| one zero zero
|
|
| one zero one
|
|
| one one zero
|
|
| one one one
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
self.assertEqual(str(grammar.rules["start"]), textwrap.dedent(expected).strip())
|
|
|
|
def test_typed_rules(self) -> None:
|
|
grammar = """
|
|
start[int]: sum NEWLINE
|
|
sum[int]: t1=term '+' t2=term { action } | term
|
|
term[int]: NUMBER
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser).rules
|
|
# Check the str() and repr() of a few rules; AST nodes don't support ==.
|
|
self.assertEqual(str(rules["start"]), "start: sum NEWLINE")
|
|
self.assertEqual(str(rules["sum"]), "sum: term '+' term | term")
|
|
self.assertEqual(
|
|
repr(rules["term"]),
|
|
"Rule('term', 'int', Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))",
|
|
)
|
|
|
|
def test_gather(self) -> None:
|
|
grammar = """
|
|
start: ','.thing+ NEWLINE
|
|
thing: NUMBER
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser).rules
|
|
self.assertEqual(str(rules["start"]), "start: ','.thing+ NEWLINE")
|
|
self.assertStartsWith(repr(rules["start"]),
|
|
"Rule('start', None, Rhs([Alt([NamedItem(None, Gather(StringLeaf(\"','\"), NameLeaf('thing'"
|
|
)
|
|
self.assertEqual(str(rules["thing"]), "thing: NUMBER")
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("42\n", parser_class)
|
|
node = parse_string("1, 2\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1, 2\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="2", start=(1, 3), end=(1, 4), line="1, 2\n"
|
|
),
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 4), end=(1, 5), line="1, 2\n"
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_expr_grammar(self) -> None:
|
|
grammar = """
|
|
start: sum NEWLINE
|
|
sum: term '+' term | term
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("42\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(NUMBER, string="42", start=(1, 0), end=(1, 2), line="42\n"),
|
|
TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="42\n"),
|
|
],
|
|
)
|
|
|
|
def test_optional_operator(self) -> None:
|
|
grammar = """
|
|
start: sum NEWLINE
|
|
sum: term ('+' term)?
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1 + 2\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2\n"
|
|
),
|
|
[
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2\n"
|
|
),
|
|
],
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 + 2\n"
|
|
),
|
|
],
|
|
)
|
|
node = parse_string("1\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
|
|
None,
|
|
],
|
|
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
|
|
],
|
|
)
|
|
|
|
def test_optional_literal(self) -> None:
|
|
grammar = """
|
|
start: sum NEWLINE
|
|
sum: term '+' ?
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1+\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1+\n"
|
|
),
|
|
TokenInfo(OP, string="+", start=(1, 1), end=(1, 2), line="1+\n"),
|
|
],
|
|
TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="1+\n"),
|
|
],
|
|
)
|
|
node = parse_string("1\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
|
|
None,
|
|
],
|
|
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
|
|
],
|
|
)
|
|
|
|
def test_alt_optional_operator(self) -> None:
|
|
grammar = """
|
|
start: sum NEWLINE
|
|
sum: term ['+' term]
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1 + 2\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2\n"
|
|
),
|
|
[
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2\n"
|
|
),
|
|
],
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 + 2\n"
|
|
),
|
|
],
|
|
)
|
|
node = parse_string("1\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
|
|
None,
|
|
],
|
|
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
|
|
],
|
|
)
|
|
|
|
def test_repeat_0_simple(self) -> None:
|
|
grammar = """
|
|
start: thing thing* NEWLINE
|
|
thing: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1 2 3\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n"),
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n"
|
|
),
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"
|
|
),
|
|
],
|
|
)
|
|
node = parse_string("1\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
|
|
[],
|
|
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
|
|
],
|
|
)
|
|
|
|
def test_repeat_0_complex(self) -> None:
|
|
grammar = """
|
|
start: term ('+' term)* NEWLINE
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1 + 2 + 3\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n"
|
|
),
|
|
[
|
|
[
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="2",
|
|
start=(1, 4),
|
|
end=(1, 5),
|
|
line="1 + 2 + 3\n",
|
|
),
|
|
],
|
|
[
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="3",
|
|
start=(1, 8),
|
|
end=(1, 9),
|
|
line="1 + 2 + 3\n",
|
|
),
|
|
],
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_repeat_1_simple(self) -> None:
|
|
grammar = """
|
|
start: thing thing+ NEWLINE
|
|
thing: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1 2 3\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n"),
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n"
|
|
),
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"
|
|
),
|
|
],
|
|
)
|
|
with self.assertRaises(SyntaxError):
|
|
parse_string("1\n", parser_class)
|
|
|
|
def test_repeat_1_complex(self) -> None:
|
|
grammar = """
|
|
start: term ('+' term)+ NEWLINE
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1 + 2 + 3\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n"
|
|
),
|
|
[
|
|
[
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="2",
|
|
start=(1, 4),
|
|
end=(1, 5),
|
|
line="1 + 2 + 3\n",
|
|
),
|
|
],
|
|
[
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="3",
|
|
start=(1, 8),
|
|
end=(1, 9),
|
|
line="1 + 2 + 3\n",
|
|
),
|
|
],
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"
|
|
),
|
|
],
|
|
)
|
|
with self.assertRaises(SyntaxError):
|
|
parse_string("1\n", parser_class)
|
|
|
|
def test_repeat_with_sep_simple(self) -> None:
|
|
grammar = """
|
|
start: ','.thing+ NEWLINE
|
|
thing: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("1, 2, 3\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
TokenInfo(
|
|
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1, 2, 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="2", start=(1, 3), end=(1, 4), line="1, 2, 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="3", start=(1, 6), end=(1, 7), line="1, 2, 3\n"
|
|
),
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 7), end=(1, 8), line="1, 2, 3\n"
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_left_recursive(self) -> None:
|
|
grammar_source = """
|
|
start: expr NEWLINE
|
|
expr: ('-' term | expr '+' term | term)
|
|
term: NUMBER
|
|
foo: NAME+
|
|
bar: NAME*
|
|
baz: NAME?
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
parser_class = generate_parser(grammar)
|
|
rules = grammar.rules
|
|
self.assertFalse(rules["start"].left_recursive)
|
|
self.assertTrue(rules["expr"].left_recursive)
|
|
self.assertFalse(rules["term"].left_recursive)
|
|
self.assertFalse(rules["foo"].left_recursive)
|
|
self.assertFalse(rules["bar"].left_recursive)
|
|
self.assertFalse(rules["baz"].left_recursive)
|
|
node = parse_string("1 + 2 + 3\n", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
[
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="1",
|
|
start=(1, 0),
|
|
end=(1, 1),
|
|
line="1 + 2 + 3\n",
|
|
),
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="2",
|
|
start=(1, 4),
|
|
end=(1, 5),
|
|
line="1 + 2 + 3\n",
|
|
),
|
|
],
|
|
TokenInfo(
|
|
OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"
|
|
),
|
|
TokenInfo(
|
|
NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n"
|
|
),
|
|
],
|
|
TokenInfo(
|
|
NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_python_expr(self) -> None:
|
|
grammar = """
|
|
start: expr NEWLINE? $ { ast.Expression(expr) }
|
|
expr: ( expr '+' term { ast.BinOp(expr, ast.Add(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) }
|
|
| expr '-' term { ast.BinOp(expr, ast.Sub(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) }
|
|
| term { term }
|
|
)
|
|
term: ( l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) }
|
|
| l=term '/' r=factor { ast.BinOp(l, ast.Div(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) }
|
|
| factor { factor }
|
|
)
|
|
factor: ( '(' expr ')' { expr }
|
|
| atom { atom }
|
|
)
|
|
atom: ( n=NAME { ast.Name(id=n.string, ctx=ast.Load(), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) }
|
|
| n=NUMBER { ast.Constant(value=ast.literal_eval(n.string), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) }
|
|
)
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("(1 + 2*3 + 5)/(6 - 2)\n", parser_class)
|
|
code = compile(node, "", "eval")
|
|
val = eval(code)
|
|
self.assertEqual(val, 3.0)
|
|
|
|
def test_f_string_in_action(self) -> None:
|
|
grammar = """
|
|
start: n=NAME NEWLINE? $ { f"name -> {n.string}" }
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("a", parser_class)
|
|
self.assertEqual(node.strip(), "name -> a")
|
|
|
|
def test_nullable(self) -> None:
|
|
grammar_source = """
|
|
start: sign NUMBER
|
|
sign: ['-' | '+']
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
rules = grammar.rules
|
|
nullables = compute_nullables(rules)
|
|
self.assertNotIn(rules["start"], nullables) # Not None!
|
|
self.assertIn(rules["sign"], nullables)
|
|
|
|
def test_advanced_left_recursive(self) -> None:
|
|
grammar_source = """
|
|
start: NUMBER | sign start
|
|
sign: ['-']
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
rules = grammar.rules
|
|
nullables = compute_nullables(rules)
|
|
compute_left_recursives(rules)
|
|
self.assertNotIn(rules["start"], nullables) # Not None!
|
|
self.assertIn(rules["sign"], nullables)
|
|
self.assertTrue(rules["start"].left_recursive)
|
|
self.assertFalse(rules["sign"].left_recursive)
|
|
|
|
def test_mutually_left_recursive(self) -> None:
|
|
grammar_source = """
|
|
start: foo 'E'
|
|
foo: bar 'A' | 'B'
|
|
bar: foo 'C' | 'D'
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
out = io.StringIO()
|
|
genr = PythonParserGenerator(grammar, out)
|
|
rules = grammar.rules
|
|
self.assertFalse(rules["start"].left_recursive)
|
|
self.assertTrue(rules["foo"].left_recursive)
|
|
self.assertTrue(rules["bar"].left_recursive)
|
|
genr.generate("<string>")
|
|
ns: Dict[str, Any] = {}
|
|
exec(out.getvalue(), ns)
|
|
parser_class: Type[Parser] = ns["GeneratedParser"]
|
|
node = parse_string("D A C A E", parser_class)
|
|
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
[
|
|
[
|
|
TokenInfo(
|
|
type=NAME,
|
|
string="D",
|
|
start=(1, 0),
|
|
end=(1, 1),
|
|
line="D A C A E",
|
|
),
|
|
TokenInfo(
|
|
type=NAME,
|
|
string="A",
|
|
start=(1, 2),
|
|
end=(1, 3),
|
|
line="D A C A E",
|
|
),
|
|
],
|
|
TokenInfo(
|
|
type=NAME,
|
|
string="C",
|
|
start=(1, 4),
|
|
end=(1, 5),
|
|
line="D A C A E",
|
|
),
|
|
],
|
|
TokenInfo(
|
|
type=NAME,
|
|
string="A",
|
|
start=(1, 6),
|
|
end=(1, 7),
|
|
line="D A C A E",
|
|
),
|
|
],
|
|
TokenInfo(
|
|
type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E"
|
|
),
|
|
],
|
|
)
|
|
node = parse_string("B C A E", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
[
|
|
[
|
|
TokenInfo(
|
|
type=NAME,
|
|
string="B",
|
|
start=(1, 0),
|
|
end=(1, 1),
|
|
line="B C A E",
|
|
),
|
|
TokenInfo(
|
|
type=NAME,
|
|
string="C",
|
|
start=(1, 2),
|
|
end=(1, 3),
|
|
line="B C A E",
|
|
),
|
|
],
|
|
TokenInfo(
|
|
type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E"
|
|
),
|
|
],
|
|
TokenInfo(
|
|
type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E"
|
|
),
|
|
],
|
|
)
|
|
|
|
def test_nasty_mutually_left_recursive(self) -> None:
|
|
# This grammar does not recognize 'x - + =', much to my chagrin.
|
|
# But that's the way PEG works.
|
|
# [Breathlessly]
|
|
# The problem is that the toplevel target call
|
|
# recurses into maybe, which recognizes 'x - +',
|
|
# and then the toplevel target looks for another '+',
|
|
# which fails, so it retreats to NAME,
|
|
# which succeeds, so we end up just recognizing 'x',
|
|
# and then start fails because there's no '=' after that.
|
|
grammar_source = """
|
|
start: target '='
|
|
target: maybe '+' | NAME
|
|
maybe: maybe '-' | target
|
|
"""
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
out = io.StringIO()
|
|
genr = PythonParserGenerator(grammar, out)
|
|
genr.generate("<string>")
|
|
ns: Dict[str, Any] = {}
|
|
exec(out.getvalue(), ns)
|
|
parser_class = ns["GeneratedParser"]
|
|
with self.assertRaises(SyntaxError):
|
|
parse_string("x - + =", parser_class)
|
|
|
|
def test_lookahead(self) -> None:
|
|
grammar = """
|
|
start: (expr_stmt | assign_stmt) &'.'
|
|
expr_stmt: !(target '=') expr
|
|
assign_stmt: target '=' expr
|
|
expr: term ('+' term)*
|
|
target: NAME
|
|
term: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("foo = 12 + 12 .", parser_class)
|
|
self.maxDiff = None
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(
|
|
NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 ."
|
|
),
|
|
TokenInfo(
|
|
OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ."
|
|
),
|
|
[
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="12",
|
|
start=(1, 6),
|
|
end=(1, 8),
|
|
line="foo = 12 + 12 .",
|
|
),
|
|
[
|
|
[
|
|
TokenInfo(
|
|
OP,
|
|
string="+",
|
|
start=(1, 9),
|
|
end=(1, 10),
|
|
line="foo = 12 + 12 .",
|
|
),
|
|
TokenInfo(
|
|
NUMBER,
|
|
string="12",
|
|
start=(1, 11),
|
|
end=(1, 13),
|
|
line="foo = 12 + 12 .",
|
|
),
|
|
]
|
|
],
|
|
],
|
|
],
|
|
)
|
|
|
|
def test_named_lookahead_error(self) -> None:
|
|
grammar = """
|
|
start: foo=!'x' NAME
|
|
"""
|
|
with self.assertRaises(SyntaxError):
|
|
make_parser(grammar)
|
|
|
|
def test_start_leader(self) -> None:
|
|
grammar = """
|
|
start: attr | NAME
|
|
attr: start '.' NAME
|
|
"""
|
|
# Would assert False without a special case in compute_left_recursives().
|
|
make_parser(grammar)
|
|
|
|
def test_opt_sequence(self) -> None:
|
|
grammar = """
|
|
start: [NAME*]
|
|
"""
|
|
# This case was failing because of a double trailing comma at the end
|
|
# of a line in the generated source. See bpo-41044
|
|
make_parser(grammar)
|
|
|
|
def test_left_recursion_too_complex(self) -> None:
|
|
grammar = """
|
|
start: foo
|
|
foo: bar '+' | baz '+' | '+'
|
|
bar: baz '-' | foo '-' | '-'
|
|
baz: foo '*' | bar '*' | '*'
|
|
"""
|
|
with self.assertRaises(ValueError) as errinfo:
|
|
make_parser(grammar)
|
|
self.assertTrue("no leader" in str(errinfo.exception.value))
|
|
|
|
def test_cut(self) -> None:
|
|
grammar = """
|
|
start: '(' ~ expr ')'
|
|
expr: NUMBER
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("(1)", parser_class)
|
|
self.assertEqual(
|
|
node,
|
|
[
|
|
TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"),
|
|
TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)"),
|
|
TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"),
|
|
],
|
|
)
|
|
|
|
def test_cut_is_local_in_rule(self) -> None:
|
|
grammar = """
|
|
start:
|
|
| inner
|
|
| 'x' { "ok" }
|
|
inner:
|
|
| 'x' ~ 'y'
|
|
| 'x'
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("x", parser_class)
|
|
self.assertEqual(node, 'ok')
|
|
|
|
def test_cut_is_local_in_parens(self) -> None:
|
|
# we currently don't guarantee this behavior, see gh-143054
|
|
grammar = """
|
|
start:
|
|
| ('x' ~ 'y' | 'x')
|
|
| 'x' { "ok" }
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
node = parse_string("x", parser_class)
|
|
self.assertEqual(node, 'ok')
|
|
|
|
def test_dangling_reference(self) -> None:
|
|
grammar = """
|
|
start: foo ENDMARKER
|
|
foo: bar NAME
|
|
"""
|
|
with self.assertRaises(GrammarError):
|
|
parser_class = make_parser(grammar)
|
|
|
|
def test_bad_token_reference(self) -> None:
|
|
grammar = """
|
|
start: foo
|
|
foo: NAMEE
|
|
"""
|
|
with self.assertRaises(GrammarError):
|
|
parser_class = make_parser(grammar)
|
|
|
|
def test_missing_start(self) -> None:
|
|
grammar = """
|
|
foo: NAME
|
|
"""
|
|
with self.assertRaises(GrammarError):
|
|
parser_class = make_parser(grammar)
|
|
|
|
def test_invalid_rule_name(self) -> None:
|
|
grammar = """
|
|
start: _a b
|
|
_a: 'a'
|
|
b: 'b'
|
|
"""
|
|
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_a'"):
|
|
parser_class = make_parser(grammar)
|
|
|
|
def test_invalid_variable_name(self) -> None:
|
|
grammar = """
|
|
start: a b
|
|
a: _x='a'
|
|
b: 'b'
|
|
"""
|
|
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_x'"):
|
|
parser_class = make_parser(grammar)
|
|
|
|
def test_invalid_variable_name_in_temporal_rule(self) -> None:
|
|
grammar = """
|
|
start: a b
|
|
a: (_x='a' | 'b') | 'c'
|
|
b: 'b'
|
|
"""
|
|
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_x'"):
|
|
parser_class = make_parser(grammar)
|
|
|
|
def test_soft_keyword(self) -> None:
|
|
grammar = """
|
|
start:
|
|
| "number" n=NUMBER { eval(n.string) }
|
|
| "string" n=STRING { n.string }
|
|
| SOFT_KEYWORD l=NAME n=(NUMBER | NAME | STRING) { l.string + " = " + n.string }
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
self.assertEqual(parse_string("number 1", parser_class), 1)
|
|
self.assertEqual(parse_string("string 'b'", parser_class), "'b'")
|
|
self.assertEqual(
|
|
parse_string("number test 1", parser_class), "test = 1"
|
|
)
|
|
assert (
|
|
parse_string("string test 'b'", parser_class) == "test = 'b'"
|
|
)
|
|
with self.assertRaises(SyntaxError):
|
|
parse_string("test 1", parser_class)
|
|
|
|
def test_forced(self) -> None:
|
|
grammar = """
|
|
start: NAME &&':' | NAME
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
self.assertTrue(parse_string("number :", parser_class))
|
|
with self.assertRaises(SyntaxError) as e:
|
|
parse_string("a", parser_class)
|
|
|
|
self.assertIn("expected ':'", str(e.exception))
|
|
|
|
def test_forced_with_group(self) -> None:
|
|
grammar = """
|
|
start: NAME &&(':' | ';') | NAME
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
self.assertTrue(parse_string("number :", parser_class))
|
|
self.assertTrue(parse_string("number ;", parser_class))
|
|
with self.assertRaises(SyntaxError) as e:
|
|
parse_string("a", parser_class)
|
|
self.assertIn("expected (':' | ';')", e.exception.args[0])
|
|
|
|
def test_unreachable_explicit(self) -> None:
|
|
source = """
|
|
start: NAME { UNREACHABLE }
|
|
"""
|
|
grammar = parse_string(source, GrammarParser)
|
|
out = io.StringIO()
|
|
genr = PythonParserGenerator(
|
|
grammar, out, unreachable_formatting="This is a test"
|
|
)
|
|
genr.generate("<string>")
|
|
self.assertIn("This is a test", out.getvalue())
|
|
|
|
def test_unreachable_implicit1(self) -> None:
|
|
source = """
|
|
start: NAME | invalid_input
|
|
invalid_input: NUMBER { None }
|
|
"""
|
|
grammar = parse_string(source, GrammarParser)
|
|
out = io.StringIO()
|
|
genr = PythonParserGenerator(
|
|
grammar, out, unreachable_formatting="This is a test"
|
|
)
|
|
genr.generate("<string>")
|
|
self.assertIn("This is a test", out.getvalue())
|
|
|
|
def test_unreachable_implicit2(self) -> None:
|
|
source = """
|
|
start: NAME | '(' invalid_input ')'
|
|
invalid_input: NUMBER { None }
|
|
"""
|
|
grammar = parse_string(source, GrammarParser)
|
|
out = io.StringIO()
|
|
genr = PythonParserGenerator(
|
|
grammar, out, unreachable_formatting="This is a test"
|
|
)
|
|
genr.generate("<string>")
|
|
self.assertIn("This is a test", out.getvalue())
|
|
|
|
def test_unreachable_implicit3(self) -> None:
|
|
source = """
|
|
start: NAME | invalid_input { None }
|
|
invalid_input: NUMBER
|
|
"""
|
|
grammar = parse_string(source, GrammarParser)
|
|
out = io.StringIO()
|
|
genr = PythonParserGenerator(
|
|
grammar, out, unreachable_formatting="This is a test"
|
|
)
|
|
genr.generate("<string>")
|
|
self.assertNotIn("This is a test", out.getvalue())
|
|
|
|
def test_locations_in_alt_action_and_group(self) -> None:
|
|
grammar = """
|
|
start: t=term NEWLINE? $ { ast.Expression(t) }
|
|
term:
|
|
| l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r, LOCATIONS) }
|
|
| l=term '/' r=factor { ast.BinOp(l, ast.Div(), r, LOCATIONS) }
|
|
| factor
|
|
factor:
|
|
| (
|
|
n=NAME { ast.Name(id=n.string, ctx=ast.Load(), LOCATIONS) } |
|
|
n=NUMBER { ast.Constant(value=ast.literal_eval(n.string), LOCATIONS) }
|
|
)
|
|
"""
|
|
parser_class = make_parser(grammar)
|
|
source = "2*3\n"
|
|
o = ast.dump(parse_string(source, parser_class).body, include_attributes=True)
|
|
p = ast.dump(ast.parse(source).body[0].value, include_attributes=True).replace(
|
|
" kind=None,", ""
|
|
)
|
|
diff = "\n".join(
|
|
difflib.unified_diff(
|
|
o.split("\n"), p.split("\n"), "cpython", "python-pegen"
|
|
)
|
|
)
|
|
self.assertFalse(diff)
|
|
|
|
|
|
class TestGrammarVisitor:
|
|
class Visitor(GrammarVisitor):
|
|
def __init__(self) -> None:
|
|
self.n_nodes = 0
|
|
|
|
def visit(self, node: Any, *args: Any, **kwargs: Any) -> None:
|
|
self.n_nodes += 1
|
|
super().visit(node, *args, **kwargs)
|
|
|
|
def test_parse_trivial_grammar(self) -> None:
|
|
grammar = """
|
|
start: 'a'
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
visitor = self.Visitor()
|
|
|
|
visitor.visit(rules)
|
|
|
|
self.assertEqual(visitor.n_nodes, 6)
|
|
|
|
def test_parse_or_grammar(self) -> None:
|
|
grammar = """
|
|
start: rule
|
|
rule: 'a' | 'b'
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
visitor = self.Visitor()
|
|
|
|
visitor.visit(rules)
|
|
|
|
# Grammar/Rule/Rhs/Alt/NamedItem/NameLeaf -> 6
|
|
# Rule/Rhs/ -> 2
|
|
# Alt/NamedItem/StringLeaf -> 3
|
|
# Alt/NamedItem/StringLeaf -> 3
|
|
|
|
self.assertEqual(visitor.n_nodes, 14)
|
|
|
|
def test_parse_repeat1_grammar(self) -> None:
|
|
grammar = """
|
|
start: 'a'+
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
visitor = self.Visitor()
|
|
|
|
visitor.visit(rules)
|
|
|
|
# Grammar/Rule/Rhs/Alt/NamedItem/Repeat1/StringLeaf -> 6
|
|
self.assertEqual(visitor.n_nodes, 7)
|
|
|
|
def test_parse_repeat0_grammar(self) -> None:
|
|
grammar = """
|
|
start: 'a'*
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
visitor = self.Visitor()
|
|
|
|
visitor.visit(rules)
|
|
|
|
# Grammar/Rule/Rhs/Alt/NamedItem/Repeat0/StringLeaf -> 6
|
|
|
|
self.assertEqual(visitor.n_nodes, 7)
|
|
|
|
def test_parse_optional_grammar(self) -> None:
|
|
grammar = """
|
|
start: 'a' ['b']
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
visitor = self.Visitor()
|
|
|
|
visitor.visit(rules)
|
|
|
|
# Grammar/Rule/Rhs/Alt/NamedItem/StringLeaf -> 6
|
|
# NamedItem/Opt/Rhs/Alt/NamedItem/Stringleaf -> 6
|
|
|
|
self.assertEqual(visitor.n_nodes, 12)
|
|
|
|
|
|
class TestGrammarVisualizer(unittest.TestCase):
|
|
def test_simple_rule(self) -> None:
|
|
grammar = """
|
|
start: 'a' 'b'
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
|
|
printer = ASTGrammarPrinter()
|
|
lines: List[str] = []
|
|
printer.print_grammar_ast(rules, printer=lines.append)
|
|
|
|
output = "\n".join(lines)
|
|
expected_output = textwrap.dedent(
|
|
"""\
|
|
└──Rule
|
|
└──Rhs
|
|
└──Alt
|
|
├──NamedItem
|
|
│ └──StringLeaf("'a'")
|
|
└──NamedItem
|
|
└──StringLeaf("'b'")
|
|
"""
|
|
)
|
|
|
|
self.assertEqual(output, expected_output)
|
|
|
|
def test_multiple_rules(self) -> None:
|
|
grammar = """
|
|
start: a b
|
|
a: 'a'
|
|
b: 'b'
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
|
|
printer = ASTGrammarPrinter()
|
|
lines: List[str] = []
|
|
printer.print_grammar_ast(rules, printer=lines.append)
|
|
|
|
output = "\n".join(lines)
|
|
expected_output = textwrap.dedent(
|
|
"""\
|
|
└──Rule
|
|
└──Rhs
|
|
└──Alt
|
|
├──NamedItem
|
|
│ └──NameLeaf('a')
|
|
└──NamedItem
|
|
└──NameLeaf('b')
|
|
|
|
└──Rule
|
|
└──Rhs
|
|
└──Alt
|
|
└──NamedItem
|
|
└──StringLeaf("'a'")
|
|
|
|
└──Rule
|
|
└──Rhs
|
|
└──Alt
|
|
└──NamedItem
|
|
└──StringLeaf("'b'")
|
|
"""
|
|
)
|
|
|
|
self.assertEqual(output, expected_output)
|
|
|
|
def test_deep_nested_rule(self) -> None:
|
|
grammar = """
|
|
start: 'a' ['b'['c'['d']]]
|
|
"""
|
|
rules = parse_string(grammar, GrammarParser)
|
|
|
|
printer = ASTGrammarPrinter()
|
|
lines: List[str] = []
|
|
printer.print_grammar_ast(rules, printer=lines.append)
|
|
|
|
output = "\n".join(lines)
|
|
expected_output = textwrap.dedent(
|
|
"""\
|
|
└──Rule
|
|
└──Rhs
|
|
└──Alt
|
|
├──NamedItem
|
|
│ └──StringLeaf("'a'")
|
|
└──NamedItem
|
|
└──Opt
|
|
└──Rhs
|
|
└──Alt
|
|
├──NamedItem
|
|
│ └──StringLeaf("'b'")
|
|
└──NamedItem
|
|
└──Opt
|
|
└──Rhs
|
|
└──Alt
|
|
├──NamedItem
|
|
│ └──StringLeaf("'c'")
|
|
└──NamedItem
|
|
└──Opt
|
|
└──Rhs
|
|
└──Alt
|
|
└──NamedItem
|
|
└──StringLeaf("'d'")
|
|
"""
|
|
)
|
|
|
|
self.assertEqual(output, expected_output)
|
|
|
|
def test_rule_flags(self) -> None:
|
|
"""Test the new rule flags syntax that accepts arbitrary lists of flags."""
|
|
# Test grammar with various flag combinations
|
|
grammar_source = """
|
|
start: simple_rule
|
|
|
|
simple_rule (memo):
|
|
| "hello"
|
|
|
|
multi_flag_rule (memo, custom, test):
|
|
| "world"
|
|
|
|
single_custom_flag (custom):
|
|
| "test"
|
|
|
|
no_flags_rule:
|
|
| "plain"
|
|
"""
|
|
|
|
grammar: Grammar = parse_string(grammar_source, GrammarParser)
|
|
rules = grammar.rules
|
|
|
|
# Test memo-only rule
|
|
simple_rule = rules['simple_rule']
|
|
self.assertTrue('memo' in simple_rule.flags,
|
|
"simple_rule should have memo")
|
|
self.assertEqual(simple_rule.flags, frozenset(['memo']),
|
|
f"simple_rule flags should be {'memo'}, got {simple_rule.flags}")
|
|
|
|
# Test multi-flag rule
|
|
multi_flag_rule = rules['multi_flag_rule']
|
|
self.assertTrue('memo' in simple_rule.flags,
|
|
"multi_flag_rule should have memo")
|
|
self.assertEqual(multi_flag_rule.flags, frozenset({'memo', 'custom', 'test'}),
|
|
f"multi_flag_rule flags should contain memo, custom, test, got {multi_flag_rule.flags}")
|
|
|
|
# Test single custom flag rule
|
|
single_custom_rule = rules['single_custom_flag']
|
|
self.assertFalse('memo' not in simple_rule.flags,
|
|
"single_custom_flag should not have memo")
|
|
self.assertEqual(single_custom_rule.flags, frozenset(['custom']),
|
|
f"single_custom_flag flags should be {'custom'}, got {single_custom_rule.flags}")
|
|
|
|
# Test no flags rule
|
|
no_flags_rule = rules['no_flags_rule']
|
|
self.assertFalse('memo' not in simple_rule.flags,
|
|
"no_flags_rule should not have memo")
|
|
self.assertEqual(no_flags_rule.flags, frozenset(),
|
|
f"no_flags_rule flags should be the empty set, got {no_flags_rule.flags}")
|