cpython/Lib/test/test_peg_generator/test_pegen.py
Petr Viktorin f0a0467c17
gh-143054: Disallow non-top-level Cut for now (GH-143622)
The behaviour of Cut in nested parentheses, Repeat, Opt, and similar
is somewhat chaotic. Apparently even the academic papers on PEG aren't
as clear as they could be.

And it doesn't really matter. Python only uses top-level cuts.
When that changes, we can clarify as much as necessary (and even
change the implementation to make sense for what we'll need).

Document that this is deliberately unspecified, and add a test to
make sure any decision is deliberate, tested and documented.
2026-01-13 13:21:59 +01:00

1182 lines
40 KiB
Python

import ast
import difflib
import io
import textwrap
import unittest
from test import test_tools
from typing import Dict, Any
from tokenize import TokenInfo, NAME, NEWLINE, NUMBER, OP
test_tools.skip_if_missing("peg_generator")
with test_tools.imports_under_tool("peg_generator"):
from pegen.grammar_parser import GeneratedParser as GrammarParser
from pegen.testutil import parse_string, generate_parser, make_parser
from pegen.grammar import GrammarVisitor, GrammarError, Grammar
from pegen.grammar_visualizer import ASTGrammarPrinter
from pegen.parser import Parser
from pegen.parser_generator import compute_nullables, compute_left_recursives
from pegen.python_generator import PythonParserGenerator
class TestPegen(unittest.TestCase):
def test_parse_grammar(self) -> None:
grammar_source = """
start: sum NEWLINE
sum: t1=term '+' t2=term { action } | term
term: NUMBER
"""
expected = """
start: sum NEWLINE
sum: term '+' term | term
term: NUMBER
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
rules = grammar.rules
self.assertEqual(str(grammar), textwrap.dedent(expected).strip())
# Check the str() and repr() of a few rules; AST nodes don't support ==.
self.assertEqual(str(rules["start"]), "start: sum NEWLINE")
self.assertEqual(str(rules["sum"]), "sum: term '+' term | term")
expected_repr = (
"Rule('term', None, Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))"
)
self.assertEqual(repr(rules["term"]), expected_repr)
def test_repeated_rules(self) -> None:
grammar_source = """
start: the_rule NEWLINE
the_rule: 'b' NEWLINE
the_rule: 'a' NEWLINE
"""
with self.assertRaisesRegex(GrammarError, "Repeated rule 'the_rule'"):
parse_string(grammar_source, GrammarParser)
def test_long_rule_str(self) -> None:
grammar_source = """
start: zero | one | one zero | one one | one zero zero | one zero one | one one zero | one one one
"""
expected = """
start:
| zero
| one
| one zero
| one one
| one zero zero
| one zero one
| one one zero
| one one one
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
self.assertEqual(str(grammar.rules["start"]), textwrap.dedent(expected).strip())
def test_typed_rules(self) -> None:
grammar = """
start[int]: sum NEWLINE
sum[int]: t1=term '+' t2=term { action } | term
term[int]: NUMBER
"""
rules = parse_string(grammar, GrammarParser).rules
# Check the str() and repr() of a few rules; AST nodes don't support ==.
self.assertEqual(str(rules["start"]), "start: sum NEWLINE")
self.assertEqual(str(rules["sum"]), "sum: term '+' term | term")
self.assertEqual(
repr(rules["term"]),
"Rule('term', 'int', Rhs([Alt([NamedItem(None, NameLeaf('NUMBER'))])]))",
)
def test_gather(self) -> None:
grammar = """
start: ','.thing+ NEWLINE
thing: NUMBER
"""
rules = parse_string(grammar, GrammarParser).rules
self.assertEqual(str(rules["start"]), "start: ','.thing+ NEWLINE")
self.assertStartsWith(repr(rules["start"]),
"Rule('start', None, Rhs([Alt([NamedItem(None, Gather(StringLeaf(\"','\"), NameLeaf('thing'"
)
self.assertEqual(str(rules["thing"]), "thing: NUMBER")
parser_class = make_parser(grammar)
node = parse_string("42\n", parser_class)
node = parse_string("1, 2\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1, 2\n"
),
TokenInfo(
NUMBER, string="2", start=(1, 3), end=(1, 4), line="1, 2\n"
),
],
TokenInfo(
NEWLINE, string="\n", start=(1, 4), end=(1, 5), line="1, 2\n"
),
],
)
def test_expr_grammar(self) -> None:
grammar = """
start: sum NEWLINE
sum: term '+' term | term
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("42\n", parser_class)
self.assertEqual(
node,
[
TokenInfo(NUMBER, string="42", start=(1, 0), end=(1, 2), line="42\n"),
TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="42\n"),
],
)
def test_optional_operator(self) -> None:
grammar = """
start: sum NEWLINE
sum: term ('+' term)?
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1 + 2\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2\n"
),
[
TokenInfo(
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2\n"
),
TokenInfo(
NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2\n"
),
],
],
TokenInfo(
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 + 2\n"
),
],
)
node = parse_string("1\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
None,
],
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
],
)
def test_optional_literal(self) -> None:
grammar = """
start: sum NEWLINE
sum: term '+' ?
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1+\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1+\n"
),
TokenInfo(OP, string="+", start=(1, 1), end=(1, 2), line="1+\n"),
],
TokenInfo(NEWLINE, string="\n", start=(1, 2), end=(1, 3), line="1+\n"),
],
)
node = parse_string("1\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
None,
],
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
],
)
def test_alt_optional_operator(self) -> None:
grammar = """
start: sum NEWLINE
sum: term ['+' term]
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1 + 2\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2\n"
),
[
TokenInfo(
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2\n"
),
TokenInfo(
NUMBER, string="2", start=(1, 4), end=(1, 5), line="1 + 2\n"
),
],
],
TokenInfo(
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 + 2\n"
),
],
)
node = parse_string("1\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
None,
],
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
],
)
def test_repeat_0_simple(self) -> None:
grammar = """
start: thing thing* NEWLINE
thing: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1 2 3\n", parser_class)
self.assertEqual(
node,
[
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n"),
[
TokenInfo(
NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n"
),
TokenInfo(
NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n"
),
],
TokenInfo(
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"
),
],
)
node = parse_string("1\n", parser_class)
self.assertEqual(
node,
[
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1\n"),
[],
TokenInfo(NEWLINE, string="\n", start=(1, 1), end=(1, 2), line="1\n"),
],
)
def test_repeat_0_complex(self) -> None:
grammar = """
start: term ('+' term)* NEWLINE
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1 + 2 + 3\n", parser_class)
self.assertEqual(
node,
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n"
),
[
[
TokenInfo(
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"
),
TokenInfo(
NUMBER,
string="2",
start=(1, 4),
end=(1, 5),
line="1 + 2 + 3\n",
),
],
[
TokenInfo(
OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"
),
TokenInfo(
NUMBER,
string="3",
start=(1, 8),
end=(1, 9),
line="1 + 2 + 3\n",
),
],
],
TokenInfo(
NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"
),
],
)
def test_repeat_1_simple(self) -> None:
grammar = """
start: thing thing+ NEWLINE
thing: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1 2 3\n", parser_class)
self.assertEqual(
node,
[
TokenInfo(NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 2 3\n"),
[
TokenInfo(
NUMBER, string="2", start=(1, 2), end=(1, 3), line="1 2 3\n"
),
TokenInfo(
NUMBER, string="3", start=(1, 4), end=(1, 5), line="1 2 3\n"
),
],
TokenInfo(
NEWLINE, string="\n", start=(1, 5), end=(1, 6), line="1 2 3\n"
),
],
)
with self.assertRaises(SyntaxError):
parse_string("1\n", parser_class)
def test_repeat_1_complex(self) -> None:
grammar = """
start: term ('+' term)+ NEWLINE
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1 + 2 + 3\n", parser_class)
self.assertEqual(
node,
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1 + 2 + 3\n"
),
[
[
TokenInfo(
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"
),
TokenInfo(
NUMBER,
string="2",
start=(1, 4),
end=(1, 5),
line="1 + 2 + 3\n",
),
],
[
TokenInfo(
OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"
),
TokenInfo(
NUMBER,
string="3",
start=(1, 8),
end=(1, 9),
line="1 + 2 + 3\n",
),
],
],
TokenInfo(
NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"
),
],
)
with self.assertRaises(SyntaxError):
parse_string("1\n", parser_class)
def test_repeat_with_sep_simple(self) -> None:
grammar = """
start: ','.thing+ NEWLINE
thing: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("1, 2, 3\n", parser_class)
self.assertEqual(
node,
[
[
TokenInfo(
NUMBER, string="1", start=(1, 0), end=(1, 1), line="1, 2, 3\n"
),
TokenInfo(
NUMBER, string="2", start=(1, 3), end=(1, 4), line="1, 2, 3\n"
),
TokenInfo(
NUMBER, string="3", start=(1, 6), end=(1, 7), line="1, 2, 3\n"
),
],
TokenInfo(
NEWLINE, string="\n", start=(1, 7), end=(1, 8), line="1, 2, 3\n"
),
],
)
def test_left_recursive(self) -> None:
grammar_source = """
start: expr NEWLINE
expr: ('-' term | expr '+' term | term)
term: NUMBER
foo: NAME+
bar: NAME*
baz: NAME?
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
parser_class = generate_parser(grammar)
rules = grammar.rules
self.assertFalse(rules["start"].left_recursive)
self.assertTrue(rules["expr"].left_recursive)
self.assertFalse(rules["term"].left_recursive)
self.assertFalse(rules["foo"].left_recursive)
self.assertFalse(rules["bar"].left_recursive)
self.assertFalse(rules["baz"].left_recursive)
node = parse_string("1 + 2 + 3\n", parser_class)
self.assertEqual(
node,
[
[
[
TokenInfo(
NUMBER,
string="1",
start=(1, 0),
end=(1, 1),
line="1 + 2 + 3\n",
),
TokenInfo(
OP, string="+", start=(1, 2), end=(1, 3), line="1 + 2 + 3\n"
),
TokenInfo(
NUMBER,
string="2",
start=(1, 4),
end=(1, 5),
line="1 + 2 + 3\n",
),
],
TokenInfo(
OP, string="+", start=(1, 6), end=(1, 7), line="1 + 2 + 3\n"
),
TokenInfo(
NUMBER, string="3", start=(1, 8), end=(1, 9), line="1 + 2 + 3\n"
),
],
TokenInfo(
NEWLINE, string="\n", start=(1, 9), end=(1, 10), line="1 + 2 + 3\n"
),
],
)
def test_python_expr(self) -> None:
grammar = """
start: expr NEWLINE? $ { ast.Expression(expr) }
expr: ( expr '+' term { ast.BinOp(expr, ast.Add(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) }
| expr '-' term { ast.BinOp(expr, ast.Sub(), term, lineno=expr.lineno, col_offset=expr.col_offset, end_lineno=term.end_lineno, end_col_offset=term.end_col_offset) }
| term { term }
)
term: ( l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) }
| l=term '/' r=factor { ast.BinOp(l, ast.Div(), r, lineno=l.lineno, col_offset=l.col_offset, end_lineno=r.end_lineno, end_col_offset=r.end_col_offset) }
| factor { factor }
)
factor: ( '(' expr ')' { expr }
| atom { atom }
)
atom: ( n=NAME { ast.Name(id=n.string, ctx=ast.Load(), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) }
| n=NUMBER { ast.Constant(value=ast.literal_eval(n.string), lineno=n.start[0], col_offset=n.start[1], end_lineno=n.end[0], end_col_offset=n.end[1]) }
)
"""
parser_class = make_parser(grammar)
node = parse_string("(1 + 2*3 + 5)/(6 - 2)\n", parser_class)
code = compile(node, "", "eval")
val = eval(code)
self.assertEqual(val, 3.0)
def test_f_string_in_action(self) -> None:
grammar = """
start: n=NAME NEWLINE? $ { f"name -> {n.string}" }
"""
parser_class = make_parser(grammar)
node = parse_string("a", parser_class)
self.assertEqual(node.strip(), "name -> a")
def test_nullable(self) -> None:
grammar_source = """
start: sign NUMBER
sign: ['-' | '+']
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
rules = grammar.rules
nullables = compute_nullables(rules)
self.assertNotIn(rules["start"], nullables) # Not None!
self.assertIn(rules["sign"], nullables)
def test_advanced_left_recursive(self) -> None:
grammar_source = """
start: NUMBER | sign start
sign: ['-']
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
rules = grammar.rules
nullables = compute_nullables(rules)
compute_left_recursives(rules)
self.assertNotIn(rules["start"], nullables) # Not None!
self.assertIn(rules["sign"], nullables)
self.assertTrue(rules["start"].left_recursive)
self.assertFalse(rules["sign"].left_recursive)
def test_mutually_left_recursive(self) -> None:
grammar_source = """
start: foo 'E'
foo: bar 'A' | 'B'
bar: foo 'C' | 'D'
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
out = io.StringIO()
genr = PythonParserGenerator(grammar, out)
rules = grammar.rules
self.assertFalse(rules["start"].left_recursive)
self.assertTrue(rules["foo"].left_recursive)
self.assertTrue(rules["bar"].left_recursive)
genr.generate("<string>")
ns: Dict[str, Any] = {}
exec(out.getvalue(), ns)
parser_class: Type[Parser] = ns["GeneratedParser"]
node = parse_string("D A C A E", parser_class)
self.assertEqual(
node,
[
[
[
[
TokenInfo(
type=NAME,
string="D",
start=(1, 0),
end=(1, 1),
line="D A C A E",
),
TokenInfo(
type=NAME,
string="A",
start=(1, 2),
end=(1, 3),
line="D A C A E",
),
],
TokenInfo(
type=NAME,
string="C",
start=(1, 4),
end=(1, 5),
line="D A C A E",
),
],
TokenInfo(
type=NAME,
string="A",
start=(1, 6),
end=(1, 7),
line="D A C A E",
),
],
TokenInfo(
type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E"
),
],
)
node = parse_string("B C A E", parser_class)
self.assertEqual(
node,
[
[
[
TokenInfo(
type=NAME,
string="B",
start=(1, 0),
end=(1, 1),
line="B C A E",
),
TokenInfo(
type=NAME,
string="C",
start=(1, 2),
end=(1, 3),
line="B C A E",
),
],
TokenInfo(
type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E"
),
],
TokenInfo(
type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E"
),
],
)
def test_nasty_mutually_left_recursive(self) -> None:
# This grammar does not recognize 'x - + =', much to my chagrin.
# But that's the way PEG works.
# [Breathlessly]
# The problem is that the toplevel target call
# recurses into maybe, which recognizes 'x - +',
# and then the toplevel target looks for another '+',
# which fails, so it retreats to NAME,
# which succeeds, so we end up just recognizing 'x',
# and then start fails because there's no '=' after that.
grammar_source = """
start: target '='
target: maybe '+' | NAME
maybe: maybe '-' | target
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
out = io.StringIO()
genr = PythonParserGenerator(grammar, out)
genr.generate("<string>")
ns: Dict[str, Any] = {}
exec(out.getvalue(), ns)
parser_class = ns["GeneratedParser"]
with self.assertRaises(SyntaxError):
parse_string("x - + =", parser_class)
def test_lookahead(self) -> None:
grammar = """
start: (expr_stmt | assign_stmt) &'.'
expr_stmt: !(target '=') expr
assign_stmt: target '=' expr
expr: term ('+' term)*
target: NAME
term: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("foo = 12 + 12 .", parser_class)
self.maxDiff = None
self.assertEqual(
node,
[
TokenInfo(
NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 ."
),
TokenInfo(
OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ."
),
[
TokenInfo(
NUMBER,
string="12",
start=(1, 6),
end=(1, 8),
line="foo = 12 + 12 .",
),
[
[
TokenInfo(
OP,
string="+",
start=(1, 9),
end=(1, 10),
line="foo = 12 + 12 .",
),
TokenInfo(
NUMBER,
string="12",
start=(1, 11),
end=(1, 13),
line="foo = 12 + 12 .",
),
]
],
],
],
)
def test_named_lookahead_error(self) -> None:
grammar = """
start: foo=!'x' NAME
"""
with self.assertRaises(SyntaxError):
make_parser(grammar)
def test_start_leader(self) -> None:
grammar = """
start: attr | NAME
attr: start '.' NAME
"""
# Would assert False without a special case in compute_left_recursives().
make_parser(grammar)
def test_opt_sequence(self) -> None:
grammar = """
start: [NAME*]
"""
# This case was failing because of a double trailing comma at the end
# of a line in the generated source. See bpo-41044
make_parser(grammar)
def test_left_recursion_too_complex(self) -> None:
grammar = """
start: foo
foo: bar '+' | baz '+' | '+'
bar: baz '-' | foo '-' | '-'
baz: foo '*' | bar '*' | '*'
"""
with self.assertRaises(ValueError) as errinfo:
make_parser(grammar)
self.assertTrue("no leader" in str(errinfo.exception.value))
def test_cut(self) -> None:
grammar = """
start: '(' ~ expr ')'
expr: NUMBER
"""
parser_class = make_parser(grammar)
node = parse_string("(1)", parser_class)
self.assertEqual(
node,
[
TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"),
TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)"),
TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"),
],
)
def test_cut_is_local_in_rule(self) -> None:
grammar = """
start:
| inner
| 'x' { "ok" }
inner:
| 'x' ~ 'y'
| 'x'
"""
parser_class = make_parser(grammar)
node = parse_string("x", parser_class)
self.assertEqual(node, 'ok')
def test_cut_is_local_in_parens(self) -> None:
# we currently don't guarantee this behavior, see gh-143054
grammar = """
start:
| ('x' ~ 'y' | 'x')
| 'x' { "ok" }
"""
parser_class = make_parser(grammar)
node = parse_string("x", parser_class)
self.assertEqual(node, 'ok')
def test_dangling_reference(self) -> None:
grammar = """
start: foo ENDMARKER
foo: bar NAME
"""
with self.assertRaises(GrammarError):
parser_class = make_parser(grammar)
def test_bad_token_reference(self) -> None:
grammar = """
start: foo
foo: NAMEE
"""
with self.assertRaises(GrammarError):
parser_class = make_parser(grammar)
def test_missing_start(self) -> None:
grammar = """
foo: NAME
"""
with self.assertRaises(GrammarError):
parser_class = make_parser(grammar)
def test_invalid_rule_name(self) -> None:
grammar = """
start: _a b
_a: 'a'
b: 'b'
"""
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_a'"):
parser_class = make_parser(grammar)
def test_invalid_variable_name(self) -> None:
grammar = """
start: a b
a: _x='a'
b: 'b'
"""
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_x'"):
parser_class = make_parser(grammar)
def test_invalid_variable_name_in_temporal_rule(self) -> None:
grammar = """
start: a b
a: (_x='a' | 'b') | 'c'
b: 'b'
"""
with self.assertRaisesRegex(GrammarError, "cannot start with underscore: '_x'"):
parser_class = make_parser(grammar)
def test_soft_keyword(self) -> None:
grammar = """
start:
| "number" n=NUMBER { eval(n.string) }
| "string" n=STRING { n.string }
| SOFT_KEYWORD l=NAME n=(NUMBER | NAME | STRING) { l.string + " = " + n.string }
"""
parser_class = make_parser(grammar)
self.assertEqual(parse_string("number 1", parser_class), 1)
self.assertEqual(parse_string("string 'b'", parser_class), "'b'")
self.assertEqual(
parse_string("number test 1", parser_class), "test = 1"
)
assert (
parse_string("string test 'b'", parser_class) == "test = 'b'"
)
with self.assertRaises(SyntaxError):
parse_string("test 1", parser_class)
def test_forced(self) -> None:
grammar = """
start: NAME &&':' | NAME
"""
parser_class = make_parser(grammar)
self.assertTrue(parse_string("number :", parser_class))
with self.assertRaises(SyntaxError) as e:
parse_string("a", parser_class)
self.assertIn("expected ':'", str(e.exception))
def test_forced_with_group(self) -> None:
grammar = """
start: NAME &&(':' | ';') | NAME
"""
parser_class = make_parser(grammar)
self.assertTrue(parse_string("number :", parser_class))
self.assertTrue(parse_string("number ;", parser_class))
with self.assertRaises(SyntaxError) as e:
parse_string("a", parser_class)
self.assertIn("expected (':' | ';')", e.exception.args[0])
def test_unreachable_explicit(self) -> None:
source = """
start: NAME { UNREACHABLE }
"""
grammar = parse_string(source, GrammarParser)
out = io.StringIO()
genr = PythonParserGenerator(
grammar, out, unreachable_formatting="This is a test"
)
genr.generate("<string>")
self.assertIn("This is a test", out.getvalue())
def test_unreachable_implicit1(self) -> None:
source = """
start: NAME | invalid_input
invalid_input: NUMBER { None }
"""
grammar = parse_string(source, GrammarParser)
out = io.StringIO()
genr = PythonParserGenerator(
grammar, out, unreachable_formatting="This is a test"
)
genr.generate("<string>")
self.assertIn("This is a test", out.getvalue())
def test_unreachable_implicit2(self) -> None:
source = """
start: NAME | '(' invalid_input ')'
invalid_input: NUMBER { None }
"""
grammar = parse_string(source, GrammarParser)
out = io.StringIO()
genr = PythonParserGenerator(
grammar, out, unreachable_formatting="This is a test"
)
genr.generate("<string>")
self.assertIn("This is a test", out.getvalue())
def test_unreachable_implicit3(self) -> None:
source = """
start: NAME | invalid_input { None }
invalid_input: NUMBER
"""
grammar = parse_string(source, GrammarParser)
out = io.StringIO()
genr = PythonParserGenerator(
grammar, out, unreachable_formatting="This is a test"
)
genr.generate("<string>")
self.assertNotIn("This is a test", out.getvalue())
def test_locations_in_alt_action_and_group(self) -> None:
grammar = """
start: t=term NEWLINE? $ { ast.Expression(t) }
term:
| l=term '*' r=factor { ast.BinOp(l, ast.Mult(), r, LOCATIONS) }
| l=term '/' r=factor { ast.BinOp(l, ast.Div(), r, LOCATIONS) }
| factor
factor:
| (
n=NAME { ast.Name(id=n.string, ctx=ast.Load(), LOCATIONS) } |
n=NUMBER { ast.Constant(value=ast.literal_eval(n.string), LOCATIONS) }
)
"""
parser_class = make_parser(grammar)
source = "2*3\n"
o = ast.dump(parse_string(source, parser_class).body, include_attributes=True)
p = ast.dump(ast.parse(source).body[0].value, include_attributes=True).replace(
" kind=None,", ""
)
diff = "\n".join(
difflib.unified_diff(
o.split("\n"), p.split("\n"), "cpython", "python-pegen"
)
)
self.assertFalse(diff)
class TestGrammarVisitor:
class Visitor(GrammarVisitor):
def __init__(self) -> None:
self.n_nodes = 0
def visit(self, node: Any, *args: Any, **kwargs: Any) -> None:
self.n_nodes += 1
super().visit(node, *args, **kwargs)
def test_parse_trivial_grammar(self) -> None:
grammar = """
start: 'a'
"""
rules = parse_string(grammar, GrammarParser)
visitor = self.Visitor()
visitor.visit(rules)
self.assertEqual(visitor.n_nodes, 6)
def test_parse_or_grammar(self) -> None:
grammar = """
start: rule
rule: 'a' | 'b'
"""
rules = parse_string(grammar, GrammarParser)
visitor = self.Visitor()
visitor.visit(rules)
# Grammar/Rule/Rhs/Alt/NamedItem/NameLeaf -> 6
# Rule/Rhs/ -> 2
# Alt/NamedItem/StringLeaf -> 3
# Alt/NamedItem/StringLeaf -> 3
self.assertEqual(visitor.n_nodes, 14)
def test_parse_repeat1_grammar(self) -> None:
grammar = """
start: 'a'+
"""
rules = parse_string(grammar, GrammarParser)
visitor = self.Visitor()
visitor.visit(rules)
# Grammar/Rule/Rhs/Alt/NamedItem/Repeat1/StringLeaf -> 6
self.assertEqual(visitor.n_nodes, 7)
def test_parse_repeat0_grammar(self) -> None:
grammar = """
start: 'a'*
"""
rules = parse_string(grammar, GrammarParser)
visitor = self.Visitor()
visitor.visit(rules)
# Grammar/Rule/Rhs/Alt/NamedItem/Repeat0/StringLeaf -> 6
self.assertEqual(visitor.n_nodes, 7)
def test_parse_optional_grammar(self) -> None:
grammar = """
start: 'a' ['b']
"""
rules = parse_string(grammar, GrammarParser)
visitor = self.Visitor()
visitor.visit(rules)
# Grammar/Rule/Rhs/Alt/NamedItem/StringLeaf -> 6
# NamedItem/Opt/Rhs/Alt/NamedItem/Stringleaf -> 6
self.assertEqual(visitor.n_nodes, 12)
class TestGrammarVisualizer(unittest.TestCase):
def test_simple_rule(self) -> None:
grammar = """
start: 'a' 'b'
"""
rules = parse_string(grammar, GrammarParser)
printer = ASTGrammarPrinter()
lines: List[str] = []
printer.print_grammar_ast(rules, printer=lines.append)
output = "\n".join(lines)
expected_output = textwrap.dedent(
"""\
└──Rule
└──Rhs
└──Alt
├──NamedItem
│ └──StringLeaf("'a'")
└──NamedItem
└──StringLeaf("'b'")
"""
)
self.assertEqual(output, expected_output)
def test_multiple_rules(self) -> None:
grammar = """
start: a b
a: 'a'
b: 'b'
"""
rules = parse_string(grammar, GrammarParser)
printer = ASTGrammarPrinter()
lines: List[str] = []
printer.print_grammar_ast(rules, printer=lines.append)
output = "\n".join(lines)
expected_output = textwrap.dedent(
"""\
└──Rule
└──Rhs
└──Alt
├──NamedItem
│ └──NameLeaf('a')
└──NamedItem
└──NameLeaf('b')
└──Rule
└──Rhs
└──Alt
└──NamedItem
└──StringLeaf("'a'")
└──Rule
└──Rhs
└──Alt
└──NamedItem
└──StringLeaf("'b'")
"""
)
self.assertEqual(output, expected_output)
def test_deep_nested_rule(self) -> None:
grammar = """
start: 'a' ['b'['c'['d']]]
"""
rules = parse_string(grammar, GrammarParser)
printer = ASTGrammarPrinter()
lines: List[str] = []
printer.print_grammar_ast(rules, printer=lines.append)
output = "\n".join(lines)
expected_output = textwrap.dedent(
"""\
└──Rule
└──Rhs
└──Alt
├──NamedItem
│ └──StringLeaf("'a'")
└──NamedItem
└──Opt
└──Rhs
└──Alt
├──NamedItem
│ └──StringLeaf("'b'")
└──NamedItem
└──Opt
└──Rhs
└──Alt
├──NamedItem
│ └──StringLeaf("'c'")
└──NamedItem
└──Opt
└──Rhs
└──Alt
└──NamedItem
└──StringLeaf("'d'")
"""
)
self.assertEqual(output, expected_output)
def test_rule_flags(self) -> None:
"""Test the new rule flags syntax that accepts arbitrary lists of flags."""
# Test grammar with various flag combinations
grammar_source = """
start: simple_rule
simple_rule (memo):
| "hello"
multi_flag_rule (memo, custom, test):
| "world"
single_custom_flag (custom):
| "test"
no_flags_rule:
| "plain"
"""
grammar: Grammar = parse_string(grammar_source, GrammarParser)
rules = grammar.rules
# Test memo-only rule
simple_rule = rules['simple_rule']
self.assertTrue('memo' in simple_rule.flags,
"simple_rule should have memo")
self.assertEqual(simple_rule.flags, frozenset(['memo']),
f"simple_rule flags should be {'memo'}, got {simple_rule.flags}")
# Test multi-flag rule
multi_flag_rule = rules['multi_flag_rule']
self.assertTrue('memo' in simple_rule.flags,
"multi_flag_rule should have memo")
self.assertEqual(multi_flag_rule.flags, frozenset({'memo', 'custom', 'test'}),
f"multi_flag_rule flags should contain memo, custom, test, got {multi_flag_rule.flags}")
# Test single custom flag rule
single_custom_rule = rules['single_custom_flag']
self.assertFalse('memo' not in simple_rule.flags,
"single_custom_flag should not have memo")
self.assertEqual(single_custom_rule.flags, frozenset(['custom']),
f"single_custom_flag flags should be {'custom'}, got {single_custom_rule.flags}")
# Test no flags rule
no_flags_rule = rules['no_flags_rule']
self.assertFalse('memo' not in simple_rule.flags,
"no_flags_rule should not have memo")
self.assertEqual(no_flags_rule.flags, frozenset(),
f"no_flags_rule flags should be the empty set, got {no_flags_rule.flags}")