mirror of
https://github.com/python/cpython.git
synced 2026-06-29 04:10:54 +00:00
Move the compile-time optimizations (_optimize_charset, _compile_charset, _simple, _compile_info and the literal/charset prefix helpers) out of _compiler.py into a new Lib/re/_optimizer.py. _compiler.py keeps only the bytecode emitter and imports them. This is groundwork for a follow-up optimization; there is no behavior change. Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
411 lines
14 KiB
Python
411 lines
14 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# convert template to internal format
|
|
#
|
|
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# See the __init__.py file for information on usage and redistribution.
|
|
#
|
|
|
|
"""Internal support module for sre"""
|
|
|
|
import _sre
|
|
from . import _parser
|
|
from ._constants import *
|
|
from ._casefix import _EXTRA_CASES
|
|
from ._optimizer import (
|
|
_combine_flags, _compile_charset, _optimize_charset, _compile_info,
|
|
_simple, _CHARSET_ALL, _CODEBITS, MAXCODE,
|
|
)
|
|
|
|
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
|
|
|
|
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
|
|
_SUCCESS_CODES = {SUCCESS, FAILURE}
|
|
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
|
|
|
|
_REPEATING_CODES = {
|
|
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
|
|
MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
|
|
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
|
|
}
|
|
|
|
def _compile(code, pattern, flags):
|
|
# internal: compile a (sub)pattern
|
|
emit = code.append
|
|
_len = len
|
|
LITERAL_CODES = _LITERAL_CODES
|
|
REPEATING_CODES = _REPEATING_CODES
|
|
SUCCESS_CODES = _SUCCESS_CODES
|
|
ASSERT_CODES = _ASSERT_CODES
|
|
iscased = None
|
|
tolower = None
|
|
fixes = None
|
|
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
|
|
if flags & SRE_FLAG_UNICODE:
|
|
iscased = _sre.unicode_iscased
|
|
tolower = _sre.unicode_tolower
|
|
fixes = _EXTRA_CASES
|
|
else:
|
|
iscased = _sre.ascii_iscased
|
|
tolower = _sre.ascii_tolower
|
|
for op, av in pattern:
|
|
if op in LITERAL_CODES:
|
|
if not flags & SRE_FLAG_IGNORECASE:
|
|
emit(op)
|
|
emit(av)
|
|
elif flags & SRE_FLAG_LOCALE:
|
|
emit(OP_LOCALE_IGNORE[op])
|
|
emit(av)
|
|
elif not iscased(av):
|
|
emit(op)
|
|
emit(av)
|
|
else:
|
|
lo = tolower(av)
|
|
if not fixes: # ascii
|
|
emit(OP_IGNORE[op])
|
|
emit(lo)
|
|
elif lo not in fixes:
|
|
emit(OP_UNICODE_IGNORE[op])
|
|
emit(lo)
|
|
else:
|
|
emit(IN_UNI_IGNORE)
|
|
skip = _len(code); emit(0)
|
|
if op is NOT_LITERAL:
|
|
emit(NEGATE)
|
|
for k in (lo,) + fixes[lo]:
|
|
emit(LITERAL)
|
|
emit(k)
|
|
emit(FAILURE)
|
|
code[skip] = _len(code) - skip
|
|
elif op is IN:
|
|
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
|
|
if not charset:
|
|
emit(FAILURE)
|
|
elif charset == _CHARSET_ALL:
|
|
emit(ANY_ALL)
|
|
else:
|
|
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
|
emit(IN_LOC_IGNORE)
|
|
elif not hascased:
|
|
emit(IN)
|
|
elif not fixes: # ascii
|
|
emit(IN_IGNORE)
|
|
else:
|
|
emit(IN_UNI_IGNORE)
|
|
skip = _len(code); emit(0)
|
|
_compile_charset(charset, flags, code)
|
|
code[skip] = _len(code) - skip
|
|
elif op is ANY:
|
|
if flags & SRE_FLAG_DOTALL:
|
|
emit(ANY_ALL)
|
|
else:
|
|
emit(ANY)
|
|
elif op in REPEATING_CODES:
|
|
if _simple(av[2]):
|
|
emit(REPEATING_CODES[op][2])
|
|
skip = _len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
emit(SUCCESS)
|
|
code[skip] = _len(code) - skip
|
|
else:
|
|
emit(REPEATING_CODES[op][0])
|
|
skip = _len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
code[skip] = _len(code) - skip
|
|
emit(REPEATING_CODES[op][1])
|
|
elif op is SUBPATTERN:
|
|
group, add_flags, del_flags, p = av
|
|
if group:
|
|
emit(MARK)
|
|
emit((group-1)*2)
|
|
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
|
|
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
|
|
if group:
|
|
emit(MARK)
|
|
emit((group-1)*2+1)
|
|
elif op is ATOMIC_GROUP:
|
|
# Atomic Groups are handled by starting with an Atomic
|
|
# Group op code, then putting in the atomic group pattern
|
|
# and finally a success op code to tell any repeat
|
|
# operations within the Atomic Group to stop eating and
|
|
# pop their stack if they reach it
|
|
emit(ATOMIC_GROUP)
|
|
skip = _len(code); emit(0)
|
|
_compile(code, av, flags)
|
|
emit(SUCCESS)
|
|
code[skip] = _len(code) - skip
|
|
elif op in SUCCESS_CODES:
|
|
emit(op)
|
|
elif op in ASSERT_CODES:
|
|
emit(op)
|
|
skip = _len(code); emit(0)
|
|
if av[0] >= 0:
|
|
emit(0) # look ahead
|
|
else:
|
|
lo, hi = av[1].getwidth()
|
|
if lo > MAXCODE:
|
|
raise error("looks too much behind")
|
|
if lo != hi:
|
|
raise PatternError("look-behind requires fixed-width pattern")
|
|
emit(lo) # look behind
|
|
_compile(code, av[1], flags)
|
|
emit(SUCCESS)
|
|
code[skip] = _len(code) - skip
|
|
elif op is AT:
|
|
emit(op)
|
|
if flags & SRE_FLAG_MULTILINE:
|
|
av = AT_MULTILINE.get(av, av)
|
|
if flags & SRE_FLAG_LOCALE:
|
|
av = AT_LOCALE.get(av, av)
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
av = AT_UNICODE.get(av, av)
|
|
emit(av)
|
|
elif op is BRANCH:
|
|
emit(op)
|
|
tail = []
|
|
tailappend = tail.append
|
|
for av in av[1]:
|
|
skip = _len(code); emit(0)
|
|
# _compile_info(code, av, flags)
|
|
_compile(code, av, flags)
|
|
emit(JUMP)
|
|
tailappend(_len(code)); emit(0)
|
|
code[skip] = _len(code) - skip
|
|
emit(FAILURE) # end of branch
|
|
for tail in tail:
|
|
code[tail] = _len(code) - tail
|
|
elif op is CATEGORY:
|
|
emit(op)
|
|
if flags & SRE_FLAG_LOCALE:
|
|
av = CH_LOCALE[av]
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
av = CH_UNICODE[av]
|
|
emit(av)
|
|
elif op is GROUPREF:
|
|
if not flags & SRE_FLAG_IGNORECASE:
|
|
emit(op)
|
|
elif flags & SRE_FLAG_LOCALE:
|
|
emit(GROUPREF_LOC_IGNORE)
|
|
elif not fixes: # ascii
|
|
emit(GROUPREF_IGNORE)
|
|
else:
|
|
emit(GROUPREF_UNI_IGNORE)
|
|
emit(av-1)
|
|
elif op is GROUPREF_EXISTS:
|
|
emit(op)
|
|
emit(av[0]-1)
|
|
skipyes = _len(code); emit(0)
|
|
_compile(code, av[1], flags)
|
|
if av[2]:
|
|
emit(JUMP)
|
|
skipno = _len(code); emit(0)
|
|
code[skipyes] = _len(code) - skipyes + 1
|
|
_compile(code, av[2], flags)
|
|
code[skipno] = _len(code) - skipno
|
|
else:
|
|
code[skipyes] = _len(code) - skipyes + 1
|
|
else:
|
|
raise PatternError(f"internal: unsupported operand type {op!r}")
|
|
|
|
def isstring(obj):
|
|
return isinstance(obj, (str, bytes))
|
|
|
|
def _code(p, flags):
|
|
|
|
flags = p.state.flags | flags
|
|
code = []
|
|
|
|
# compile info block
|
|
_compile_info(code, p, flags)
|
|
|
|
# compile the pattern
|
|
_compile(code, p.data, flags)
|
|
|
|
code.append(SUCCESS)
|
|
|
|
return code
|
|
|
|
def _hex_code(code):
|
|
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
|
|
|
|
def dis(code):
|
|
import sys
|
|
|
|
labels = set()
|
|
level = 0
|
|
offset_width = len(str(len(code) - 1))
|
|
|
|
def dis_(start, end):
|
|
def print_(*args, to=None):
|
|
if to is not None:
|
|
labels.add(to)
|
|
args += ('(to %d)' % (to,),)
|
|
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
|
|
end=' '*(level-1))
|
|
print(*args)
|
|
|
|
def print_2(*args):
|
|
print(end=' '*(offset_width + 2*level))
|
|
print(*args)
|
|
|
|
nonlocal level
|
|
level += 1
|
|
i = start
|
|
while i < end:
|
|
start = i
|
|
op = code[i]
|
|
i += 1
|
|
op = OPCODES[op]
|
|
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
|
|
MAX_UNTIL, MIN_UNTIL, NEGATE):
|
|
print_(op)
|
|
elif op in (LITERAL, NOT_LITERAL,
|
|
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
|
|
LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
|
|
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
|
|
arg = code[i]
|
|
i += 1
|
|
print_(op, '%#02x (%r)' % (arg, chr(arg)))
|
|
elif op is AT:
|
|
arg = code[i]
|
|
i += 1
|
|
arg = str(ATCODES[arg])
|
|
assert arg[:3] == 'AT_'
|
|
print_(op, arg[3:])
|
|
elif op is CATEGORY:
|
|
arg = code[i]
|
|
i += 1
|
|
arg = str(CHCODES[arg])
|
|
assert arg[:9] == 'CATEGORY_'
|
|
print_(op, arg[9:])
|
|
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
|
|
skip = code[i]
|
|
print_(op, skip, to=i+skip)
|
|
dis_(i+1, i+skip)
|
|
i += skip
|
|
elif op in (RANGE, RANGE_UNI_IGNORE):
|
|
lo, hi = code[i: i+2]
|
|
i += 2
|
|
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
|
|
elif op is CHARSET:
|
|
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
|
|
i += 256//_CODEBITS
|
|
elif op is BIGCHARSET:
|
|
arg = code[i]
|
|
i += 1
|
|
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
|
|
for x in code[i: i + 256//_sre.CODESIZE]))
|
|
print_(op, arg, mapping)
|
|
i += 256//_sre.CODESIZE
|
|
level += 1
|
|
for j in range(arg):
|
|
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
|
|
i += 256//_CODEBITS
|
|
level -= 1
|
|
elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
|
|
GROUPREF_LOC_IGNORE):
|
|
arg = code[i]
|
|
i += 1
|
|
print_(op, arg)
|
|
elif op is JUMP:
|
|
skip = code[i]
|
|
print_(op, skip, to=i+skip)
|
|
i += 1
|
|
elif op is BRANCH:
|
|
skip = code[i]
|
|
print_(op, skip, to=i+skip)
|
|
while skip:
|
|
dis_(i+1, i+skip)
|
|
i += skip
|
|
start = i
|
|
skip = code[i]
|
|
if skip:
|
|
print_('branch', skip, to=i+skip)
|
|
else:
|
|
print_(FAILURE)
|
|
i += 1
|
|
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
|
|
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
|
|
skip, min, max = code[i: i+3]
|
|
if max == MAXREPEAT:
|
|
max = 'MAXREPEAT'
|
|
print_(op, skip, min, max, to=i+skip)
|
|
dis_(i+3, i+skip)
|
|
i += skip
|
|
elif op is GROUPREF_EXISTS:
|
|
arg, skip = code[i: i+2]
|
|
print_(op, arg, skip, to=i+skip)
|
|
i += 2
|
|
elif op in (ASSERT, ASSERT_NOT):
|
|
skip, arg = code[i: i+2]
|
|
print_(op, skip, arg, to=i+skip)
|
|
dis_(i+2, i+skip)
|
|
i += skip
|
|
elif op is ATOMIC_GROUP:
|
|
skip = code[i]
|
|
print_(op, skip, to=i+skip)
|
|
dis_(i+1, i+skip)
|
|
i += skip
|
|
elif op is INFO:
|
|
skip, flags, min, max = code[i: i+4]
|
|
if max == MAXREPEAT:
|
|
max = 'MAXREPEAT'
|
|
print_(op, skip, bin(flags), min, max, to=i+skip)
|
|
start = i+4
|
|
if flags & SRE_INFO_PREFIX:
|
|
prefix_len, prefix_skip = code[i+4: i+6]
|
|
print_2(' prefix_skip', prefix_skip)
|
|
start = i + 6
|
|
prefix = code[start: start+prefix_len]
|
|
print_2(' prefix',
|
|
'[%s]' % ', '.join('%#02x' % x for x in prefix),
|
|
'(%r)' % ''.join(map(chr, prefix)))
|
|
start += prefix_len
|
|
print_2(' overlap', code[start: start+prefix_len])
|
|
start += prefix_len
|
|
if flags & SRE_INFO_CHARSET:
|
|
level += 1
|
|
print_2('in')
|
|
dis_(start, i+skip)
|
|
level -= 1
|
|
i += skip
|
|
else:
|
|
raise ValueError(op)
|
|
|
|
level -= 1
|
|
|
|
dis_(0, len(code))
|
|
|
|
|
|
def compile(p, flags=0):
|
|
# internal: convert pattern list to internal format
|
|
|
|
if isstring(p):
|
|
pattern = p
|
|
p = _parser.parse(p, flags)
|
|
else:
|
|
pattern = None
|
|
|
|
code = _code(p, flags)
|
|
|
|
if flags & SRE_FLAG_DEBUG:
|
|
print()
|
|
dis(code)
|
|
|
|
# map in either direction
|
|
groupindex = p.state.groupdict
|
|
indexgroup = [None] * p.state.groups
|
|
for k, i in groupindex.items():
|
|
indexgroup[i] = k
|
|
|
|
return _sre.compile(
|
|
pattern, flags | p.state.flags, code,
|
|
p.state.groups-1,
|
|
groupindex, tuple(indexgroup)
|
|
)
|