mirror of
https://github.com/python/cpython.git
synced 2025-10-31 21:51:50 +00:00
also modified check_all function to suppress all warnings since they aren't relevant to what this test is doing (allows quiet checking of regsub, for instance)
386 lines
11 KiB
Python
386 lines
11 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# convert template to internal format
|
|
#
|
|
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# See the sre.py file for information on usage and redistribution.
|
|
#
|
|
|
|
import _sre
|
|
|
|
from sre_constants import *
|
|
|
|
__all__ = ["compile"]
|
|
|
|
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
|
|
|
|
MAXCODE = 65535
|
|
|
|
def _compile(code, pattern, flags):
|
|
# internal: compile a (sub)pattern
|
|
emit = code.append
|
|
for op, av in pattern:
|
|
if op in (LITERAL, NOT_LITERAL):
|
|
if flags & SRE_FLAG_IGNORECASE:
|
|
emit(OPCODES[OP_IGNORE[op]])
|
|
emit(_sre.getlower(av, flags))
|
|
else:
|
|
emit(OPCODES[op])
|
|
emit(av)
|
|
elif op is IN:
|
|
if flags & SRE_FLAG_IGNORECASE:
|
|
emit(OPCODES[OP_IGNORE[op]])
|
|
def fixup(literal, flags=flags):
|
|
return _sre.getlower(literal, flags)
|
|
else:
|
|
emit(OPCODES[op])
|
|
fixup = lambda x: x
|
|
skip = len(code); emit(0)
|
|
_compile_charset(av, flags, code, fixup)
|
|
code[skip] = len(code) - skip
|
|
elif op is ANY:
|
|
if flags & SRE_FLAG_DOTALL:
|
|
emit(OPCODES[ANY_ALL])
|
|
else:
|
|
emit(OPCODES[ANY])
|
|
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
|
|
if flags & SRE_FLAG_TEMPLATE:
|
|
raise error, "internal: unsupported template operator"
|
|
emit(OPCODES[REPEAT])
|
|
skip = len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = len(code) - skip
|
|
elif _simple(av) and op == MAX_REPEAT:
|
|
emit(OPCODES[REPEAT_ONE])
|
|
skip = len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = len(code) - skip
|
|
else:
|
|
emit(OPCODES[REPEAT])
|
|
skip = len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
code[skip] = len(code) - skip
|
|
if op == MAX_REPEAT:
|
|
emit(OPCODES[MAX_UNTIL])
|
|
else:
|
|
emit(OPCODES[MIN_UNTIL])
|
|
elif op is SUBPATTERN:
|
|
if av[0]:
|
|
emit(OPCODES[MARK])
|
|
emit((av[0]-1)*2)
|
|
# _compile_info(code, av[1], flags)
|
|
_compile(code, av[1], flags)
|
|
if av[0]:
|
|
emit(OPCODES[MARK])
|
|
emit((av[0]-1)*2+1)
|
|
elif op in (SUCCESS, FAILURE):
|
|
emit(OPCODES[op])
|
|
elif op in (ASSERT, ASSERT_NOT):
|
|
emit(OPCODES[op])
|
|
skip = len(code); emit(0)
|
|
if av[0] >= 0:
|
|
emit(0) # look ahead
|
|
else:
|
|
lo, hi = av[1].getwidth()
|
|
if lo != hi:
|
|
raise error, "look-behind requires fixed-width pattern"
|
|
emit(lo) # look behind
|
|
_compile(code, av[1], flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = len(code) - skip
|
|
elif op is CALL:
|
|
emit(OPCODES[op])
|
|
skip = len(code); emit(0)
|
|
_compile(code, av, flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = len(code) - skip
|
|
elif op is AT:
|
|
emit(OPCODES[op])
|
|
if flags & SRE_FLAG_MULTILINE:
|
|
emit(ATCODES[AT_MULTILINE.get(av, av)])
|
|
else:
|
|
emit(ATCODES[av])
|
|
elif op is BRANCH:
|
|
emit(OPCODES[op])
|
|
tail = []
|
|
for av in av[1]:
|
|
skip = len(code); emit(0)
|
|
# _compile_info(code, av, flags)
|
|
_compile(code, av, flags)
|
|
emit(OPCODES[JUMP])
|
|
tail.append(len(code)); emit(0)
|
|
code[skip] = len(code) - skip
|
|
emit(0) # end of branch
|
|
for tail in tail:
|
|
code[tail] = len(code) - tail
|
|
elif op is CATEGORY:
|
|
emit(OPCODES[op])
|
|
if flags & SRE_FLAG_LOCALE:
|
|
emit(CHCODES[CH_LOCALE[av]])
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
emit(CHCODES[CH_UNICODE[av]])
|
|
else:
|
|
emit(CHCODES[av])
|
|
elif op is GROUPREF:
|
|
if flags & SRE_FLAG_IGNORECASE:
|
|
emit(OPCODES[OP_IGNORE[op]])
|
|
else:
|
|
emit(OPCODES[op])
|
|
emit(av-1)
|
|
else:
|
|
raise ValueError, ("unsupported operand type", op)
|
|
|
|
def _compile_charset(charset, flags, code, fixup=None):
|
|
# compile charset subprogram
|
|
emit = code.append
|
|
if not fixup:
|
|
fixup = lambda x: x
|
|
for op, av in _optimize_charset(charset, fixup):
|
|
emit(OPCODES[op])
|
|
if op is NEGATE:
|
|
pass
|
|
elif op is LITERAL:
|
|
emit(fixup(av))
|
|
elif op is RANGE:
|
|
emit(fixup(av[0]))
|
|
emit(fixup(av[1]))
|
|
elif op is CHARSET:
|
|
code.extend(av)
|
|
elif op is CATEGORY:
|
|
if flags & SRE_FLAG_LOCALE:
|
|
emit(CHCODES[CH_LOCALE[av]])
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
emit(CHCODES[CH_UNICODE[av]])
|
|
else:
|
|
emit(CHCODES[av])
|
|
else:
|
|
raise error, "internal: unsupported set operator"
|
|
emit(OPCODES[FAILURE])
|
|
|
|
def _optimize_charset(charset, fixup):
|
|
# internal: optimize character set
|
|
out = []
|
|
charmap = [0]*256
|
|
try:
|
|
for op, av in charset:
|
|
if op is NEGATE:
|
|
out.append((op, av))
|
|
elif op is LITERAL:
|
|
charmap[fixup(av)] = 1
|
|
elif op is RANGE:
|
|
for i in range(fixup(av[0]), fixup(av[1])+1):
|
|
charmap[i] = 1
|
|
elif op is CATEGORY:
|
|
# XXX: could append to charmap tail
|
|
return charset # cannot compress
|
|
except IndexError:
|
|
# character set contains unicode characters
|
|
return charset
|
|
# compress character map
|
|
i = p = n = 0
|
|
runs = []
|
|
for c in charmap:
|
|
if c:
|
|
if n == 0:
|
|
p = i
|
|
n = n + 1
|
|
elif n:
|
|
runs.append((p, n))
|
|
n = 0
|
|
i = i + 1
|
|
if n:
|
|
runs.append((p, n))
|
|
if len(runs) <= 2:
|
|
# use literal/range
|
|
for p, n in runs:
|
|
if n == 1:
|
|
out.append((LITERAL, p))
|
|
else:
|
|
out.append((RANGE, (p, p+n-1)))
|
|
if len(out) < len(charset):
|
|
return out
|
|
else:
|
|
# use bitmap
|
|
data = []
|
|
m = 1; v = 0
|
|
for c in charmap:
|
|
if c:
|
|
v = v + m
|
|
m = m << 1
|
|
if m > MAXCODE:
|
|
data.append(v)
|
|
m = 1; v = 0
|
|
out.append((CHARSET, data))
|
|
return out
|
|
return charset
|
|
|
|
def _simple(av):
|
|
# check if av is a "simple" operator
|
|
lo, hi = av[2].getwidth()
|
|
if lo == 0 and hi == MAXREPEAT:
|
|
raise error, "nothing to repeat"
|
|
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
|
|
|
|
def _compile_info(code, pattern, flags):
|
|
# internal: compile an info block. in the current version,
|
|
# this contains min/max pattern width, and an optional literal
|
|
# prefix or a character map
|
|
lo, hi = pattern.getwidth()
|
|
if lo == 0:
|
|
return # not worth it
|
|
# look for a literal prefix
|
|
prefix = []
|
|
prefix_skip = 0
|
|
charset = [] # not used
|
|
if not (flags & SRE_FLAG_IGNORECASE):
|
|
# look for literal prefix
|
|
for op, av in pattern.data:
|
|
if op is LITERAL:
|
|
if len(prefix) == prefix_skip:
|
|
prefix_skip = prefix_skip + 1
|
|
prefix.append(av)
|
|
elif op is SUBPATTERN and len(av[1]) == 1:
|
|
op, av = av[1][0]
|
|
if op is LITERAL:
|
|
prefix.append(av)
|
|
else:
|
|
break
|
|
else:
|
|
break
|
|
# if no prefix, look for charset prefix
|
|
if not prefix and pattern.data:
|
|
op, av = pattern.data[0]
|
|
if op is SUBPATTERN and av[1]:
|
|
op, av = av[1][0]
|
|
if op is LITERAL:
|
|
charset.append((op, av))
|
|
elif op is BRANCH:
|
|
c = []
|
|
for p in av[1]:
|
|
if not p:
|
|
break
|
|
op, av = p[0]
|
|
if op is LITERAL:
|
|
c.append((op, av))
|
|
else:
|
|
break
|
|
else:
|
|
charset = c
|
|
elif op is BRANCH:
|
|
c = []
|
|
for p in av[1]:
|
|
if not p:
|
|
break
|
|
op, av = p[0]
|
|
if op is LITERAL:
|
|
c.append((op, av))
|
|
else:
|
|
break
|
|
else:
|
|
charset = c
|
|
elif op is IN:
|
|
charset = av
|
|
## if prefix:
|
|
## print "*** PREFIX", prefix, prefix_skip
|
|
## if charset:
|
|
## print "*** CHARSET", charset
|
|
# add an info block
|
|
emit = code.append
|
|
emit(OPCODES[INFO])
|
|
skip = len(code); emit(0)
|
|
# literal flag
|
|
mask = 0
|
|
if prefix:
|
|
mask = SRE_INFO_PREFIX
|
|
if len(prefix) == prefix_skip == len(pattern.data):
|
|
mask = mask + SRE_INFO_LITERAL
|
|
elif charset:
|
|
mask = mask + SRE_INFO_CHARSET
|
|
emit(mask)
|
|
# pattern length
|
|
if lo < MAXCODE:
|
|
emit(lo)
|
|
else:
|
|
emit(MAXCODE)
|
|
prefix = prefix[:MAXCODE]
|
|
if hi < MAXCODE:
|
|
emit(hi)
|
|
else:
|
|
emit(0)
|
|
# add literal prefix
|
|
if prefix:
|
|
emit(len(prefix)) # length
|
|
emit(prefix_skip) # skip
|
|
code.extend(prefix)
|
|
# generate overlap table
|
|
table = [-1] + ([0]*len(prefix))
|
|
for i in range(len(prefix)):
|
|
table[i+1] = table[i]+1
|
|
while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
|
|
table[i+1] = table[table[i+1]-1]+1
|
|
code.extend(table[1:]) # don't store first entry
|
|
elif charset:
|
|
_compile_charset(charset, 0, code)
|
|
code[skip] = len(code) - skip
|
|
|
|
STRING_TYPES = [type("")]
|
|
|
|
try:
|
|
STRING_TYPES.append(type(unicode("")))
|
|
except NameError:
|
|
pass
|
|
|
|
def _code(p, flags):
|
|
|
|
flags = p.pattern.flags | flags
|
|
code = []
|
|
|
|
# compile info block
|
|
_compile_info(code, p, flags)
|
|
|
|
# compile the pattern
|
|
_compile(code, p.data, flags)
|
|
|
|
code.append(OPCODES[SUCCESS])
|
|
|
|
return code
|
|
|
|
def compile(p, flags=0):
|
|
# internal: convert pattern list to internal format
|
|
|
|
if type(p) in STRING_TYPES:
|
|
import sre_parse
|
|
pattern = p
|
|
p = sre_parse.parse(p, flags)
|
|
else:
|
|
pattern = None
|
|
|
|
code = _code(p, flags)
|
|
|
|
# print code
|
|
|
|
# XXX: <fl> get rid of this limitation!
|
|
assert p.pattern.groups <= 100,\
|
|
"sorry, but this version only supports 100 named groups"
|
|
|
|
# map in either direction
|
|
groupindex = p.pattern.groupdict
|
|
indexgroup = [None] * p.pattern.groups
|
|
for k, i in groupindex.items():
|
|
indexgroup[i] = k
|
|
|
|
return _sre.compile(
|
|
pattern, flags, code,
|
|
p.pattern.groups-1,
|
|
groupindex, indexgroup
|
|
)
|