mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	bpo-47152: Convert the re module into a package (GH-32177)
The sre_* modules are now deprecated.
This commit is contained in:
		
							parent
							
								
									4ed8a9a589
								
							
						
					
					
						commit
						1be3260a90
					
				
					 16 changed files with 2235 additions and 2182 deletions
				
			
		|  | @ -96,14 +96,14 @@ Sample output (may vary depending on the architecture):: | ||||||
|     Loaded modules: |     Loaded modules: | ||||||
|     _types: |     _types: | ||||||
|     copyreg:  _inverted_registry,_slotnames,__all__ |     copyreg:  _inverted_registry,_slotnames,__all__ | ||||||
|     sre_compile:  isstring,_sre,_optimize_unicode |     re._compiler:  isstring,_sre,_optimize_unicode | ||||||
|     _sre: |     _sre: | ||||||
|     sre_constants:  REPEAT_ONE,makedict,AT_END_LINE |     re._constants:  REPEAT_ONE,makedict,AT_END_LINE | ||||||
|     sys: |     sys: | ||||||
|     re:  __module__,finditer,_expand |     re:  __module__,finditer,_expand | ||||||
|     itertools: |     itertools: | ||||||
|     __main__:  re,itertools,baconhameggs |     __main__:  re,itertools,baconhameggs | ||||||
|     sre_parse:  _PATTERNENDERS,SRE_FLAG_UNICODE |     re._parser:  _PATTERNENDERS,SRE_FLAG_UNICODE | ||||||
|     array: |     array: | ||||||
|     types:  __module__,IntType,TypeType |     types:  __module__,IntType,TypeType | ||||||
|     --------------------------------------------------- |     --------------------------------------------------- | ||||||
|  |  | ||||||
|  | @ -73,12 +73,12 @@ the following:: | ||||||
|    ncalls  tottime  percall  cumtime  percall filename:lineno(function) |    ncalls  tottime  percall  cumtime  percall filename:lineno(function) | ||||||
|         1    0.000    0.000    0.002    0.002 {built-in method builtins.exec} |         1    0.000    0.000    0.002    0.002 {built-in method builtins.exec} | ||||||
|         1    0.000    0.000    0.001    0.001 <string>:1(<module>) |         1    0.000    0.000    0.001    0.001 <string>:1(<module>) | ||||||
|         1    0.000    0.000    0.001    0.001 re.py:250(compile) |         1    0.000    0.000    0.001    0.001 __init__.py:250(compile) | ||||||
|         1    0.000    0.000    0.001    0.001 re.py:289(_compile) |         1    0.000    0.000    0.001    0.001 __init__.py:289(_compile) | ||||||
|         1    0.000    0.000    0.000    0.000 sre_compile.py:759(compile) |         1    0.000    0.000    0.000    0.000 _compiler.py:759(compile) | ||||||
|         1    0.000    0.000    0.000    0.000 sre_parse.py:937(parse) |         1    0.000    0.000    0.000    0.000 _parser.py:937(parse) | ||||||
|         1    0.000    0.000    0.000    0.000 sre_compile.py:598(_code) |         1    0.000    0.000    0.000    0.000 _compiler.py:598(_code) | ||||||
|         1    0.000    0.000    0.000    0.000 sre_parse.py:435(_parse_sub) |         1    0.000    0.000    0.000    0.000 _parser.py:435(_parse_sub) | ||||||
| 
 | 
 | ||||||
| The first line indicates that 214 calls were monitored.  Of those calls, 207 | The first line indicates that 214 calls were monitored.  Of those calls, 207 | ||||||
| were :dfn:`primitive`, meaning that the call was not induced via recursion. The | were :dfn:`primitive`, meaning that the call was not induced via recursion. The | ||||||
|  |  | ||||||
|  | @ -532,6 +532,10 @@ Deprecated | ||||||
|   be able to parse Python 3.10 or newer. See the :pep:`617` (New PEG parser for |   be able to parse Python 3.10 or newer. See the :pep:`617` (New PEG parser for | ||||||
|   CPython).  (Contributed by Victor Stinner in :issue:`40360`.) |   CPython).  (Contributed by Victor Stinner in :issue:`40360`.) | ||||||
| 
 | 
 | ||||||
|  | * Undocumented modules ``sre_compile``, ``sre_constants`` and ``sre_parse`` | ||||||
|  |   are now deprecated. | ||||||
|  |   (Contributed by Serhiy Storchaka in :issue:`47152`.) | ||||||
|  | 
 | ||||||
| * :class:`webbrowser.MacOSX` is deprecated and will be removed in Python 3.13. | * :class:`webbrowser.MacOSX` is deprecated and will be removed in Python 3.13. | ||||||
|   It is untested and undocumented and also not used by webbrowser itself. |   It is untested and undocumented and also not used by webbrowser itself. | ||||||
|   (Contributed by Dong-hee Na in :issue:`42255`.) |   (Contributed by Dong-hee Na in :issue:`42255`.) | ||||||
|  |  | ||||||
|  | @ -122,8 +122,7 @@ | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| import enum | import enum | ||||||
| import sre_compile | from . import _compiler, _parser | ||||||
| import sre_parse |  | ||||||
| import functools | import functools | ||||||
| try: | try: | ||||||
|     import _locale |     import _locale | ||||||
|  | @ -146,21 +145,21 @@ | ||||||
| @enum._simple_enum(enum.IntFlag, boundary=enum.KEEP) | @enum._simple_enum(enum.IntFlag, boundary=enum.KEEP) | ||||||
| class RegexFlag: | class RegexFlag: | ||||||
|     NOFLAG = 0 |     NOFLAG = 0 | ||||||
|     ASCII = A = sre_compile.SRE_FLAG_ASCII # assume ascii "locale" |     ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale" | ||||||
|     IGNORECASE = I = sre_compile.SRE_FLAG_IGNORECASE # ignore case |     IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case | ||||||
|     LOCALE = L = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale |     LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale | ||||||
|     UNICODE = U = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale" |     UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale" | ||||||
|     MULTILINE = M = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline |     MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline | ||||||
|     DOTALL = S = sre_compile.SRE_FLAG_DOTALL # make dot match newline |     DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline | ||||||
|     VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments |     VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments | ||||||
|     # sre extensions (experimental, don't rely on these) |     # sre extensions (experimental, don't rely on these) | ||||||
|     TEMPLATE = T = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking |     TEMPLATE = T = _compiler.SRE_FLAG_TEMPLATE # disable backtracking | ||||||
|     DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation |     DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation | ||||||
|     __str__ = object.__str__ |     __str__ = object.__str__ | ||||||
|     _numeric_repr_ = hex |     _numeric_repr_ = hex | ||||||
| 
 | 
 | ||||||
| # sre exception | # sre exception | ||||||
| error = sre_compile.error | error = _compiler.error | ||||||
| 
 | 
 | ||||||
| # -------------------------------------------------------------------- | # -------------------------------------------------------------------- | ||||||
| # public interface | # public interface | ||||||
|  | @ -257,8 +256,8 @@ def escape(pattern): | ||||||
|         pattern = str(pattern, 'latin1') |         pattern = str(pattern, 'latin1') | ||||||
|         return pattern.translate(_special_chars_map).encode('latin1') |         return pattern.translate(_special_chars_map).encode('latin1') | ||||||
| 
 | 
 | ||||||
| Pattern = type(sre_compile.compile('', 0)) | Pattern = type(_compiler.compile('', 0)) | ||||||
| Match = type(sre_compile.compile('', 0).match('')) | Match = type(_compiler.compile('', 0).match('')) | ||||||
| 
 | 
 | ||||||
| # -------------------------------------------------------------------- | # -------------------------------------------------------------------- | ||||||
| # internals | # internals | ||||||
|  | @ -279,9 +278,9 @@ def _compile(pattern, flags): | ||||||
|             raise ValueError( |             raise ValueError( | ||||||
|                 "cannot process flags argument with a compiled pattern") |                 "cannot process flags argument with a compiled pattern") | ||||||
|         return pattern |         return pattern | ||||||
|     if not sre_compile.isstring(pattern): |     if not _compiler.isstring(pattern): | ||||||
|         raise TypeError("first argument must be string or compiled pattern") |         raise TypeError("first argument must be string or compiled pattern") | ||||||
|     p = sre_compile.compile(pattern, flags) |     p = _compiler.compile(pattern, flags) | ||||||
|     if not (flags & DEBUG): |     if not (flags & DEBUG): | ||||||
|         if len(_cache) >= _MAXCACHE: |         if len(_cache) >= _MAXCACHE: | ||||||
|             # Drop the oldest item |             # Drop the oldest item | ||||||
|  | @ -295,12 +294,12 @@ def _compile(pattern, flags): | ||||||
| @functools.lru_cache(_MAXCACHE) | @functools.lru_cache(_MAXCACHE) | ||||||
| def _compile_repl(repl, pattern): | def _compile_repl(repl, pattern): | ||||||
|     # internal: compile replacement pattern |     # internal: compile replacement pattern | ||||||
|     return sre_parse.parse_template(repl, pattern) |     return _parser.parse_template(repl, pattern) | ||||||
| 
 | 
 | ||||||
| def _expand(pattern, match, template): | def _expand(pattern, match, template): | ||||||
|     # internal: Match.expand implementation hook |     # internal: Match.expand implementation hook | ||||||
|     template = sre_parse.parse_template(template, pattern) |     template = _parser.parse_template(template, pattern) | ||||||
|     return sre_parse.expand_template(template, match) |     return _parser.expand_template(template, match) | ||||||
| 
 | 
 | ||||||
| def _subx(pattern, template): | def _subx(pattern, template): | ||||||
|     # internal: Pattern.sub/subn implementation helper |     # internal: Pattern.sub/subn implementation helper | ||||||
|  | @ -309,7 +308,7 @@ def _subx(pattern, template): | ||||||
|         # literal replacement |         # literal replacement | ||||||
|         return template[1][0] |         return template[1][0] | ||||||
|     def filter(match, template=template): |     def filter(match, template=template): | ||||||
|         return sre_parse.expand_template(template, match) |         return _parser.expand_template(template, match) | ||||||
|     return filter |     return filter | ||||||
| 
 | 
 | ||||||
| # register myself for pickling | # register myself for pickling | ||||||
|  | @ -326,22 +325,22 @@ def _pickle(p): | ||||||
| 
 | 
 | ||||||
| class Scanner: | class Scanner: | ||||||
|     def __init__(self, lexicon, flags=0): |     def __init__(self, lexicon, flags=0): | ||||||
|         from sre_constants import BRANCH, SUBPATTERN |         from ._constants import BRANCH, SUBPATTERN | ||||||
|         if isinstance(flags, RegexFlag): |         if isinstance(flags, RegexFlag): | ||||||
|             flags = flags.value |             flags = flags.value | ||||||
|         self.lexicon = lexicon |         self.lexicon = lexicon | ||||||
|         # combine phrases into a compound pattern |         # combine phrases into a compound pattern | ||||||
|         p = [] |         p = [] | ||||||
|         s = sre_parse.State() |         s = _parser.State() | ||||||
|         s.flags = flags |         s.flags = flags | ||||||
|         for phrase, action in lexicon: |         for phrase, action in lexicon: | ||||||
|             gid = s.opengroup() |             gid = s.opengroup() | ||||||
|             p.append(sre_parse.SubPattern(s, [ |             p.append(_parser.SubPattern(s, [ | ||||||
|                 (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), |                 (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), | ||||||
|                 ])) |                 ])) | ||||||
|             s.closegroup(gid, p[-1]) |             s.closegroup(gid, p[-1]) | ||||||
|         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |         p = _parser.SubPattern(s, [(BRANCH, (None, p))]) | ||||||
|         self.scanner = sre_compile.compile(p) |         self.scanner = _compiler.compile(p) | ||||||
|     def scan(self, string): |     def scan(self, string): | ||||||
|         result = [] |         result = [] | ||||||
|         append = result.append |         append = result.append | ||||||
							
								
								
									
										800
									
								
								Lib/re/_compiler.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										800
									
								
								Lib/re/_compiler.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,800 @@ | ||||||
|  | # | ||||||
|  | # Secret Labs' Regular Expression Engine | ||||||
|  | # | ||||||
|  | # convert template to internal format | ||||||
|  | # | ||||||
|  | # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. | ||||||
|  | # | ||||||
|  | # See the __init__.py file for information on usage and redistribution. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | """Internal support module for sre""" | ||||||
|  | 
 | ||||||
|  | import _sre | ||||||
|  | from . import _parser | ||||||
|  | from ._constants import * | ||||||
|  | 
 | ||||||
|  | assert _sre.MAGIC == MAGIC, "SRE module mismatch" | ||||||
|  | 
 | ||||||
|  | _LITERAL_CODES = {LITERAL, NOT_LITERAL} | ||||||
|  | _SUCCESS_CODES = {SUCCESS, FAILURE} | ||||||
|  | _ASSERT_CODES = {ASSERT, ASSERT_NOT} | ||||||
|  | _UNIT_CODES = _LITERAL_CODES | {ANY, IN} | ||||||
|  | 
 | ||||||
|  | _REPEATING_CODES = { | ||||||
|  |     MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), | ||||||
|  |     MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), | ||||||
|  |     POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # Sets of lowercase characters which have the same uppercase. | ||||||
|  | _equivalences = ( | ||||||
|  |     # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I | ||||||
|  |     (0x69, 0x131), # iı | ||||||
|  |     # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S | ||||||
|  |     (0x73, 0x17f), # sſ | ||||||
|  |     # MICRO SIGN, GREEK SMALL LETTER MU | ||||||
|  |     (0xb5, 0x3bc), # µμ | ||||||
|  |     # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI | ||||||
|  |     (0x345, 0x3b9, 0x1fbe), # \u0345ιι | ||||||
|  |     # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA | ||||||
|  |     (0x390, 0x1fd3), # ΐΐ | ||||||
|  |     # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA | ||||||
|  |     (0x3b0, 0x1fe3), # ΰΰ | ||||||
|  |     # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL | ||||||
|  |     (0x3b2, 0x3d0), # βϐ | ||||||
|  |     # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL | ||||||
|  |     (0x3b5, 0x3f5), # εϵ | ||||||
|  |     # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL | ||||||
|  |     (0x3b8, 0x3d1), # θϑ | ||||||
|  |     # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL | ||||||
|  |     (0x3ba, 0x3f0), # κϰ | ||||||
|  |     # GREEK SMALL LETTER PI, GREEK PI SYMBOL | ||||||
|  |     (0x3c0, 0x3d6), # πϖ | ||||||
|  |     # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL | ||||||
|  |     (0x3c1, 0x3f1), # ρϱ | ||||||
|  |     # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA | ||||||
|  |     (0x3c2, 0x3c3), # ςσ | ||||||
|  |     # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL | ||||||
|  |     (0x3c6, 0x3d5), # φϕ | ||||||
|  |     # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE | ||||||
|  |     (0x1e61, 0x1e9b), # ṡẛ | ||||||
|  |     # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST | ||||||
|  |     (0xfb05, 0xfb06), # ſtst | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | # Maps the lowercase code to lowercase codes which have the same uppercase. | ||||||
|  | _ignorecase_fixes = {i: tuple(j for j in t if i != j) | ||||||
|  |                      for t in _equivalences for i in t} | ||||||
|  | 
 | ||||||
|  | def _combine_flags(flags, add_flags, del_flags, | ||||||
|  |                    TYPE_FLAGS=_parser.TYPE_FLAGS): | ||||||
|  |     if add_flags & TYPE_FLAGS: | ||||||
|  |         flags &= ~TYPE_FLAGS | ||||||
|  |     return (flags | add_flags) & ~del_flags | ||||||
|  | 
 | ||||||
|  | def _compile(code, pattern, flags): | ||||||
|  |     # internal: compile a (sub)pattern | ||||||
|  |     emit = code.append | ||||||
|  |     _len = len | ||||||
|  |     LITERAL_CODES = _LITERAL_CODES | ||||||
|  |     REPEATING_CODES = _REPEATING_CODES | ||||||
|  |     SUCCESS_CODES = _SUCCESS_CODES | ||||||
|  |     ASSERT_CODES = _ASSERT_CODES | ||||||
|  |     iscased = None | ||||||
|  |     tolower = None | ||||||
|  |     fixes = None | ||||||
|  |     if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: | ||||||
|  |         if flags & SRE_FLAG_UNICODE: | ||||||
|  |             iscased = _sre.unicode_iscased | ||||||
|  |             tolower = _sre.unicode_tolower | ||||||
|  |             fixes = _ignorecase_fixes | ||||||
|  |         else: | ||||||
|  |             iscased = _sre.ascii_iscased | ||||||
|  |             tolower = _sre.ascii_tolower | ||||||
|  |     for op, av in pattern: | ||||||
|  |         if op in LITERAL_CODES: | ||||||
|  |             if not flags & SRE_FLAG_IGNORECASE: | ||||||
|  |                 emit(op) | ||||||
|  |                 emit(av) | ||||||
|  |             elif flags & SRE_FLAG_LOCALE: | ||||||
|  |                 emit(OP_LOCALE_IGNORE[op]) | ||||||
|  |                 emit(av) | ||||||
|  |             elif not iscased(av): | ||||||
|  |                 emit(op) | ||||||
|  |                 emit(av) | ||||||
|  |             else: | ||||||
|  |                 lo = tolower(av) | ||||||
|  |                 if not fixes:  # ascii | ||||||
|  |                     emit(OP_IGNORE[op]) | ||||||
|  |                     emit(lo) | ||||||
|  |                 elif lo not in fixes: | ||||||
|  |                     emit(OP_UNICODE_IGNORE[op]) | ||||||
|  |                     emit(lo) | ||||||
|  |                 else: | ||||||
|  |                     emit(IN_UNI_IGNORE) | ||||||
|  |                     skip = _len(code); emit(0) | ||||||
|  |                     if op is NOT_LITERAL: | ||||||
|  |                         emit(NEGATE) | ||||||
|  |                     for k in (lo,) + fixes[lo]: | ||||||
|  |                         emit(LITERAL) | ||||||
|  |                         emit(k) | ||||||
|  |                     emit(FAILURE) | ||||||
|  |                     code[skip] = _len(code) - skip | ||||||
|  |         elif op is IN: | ||||||
|  |             charset, hascased = _optimize_charset(av, iscased, tolower, fixes) | ||||||
|  |             if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: | ||||||
|  |                 emit(IN_LOC_IGNORE) | ||||||
|  |             elif not hascased: | ||||||
|  |                 emit(IN) | ||||||
|  |             elif not fixes:  # ascii | ||||||
|  |                 emit(IN_IGNORE) | ||||||
|  |             else: | ||||||
|  |                 emit(IN_UNI_IGNORE) | ||||||
|  |             skip = _len(code); emit(0) | ||||||
|  |             _compile_charset(charset, flags, code) | ||||||
|  |             code[skip] = _len(code) - skip | ||||||
|  |         elif op is ANY: | ||||||
|  |             if flags & SRE_FLAG_DOTALL: | ||||||
|  |                 emit(ANY_ALL) | ||||||
|  |             else: | ||||||
|  |                 emit(ANY) | ||||||
|  |         elif op in REPEATING_CODES: | ||||||
|  |             if flags & SRE_FLAG_TEMPLATE: | ||||||
|  |                 raise error("internal: unsupported template operator %r" % (op,)) | ||||||
|  |             if _simple(av[2]): | ||||||
|  |                 emit(REPEATING_CODES[op][2]) | ||||||
|  |                 skip = _len(code); emit(0) | ||||||
|  |                 emit(av[0]) | ||||||
|  |                 emit(av[1]) | ||||||
|  |                 _compile(code, av[2], flags) | ||||||
|  |                 emit(SUCCESS) | ||||||
|  |                 code[skip] = _len(code) - skip | ||||||
|  |             else: | ||||||
|  |                 emit(REPEATING_CODES[op][0]) | ||||||
|  |                 skip = _len(code); emit(0) | ||||||
|  |                 emit(av[0]) | ||||||
|  |                 emit(av[1]) | ||||||
|  |                 _compile(code, av[2], flags) | ||||||
|  |                 code[skip] = _len(code) - skip | ||||||
|  |                 emit(REPEATING_CODES[op][1]) | ||||||
|  |         elif op is SUBPATTERN: | ||||||
|  |             group, add_flags, del_flags, p = av | ||||||
|  |             if group: | ||||||
|  |                 emit(MARK) | ||||||
|  |                 emit((group-1)*2) | ||||||
|  |             # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) | ||||||
|  |             _compile(code, p, _combine_flags(flags, add_flags, del_flags)) | ||||||
|  |             if group: | ||||||
|  |                 emit(MARK) | ||||||
|  |                 emit((group-1)*2+1) | ||||||
|  |         elif op is ATOMIC_GROUP: | ||||||
|  |             # Atomic Groups are handled by starting with an Atomic | ||||||
|  |             # Group op code, then putting in the atomic group pattern | ||||||
|  |             # and finally a success op code to tell any repeat | ||||||
|  |             # operations within the Atomic Group to stop eating and | ||||||
|  |             # pop their stack if they reach it | ||||||
|  |             emit(ATOMIC_GROUP) | ||||||
|  |             skip = _len(code); emit(0) | ||||||
|  |             _compile(code, av, flags) | ||||||
|  |             emit(SUCCESS) | ||||||
|  |             code[skip] = _len(code) - skip | ||||||
|  |         elif op in SUCCESS_CODES: | ||||||
|  |             emit(op) | ||||||
|  |         elif op in ASSERT_CODES: | ||||||
|  |             emit(op) | ||||||
|  |             skip = _len(code); emit(0) | ||||||
|  |             if av[0] >= 0: | ||||||
|  |                 emit(0) # look ahead | ||||||
|  |             else: | ||||||
|  |                 lo, hi = av[1].getwidth() | ||||||
|  |                 if lo != hi: | ||||||
|  |                     raise error("look-behind requires fixed-width pattern") | ||||||
|  |                 emit(lo) # look behind | ||||||
|  |             _compile(code, av[1], flags) | ||||||
|  |             emit(SUCCESS) | ||||||
|  |             code[skip] = _len(code) - skip | ||||||
|  |         elif op is CALL: | ||||||
|  |             emit(op) | ||||||
|  |             skip = _len(code); emit(0) | ||||||
|  |             _compile(code, av, flags) | ||||||
|  |             emit(SUCCESS) | ||||||
|  |             code[skip] = _len(code) - skip | ||||||
|  |         elif op is AT: | ||||||
|  |             emit(op) | ||||||
|  |             if flags & SRE_FLAG_MULTILINE: | ||||||
|  |                 av = AT_MULTILINE.get(av, av) | ||||||
|  |             if flags & SRE_FLAG_LOCALE: | ||||||
|  |                 av = AT_LOCALE.get(av, av) | ||||||
|  |             elif flags & SRE_FLAG_UNICODE: | ||||||
|  |                 av = AT_UNICODE.get(av, av) | ||||||
|  |             emit(av) | ||||||
|  |         elif op is BRANCH: | ||||||
|  |             emit(op) | ||||||
|  |             tail = [] | ||||||
|  |             tailappend = tail.append | ||||||
|  |             for av in av[1]: | ||||||
|  |                 skip = _len(code); emit(0) | ||||||
|  |                 # _compile_info(code, av, flags) | ||||||
|  |                 _compile(code, av, flags) | ||||||
|  |                 emit(JUMP) | ||||||
|  |                 tailappend(_len(code)); emit(0) | ||||||
|  |                 code[skip] = _len(code) - skip | ||||||
|  |             emit(FAILURE) # end of branch | ||||||
|  |             for tail in tail: | ||||||
|  |                 code[tail] = _len(code) - tail | ||||||
|  |         elif op is CATEGORY: | ||||||
|  |             emit(op) | ||||||
|  |             if flags & SRE_FLAG_LOCALE: | ||||||
|  |                 av = CH_LOCALE[av] | ||||||
|  |             elif flags & SRE_FLAG_UNICODE: | ||||||
|  |                 av = CH_UNICODE[av] | ||||||
|  |             emit(av) | ||||||
|  |         elif op is GROUPREF: | ||||||
|  |             if not flags & SRE_FLAG_IGNORECASE: | ||||||
|  |                 emit(op) | ||||||
|  |             elif flags & SRE_FLAG_LOCALE: | ||||||
|  |                 emit(GROUPREF_LOC_IGNORE) | ||||||
|  |             elif not fixes:  # ascii | ||||||
|  |                 emit(GROUPREF_IGNORE) | ||||||
|  |             else: | ||||||
|  |                 emit(GROUPREF_UNI_IGNORE) | ||||||
|  |             emit(av-1) | ||||||
|  |         elif op is GROUPREF_EXISTS: | ||||||
|  |             emit(op) | ||||||
|  |             emit(av[0]-1) | ||||||
|  |             skipyes = _len(code); emit(0) | ||||||
|  |             _compile(code, av[1], flags) | ||||||
|  |             if av[2]: | ||||||
|  |                 emit(JUMP) | ||||||
|  |                 skipno = _len(code); emit(0) | ||||||
|  |                 code[skipyes] = _len(code) - skipyes + 1 | ||||||
|  |                 _compile(code, av[2], flags) | ||||||
|  |                 code[skipno] = _len(code) - skipno | ||||||
|  |             else: | ||||||
|  |                 code[skipyes] = _len(code) - skipyes + 1 | ||||||
|  |         else: | ||||||
|  |             raise error("internal: unsupported operand type %r" % (op,)) | ||||||
|  | 
 | ||||||
|  | def _compile_charset(charset, flags, code): | ||||||
|  |     # compile charset subprogram | ||||||
|  |     emit = code.append | ||||||
|  |     for op, av in charset: | ||||||
|  |         emit(op) | ||||||
|  |         if op is NEGATE: | ||||||
|  |             pass | ||||||
|  |         elif op is LITERAL: | ||||||
|  |             emit(av) | ||||||
|  |         elif op is RANGE or op is RANGE_UNI_IGNORE: | ||||||
|  |             emit(av[0]) | ||||||
|  |             emit(av[1]) | ||||||
|  |         elif op is CHARSET: | ||||||
|  |             code.extend(av) | ||||||
|  |         elif op is BIGCHARSET: | ||||||
|  |             code.extend(av) | ||||||
|  |         elif op is CATEGORY: | ||||||
|  |             if flags & SRE_FLAG_LOCALE: | ||||||
|  |                 emit(CH_LOCALE[av]) | ||||||
|  |             elif flags & SRE_FLAG_UNICODE: | ||||||
|  |                 emit(CH_UNICODE[av]) | ||||||
|  |             else: | ||||||
|  |                 emit(av) | ||||||
|  |         else: | ||||||
|  |             raise error("internal: unsupported set operator %r" % (op,)) | ||||||
|  |     emit(FAILURE) | ||||||
|  | 
 | ||||||
|  | def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): | ||||||
|  |     # internal: optimize character set | ||||||
|  |     out = [] | ||||||
|  |     tail = [] | ||||||
|  |     charmap = bytearray(256) | ||||||
|  |     hascased = False | ||||||
|  |     for op, av in charset: | ||||||
|  |         while True: | ||||||
|  |             try: | ||||||
|  |                 if op is LITERAL: | ||||||
|  |                     if fixup: | ||||||
|  |                         lo = fixup(av) | ||||||
|  |                         charmap[lo] = 1 | ||||||
|  |                         if fixes and lo in fixes: | ||||||
|  |                             for k in fixes[lo]: | ||||||
|  |                                 charmap[k] = 1 | ||||||
|  |                         if not hascased and iscased(av): | ||||||
|  |                             hascased = True | ||||||
|  |                     else: | ||||||
|  |                         charmap[av] = 1 | ||||||
|  |                 elif op is RANGE: | ||||||
|  |                     r = range(av[0], av[1]+1) | ||||||
|  |                     if fixup: | ||||||
|  |                         if fixes: | ||||||
|  |                             for i in map(fixup, r): | ||||||
|  |                                 charmap[i] = 1 | ||||||
|  |                                 if i in fixes: | ||||||
|  |                                     for k in fixes[i]: | ||||||
|  |                                         charmap[k] = 1 | ||||||
|  |                         else: | ||||||
|  |                             for i in map(fixup, r): | ||||||
|  |                                 charmap[i] = 1 | ||||||
|  |                         if not hascased: | ||||||
|  |                             hascased = any(map(iscased, r)) | ||||||
|  |                     else: | ||||||
|  |                         for i in r: | ||||||
|  |                             charmap[i] = 1 | ||||||
|  |                 elif op is NEGATE: | ||||||
|  |                     out.append((op, av)) | ||||||
|  |                 else: | ||||||
|  |                     tail.append((op, av)) | ||||||
|  |             except IndexError: | ||||||
|  |                 if len(charmap) == 256: | ||||||
|  |                     # character set contains non-UCS1 character codes | ||||||
|  |                     charmap += b'\0' * 0xff00 | ||||||
|  |                     continue | ||||||
|  |                 # Character set contains non-BMP character codes. | ||||||
|  |                 if fixup: | ||||||
|  |                     hascased = True | ||||||
|  |                     # There are only two ranges of cased non-BMP characters: | ||||||
|  |                     # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), | ||||||
|  |                     # and for both ranges RANGE_UNI_IGNORE works. | ||||||
|  |                     if op is RANGE: | ||||||
|  |                         op = RANGE_UNI_IGNORE | ||||||
|  |                 tail.append((op, av)) | ||||||
|  |             break | ||||||
|  | 
 | ||||||
|  |     # compress character map | ||||||
|  |     runs = [] | ||||||
|  |     q = 0 | ||||||
|  |     while True: | ||||||
|  |         p = charmap.find(1, q) | ||||||
|  |         if p < 0: | ||||||
|  |             break | ||||||
|  |         if len(runs) >= 2: | ||||||
|  |             runs = None | ||||||
|  |             break | ||||||
|  |         q = charmap.find(0, p) | ||||||
|  |         if q < 0: | ||||||
|  |             runs.append((p, len(charmap))) | ||||||
|  |             break | ||||||
|  |         runs.append((p, q)) | ||||||
|  |     if runs is not None: | ||||||
|  |         # use literal/range | ||||||
|  |         for p, q in runs: | ||||||
|  |             if q - p == 1: | ||||||
|  |                 out.append((LITERAL, p)) | ||||||
|  |             else: | ||||||
|  |                 out.append((RANGE, (p, q - 1))) | ||||||
|  |         out += tail | ||||||
|  |         # if the case was changed or new representation is more compact | ||||||
|  |         if hascased or len(out) < len(charset): | ||||||
|  |             return out, hascased | ||||||
|  |         # else original character set is good enough | ||||||
|  |         return charset, hascased | ||||||
|  | 
 | ||||||
|  |     # use bitmap | ||||||
|  |     if len(charmap) == 256: | ||||||
|  |         data = _mk_bitmap(charmap) | ||||||
|  |         out.append((CHARSET, data)) | ||||||
|  |         out += tail | ||||||
|  |         return out, hascased | ||||||
|  | 
 | ||||||
|  |     # To represent a big charset, first a bitmap of all characters in the | ||||||
|  |     # set is constructed. Then, this bitmap is sliced into chunks of 256 | ||||||
|  |     # characters, duplicate chunks are eliminated, and each chunk is | ||||||
|  |     # given a number. In the compiled expression, the charset is | ||||||
|  |     # represented by a 32-bit word sequence, consisting of one word for | ||||||
|  |     # the number of different chunks, a sequence of 256 bytes (64 words) | ||||||
|  |     # of chunk numbers indexed by their original chunk position, and a | ||||||
|  |     # sequence of 256-bit chunks (8 words each). | ||||||
|  | 
 | ||||||
|  |     # Compression is normally good: in a typical charset, large ranges of | ||||||
|  |     # Unicode will be either completely excluded (e.g. if only cyrillic | ||||||
|  |     # letters are to be matched), or completely included (e.g. if large | ||||||
|  |     # subranges of Kanji match). These ranges will be represented by | ||||||
|  |     # chunks of all one-bits or all zero-bits. | ||||||
|  | 
 | ||||||
|  |     # Matching can be also done efficiently: the more significant byte of | ||||||
|  |     # the Unicode character is an index into the chunk number, and the | ||||||
|  |     # less significant byte is a bit index in the chunk (just like the | ||||||
|  |     # CHARSET matching). | ||||||
|  | 
 | ||||||
|  |     charmap = bytes(charmap) # should be hashable | ||||||
|  |     comps = {} | ||||||
|  |     mapping = bytearray(256) | ||||||
|  |     block = 0 | ||||||
|  |     data = bytearray() | ||||||
|  |     for i in range(0, 65536, 256): | ||||||
|  |         chunk = charmap[i: i + 256] | ||||||
|  |         if chunk in comps: | ||||||
|  |             mapping[i // 256] = comps[chunk] | ||||||
|  |         else: | ||||||
|  |             mapping[i // 256] = comps[chunk] = block | ||||||
|  |             block += 1 | ||||||
|  |             data += chunk | ||||||
|  |     data = _mk_bitmap(data) | ||||||
|  |     data[0:0] = [block] + _bytes_to_codes(mapping) | ||||||
|  |     out.append((BIGCHARSET, data)) | ||||||
|  |     out += tail | ||||||
|  |     return out, hascased | ||||||
|  | 
 | ||||||
|  | _CODEBITS = _sre.CODESIZE * 8 | ||||||
|  | MAXCODE = (1 << _CODEBITS) - 1 | ||||||
|  | _BITS_TRANS = b'0' + b'1' * 255 | ||||||
|  | def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): | ||||||
|  |     s = bits.translate(_BITS_TRANS)[::-1] | ||||||
|  |     return [_int(s[i - _CODEBITS: i], 2) | ||||||
|  |             for i in range(len(s), 0, -_CODEBITS)] | ||||||
|  | 
 | ||||||
|  | def _bytes_to_codes(b): | ||||||
|  |     # Convert block indices to word array | ||||||
|  |     a = memoryview(b).cast('I') | ||||||
|  |     assert a.itemsize == _sre.CODESIZE | ||||||
|  |     assert len(a) * a.itemsize == len(b) | ||||||
|  |     return a.tolist() | ||||||
|  | 
 | ||||||
|  | def _simple(p): | ||||||
|  |     # check if this subpattern is a "simple" operator | ||||||
|  |     if len(p) != 1: | ||||||
|  |         return False | ||||||
|  |     op, av = p[0] | ||||||
|  |     if op is SUBPATTERN: | ||||||
|  |         return av[0] is None and _simple(av[-1]) | ||||||
|  |     return op in _UNIT_CODES | ||||||
|  | 
 | ||||||
|  | def _generate_overlap_table(prefix): | ||||||
|  |     """ | ||||||
|  |     Generate an overlap table for the following prefix. | ||||||
|  |     An overlap table is a table of the same size as the prefix which | ||||||
|  |     informs about the potential self-overlap for each index in the prefix: | ||||||
|  |     - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] | ||||||
|  |     - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with | ||||||
|  |       prefix[0:k] | ||||||
|  |     """ | ||||||
|  |     table = [0] * len(prefix) | ||||||
|  |     for i in range(1, len(prefix)): | ||||||
|  |         idx = table[i - 1] | ||||||
|  |         while prefix[i] != prefix[idx]: | ||||||
|  |             if idx == 0: | ||||||
|  |                 table[i] = 0 | ||||||
|  |                 break | ||||||
|  |             idx = table[idx - 1] | ||||||
|  |         else: | ||||||
|  |             table[i] = idx + 1 | ||||||
|  |     return table | ||||||
|  | 
 | ||||||
|  | def _get_iscased(flags): | ||||||
|  |     if not flags & SRE_FLAG_IGNORECASE: | ||||||
|  |         return None | ||||||
|  |     elif flags & SRE_FLAG_UNICODE: | ||||||
|  |         return _sre.unicode_iscased | ||||||
|  |     else: | ||||||
|  |         return _sre.ascii_iscased | ||||||
|  | 
 | ||||||
|  | def _get_literal_prefix(pattern, flags): | ||||||
|  |     # look for literal prefix | ||||||
|  |     prefix = [] | ||||||
|  |     prefixappend = prefix.append | ||||||
|  |     prefix_skip = None | ||||||
|  |     iscased = _get_iscased(flags) | ||||||
|  |     for op, av in pattern.data: | ||||||
|  |         if op is LITERAL: | ||||||
|  |             if iscased and iscased(av): | ||||||
|  |                 break | ||||||
|  |             prefixappend(av) | ||||||
|  |         elif op is SUBPATTERN: | ||||||
|  |             group, add_flags, del_flags, p = av | ||||||
|  |             flags1 = _combine_flags(flags, add_flags, del_flags) | ||||||
|  |             if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: | ||||||
|  |                 break | ||||||
|  |             prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) | ||||||
|  |             if prefix_skip is None: | ||||||
|  |                 if group is not None: | ||||||
|  |                     prefix_skip = len(prefix) | ||||||
|  |                 elif prefix_skip1 is not None: | ||||||
|  |                     prefix_skip = len(prefix) + prefix_skip1 | ||||||
|  |             prefix.extend(prefix1) | ||||||
|  |             if not got_all: | ||||||
|  |                 break | ||||||
|  |         else: | ||||||
|  |             break | ||||||
|  |     else: | ||||||
|  |         return prefix, prefix_skip, True | ||||||
|  |     return prefix, prefix_skip, False | ||||||
|  | 
 | ||||||
|  | def _get_charset_prefix(pattern, flags): | ||||||
|  |     while True: | ||||||
|  |         if not pattern.data: | ||||||
|  |             return None | ||||||
|  |         op, av = pattern.data[0] | ||||||
|  |         if op is not SUBPATTERN: | ||||||
|  |             break | ||||||
|  |         group, add_flags, del_flags, pattern = av | ||||||
|  |         flags = _combine_flags(flags, add_flags, del_flags) | ||||||
|  |         if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: | ||||||
|  |             return None | ||||||
|  | 
 | ||||||
|  |     iscased = _get_iscased(flags) | ||||||
|  |     if op is LITERAL: | ||||||
|  |         if iscased and iscased(av): | ||||||
|  |             return None | ||||||
|  |         return [(op, av)] | ||||||
|  |     elif op is BRANCH: | ||||||
|  |         charset = [] | ||||||
|  |         charsetappend = charset.append | ||||||
|  |         for p in av[1]: | ||||||
|  |             if not p: | ||||||
|  |                 return None | ||||||
|  |             op, av = p[0] | ||||||
|  |             if op is LITERAL and not (iscased and iscased(av)): | ||||||
|  |                 charsetappend((op, av)) | ||||||
|  |             else: | ||||||
|  |                 return None | ||||||
|  |         return charset | ||||||
|  |     elif op is IN: | ||||||
|  |         charset = av | ||||||
|  |         if iscased: | ||||||
|  |             for op, av in charset: | ||||||
|  |                 if op is LITERAL: | ||||||
|  |                     if iscased(av): | ||||||
|  |                         return None | ||||||
|  |                 elif op is RANGE: | ||||||
|  |                     if av[1] > 0xffff: | ||||||
|  |                         return None | ||||||
|  |                     if any(map(iscased, range(av[0], av[1]+1))): | ||||||
|  |                         return None | ||||||
|  |         return charset | ||||||
|  |     return None | ||||||
|  | 
 | ||||||
|  | def _compile_info(code, pattern, flags): | ||||||
|  |     # internal: compile an info block.  in the current version, | ||||||
|  |     # this contains min/max pattern width, and an optional literal | ||||||
|  |     # prefix or a character map | ||||||
|  |     lo, hi = pattern.getwidth() | ||||||
|  |     if hi > MAXCODE: | ||||||
|  |         hi = MAXCODE | ||||||
|  |     if lo == 0: | ||||||
|  |         code.extend([INFO, 4, 0, lo, hi]) | ||||||
|  |         return | ||||||
|  |     # look for a literal prefix | ||||||
|  |     prefix = [] | ||||||
|  |     prefix_skip = 0 | ||||||
|  |     charset = [] # not used | ||||||
|  |     if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): | ||||||
|  |         # look for literal prefix | ||||||
|  |         prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) | ||||||
|  |         # if no prefix, look for charset prefix | ||||||
|  |         if not prefix: | ||||||
|  |             charset = _get_charset_prefix(pattern, flags) | ||||||
|  | ##     if prefix: | ||||||
|  | ##         print("*** PREFIX", prefix, prefix_skip) | ||||||
|  | ##     if charset: | ||||||
|  | ##         print("*** CHARSET", charset) | ||||||
|  |     # add an info block | ||||||
|  |     emit = code.append | ||||||
|  |     emit(INFO) | ||||||
|  |     skip = len(code); emit(0) | ||||||
|  |     # literal flag | ||||||
|  |     mask = 0 | ||||||
|  |     if prefix: | ||||||
|  |         mask = SRE_INFO_PREFIX | ||||||
|  |         if prefix_skip is None and got_all: | ||||||
|  |             mask = mask | SRE_INFO_LITERAL | ||||||
|  |     elif charset: | ||||||
|  |         mask = mask | SRE_INFO_CHARSET | ||||||
|  |     emit(mask) | ||||||
|  |     # pattern length | ||||||
|  |     if lo < MAXCODE: | ||||||
|  |         emit(lo) | ||||||
|  |     else: | ||||||
|  |         emit(MAXCODE) | ||||||
|  |         prefix = prefix[:MAXCODE] | ||||||
|  |     emit(min(hi, MAXCODE)) | ||||||
|  |     # add literal prefix | ||||||
|  |     if prefix: | ||||||
|  |         emit(len(prefix)) # length | ||||||
|  |         if prefix_skip is None: | ||||||
|  |             prefix_skip =  len(prefix) | ||||||
|  |         emit(prefix_skip) # skip | ||||||
|  |         code.extend(prefix) | ||||||
|  |         # generate overlap table | ||||||
|  |         code.extend(_generate_overlap_table(prefix)) | ||||||
|  |     elif charset: | ||||||
|  |         charset, hascased = _optimize_charset(charset) | ||||||
|  |         assert not hascased | ||||||
|  |         _compile_charset(charset, flags, code) | ||||||
|  |     code[skip] = len(code) - skip | ||||||
|  | 
 | ||||||
|  | def isstring(obj): | ||||||
|  |     return isinstance(obj, (str, bytes)) | ||||||
|  | 
 | ||||||
|  | def _code(p, flags): | ||||||
|  | 
 | ||||||
|  |     flags = p.state.flags | flags | ||||||
|  |     code = [] | ||||||
|  | 
 | ||||||
|  |     # compile info block | ||||||
|  |     _compile_info(code, p, flags) | ||||||
|  | 
 | ||||||
|  |     # compile the pattern | ||||||
|  |     _compile(code, p.data, flags) | ||||||
|  | 
 | ||||||
|  |     code.append(SUCCESS) | ||||||
|  | 
 | ||||||
|  |     return code | ||||||
|  | 
 | ||||||
|  | def _hex_code(code): | ||||||
|  |     return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) | ||||||
|  | 
 | ||||||
|  | def dis(code): | ||||||
|  |     import sys | ||||||
|  | 
 | ||||||
|  |     labels = set() | ||||||
|  |     level = 0 | ||||||
|  |     offset_width = len(str(len(code) - 1)) | ||||||
|  | 
 | ||||||
|  |     def dis_(start, end): | ||||||
|  |         def print_(*args, to=None): | ||||||
|  |             if to is not None: | ||||||
|  |                 labels.add(to) | ||||||
|  |                 args += ('(to %d)' % (to,),) | ||||||
|  |             print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), | ||||||
|  |                   end='  '*(level-1)) | ||||||
|  |             print(*args) | ||||||
|  | 
 | ||||||
|  |         def print_2(*args): | ||||||
|  |             print(end=' '*(offset_width + 2*level)) | ||||||
|  |             print(*args) | ||||||
|  | 
 | ||||||
|  |         nonlocal level | ||||||
|  |         level += 1 | ||||||
|  |         i = start | ||||||
|  |         while i < end: | ||||||
|  |             start = i | ||||||
|  |             op = code[i] | ||||||
|  |             i += 1 | ||||||
|  |             op = OPCODES[op] | ||||||
|  |             if op in (SUCCESS, FAILURE, ANY, ANY_ALL, | ||||||
|  |                       MAX_UNTIL, MIN_UNTIL, NEGATE): | ||||||
|  |                 print_(op) | ||||||
|  |             elif op in (LITERAL, NOT_LITERAL, | ||||||
|  |                         LITERAL_IGNORE, NOT_LITERAL_IGNORE, | ||||||
|  |                         LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, | ||||||
|  |                         LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): | ||||||
|  |                 arg = code[i] | ||||||
|  |                 i += 1 | ||||||
|  |                 print_(op, '%#02x (%r)' % (arg, chr(arg))) | ||||||
|  |             elif op is AT: | ||||||
|  |                 arg = code[i] | ||||||
|  |                 i += 1 | ||||||
|  |                 arg = str(ATCODES[arg]) | ||||||
|  |                 assert arg[:3] == 'AT_' | ||||||
|  |                 print_(op, arg[3:]) | ||||||
|  |             elif op is CATEGORY: | ||||||
|  |                 arg = code[i] | ||||||
|  |                 i += 1 | ||||||
|  |                 arg = str(CHCODES[arg]) | ||||||
|  |                 assert arg[:9] == 'CATEGORY_' | ||||||
|  |                 print_(op, arg[9:]) | ||||||
|  |             elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): | ||||||
|  |                 skip = code[i] | ||||||
|  |                 print_(op, skip, to=i+skip) | ||||||
|  |                 dis_(i+1, i+skip) | ||||||
|  |                 i += skip | ||||||
|  |             elif op in (RANGE, RANGE_UNI_IGNORE): | ||||||
|  |                 lo, hi = code[i: i+2] | ||||||
|  |                 i += 2 | ||||||
|  |                 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) | ||||||
|  |             elif op is CHARSET: | ||||||
|  |                 print_(op, _hex_code(code[i: i + 256//_CODEBITS])) | ||||||
|  |                 i += 256//_CODEBITS | ||||||
|  |             elif op is BIGCHARSET: | ||||||
|  |                 arg = code[i] | ||||||
|  |                 i += 1 | ||||||
|  |                 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) | ||||||
|  |                                         for x in code[i: i + 256//_sre.CODESIZE])) | ||||||
|  |                 print_(op, arg, mapping) | ||||||
|  |                 i += 256//_sre.CODESIZE | ||||||
|  |                 level += 1 | ||||||
|  |                 for j in range(arg): | ||||||
|  |                     print_2(_hex_code(code[i: i + 256//_CODEBITS])) | ||||||
|  |                     i += 256//_CODEBITS | ||||||
|  |                 level -= 1 | ||||||
|  |             elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, | ||||||
|  |                         GROUPREF_LOC_IGNORE): | ||||||
|  |                 arg = code[i] | ||||||
|  |                 i += 1 | ||||||
|  |                 print_(op, arg) | ||||||
|  |             elif op is JUMP: | ||||||
|  |                 skip = code[i] | ||||||
|  |                 print_(op, skip, to=i+skip) | ||||||
|  |                 i += 1 | ||||||
|  |             elif op is BRANCH: | ||||||
|  |                 skip = code[i] | ||||||
|  |                 print_(op, skip, to=i+skip) | ||||||
|  |                 while skip: | ||||||
|  |                     dis_(i+1, i+skip) | ||||||
|  |                     i += skip | ||||||
|  |                     start = i | ||||||
|  |                     skip = code[i] | ||||||
|  |                     if skip: | ||||||
|  |                         print_('branch', skip, to=i+skip) | ||||||
|  |                     else: | ||||||
|  |                         print_(FAILURE) | ||||||
|  |                 i += 1 | ||||||
|  |             elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, | ||||||
|  |                         POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): | ||||||
|  |                 skip, min, max = code[i: i+3] | ||||||
|  |                 if max == MAXREPEAT: | ||||||
|  |                     max = 'MAXREPEAT' | ||||||
|  |                 print_(op, skip, min, max, to=i+skip) | ||||||
|  |                 dis_(i+3, i+skip) | ||||||
|  |                 i += skip | ||||||
|  |             elif op is GROUPREF_EXISTS: | ||||||
|  |                 arg, skip = code[i: i+2] | ||||||
|  |                 print_(op, arg, skip, to=i+skip) | ||||||
|  |                 i += 2 | ||||||
|  |             elif op in (ASSERT, ASSERT_NOT): | ||||||
|  |                 skip, arg = code[i: i+2] | ||||||
|  |                 print_(op, skip, arg, to=i+skip) | ||||||
|  |                 dis_(i+2, i+skip) | ||||||
|  |                 i += skip | ||||||
|  |             elif op is ATOMIC_GROUP: | ||||||
|  |                 skip = code[i] | ||||||
|  |                 print_(op, skip, to=i+skip) | ||||||
|  |                 dis_(i+1, i+skip) | ||||||
|  |                 i += skip | ||||||
|  |             elif op is INFO: | ||||||
|  |                 skip, flags, min, max = code[i: i+4] | ||||||
|  |                 if max == MAXREPEAT: | ||||||
|  |                     max = 'MAXREPEAT' | ||||||
|  |                 print_(op, skip, bin(flags), min, max, to=i+skip) | ||||||
|  |                 start = i+4 | ||||||
|  |                 if flags & SRE_INFO_PREFIX: | ||||||
|  |                     prefix_len, prefix_skip = code[i+4: i+6] | ||||||
|  |                     print_2('  prefix_skip', prefix_skip) | ||||||
|  |                     start = i + 6 | ||||||
|  |                     prefix = code[start: start+prefix_len] | ||||||
|  |                     print_2('  prefix', | ||||||
|  |                             '[%s]' % ', '.join('%#02x' % x for x in prefix), | ||||||
|  |                             '(%r)' % ''.join(map(chr, prefix))) | ||||||
|  |                     start += prefix_len | ||||||
|  |                     print_2('  overlap', code[start: start+prefix_len]) | ||||||
|  |                     start += prefix_len | ||||||
|  |                 if flags & SRE_INFO_CHARSET: | ||||||
|  |                     level += 1 | ||||||
|  |                     print_2('in') | ||||||
|  |                     dis_(start, i+skip) | ||||||
|  |                     level -= 1 | ||||||
|  |                 i += skip | ||||||
|  |             else: | ||||||
|  |                 raise ValueError(op) | ||||||
|  | 
 | ||||||
|  |         level -= 1 | ||||||
|  | 
 | ||||||
|  |     dis_(0, len(code)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def compile(p, flags=0): | ||||||
|  |     # internal: convert pattern list to internal format | ||||||
|  | 
 | ||||||
|  |     if isstring(p): | ||||||
|  |         pattern = p | ||||||
|  |         p = _parser.parse(p, flags) | ||||||
|  |     else: | ||||||
|  |         pattern = None | ||||||
|  | 
 | ||||||
|  |     code = _code(p, flags) | ||||||
|  | 
 | ||||||
|  |     if flags & SRE_FLAG_DEBUG: | ||||||
|  |         print() | ||||||
|  |         dis(code) | ||||||
|  | 
 | ||||||
|  |     # map in either direction | ||||||
|  |     groupindex = p.state.groupdict | ||||||
|  |     indexgroup = [None] * p.state.groups | ||||||
|  |     for k, i in groupindex.items(): | ||||||
|  |         indexgroup[i] = k | ||||||
|  | 
 | ||||||
|  |     return _sre.compile( | ||||||
|  |         pattern, flags | p.state.flags, code, | ||||||
|  |         p.state.groups-1, | ||||||
|  |         groupindex, tuple(indexgroup) | ||||||
|  |         ) | ||||||
							
								
								
									
										262
									
								
								Lib/re/_constants.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										262
									
								
								Lib/re/_constants.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,262 @@ | ||||||
|  | # | ||||||
|  | # Secret Labs' Regular Expression Engine | ||||||
|  | # | ||||||
|  | # various symbols used by the regular expression engine. | ||||||
|  | # run this script to update the _sre include files! | ||||||
|  | # | ||||||
|  | # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved. | ||||||
|  | # | ||||||
|  | # See the __init__.py file for information on usage and redistribution. | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | """Internal support module for sre""" | ||||||
|  | 
 | ||||||
|  | # update when constants are added or removed | ||||||
|  | 
 | ||||||
|  | MAGIC = 20220318 | ||||||
|  | 
 | ||||||
|  | from _sre import MAXREPEAT, MAXGROUPS | ||||||
|  | 
 | ||||||
|  | # SRE standard exception (access as sre.error) | ||||||
|  | # should this really be here? | ||||||
|  | 
 | ||||||
|  | class error(Exception): | ||||||
|  |     """Exception raised for invalid regular expressions. | ||||||
|  | 
 | ||||||
|  |     Attributes: | ||||||
|  | 
 | ||||||
|  |         msg: The unformatted error message | ||||||
|  |         pattern: The regular expression pattern | ||||||
|  |         pos: The index in the pattern where compilation failed (may be None) | ||||||
|  |         lineno: The line corresponding to pos (may be None) | ||||||
|  |         colno: The column corresponding to pos (may be None) | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     __module__ = 're' | ||||||
|  | 
 | ||||||
|  |     def __init__(self, msg, pattern=None, pos=None): | ||||||
|  |         self.msg = msg | ||||||
|  |         self.pattern = pattern | ||||||
|  |         self.pos = pos | ||||||
|  |         if pattern is not None and pos is not None: | ||||||
|  |             msg = '%s at position %d' % (msg, pos) | ||||||
|  |             if isinstance(pattern, str): | ||||||
|  |                 newline = '\n' | ||||||
|  |             else: | ||||||
|  |                 newline = b'\n' | ||||||
|  |             self.lineno = pattern.count(newline, 0, pos) + 1 | ||||||
|  |             self.colno = pos - pattern.rfind(newline, 0, pos) | ||||||
|  |             if newline in pattern: | ||||||
|  |                 msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) | ||||||
|  |         else: | ||||||
|  |             self.lineno = self.colno = None | ||||||
|  |         super().__init__(msg) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class _NamedIntConstant(int): | ||||||
|  |     def __new__(cls, value, name): | ||||||
|  |         self = super(_NamedIntConstant, cls).__new__(cls, value) | ||||||
|  |         self.name = name | ||||||
|  |         return self | ||||||
|  | 
 | ||||||
|  |     def __repr__(self): | ||||||
|  |         return self.name | ||||||
|  | 
 | ||||||
|  | MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') | ||||||
|  | 
 | ||||||
|  | def _makecodes(names): | ||||||
|  |     names = names.strip().split() | ||||||
|  |     items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] | ||||||
|  |     globals().update({item.name: item for item in items}) | ||||||
|  |     return items | ||||||
|  | 
 | ||||||
|  | # operators | ||||||
|  | # failure=0 success=1 (just because it looks better that way :-) | ||||||
|  | OPCODES = _makecodes(""" | ||||||
|  |     FAILURE SUCCESS | ||||||
|  | 
 | ||||||
|  |     ANY ANY_ALL | ||||||
|  |     ASSERT ASSERT_NOT | ||||||
|  |     AT | ||||||
|  |     BRANCH | ||||||
|  |     CALL | ||||||
|  |     CATEGORY | ||||||
|  |     CHARSET BIGCHARSET | ||||||
|  |     GROUPREF GROUPREF_EXISTS | ||||||
|  |     IN | ||||||
|  |     INFO | ||||||
|  |     JUMP | ||||||
|  |     LITERAL | ||||||
|  |     MARK | ||||||
|  |     MAX_UNTIL | ||||||
|  |     MIN_UNTIL | ||||||
|  |     NOT_LITERAL | ||||||
|  |     NEGATE | ||||||
|  |     RANGE | ||||||
|  |     REPEAT | ||||||
|  |     REPEAT_ONE | ||||||
|  |     SUBPATTERN | ||||||
|  |     MIN_REPEAT_ONE | ||||||
|  |     ATOMIC_GROUP | ||||||
|  |     POSSESSIVE_REPEAT | ||||||
|  |     POSSESSIVE_REPEAT_ONE | ||||||
|  | 
 | ||||||
|  |     GROUPREF_IGNORE | ||||||
|  |     IN_IGNORE | ||||||
|  |     LITERAL_IGNORE | ||||||
|  |     NOT_LITERAL_IGNORE | ||||||
|  | 
 | ||||||
|  |     GROUPREF_LOC_IGNORE | ||||||
|  |     IN_LOC_IGNORE | ||||||
|  |     LITERAL_LOC_IGNORE | ||||||
|  |     NOT_LITERAL_LOC_IGNORE | ||||||
|  | 
 | ||||||
|  |     GROUPREF_UNI_IGNORE | ||||||
|  |     IN_UNI_IGNORE | ||||||
|  |     LITERAL_UNI_IGNORE | ||||||
|  |     NOT_LITERAL_UNI_IGNORE | ||||||
|  |     RANGE_UNI_IGNORE | ||||||
|  | 
 | ||||||
|  |     MIN_REPEAT MAX_REPEAT | ||||||
|  | """) | ||||||
|  | del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT | ||||||
|  | 
 | ||||||
|  | # positions | ||||||
|  | ATCODES = _makecodes(""" | ||||||
|  |     AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING | ||||||
|  |     AT_BOUNDARY AT_NON_BOUNDARY | ||||||
|  |     AT_END AT_END_LINE AT_END_STRING | ||||||
|  | 
 | ||||||
|  |     AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY | ||||||
|  | 
 | ||||||
|  |     AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY | ||||||
|  | """) | ||||||
|  | 
 | ||||||
|  | # categories | ||||||
|  | CHCODES = _makecodes(""" | ||||||
|  |     CATEGORY_DIGIT CATEGORY_NOT_DIGIT | ||||||
|  |     CATEGORY_SPACE CATEGORY_NOT_SPACE | ||||||
|  |     CATEGORY_WORD CATEGORY_NOT_WORD | ||||||
|  |     CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK | ||||||
|  | 
 | ||||||
|  |     CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD | ||||||
|  | 
 | ||||||
|  |     CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT | ||||||
|  |     CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE | ||||||
|  |     CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD | ||||||
|  |     CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK | ||||||
|  | """) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # replacement operations for "ignore case" mode | ||||||
|  | OP_IGNORE = { | ||||||
|  |     LITERAL: LITERAL_IGNORE, | ||||||
|  |     NOT_LITERAL: NOT_LITERAL_IGNORE, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | OP_LOCALE_IGNORE = { | ||||||
|  |     LITERAL: LITERAL_LOC_IGNORE, | ||||||
|  |     NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | OP_UNICODE_IGNORE = { | ||||||
|  |     LITERAL: LITERAL_UNI_IGNORE, | ||||||
|  |     NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | AT_MULTILINE = { | ||||||
|  |     AT_BEGINNING: AT_BEGINNING_LINE, | ||||||
|  |     AT_END: AT_END_LINE | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | AT_LOCALE = { | ||||||
|  |     AT_BOUNDARY: AT_LOC_BOUNDARY, | ||||||
|  |     AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | AT_UNICODE = { | ||||||
|  |     AT_BOUNDARY: AT_UNI_BOUNDARY, | ||||||
|  |     AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | CH_LOCALE = { | ||||||
|  |     CATEGORY_DIGIT: CATEGORY_DIGIT, | ||||||
|  |     CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, | ||||||
|  |     CATEGORY_SPACE: CATEGORY_SPACE, | ||||||
|  |     CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, | ||||||
|  |     CATEGORY_WORD: CATEGORY_LOC_WORD, | ||||||
|  |     CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, | ||||||
|  |     CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, | ||||||
|  |     CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | CH_UNICODE = { | ||||||
|  |     CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, | ||||||
|  |     CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, | ||||||
|  |     CATEGORY_SPACE: CATEGORY_UNI_SPACE, | ||||||
|  |     CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, | ||||||
|  |     CATEGORY_WORD: CATEGORY_UNI_WORD, | ||||||
|  |     CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, | ||||||
|  |     CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, | ||||||
|  |     CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # flags | ||||||
|  | SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) | ||||||
|  | SRE_FLAG_IGNORECASE = 2 # case insensitive | ||||||
|  | SRE_FLAG_LOCALE = 4 # honour system locale | ||||||
|  | SRE_FLAG_MULTILINE = 8 # treat target as multiline string | ||||||
|  | SRE_FLAG_DOTALL = 16 # treat target as a single string | ||||||
|  | SRE_FLAG_UNICODE = 32 # use unicode "locale" | ||||||
|  | SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments | ||||||
|  | SRE_FLAG_DEBUG = 128 # debugging | ||||||
|  | SRE_FLAG_ASCII = 256 # use ascii "locale" | ||||||
|  | 
 | ||||||
|  | # flags for INFO primitive | ||||||
|  | SRE_INFO_PREFIX = 1 # has prefix | ||||||
|  | SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) | ||||||
|  | SRE_INFO_CHARSET = 4 # pattern starts with character from given set | ||||||
|  | 
 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     def dump(f, d, prefix): | ||||||
|  |         items = sorted(d) | ||||||
|  |         for item in items: | ||||||
|  |             f.write("#define %s_%s %d\n" % (prefix, item, item)) | ||||||
|  |     with open("sre_constants.h", "w") as f: | ||||||
|  |         f.write("""\ | ||||||
|  | /* | ||||||
|  |  * Secret Labs' Regular Expression Engine | ||||||
|  |  * | ||||||
|  |  * regular expression matching engine | ||||||
|  |  * | ||||||
|  |  * NOTE: This file is generated by Lib/re/_constants.py.  If you need | ||||||
|  |  * to change anything in here, edit Lib/re/_constants.py and run it. | ||||||
|  |  * | ||||||
|  |  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. | ||||||
|  |  * | ||||||
|  |  * See the _sre.c file for information on usage and redistribution. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | """) | ||||||
|  | 
 | ||||||
|  |         f.write("#define SRE_MAGIC %d\n" % MAGIC) | ||||||
|  | 
 | ||||||
|  |         dump(f, OPCODES, "SRE_OP") | ||||||
|  |         dump(f, ATCODES, "SRE") | ||||||
|  |         dump(f, CHCODES, "SRE") | ||||||
|  | 
 | ||||||
|  |         f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) | ||||||
|  |         f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) | ||||||
|  |         f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) | ||||||
|  |         f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) | ||||||
|  |         f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) | ||||||
|  |         f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) | ||||||
|  |         f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) | ||||||
|  |         f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG) | ||||||
|  |         f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII) | ||||||
|  | 
 | ||||||
|  |         f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) | ||||||
|  |         f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) | ||||||
|  |         f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) | ||||||
|  | 
 | ||||||
|  |     print("done") | ||||||
							
								
								
									
										1079
									
								
								Lib/re/_parser.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1079
									
								
								Lib/re/_parser.py
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -1,800 +1,7 @@ | ||||||
| # | import warnings | ||||||
| # Secret Labs' Regular Expression Engine | warnings.warn(f"module {__name__!r} is deprecated", | ||||||
| # |               DeprecationWarning, | ||||||
| # convert template to internal format |               stacklevel=2) | ||||||
| # |  | ||||||
| # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. |  | ||||||
| # |  | ||||||
| # See the sre.py file for information on usage and redistribution. |  | ||||||
| # |  | ||||||
| 
 | 
 | ||||||
| """Internal support module for sre""" | from re import _compiler as _ | ||||||
| 
 | globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'}) | ||||||
| import _sre |  | ||||||
| import sre_parse |  | ||||||
| from sre_constants import * |  | ||||||
| 
 |  | ||||||
| assert _sre.MAGIC == MAGIC, "SRE module mismatch" |  | ||||||
| 
 |  | ||||||
| _LITERAL_CODES = {LITERAL, NOT_LITERAL} |  | ||||||
| _SUCCESS_CODES = {SUCCESS, FAILURE} |  | ||||||
| _ASSERT_CODES = {ASSERT, ASSERT_NOT} |  | ||||||
| _UNIT_CODES = _LITERAL_CODES | {ANY, IN} |  | ||||||
| 
 |  | ||||||
| _REPEATING_CODES = { |  | ||||||
|     MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), |  | ||||||
|     MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), |  | ||||||
|     POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| # Sets of lowercase characters which have the same uppercase. |  | ||||||
| _equivalences = ( |  | ||||||
|     # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I |  | ||||||
|     (0x69, 0x131), # iı |  | ||||||
|     # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S |  | ||||||
|     (0x73, 0x17f), # sſ |  | ||||||
|     # MICRO SIGN, GREEK SMALL LETTER MU |  | ||||||
|     (0xb5, 0x3bc), # µμ |  | ||||||
|     # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI |  | ||||||
|     (0x345, 0x3b9, 0x1fbe), # \u0345ιι |  | ||||||
|     # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA |  | ||||||
|     (0x390, 0x1fd3), # ΐΐ |  | ||||||
|     # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA |  | ||||||
|     (0x3b0, 0x1fe3), # ΰΰ |  | ||||||
|     # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL |  | ||||||
|     (0x3b2, 0x3d0), # βϐ |  | ||||||
|     # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL |  | ||||||
|     (0x3b5, 0x3f5), # εϵ |  | ||||||
|     # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL |  | ||||||
|     (0x3b8, 0x3d1), # θϑ |  | ||||||
|     # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL |  | ||||||
|     (0x3ba, 0x3f0), # κϰ |  | ||||||
|     # GREEK SMALL LETTER PI, GREEK PI SYMBOL |  | ||||||
|     (0x3c0, 0x3d6), # πϖ |  | ||||||
|     # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL |  | ||||||
|     (0x3c1, 0x3f1), # ρϱ |  | ||||||
|     # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA |  | ||||||
|     (0x3c2, 0x3c3), # ςσ |  | ||||||
|     # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL |  | ||||||
|     (0x3c6, 0x3d5), # φϕ |  | ||||||
|     # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE |  | ||||||
|     (0x1e61, 0x1e9b), # ṡẛ |  | ||||||
|     # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST |  | ||||||
|     (0xfb05, 0xfb06), # ſtst |  | ||||||
| ) |  | ||||||
| 
 |  | ||||||
| # Maps the lowercase code to lowercase codes which have the same uppercase. |  | ||||||
| _ignorecase_fixes = {i: tuple(j for j in t if i != j) |  | ||||||
|                      for t in _equivalences for i in t} |  | ||||||
| 
 |  | ||||||
| def _combine_flags(flags, add_flags, del_flags, |  | ||||||
|                    TYPE_FLAGS=sre_parse.TYPE_FLAGS): |  | ||||||
|     if add_flags & TYPE_FLAGS: |  | ||||||
|         flags &= ~TYPE_FLAGS |  | ||||||
|     return (flags | add_flags) & ~del_flags |  | ||||||
| 
 |  | ||||||
| def _compile(code, pattern, flags): |  | ||||||
|     # internal: compile a (sub)pattern |  | ||||||
|     emit = code.append |  | ||||||
|     _len = len |  | ||||||
|     LITERAL_CODES = _LITERAL_CODES |  | ||||||
|     REPEATING_CODES = _REPEATING_CODES |  | ||||||
|     SUCCESS_CODES = _SUCCESS_CODES |  | ||||||
|     ASSERT_CODES = _ASSERT_CODES |  | ||||||
|     iscased = None |  | ||||||
|     tolower = None |  | ||||||
|     fixes = None |  | ||||||
|     if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: |  | ||||||
|         if flags & SRE_FLAG_UNICODE: |  | ||||||
|             iscased = _sre.unicode_iscased |  | ||||||
|             tolower = _sre.unicode_tolower |  | ||||||
|             fixes = _ignorecase_fixes |  | ||||||
|         else: |  | ||||||
|             iscased = _sre.ascii_iscased |  | ||||||
|             tolower = _sre.ascii_tolower |  | ||||||
|     for op, av in pattern: |  | ||||||
|         if op in LITERAL_CODES: |  | ||||||
|             if not flags & SRE_FLAG_IGNORECASE: |  | ||||||
|                 emit(op) |  | ||||||
|                 emit(av) |  | ||||||
|             elif flags & SRE_FLAG_LOCALE: |  | ||||||
|                 emit(OP_LOCALE_IGNORE[op]) |  | ||||||
|                 emit(av) |  | ||||||
|             elif not iscased(av): |  | ||||||
|                 emit(op) |  | ||||||
|                 emit(av) |  | ||||||
|             else: |  | ||||||
|                 lo = tolower(av) |  | ||||||
|                 if not fixes:  # ascii |  | ||||||
|                     emit(OP_IGNORE[op]) |  | ||||||
|                     emit(lo) |  | ||||||
|                 elif lo not in fixes: |  | ||||||
|                     emit(OP_UNICODE_IGNORE[op]) |  | ||||||
|                     emit(lo) |  | ||||||
|                 else: |  | ||||||
|                     emit(IN_UNI_IGNORE) |  | ||||||
|                     skip = _len(code); emit(0) |  | ||||||
|                     if op is NOT_LITERAL: |  | ||||||
|                         emit(NEGATE) |  | ||||||
|                     for k in (lo,) + fixes[lo]: |  | ||||||
|                         emit(LITERAL) |  | ||||||
|                         emit(k) |  | ||||||
|                     emit(FAILURE) |  | ||||||
|                     code[skip] = _len(code) - skip |  | ||||||
|         elif op is IN: |  | ||||||
|             charset, hascased = _optimize_charset(av, iscased, tolower, fixes) |  | ||||||
|             if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: |  | ||||||
|                 emit(IN_LOC_IGNORE) |  | ||||||
|             elif not hascased: |  | ||||||
|                 emit(IN) |  | ||||||
|             elif not fixes:  # ascii |  | ||||||
|                 emit(IN_IGNORE) |  | ||||||
|             else: |  | ||||||
|                 emit(IN_UNI_IGNORE) |  | ||||||
|             skip = _len(code); emit(0) |  | ||||||
|             _compile_charset(charset, flags, code) |  | ||||||
|             code[skip] = _len(code) - skip |  | ||||||
|         elif op is ANY: |  | ||||||
|             if flags & SRE_FLAG_DOTALL: |  | ||||||
|                 emit(ANY_ALL) |  | ||||||
|             else: |  | ||||||
|                 emit(ANY) |  | ||||||
|         elif op in REPEATING_CODES: |  | ||||||
|             if flags & SRE_FLAG_TEMPLATE: |  | ||||||
|                 raise error("internal: unsupported template operator %r" % (op,)) |  | ||||||
|             if _simple(av[2]): |  | ||||||
|                 emit(REPEATING_CODES[op][2]) |  | ||||||
|                 skip = _len(code); emit(0) |  | ||||||
|                 emit(av[0]) |  | ||||||
|                 emit(av[1]) |  | ||||||
|                 _compile(code, av[2], flags) |  | ||||||
|                 emit(SUCCESS) |  | ||||||
|                 code[skip] = _len(code) - skip |  | ||||||
|             else: |  | ||||||
|                 emit(REPEATING_CODES[op][0]) |  | ||||||
|                 skip = _len(code); emit(0) |  | ||||||
|                 emit(av[0]) |  | ||||||
|                 emit(av[1]) |  | ||||||
|                 _compile(code, av[2], flags) |  | ||||||
|                 code[skip] = _len(code) - skip |  | ||||||
|                 emit(REPEATING_CODES[op][1]) |  | ||||||
|         elif op is SUBPATTERN: |  | ||||||
|             group, add_flags, del_flags, p = av |  | ||||||
|             if group: |  | ||||||
|                 emit(MARK) |  | ||||||
|                 emit((group-1)*2) |  | ||||||
|             # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) |  | ||||||
|             _compile(code, p, _combine_flags(flags, add_flags, del_flags)) |  | ||||||
|             if group: |  | ||||||
|                 emit(MARK) |  | ||||||
|                 emit((group-1)*2+1) |  | ||||||
|         elif op is ATOMIC_GROUP: |  | ||||||
|             # Atomic Groups are handled by starting with an Atomic |  | ||||||
|             # Group op code, then putting in the atomic group pattern |  | ||||||
|             # and finally a success op code to tell any repeat |  | ||||||
|             # operations within the Atomic Group to stop eating and |  | ||||||
|             # pop their stack if they reach it |  | ||||||
|             emit(ATOMIC_GROUP) |  | ||||||
|             skip = _len(code); emit(0) |  | ||||||
|             _compile(code, av, flags) |  | ||||||
|             emit(SUCCESS) |  | ||||||
|             code[skip] = _len(code) - skip |  | ||||||
|         elif op in SUCCESS_CODES: |  | ||||||
|             emit(op) |  | ||||||
|         elif op in ASSERT_CODES: |  | ||||||
|             emit(op) |  | ||||||
|             skip = _len(code); emit(0) |  | ||||||
|             if av[0] >= 0: |  | ||||||
|                 emit(0) # look ahead |  | ||||||
|             else: |  | ||||||
|                 lo, hi = av[1].getwidth() |  | ||||||
|                 if lo != hi: |  | ||||||
|                     raise error("look-behind requires fixed-width pattern") |  | ||||||
|                 emit(lo) # look behind |  | ||||||
|             _compile(code, av[1], flags) |  | ||||||
|             emit(SUCCESS) |  | ||||||
|             code[skip] = _len(code) - skip |  | ||||||
|         elif op is CALL: |  | ||||||
|             emit(op) |  | ||||||
|             skip = _len(code); emit(0) |  | ||||||
|             _compile(code, av, flags) |  | ||||||
|             emit(SUCCESS) |  | ||||||
|             code[skip] = _len(code) - skip |  | ||||||
|         elif op is AT: |  | ||||||
|             emit(op) |  | ||||||
|             if flags & SRE_FLAG_MULTILINE: |  | ||||||
|                 av = AT_MULTILINE.get(av, av) |  | ||||||
|             if flags & SRE_FLAG_LOCALE: |  | ||||||
|                 av = AT_LOCALE.get(av, av) |  | ||||||
|             elif flags & SRE_FLAG_UNICODE: |  | ||||||
|                 av = AT_UNICODE.get(av, av) |  | ||||||
|             emit(av) |  | ||||||
|         elif op is BRANCH: |  | ||||||
|             emit(op) |  | ||||||
|             tail = [] |  | ||||||
|             tailappend = tail.append |  | ||||||
|             for av in av[1]: |  | ||||||
|                 skip = _len(code); emit(0) |  | ||||||
|                 # _compile_info(code, av, flags) |  | ||||||
|                 _compile(code, av, flags) |  | ||||||
|                 emit(JUMP) |  | ||||||
|                 tailappend(_len(code)); emit(0) |  | ||||||
|                 code[skip] = _len(code) - skip |  | ||||||
|             emit(FAILURE) # end of branch |  | ||||||
|             for tail in tail: |  | ||||||
|                 code[tail] = _len(code) - tail |  | ||||||
|         elif op is CATEGORY: |  | ||||||
|             emit(op) |  | ||||||
|             if flags & SRE_FLAG_LOCALE: |  | ||||||
|                 av = CH_LOCALE[av] |  | ||||||
|             elif flags & SRE_FLAG_UNICODE: |  | ||||||
|                 av = CH_UNICODE[av] |  | ||||||
|             emit(av) |  | ||||||
|         elif op is GROUPREF: |  | ||||||
|             if not flags & SRE_FLAG_IGNORECASE: |  | ||||||
|                 emit(op) |  | ||||||
|             elif flags & SRE_FLAG_LOCALE: |  | ||||||
|                 emit(GROUPREF_LOC_IGNORE) |  | ||||||
|             elif not fixes:  # ascii |  | ||||||
|                 emit(GROUPREF_IGNORE) |  | ||||||
|             else: |  | ||||||
|                 emit(GROUPREF_UNI_IGNORE) |  | ||||||
|             emit(av-1) |  | ||||||
|         elif op is GROUPREF_EXISTS: |  | ||||||
|             emit(op) |  | ||||||
|             emit(av[0]-1) |  | ||||||
|             skipyes = _len(code); emit(0) |  | ||||||
|             _compile(code, av[1], flags) |  | ||||||
|             if av[2]: |  | ||||||
|                 emit(JUMP) |  | ||||||
|                 skipno = _len(code); emit(0) |  | ||||||
|                 code[skipyes] = _len(code) - skipyes + 1 |  | ||||||
|                 _compile(code, av[2], flags) |  | ||||||
|                 code[skipno] = _len(code) - skipno |  | ||||||
|             else: |  | ||||||
|                 code[skipyes] = _len(code) - skipyes + 1 |  | ||||||
|         else: |  | ||||||
|             raise error("internal: unsupported operand type %r" % (op,)) |  | ||||||
| 
 |  | ||||||
| def _compile_charset(charset, flags, code): |  | ||||||
|     # compile charset subprogram |  | ||||||
|     emit = code.append |  | ||||||
|     for op, av in charset: |  | ||||||
|         emit(op) |  | ||||||
|         if op is NEGATE: |  | ||||||
|             pass |  | ||||||
|         elif op is LITERAL: |  | ||||||
|             emit(av) |  | ||||||
|         elif op is RANGE or op is RANGE_UNI_IGNORE: |  | ||||||
|             emit(av[0]) |  | ||||||
|             emit(av[1]) |  | ||||||
|         elif op is CHARSET: |  | ||||||
|             code.extend(av) |  | ||||||
|         elif op is BIGCHARSET: |  | ||||||
|             code.extend(av) |  | ||||||
|         elif op is CATEGORY: |  | ||||||
|             if flags & SRE_FLAG_LOCALE: |  | ||||||
|                 emit(CH_LOCALE[av]) |  | ||||||
|             elif flags & SRE_FLAG_UNICODE: |  | ||||||
|                 emit(CH_UNICODE[av]) |  | ||||||
|             else: |  | ||||||
|                 emit(av) |  | ||||||
|         else: |  | ||||||
|             raise error("internal: unsupported set operator %r" % (op,)) |  | ||||||
|     emit(FAILURE) |  | ||||||
| 
 |  | ||||||
| def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): |  | ||||||
|     # internal: optimize character set |  | ||||||
|     out = [] |  | ||||||
|     tail = [] |  | ||||||
|     charmap = bytearray(256) |  | ||||||
|     hascased = False |  | ||||||
|     for op, av in charset: |  | ||||||
|         while True: |  | ||||||
|             try: |  | ||||||
|                 if op is LITERAL: |  | ||||||
|                     if fixup: |  | ||||||
|                         lo = fixup(av) |  | ||||||
|                         charmap[lo] = 1 |  | ||||||
|                         if fixes and lo in fixes: |  | ||||||
|                             for k in fixes[lo]: |  | ||||||
|                                 charmap[k] = 1 |  | ||||||
|                         if not hascased and iscased(av): |  | ||||||
|                             hascased = True |  | ||||||
|                     else: |  | ||||||
|                         charmap[av] = 1 |  | ||||||
|                 elif op is RANGE: |  | ||||||
|                     r = range(av[0], av[1]+1) |  | ||||||
|                     if fixup: |  | ||||||
|                         if fixes: |  | ||||||
|                             for i in map(fixup, r): |  | ||||||
|                                 charmap[i] = 1 |  | ||||||
|                                 if i in fixes: |  | ||||||
|                                     for k in fixes[i]: |  | ||||||
|                                         charmap[k] = 1 |  | ||||||
|                         else: |  | ||||||
|                             for i in map(fixup, r): |  | ||||||
|                                 charmap[i] = 1 |  | ||||||
|                         if not hascased: |  | ||||||
|                             hascased = any(map(iscased, r)) |  | ||||||
|                     else: |  | ||||||
|                         for i in r: |  | ||||||
|                             charmap[i] = 1 |  | ||||||
|                 elif op is NEGATE: |  | ||||||
|                     out.append((op, av)) |  | ||||||
|                 else: |  | ||||||
|                     tail.append((op, av)) |  | ||||||
|             except IndexError: |  | ||||||
|                 if len(charmap) == 256: |  | ||||||
|                     # character set contains non-UCS1 character codes |  | ||||||
|                     charmap += b'\0' * 0xff00 |  | ||||||
|                     continue |  | ||||||
|                 # Character set contains non-BMP character codes. |  | ||||||
|                 if fixup: |  | ||||||
|                     hascased = True |  | ||||||
|                     # There are only two ranges of cased non-BMP characters: |  | ||||||
|                     # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), |  | ||||||
|                     # and for both ranges RANGE_UNI_IGNORE works. |  | ||||||
|                     if op is RANGE: |  | ||||||
|                         op = RANGE_UNI_IGNORE |  | ||||||
|                 tail.append((op, av)) |  | ||||||
|             break |  | ||||||
| 
 |  | ||||||
|     # compress character map |  | ||||||
|     runs = [] |  | ||||||
|     q = 0 |  | ||||||
|     while True: |  | ||||||
|         p = charmap.find(1, q) |  | ||||||
|         if p < 0: |  | ||||||
|             break |  | ||||||
|         if len(runs) >= 2: |  | ||||||
|             runs = None |  | ||||||
|             break |  | ||||||
|         q = charmap.find(0, p) |  | ||||||
|         if q < 0: |  | ||||||
|             runs.append((p, len(charmap))) |  | ||||||
|             break |  | ||||||
|         runs.append((p, q)) |  | ||||||
|     if runs is not None: |  | ||||||
|         # use literal/range |  | ||||||
|         for p, q in runs: |  | ||||||
|             if q - p == 1: |  | ||||||
|                 out.append((LITERAL, p)) |  | ||||||
|             else: |  | ||||||
|                 out.append((RANGE, (p, q - 1))) |  | ||||||
|         out += tail |  | ||||||
|         # if the case was changed or new representation is more compact |  | ||||||
|         if hascased or len(out) < len(charset): |  | ||||||
|             return out, hascased |  | ||||||
|         # else original character set is good enough |  | ||||||
|         return charset, hascased |  | ||||||
| 
 |  | ||||||
|     # use bitmap |  | ||||||
|     if len(charmap) == 256: |  | ||||||
|         data = _mk_bitmap(charmap) |  | ||||||
|         out.append((CHARSET, data)) |  | ||||||
|         out += tail |  | ||||||
|         return out, hascased |  | ||||||
| 
 |  | ||||||
|     # To represent a big charset, first a bitmap of all characters in the |  | ||||||
|     # set is constructed. Then, this bitmap is sliced into chunks of 256 |  | ||||||
|     # characters, duplicate chunks are eliminated, and each chunk is |  | ||||||
|     # given a number. In the compiled expression, the charset is |  | ||||||
|     # represented by a 32-bit word sequence, consisting of one word for |  | ||||||
|     # the number of different chunks, a sequence of 256 bytes (64 words) |  | ||||||
|     # of chunk numbers indexed by their original chunk position, and a |  | ||||||
|     # sequence of 256-bit chunks (8 words each). |  | ||||||
| 
 |  | ||||||
|     # Compression is normally good: in a typical charset, large ranges of |  | ||||||
|     # Unicode will be either completely excluded (e.g. if only cyrillic |  | ||||||
|     # letters are to be matched), or completely included (e.g. if large |  | ||||||
|     # subranges of Kanji match). These ranges will be represented by |  | ||||||
|     # chunks of all one-bits or all zero-bits. |  | ||||||
| 
 |  | ||||||
|     # Matching can be also done efficiently: the more significant byte of |  | ||||||
|     # the Unicode character is an index into the chunk number, and the |  | ||||||
|     # less significant byte is a bit index in the chunk (just like the |  | ||||||
|     # CHARSET matching). |  | ||||||
| 
 |  | ||||||
|     charmap = bytes(charmap) # should be hashable |  | ||||||
|     comps = {} |  | ||||||
|     mapping = bytearray(256) |  | ||||||
|     block = 0 |  | ||||||
|     data = bytearray() |  | ||||||
|     for i in range(0, 65536, 256): |  | ||||||
|         chunk = charmap[i: i + 256] |  | ||||||
|         if chunk in comps: |  | ||||||
|             mapping[i // 256] = comps[chunk] |  | ||||||
|         else: |  | ||||||
|             mapping[i // 256] = comps[chunk] = block |  | ||||||
|             block += 1 |  | ||||||
|             data += chunk |  | ||||||
|     data = _mk_bitmap(data) |  | ||||||
|     data[0:0] = [block] + _bytes_to_codes(mapping) |  | ||||||
|     out.append((BIGCHARSET, data)) |  | ||||||
|     out += tail |  | ||||||
|     return out, hascased |  | ||||||
| 
 |  | ||||||
| _CODEBITS = _sre.CODESIZE * 8 |  | ||||||
| MAXCODE = (1 << _CODEBITS) - 1 |  | ||||||
| _BITS_TRANS = b'0' + b'1' * 255 |  | ||||||
| def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): |  | ||||||
|     s = bits.translate(_BITS_TRANS)[::-1] |  | ||||||
|     return [_int(s[i - _CODEBITS: i], 2) |  | ||||||
|             for i in range(len(s), 0, -_CODEBITS)] |  | ||||||
| 
 |  | ||||||
| def _bytes_to_codes(b): |  | ||||||
|     # Convert block indices to word array |  | ||||||
|     a = memoryview(b).cast('I') |  | ||||||
|     assert a.itemsize == _sre.CODESIZE |  | ||||||
|     assert len(a) * a.itemsize == len(b) |  | ||||||
|     return a.tolist() |  | ||||||
| 
 |  | ||||||
| def _simple(p): |  | ||||||
|     # check if this subpattern is a "simple" operator |  | ||||||
|     if len(p) != 1: |  | ||||||
|         return False |  | ||||||
|     op, av = p[0] |  | ||||||
|     if op is SUBPATTERN: |  | ||||||
|         return av[0] is None and _simple(av[-1]) |  | ||||||
|     return op in _UNIT_CODES |  | ||||||
| 
 |  | ||||||
| def _generate_overlap_table(prefix): |  | ||||||
|     """ |  | ||||||
|     Generate an overlap table for the following prefix. |  | ||||||
|     An overlap table is a table of the same size as the prefix which |  | ||||||
|     informs about the potential self-overlap for each index in the prefix: |  | ||||||
|     - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] |  | ||||||
|     - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with |  | ||||||
|       prefix[0:k] |  | ||||||
|     """ |  | ||||||
|     table = [0] * len(prefix) |  | ||||||
|     for i in range(1, len(prefix)): |  | ||||||
|         idx = table[i - 1] |  | ||||||
|         while prefix[i] != prefix[idx]: |  | ||||||
|             if idx == 0: |  | ||||||
|                 table[i] = 0 |  | ||||||
|                 break |  | ||||||
|             idx = table[idx - 1] |  | ||||||
|         else: |  | ||||||
|             table[i] = idx + 1 |  | ||||||
|     return table |  | ||||||
| 
 |  | ||||||
| def _get_iscased(flags): |  | ||||||
|     if not flags & SRE_FLAG_IGNORECASE: |  | ||||||
|         return None |  | ||||||
|     elif flags & SRE_FLAG_UNICODE: |  | ||||||
|         return _sre.unicode_iscased |  | ||||||
|     else: |  | ||||||
|         return _sre.ascii_iscased |  | ||||||
| 
 |  | ||||||
| def _get_literal_prefix(pattern, flags): |  | ||||||
|     # look for literal prefix |  | ||||||
|     prefix = [] |  | ||||||
|     prefixappend = prefix.append |  | ||||||
|     prefix_skip = None |  | ||||||
|     iscased = _get_iscased(flags) |  | ||||||
|     for op, av in pattern.data: |  | ||||||
|         if op is LITERAL: |  | ||||||
|             if iscased and iscased(av): |  | ||||||
|                 break |  | ||||||
|             prefixappend(av) |  | ||||||
|         elif op is SUBPATTERN: |  | ||||||
|             group, add_flags, del_flags, p = av |  | ||||||
|             flags1 = _combine_flags(flags, add_flags, del_flags) |  | ||||||
|             if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: |  | ||||||
|                 break |  | ||||||
|             prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) |  | ||||||
|             if prefix_skip is None: |  | ||||||
|                 if group is not None: |  | ||||||
|                     prefix_skip = len(prefix) |  | ||||||
|                 elif prefix_skip1 is not None: |  | ||||||
|                     prefix_skip = len(prefix) + prefix_skip1 |  | ||||||
|             prefix.extend(prefix1) |  | ||||||
|             if not got_all: |  | ||||||
|                 break |  | ||||||
|         else: |  | ||||||
|             break |  | ||||||
|     else: |  | ||||||
|         return prefix, prefix_skip, True |  | ||||||
|     return prefix, prefix_skip, False |  | ||||||
| 
 |  | ||||||
| def _get_charset_prefix(pattern, flags): |  | ||||||
|     while True: |  | ||||||
|         if not pattern.data: |  | ||||||
|             return None |  | ||||||
|         op, av = pattern.data[0] |  | ||||||
|         if op is not SUBPATTERN: |  | ||||||
|             break |  | ||||||
|         group, add_flags, del_flags, pattern = av |  | ||||||
|         flags = _combine_flags(flags, add_flags, del_flags) |  | ||||||
|         if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: |  | ||||||
|             return None |  | ||||||
| 
 |  | ||||||
|     iscased = _get_iscased(flags) |  | ||||||
|     if op is LITERAL: |  | ||||||
|         if iscased and iscased(av): |  | ||||||
|             return None |  | ||||||
|         return [(op, av)] |  | ||||||
|     elif op is BRANCH: |  | ||||||
|         charset = [] |  | ||||||
|         charsetappend = charset.append |  | ||||||
|         for p in av[1]: |  | ||||||
|             if not p: |  | ||||||
|                 return None |  | ||||||
|             op, av = p[0] |  | ||||||
|             if op is LITERAL and not (iscased and iscased(av)): |  | ||||||
|                 charsetappend((op, av)) |  | ||||||
|             else: |  | ||||||
|                 return None |  | ||||||
|         return charset |  | ||||||
|     elif op is IN: |  | ||||||
|         charset = av |  | ||||||
|         if iscased: |  | ||||||
|             for op, av in charset: |  | ||||||
|                 if op is LITERAL: |  | ||||||
|                     if iscased(av): |  | ||||||
|                         return None |  | ||||||
|                 elif op is RANGE: |  | ||||||
|                     if av[1] > 0xffff: |  | ||||||
|                         return None |  | ||||||
|                     if any(map(iscased, range(av[0], av[1]+1))): |  | ||||||
|                         return None |  | ||||||
|         return charset |  | ||||||
|     return None |  | ||||||
| 
 |  | ||||||
| def _compile_info(code, pattern, flags): |  | ||||||
|     # internal: compile an info block.  in the current version, |  | ||||||
|     # this contains min/max pattern width, and an optional literal |  | ||||||
|     # prefix or a character map |  | ||||||
|     lo, hi = pattern.getwidth() |  | ||||||
|     if hi > MAXCODE: |  | ||||||
|         hi = MAXCODE |  | ||||||
|     if lo == 0: |  | ||||||
|         code.extend([INFO, 4, 0, lo, hi]) |  | ||||||
|         return |  | ||||||
|     # look for a literal prefix |  | ||||||
|     prefix = [] |  | ||||||
|     prefix_skip = 0 |  | ||||||
|     charset = [] # not used |  | ||||||
|     if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): |  | ||||||
|         # look for literal prefix |  | ||||||
|         prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) |  | ||||||
|         # if no prefix, look for charset prefix |  | ||||||
|         if not prefix: |  | ||||||
|             charset = _get_charset_prefix(pattern, flags) |  | ||||||
| ##     if prefix: |  | ||||||
| ##         print("*** PREFIX", prefix, prefix_skip) |  | ||||||
| ##     if charset: |  | ||||||
| ##         print("*** CHARSET", charset) |  | ||||||
|     # add an info block |  | ||||||
|     emit = code.append |  | ||||||
|     emit(INFO) |  | ||||||
|     skip = len(code); emit(0) |  | ||||||
|     # literal flag |  | ||||||
|     mask = 0 |  | ||||||
|     if prefix: |  | ||||||
|         mask = SRE_INFO_PREFIX |  | ||||||
|         if prefix_skip is None and got_all: |  | ||||||
|             mask = mask | SRE_INFO_LITERAL |  | ||||||
|     elif charset: |  | ||||||
|         mask = mask | SRE_INFO_CHARSET |  | ||||||
|     emit(mask) |  | ||||||
|     # pattern length |  | ||||||
|     if lo < MAXCODE: |  | ||||||
|         emit(lo) |  | ||||||
|     else: |  | ||||||
|         emit(MAXCODE) |  | ||||||
|         prefix = prefix[:MAXCODE] |  | ||||||
|     emit(min(hi, MAXCODE)) |  | ||||||
|     # add literal prefix |  | ||||||
|     if prefix: |  | ||||||
|         emit(len(prefix)) # length |  | ||||||
|         if prefix_skip is None: |  | ||||||
|             prefix_skip =  len(prefix) |  | ||||||
|         emit(prefix_skip) # skip |  | ||||||
|         code.extend(prefix) |  | ||||||
|         # generate overlap table |  | ||||||
|         code.extend(_generate_overlap_table(prefix)) |  | ||||||
|     elif charset: |  | ||||||
|         charset, hascased = _optimize_charset(charset) |  | ||||||
|         assert not hascased |  | ||||||
|         _compile_charset(charset, flags, code) |  | ||||||
|     code[skip] = len(code) - skip |  | ||||||
| 
 |  | ||||||
| def isstring(obj): |  | ||||||
|     return isinstance(obj, (str, bytes)) |  | ||||||
| 
 |  | ||||||
| def _code(p, flags): |  | ||||||
| 
 |  | ||||||
|     flags = p.state.flags | flags |  | ||||||
|     code = [] |  | ||||||
| 
 |  | ||||||
|     # compile info block |  | ||||||
|     _compile_info(code, p, flags) |  | ||||||
| 
 |  | ||||||
|     # compile the pattern |  | ||||||
|     _compile(code, p.data, flags) |  | ||||||
| 
 |  | ||||||
|     code.append(SUCCESS) |  | ||||||
| 
 |  | ||||||
|     return code |  | ||||||
| 
 |  | ||||||
| def _hex_code(code): |  | ||||||
|     return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) |  | ||||||
| 
 |  | ||||||
| def dis(code): |  | ||||||
|     import sys |  | ||||||
| 
 |  | ||||||
|     labels = set() |  | ||||||
|     level = 0 |  | ||||||
|     offset_width = len(str(len(code) - 1)) |  | ||||||
| 
 |  | ||||||
|     def dis_(start, end): |  | ||||||
|         def print_(*args, to=None): |  | ||||||
|             if to is not None: |  | ||||||
|                 labels.add(to) |  | ||||||
|                 args += ('(to %d)' % (to,),) |  | ||||||
|             print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), |  | ||||||
|                   end='  '*(level-1)) |  | ||||||
|             print(*args) |  | ||||||
| 
 |  | ||||||
|         def print_2(*args): |  | ||||||
|             print(end=' '*(offset_width + 2*level)) |  | ||||||
|             print(*args) |  | ||||||
| 
 |  | ||||||
|         nonlocal level |  | ||||||
|         level += 1 |  | ||||||
|         i = start |  | ||||||
|         while i < end: |  | ||||||
|             start = i |  | ||||||
|             op = code[i] |  | ||||||
|             i += 1 |  | ||||||
|             op = OPCODES[op] |  | ||||||
|             if op in (SUCCESS, FAILURE, ANY, ANY_ALL, |  | ||||||
|                       MAX_UNTIL, MIN_UNTIL, NEGATE): |  | ||||||
|                 print_(op) |  | ||||||
|             elif op in (LITERAL, NOT_LITERAL, |  | ||||||
|                         LITERAL_IGNORE, NOT_LITERAL_IGNORE, |  | ||||||
|                         LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, |  | ||||||
|                         LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): |  | ||||||
|                 arg = code[i] |  | ||||||
|                 i += 1 |  | ||||||
|                 print_(op, '%#02x (%r)' % (arg, chr(arg))) |  | ||||||
|             elif op is AT: |  | ||||||
|                 arg = code[i] |  | ||||||
|                 i += 1 |  | ||||||
|                 arg = str(ATCODES[arg]) |  | ||||||
|                 assert arg[:3] == 'AT_' |  | ||||||
|                 print_(op, arg[3:]) |  | ||||||
|             elif op is CATEGORY: |  | ||||||
|                 arg = code[i] |  | ||||||
|                 i += 1 |  | ||||||
|                 arg = str(CHCODES[arg]) |  | ||||||
|                 assert arg[:9] == 'CATEGORY_' |  | ||||||
|                 print_(op, arg[9:]) |  | ||||||
|             elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): |  | ||||||
|                 skip = code[i] |  | ||||||
|                 print_(op, skip, to=i+skip) |  | ||||||
|                 dis_(i+1, i+skip) |  | ||||||
|                 i += skip |  | ||||||
|             elif op in (RANGE, RANGE_UNI_IGNORE): |  | ||||||
|                 lo, hi = code[i: i+2] |  | ||||||
|                 i += 2 |  | ||||||
|                 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) |  | ||||||
|             elif op is CHARSET: |  | ||||||
|                 print_(op, _hex_code(code[i: i + 256//_CODEBITS])) |  | ||||||
|                 i += 256//_CODEBITS |  | ||||||
|             elif op is BIGCHARSET: |  | ||||||
|                 arg = code[i] |  | ||||||
|                 i += 1 |  | ||||||
|                 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) |  | ||||||
|                                         for x in code[i: i + 256//_sre.CODESIZE])) |  | ||||||
|                 print_(op, arg, mapping) |  | ||||||
|                 i += 256//_sre.CODESIZE |  | ||||||
|                 level += 1 |  | ||||||
|                 for j in range(arg): |  | ||||||
|                     print_2(_hex_code(code[i: i + 256//_CODEBITS])) |  | ||||||
|                     i += 256//_CODEBITS |  | ||||||
|                 level -= 1 |  | ||||||
|             elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, |  | ||||||
|                         GROUPREF_LOC_IGNORE): |  | ||||||
|                 arg = code[i] |  | ||||||
|                 i += 1 |  | ||||||
|                 print_(op, arg) |  | ||||||
|             elif op is JUMP: |  | ||||||
|                 skip = code[i] |  | ||||||
|                 print_(op, skip, to=i+skip) |  | ||||||
|                 i += 1 |  | ||||||
|             elif op is BRANCH: |  | ||||||
|                 skip = code[i] |  | ||||||
|                 print_(op, skip, to=i+skip) |  | ||||||
|                 while skip: |  | ||||||
|                     dis_(i+1, i+skip) |  | ||||||
|                     i += skip |  | ||||||
|                     start = i |  | ||||||
|                     skip = code[i] |  | ||||||
|                     if skip: |  | ||||||
|                         print_('branch', skip, to=i+skip) |  | ||||||
|                     else: |  | ||||||
|                         print_(FAILURE) |  | ||||||
|                 i += 1 |  | ||||||
|             elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, |  | ||||||
|                         POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): |  | ||||||
|                 skip, min, max = code[i: i+3] |  | ||||||
|                 if max == MAXREPEAT: |  | ||||||
|                     max = 'MAXREPEAT' |  | ||||||
|                 print_(op, skip, min, max, to=i+skip) |  | ||||||
|                 dis_(i+3, i+skip) |  | ||||||
|                 i += skip |  | ||||||
|             elif op is GROUPREF_EXISTS: |  | ||||||
|                 arg, skip = code[i: i+2] |  | ||||||
|                 print_(op, arg, skip, to=i+skip) |  | ||||||
|                 i += 2 |  | ||||||
|             elif op in (ASSERT, ASSERT_NOT): |  | ||||||
|                 skip, arg = code[i: i+2] |  | ||||||
|                 print_(op, skip, arg, to=i+skip) |  | ||||||
|                 dis_(i+2, i+skip) |  | ||||||
|                 i += skip |  | ||||||
|             elif op is ATOMIC_GROUP: |  | ||||||
|                 skip = code[i] |  | ||||||
|                 print_(op, skip, to=i+skip) |  | ||||||
|                 dis_(i+1, i+skip) |  | ||||||
|                 i += skip |  | ||||||
|             elif op is INFO: |  | ||||||
|                 skip, flags, min, max = code[i: i+4] |  | ||||||
|                 if max == MAXREPEAT: |  | ||||||
|                     max = 'MAXREPEAT' |  | ||||||
|                 print_(op, skip, bin(flags), min, max, to=i+skip) |  | ||||||
|                 start = i+4 |  | ||||||
|                 if flags & SRE_INFO_PREFIX: |  | ||||||
|                     prefix_len, prefix_skip = code[i+4: i+6] |  | ||||||
|                     print_2('  prefix_skip', prefix_skip) |  | ||||||
|                     start = i + 6 |  | ||||||
|                     prefix = code[start: start+prefix_len] |  | ||||||
|                     print_2('  prefix', |  | ||||||
|                             '[%s]' % ', '.join('%#02x' % x for x in prefix), |  | ||||||
|                             '(%r)' % ''.join(map(chr, prefix))) |  | ||||||
|                     start += prefix_len |  | ||||||
|                     print_2('  overlap', code[start: start+prefix_len]) |  | ||||||
|                     start += prefix_len |  | ||||||
|                 if flags & SRE_INFO_CHARSET: |  | ||||||
|                     level += 1 |  | ||||||
|                     print_2('in') |  | ||||||
|                     dis_(start, i+skip) |  | ||||||
|                     level -= 1 |  | ||||||
|                 i += skip |  | ||||||
|             else: |  | ||||||
|                 raise ValueError(op) |  | ||||||
| 
 |  | ||||||
|         level -= 1 |  | ||||||
| 
 |  | ||||||
|     dis_(0, len(code)) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def compile(p, flags=0): |  | ||||||
|     # internal: convert pattern list to internal format |  | ||||||
| 
 |  | ||||||
|     if isstring(p): |  | ||||||
|         pattern = p |  | ||||||
|         p = sre_parse.parse(p, flags) |  | ||||||
|     else: |  | ||||||
|         pattern = None |  | ||||||
| 
 |  | ||||||
|     code = _code(p, flags) |  | ||||||
| 
 |  | ||||||
|     if flags & SRE_FLAG_DEBUG: |  | ||||||
|         print() |  | ||||||
|         dis(code) |  | ||||||
| 
 |  | ||||||
|     # map in either direction |  | ||||||
|     groupindex = p.state.groupdict |  | ||||||
|     indexgroup = [None] * p.state.groups |  | ||||||
|     for k, i in groupindex.items(): |  | ||||||
|         indexgroup[i] = k |  | ||||||
| 
 |  | ||||||
|     return _sre.compile( |  | ||||||
|         pattern, flags | p.state.flags, code, |  | ||||||
|         p.state.groups-1, |  | ||||||
|         groupindex, tuple(indexgroup) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|  | @ -1,262 +1,7 @@ | ||||||
| # | import warnings | ||||||
| # Secret Labs' Regular Expression Engine | warnings.warn(f"module {__name__!r} is deprecated", | ||||||
| # |               DeprecationWarning, | ||||||
| # various symbols used by the regular expression engine. |               stacklevel=2) | ||||||
| # run this script to update the _sre include files! |  | ||||||
| # |  | ||||||
| # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved. |  | ||||||
| # |  | ||||||
| # See the sre.py file for information on usage and redistribution. |  | ||||||
| # |  | ||||||
| 
 | 
 | ||||||
| """Internal support module for sre""" | from re import _constants as _ | ||||||
| 
 | globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'}) | ||||||
| # update when constants are added or removed |  | ||||||
| 
 |  | ||||||
| MAGIC = 20220318 |  | ||||||
| 
 |  | ||||||
| from _sre import MAXREPEAT, MAXGROUPS |  | ||||||
| 
 |  | ||||||
| # SRE standard exception (access as sre.error) |  | ||||||
| # should this really be here? |  | ||||||
| 
 |  | ||||||
| class error(Exception): |  | ||||||
|     """Exception raised for invalid regular expressions. |  | ||||||
| 
 |  | ||||||
|     Attributes: |  | ||||||
| 
 |  | ||||||
|         msg: The unformatted error message |  | ||||||
|         pattern: The regular expression pattern |  | ||||||
|         pos: The index in the pattern where compilation failed (may be None) |  | ||||||
|         lineno: The line corresponding to pos (may be None) |  | ||||||
|         colno: The column corresponding to pos (may be None) |  | ||||||
|     """ |  | ||||||
| 
 |  | ||||||
|     __module__ = 're' |  | ||||||
| 
 |  | ||||||
|     def __init__(self, msg, pattern=None, pos=None): |  | ||||||
|         self.msg = msg |  | ||||||
|         self.pattern = pattern |  | ||||||
|         self.pos = pos |  | ||||||
|         if pattern is not None and pos is not None: |  | ||||||
|             msg = '%s at position %d' % (msg, pos) |  | ||||||
|             if isinstance(pattern, str): |  | ||||||
|                 newline = '\n' |  | ||||||
|             else: |  | ||||||
|                 newline = b'\n' |  | ||||||
|             self.lineno = pattern.count(newline, 0, pos) + 1 |  | ||||||
|             self.colno = pos - pattern.rfind(newline, 0, pos) |  | ||||||
|             if newline in pattern: |  | ||||||
|                 msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) |  | ||||||
|         else: |  | ||||||
|             self.lineno = self.colno = None |  | ||||||
|         super().__init__(msg) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class _NamedIntConstant(int): |  | ||||||
|     def __new__(cls, value, name): |  | ||||||
|         self = super(_NamedIntConstant, cls).__new__(cls, value) |  | ||||||
|         self.name = name |  | ||||||
|         return self |  | ||||||
| 
 |  | ||||||
|     def __repr__(self): |  | ||||||
|         return self.name |  | ||||||
| 
 |  | ||||||
| MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') |  | ||||||
| 
 |  | ||||||
| def _makecodes(names): |  | ||||||
|     names = names.strip().split() |  | ||||||
|     items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] |  | ||||||
|     globals().update({item.name: item for item in items}) |  | ||||||
|     return items |  | ||||||
| 
 |  | ||||||
| # operators |  | ||||||
| # failure=0 success=1 (just because it looks better that way :-) |  | ||||||
| OPCODES = _makecodes(""" |  | ||||||
|     FAILURE SUCCESS |  | ||||||
| 
 |  | ||||||
|     ANY ANY_ALL |  | ||||||
|     ASSERT ASSERT_NOT |  | ||||||
|     AT |  | ||||||
|     BRANCH |  | ||||||
|     CALL |  | ||||||
|     CATEGORY |  | ||||||
|     CHARSET BIGCHARSET |  | ||||||
|     GROUPREF GROUPREF_EXISTS |  | ||||||
|     IN |  | ||||||
|     INFO |  | ||||||
|     JUMP |  | ||||||
|     LITERAL |  | ||||||
|     MARK |  | ||||||
|     MAX_UNTIL |  | ||||||
|     MIN_UNTIL |  | ||||||
|     NOT_LITERAL |  | ||||||
|     NEGATE |  | ||||||
|     RANGE |  | ||||||
|     REPEAT |  | ||||||
|     REPEAT_ONE |  | ||||||
|     SUBPATTERN |  | ||||||
|     MIN_REPEAT_ONE |  | ||||||
|     ATOMIC_GROUP |  | ||||||
|     POSSESSIVE_REPEAT |  | ||||||
|     POSSESSIVE_REPEAT_ONE |  | ||||||
| 
 |  | ||||||
|     GROUPREF_IGNORE |  | ||||||
|     IN_IGNORE |  | ||||||
|     LITERAL_IGNORE |  | ||||||
|     NOT_LITERAL_IGNORE |  | ||||||
| 
 |  | ||||||
|     GROUPREF_LOC_IGNORE |  | ||||||
|     IN_LOC_IGNORE |  | ||||||
|     LITERAL_LOC_IGNORE |  | ||||||
|     NOT_LITERAL_LOC_IGNORE |  | ||||||
| 
 |  | ||||||
|     GROUPREF_UNI_IGNORE |  | ||||||
|     IN_UNI_IGNORE |  | ||||||
|     LITERAL_UNI_IGNORE |  | ||||||
|     NOT_LITERAL_UNI_IGNORE |  | ||||||
|     RANGE_UNI_IGNORE |  | ||||||
| 
 |  | ||||||
|     MIN_REPEAT MAX_REPEAT |  | ||||||
| """) |  | ||||||
| del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT |  | ||||||
| 
 |  | ||||||
| # positions |  | ||||||
| ATCODES = _makecodes(""" |  | ||||||
|     AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING |  | ||||||
|     AT_BOUNDARY AT_NON_BOUNDARY |  | ||||||
|     AT_END AT_END_LINE AT_END_STRING |  | ||||||
| 
 |  | ||||||
|     AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY |  | ||||||
| 
 |  | ||||||
|     AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY |  | ||||||
| """) |  | ||||||
| 
 |  | ||||||
| # categories |  | ||||||
| CHCODES = _makecodes(""" |  | ||||||
|     CATEGORY_DIGIT CATEGORY_NOT_DIGIT |  | ||||||
|     CATEGORY_SPACE CATEGORY_NOT_SPACE |  | ||||||
|     CATEGORY_WORD CATEGORY_NOT_WORD |  | ||||||
|     CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK |  | ||||||
| 
 |  | ||||||
|     CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD |  | ||||||
| 
 |  | ||||||
|     CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT |  | ||||||
|     CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE |  | ||||||
|     CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD |  | ||||||
|     CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK |  | ||||||
| """) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # replacement operations for "ignore case" mode |  | ||||||
| OP_IGNORE = { |  | ||||||
|     LITERAL: LITERAL_IGNORE, |  | ||||||
|     NOT_LITERAL: NOT_LITERAL_IGNORE, |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| OP_LOCALE_IGNORE = { |  | ||||||
|     LITERAL: LITERAL_LOC_IGNORE, |  | ||||||
|     NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| OP_UNICODE_IGNORE = { |  | ||||||
|     LITERAL: LITERAL_UNI_IGNORE, |  | ||||||
|     NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| AT_MULTILINE = { |  | ||||||
|     AT_BEGINNING: AT_BEGINNING_LINE, |  | ||||||
|     AT_END: AT_END_LINE |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| AT_LOCALE = { |  | ||||||
|     AT_BOUNDARY: AT_LOC_BOUNDARY, |  | ||||||
|     AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| AT_UNICODE = { |  | ||||||
|     AT_BOUNDARY: AT_UNI_BOUNDARY, |  | ||||||
|     AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| CH_LOCALE = { |  | ||||||
|     CATEGORY_DIGIT: CATEGORY_DIGIT, |  | ||||||
|     CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, |  | ||||||
|     CATEGORY_SPACE: CATEGORY_SPACE, |  | ||||||
|     CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, |  | ||||||
|     CATEGORY_WORD: CATEGORY_LOC_WORD, |  | ||||||
|     CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, |  | ||||||
|     CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, |  | ||||||
|     CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| CH_UNICODE = { |  | ||||||
|     CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, |  | ||||||
|     CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, |  | ||||||
|     CATEGORY_SPACE: CATEGORY_UNI_SPACE, |  | ||||||
|     CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, |  | ||||||
|     CATEGORY_WORD: CATEGORY_UNI_WORD, |  | ||||||
|     CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, |  | ||||||
|     CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, |  | ||||||
|     CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| # flags |  | ||||||
| SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) |  | ||||||
| SRE_FLAG_IGNORECASE = 2 # case insensitive |  | ||||||
| SRE_FLAG_LOCALE = 4 # honour system locale |  | ||||||
| SRE_FLAG_MULTILINE = 8 # treat target as multiline string |  | ||||||
| SRE_FLAG_DOTALL = 16 # treat target as a single string |  | ||||||
| SRE_FLAG_UNICODE = 32 # use unicode "locale" |  | ||||||
| SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments |  | ||||||
| SRE_FLAG_DEBUG = 128 # debugging |  | ||||||
| SRE_FLAG_ASCII = 256 # use ascii "locale" |  | ||||||
| 
 |  | ||||||
| # flags for INFO primitive |  | ||||||
| SRE_INFO_PREFIX = 1 # has prefix |  | ||||||
| SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) |  | ||||||
| SRE_INFO_CHARSET = 4 # pattern starts with character from given set |  | ||||||
| 
 |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     def dump(f, d, prefix): |  | ||||||
|         items = sorted(d) |  | ||||||
|         for item in items: |  | ||||||
|             f.write("#define %s_%s %d\n" % (prefix, item, item)) |  | ||||||
|     with open("sre_constants.h", "w") as f: |  | ||||||
|         f.write("""\ |  | ||||||
| /* |  | ||||||
|  * Secret Labs' Regular Expression Engine |  | ||||||
|  * |  | ||||||
|  * regular expression matching engine |  | ||||||
|  * |  | ||||||
|  * NOTE: This file is generated by sre_constants.py.  If you need |  | ||||||
|  * to change anything in here, edit sre_constants.py and run it. |  | ||||||
|  * |  | ||||||
|  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. |  | ||||||
|  * |  | ||||||
|  * See the _sre.c file for information on usage and redistribution. |  | ||||||
|  */ |  | ||||||
| 
 |  | ||||||
| """) |  | ||||||
| 
 |  | ||||||
|         f.write("#define SRE_MAGIC %d\n" % MAGIC) |  | ||||||
| 
 |  | ||||||
|         dump(f, OPCODES, "SRE_OP") |  | ||||||
|         dump(f, ATCODES, "SRE") |  | ||||||
|         dump(f, CHCODES, "SRE") |  | ||||||
| 
 |  | ||||||
|         f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE) |  | ||||||
|         f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE) |  | ||||||
|         f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE) |  | ||||||
|         f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE) |  | ||||||
|         f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL) |  | ||||||
|         f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE) |  | ||||||
|         f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE) |  | ||||||
|         f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG) |  | ||||||
|         f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII) |  | ||||||
| 
 |  | ||||||
|         f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX) |  | ||||||
|         f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL) |  | ||||||
|         f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET) |  | ||||||
| 
 |  | ||||||
|     print("done") |  | ||||||
|  |  | ||||||
							
								
								
									
										1084
									
								
								Lib/sre_parse.py
									
										
									
									
									
								
							
							
						
						
									
										1084
									
								
								Lib/sre_parse.py
									
										
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -221,7 +221,7 @@ def test_others(self): | ||||||
|         cm('cgi', ignore=('log',))      # set with = in module |         cm('cgi', ignore=('log',))      # set with = in module | ||||||
|         cm('pickle', ignore=('partial', 'PickleBuffer')) |         cm('pickle', ignore=('partial', 'PickleBuffer')) | ||||||
|         cm('aifc', ignore=('_aifc_params',))  # set with = in module |         cm('aifc', ignore=('_aifc_params',))  # set with = in module | ||||||
|         cm('sre_parse', ignore=('dump', 'groups', 'pos')) # from sre_constants import *; property |         cm('re._parser', ignore=('dump', 'groups', 'pos')) # from ._constants import *; property | ||||||
|         cm( |         cm( | ||||||
|             'pdb', |             'pdb', | ||||||
|             # pyclbr does not handle elegantly `typing` or properties |             # pyclbr does not handle elegantly `typing` or properties | ||||||
|  |  | ||||||
|  | @ -3,8 +3,8 @@ | ||||||
|                           check_disallow_instantiation, is_emscripten) |                           check_disallow_instantiation, is_emscripten) | ||||||
| import locale | import locale | ||||||
| import re | import re | ||||||
| import sre_compile |  | ||||||
| import string | import string | ||||||
|  | import sys | ||||||
| import time | import time | ||||||
| import unittest | import unittest | ||||||
| import warnings | import warnings | ||||||
|  | @ -569,7 +569,7 @@ def test_re_groupref_exists(self): | ||||||
|                                'two branches', 10) |                                'two branches', 10) | ||||||
| 
 | 
 | ||||||
|     def test_re_groupref_overflow(self): |     def test_re_groupref_overflow(self): | ||||||
|         from sre_constants import MAXGROUPS |         from re._constants import MAXGROUPS | ||||||
|         self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', |         self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', | ||||||
|                                 'invalid group reference %d' % MAXGROUPS, 3) |                                 'invalid group reference %d' % MAXGROUPS, 3) | ||||||
|         self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, |         self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, | ||||||
|  | @ -2433,7 +2433,7 @@ def test_immutable(self): | ||||||
|             tp.foo = 1 |             tp.foo = 1 | ||||||
| 
 | 
 | ||||||
|     def test_overlap_table(self): |     def test_overlap_table(self): | ||||||
|         f = sre_compile._generate_overlap_table |         f = re._compiler._generate_overlap_table | ||||||
|         self.assertEqual(f(""), []) |         self.assertEqual(f(""), []) | ||||||
|         self.assertEqual(f("a"), [0]) |         self.assertEqual(f("a"), [0]) | ||||||
|         self.assertEqual(f("abcd"), [0, 0, 0, 0]) |         self.assertEqual(f("abcd"), [0, 0, 0, 0]) | ||||||
|  | @ -2442,8 +2442,8 @@ def test_overlap_table(self): | ||||||
|         self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) |         self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) | ||||||
| 
 | 
 | ||||||
|     def test_signedness(self): |     def test_signedness(self): | ||||||
|         self.assertGreaterEqual(sre_compile.MAXREPEAT, 0) |         self.assertGreaterEqual(re._compiler.MAXREPEAT, 0) | ||||||
|         self.assertGreaterEqual(sre_compile.MAXGROUPS, 0) |         self.assertGreaterEqual(re._compiler.MAXGROUPS, 0) | ||||||
| 
 | 
 | ||||||
|     @cpython_only |     @cpython_only | ||||||
|     def test_disallow_instantiation(self): |     def test_disallow_instantiation(self): | ||||||
|  | @ -2453,6 +2453,32 @@ def test_disallow_instantiation(self): | ||||||
|         pat = re.compile("") |         pat = re.compile("") | ||||||
|         check_disallow_instantiation(self, type(pat.scanner(""))) |         check_disallow_instantiation(self, type(pat.scanner(""))) | ||||||
| 
 | 
 | ||||||
|  |     def test_deprecated_modules(self): | ||||||
|  |         deprecated = { | ||||||
|  |             'sre_compile': ['compile', 'error', | ||||||
|  |                             'SRE_FLAG_IGNORECASE', 'SUBPATTERN', | ||||||
|  |                             '_compile_info'], | ||||||
|  |             'sre_constants': ['error', 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', | ||||||
|  |                               '_NamedIntConstant'], | ||||||
|  |             'sre_parse': ['SubPattern', 'parse', | ||||||
|  |                           'SRE_FLAG_IGNORECASE', 'SUBPATTERN', | ||||||
|  |                           '_parse_sub'], | ||||||
|  |         } | ||||||
|  |         for name in deprecated: | ||||||
|  |             with self.subTest(module=name): | ||||||
|  |                 sys.modules.pop(name, None) | ||||||
|  |                 with self.assertWarns(DeprecationWarning) as cm: | ||||||
|  |                     __import__(name) | ||||||
|  |                 self.assertEqual(str(cm.warnings[0].message), | ||||||
|  |                                  f"module {name!r} is deprecated") | ||||||
|  |                 self.assertEqual(cm.warnings[0].filename, __file__) | ||||||
|  |                 self.assertIn(name, sys.modules) | ||||||
|  |                 mod = sys.modules[name] | ||||||
|  |                 self.assertEqual(mod.__name__, name) | ||||||
|  |                 self.assertEqual(mod.__package__, '') | ||||||
|  |                 for attr in deprecated[name]: | ||||||
|  |                     self.assertTrue(hasattr(mod, attr)) | ||||||
|  |                 del sys.modules[name] | ||||||
| 
 | 
 | ||||||
| class ExternalTests(unittest.TestCase): | class ExternalTests(unittest.TestCase): | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -523,7 +523,7 @@ def test_startup_imports(self): | ||||||
|         self.assertIn('site', modules) |         self.assertIn('site', modules) | ||||||
| 
 | 
 | ||||||
|         # http://bugs.python.org/issue19205 |         # http://bugs.python.org/issue19205 | ||||||
|         re_mods = {'re', '_sre', 'sre_compile', 'sre_constants', 'sre_parse'} |         re_mods = {'re', '_sre', 're._compiler', 're._constants', 're._parser'} | ||||||
|         self.assertFalse(modules.intersection(re_mods), stderr) |         self.assertFalse(modules.intersection(re_mods), stderr) | ||||||
| 
 | 
 | ||||||
|         # http://bugs.python.org/issue9548 |         # http://bugs.python.org/issue9548 | ||||||
|  |  | ||||||
|  | @ -1862,6 +1862,7 @@ LIBSUBDIRS=	asyncio \ | ||||||
| 		logging \ | 		logging \ | ||||||
| 		multiprocessing multiprocessing/dummy \ | 		multiprocessing multiprocessing/dummy \ | ||||||
| 		pydoc_data \ | 		pydoc_data \ | ||||||
|  | 		re \ | ||||||
| 		site-packages \ | 		site-packages \ | ||||||
| 		sqlite3 \ | 		sqlite3 \ | ||||||
| 		tkinter \ | 		tkinter \ | ||||||
|  |  | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | Convert the :mod:`re` module into a package. Deprecate modules ``sre_compile``, | ||||||
|  | ``sre_constants`` and ``sre_parse``. | ||||||
|  | @ -3,8 +3,8 @@ | ||||||
|  * |  * | ||||||
|  * regular expression matching engine |  * regular expression matching engine | ||||||
|  * |  * | ||||||
|  * NOTE: This file is generated by sre_constants.py.  If you need |  * NOTE: This file is generated by Lib/re/_constants.py.  If you need | ||||||
|  * to change anything in here, edit sre_constants.py and run it. |  * to change anything in here, edit Lib/re/_constants.py and run it. | ||||||
|  * |  * | ||||||
|  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. |  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved. | ||||||
|  * |  * | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Serhiy Storchaka
						Serhiy Storchaka