| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | # | 
					
						
							|  |  |  | # Secret Labs' Regular Expression Engine | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | # convert re-style regular expression to sre pattern | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2001-01-14 15:06:11 +00:00
										 |  |  | # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved. | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2000-08-01 18:20:07 +00:00
										 |  |  | # See the sre.py file for information on usage and redistribution. | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-09-04 19:10:20 +00:00
										 |  |  | """Internal support module for sre""" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  | # XXX: show string offset and offending character for all errors | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  | # this module works under 1.5.2 and later.  don't use string methods | 
					
						
							|  |  |  | import string, sys | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | from sre_constants import * | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | SPECIAL_CHARS = ".\\[{()*+?^$|" | 
					
						
							| 
									
										
										
										
											2000-09-02 11:03:34 +00:00
										 |  |  | REPEAT_CHARS = "*+?{" | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-02 07:44:32 +00:00
										 |  |  | DIGITS = tuple("0123456789") | 
					
						
							| 
									
										
										
										
											2000-04-10 17:10:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-29 11:34:28 +00:00
										 |  |  | OCTDIGITS = tuple("01234567") | 
					
						
							|  |  |  | HEXDIGITS = tuple("0123456789abcdefABCDEF") | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  | WHITESPACE = tuple(" \t\n\r\v\f") | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | ESCAPES = { | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |     r"\a": (LITERAL, ord("\a")), | 
					
						
							|  |  |  |     r"\b": (LITERAL, ord("\b")), | 
					
						
							|  |  |  |     r"\f": (LITERAL, ord("\f")), | 
					
						
							|  |  |  |     r"\n": (LITERAL, ord("\n")), | 
					
						
							|  |  |  |     r"\r": (LITERAL, ord("\r")), | 
					
						
							|  |  |  |     r"\t": (LITERAL, ord("\t")), | 
					
						
							|  |  |  |     r"\v": (LITERAL, ord("\v")), | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |     r"\\": (LITERAL, ord("\\")) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CATEGORIES = { | 
					
						
							| 
									
										
										
										
											2001-01-14 15:06:11 +00:00
										 |  |  |     r"\A": (AT, AT_BEGINNING_STRING), # start of string | 
					
						
							| 
									
										
										
										
											2000-06-30 00:27:46 +00:00
										 |  |  |     r"\b": (AT, AT_BOUNDARY), | 
					
						
							|  |  |  |     r"\B": (AT, AT_NON_BOUNDARY), | 
					
						
							|  |  |  |     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), | 
					
						
							|  |  |  |     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), | 
					
						
							|  |  |  |     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), | 
					
						
							|  |  |  |     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), | 
					
						
							|  |  |  |     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), | 
					
						
							|  |  |  |     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), | 
					
						
							| 
									
										
										
										
											2001-01-14 15:06:11 +00:00
										 |  |  |     r"\Z": (AT, AT_END_STRING), # end of string | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | FLAGS = { | 
					
						
							| 
									
										
										
										
											2000-06-29 08:58:44 +00:00
										 |  |  |     # standard flags | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     "i": SRE_FLAG_IGNORECASE, | 
					
						
							|  |  |  |     "L": SRE_FLAG_LOCALE, | 
					
						
							|  |  |  |     "m": SRE_FLAG_MULTILINE, | 
					
						
							|  |  |  |     "s": SRE_FLAG_DOTALL, | 
					
						
							|  |  |  |     "x": SRE_FLAG_VERBOSE, | 
					
						
							| 
									
										
										
										
											2000-06-29 08:58:44 +00:00
										 |  |  |     # extensions | 
					
						
							|  |  |  |     "t": SRE_FLAG_TEMPLATE, | 
					
						
							|  |  |  |     "u": SRE_FLAG_UNICODE, | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  | # figure out best way to convert hex/octal numbers to integers | 
					
						
							|  |  |  | try: | 
					
						
							|  |  |  |     int("10", 8) | 
					
						
							|  |  |  |     atoi = int # 2.0 and later | 
					
						
							|  |  |  | except TypeError: | 
					
						
							|  |  |  |     atoi = string.atoi # 1.5.2 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  | class Pattern: | 
					
						
							|  |  |  |     # master pattern object.  keeps track of global attributes | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __init__(self): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.flags = 0 | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |         self.open = [] | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.groups = 1 | 
					
						
							|  |  |  |         self.groupdict = {} | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |     def opengroup(self, name=None): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         gid = self.groups | 
					
						
							|  |  |  |         self.groups = gid + 1 | 
					
						
							| 
									
										
										
										
											2002-06-02 00:40:05 +00:00
										 |  |  |         if name is not None: | 
					
						
							| 
									
										
										
										
											2001-11-03 19:35:43 +00:00
										 |  |  |             ogid = self.groupdict.get(name, None) | 
					
						
							|  |  |  |             if ogid is not None: | 
					
						
							| 
									
										
										
										
											2001-12-09 16:13:15 +00:00
										 |  |  |                 raise error, ("redefinition of group name %s as group %d; " | 
					
						
							|  |  |  |                               "was group %d" % (repr(name), gid,  ogid)) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             self.groupdict[name] = gid | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |         self.open.append(gid) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return gid | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |     def closegroup(self, gid): | 
					
						
							|  |  |  |         self.open.remove(gid) | 
					
						
							|  |  |  |     def checkgroup(self, gid): | 
					
						
							|  |  |  |         return gid < self.groups and gid not in self.open | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | class SubPattern: | 
					
						
							|  |  |  |     # a subpattern, in intermediate form | 
					
						
							|  |  |  |     def __init__(self, pattern, data=None): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.pattern = pattern | 
					
						
							| 
									
										
										
										
											2002-06-02 00:40:05 +00:00
										 |  |  |         if data is None: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             data = [] | 
					
						
							|  |  |  |         self.data = data | 
					
						
							|  |  |  |         self.width = None | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |     def dump(self, level=0): | 
					
						
							|  |  |  |         nl = 1 | 
					
						
							|  |  |  |         for op, av in self.data: | 
					
						
							|  |  |  |             print level*"  " + op,; nl = 0 | 
					
						
							|  |  |  |             if op == "in": | 
					
						
							|  |  |  |                 # member sublanguage | 
					
						
							|  |  |  |                 print; nl = 1 | 
					
						
							|  |  |  |                 for op, a in av: | 
					
						
							|  |  |  |                     print (level+1)*"  " + op, a | 
					
						
							|  |  |  |             elif op == "branch": | 
					
						
							|  |  |  |                 print; nl = 1 | 
					
						
							|  |  |  |                 i = 0 | 
					
						
							|  |  |  |                 for a in av[1]: | 
					
						
							|  |  |  |                     if i > 0: | 
					
						
							|  |  |  |                         print level*"  " + "or" | 
					
						
							|  |  |  |                     a.dump(level+1); nl = 1 | 
					
						
							|  |  |  |                     i = i + 1 | 
					
						
							|  |  |  |             elif type(av) in (type(()), type([])): | 
					
						
							|  |  |  |                 for a in av: | 
					
						
							|  |  |  |                     if isinstance(a, SubPattern): | 
					
						
							|  |  |  |                         if not nl: print | 
					
						
							|  |  |  |                         a.dump(level+1); nl = 1 | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         print a, ; nl = 0 | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 print av, ; nl = 0 | 
					
						
							|  |  |  |             if not nl: print | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __repr__(self): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return repr(self.data) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __len__(self): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return len(self.data) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __delitem__(self, index): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         del self.data[index] | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __getitem__(self, index): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return self.data[index] | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __setitem__(self, index, code): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.data[index] = code | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __getslice__(self, start, stop): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return SubPattern(self.pattern, self.data[start:stop]) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def insert(self, index, code): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.data.insert(index, code) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def append(self, code): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.data.append(code) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def getwidth(self): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         # determine the width (min, max) for this subpattern | 
					
						
							|  |  |  |         if self.width: | 
					
						
							|  |  |  |             return self.width | 
					
						
							|  |  |  |         lo = hi = 0L | 
					
						
							|  |  |  |         for op, av in self.data: | 
					
						
							|  |  |  |             if op is BRANCH: | 
					
						
							| 
									
										
										
										
											2000-08-01 21:05:41 +00:00
										 |  |  |                 i = sys.maxint | 
					
						
							|  |  |  |                 j = 0 | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 for av in av[1]: | 
					
						
							| 
									
										
										
										
											2000-08-01 21:05:41 +00:00
										 |  |  |                     l, h = av.getwidth() | 
					
						
							|  |  |  |                     i = min(i, l) | 
					
						
							| 
									
										
										
										
											2000-08-01 22:47:49 +00:00
										 |  |  |                     j = max(j, h) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 lo = lo + i | 
					
						
							|  |  |  |                 hi = hi + j | 
					
						
							|  |  |  |             elif op is CALL: | 
					
						
							|  |  |  |                 i, j = av.getwidth() | 
					
						
							|  |  |  |                 lo = lo + i | 
					
						
							|  |  |  |                 hi = hi + j | 
					
						
							|  |  |  |             elif op is SUBPATTERN: | 
					
						
							|  |  |  |                 i, j = av[1].getwidth() | 
					
						
							|  |  |  |                 lo = lo + i | 
					
						
							|  |  |  |                 hi = hi + j | 
					
						
							|  |  |  |             elif op in (MIN_REPEAT, MAX_REPEAT): | 
					
						
							|  |  |  |                 i, j = av[2].getwidth() | 
					
						
							|  |  |  |                 lo = lo + long(i) * av[0] | 
					
						
							|  |  |  |                 hi = hi + long(j) * av[1] | 
					
						
							|  |  |  |             elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): | 
					
						
							|  |  |  |                 lo = lo + 1 | 
					
						
							|  |  |  |                 hi = hi + 1 | 
					
						
							|  |  |  |             elif op == SUCCESS: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |         self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) | 
					
						
							|  |  |  |         return self.width | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | class Tokenizer: | 
					
						
							|  |  |  |     def __init__(self, string): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         self.string = string | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |         self.index = 0 | 
					
						
							|  |  |  |         self.__next() | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def __next(self): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if self.index >= len(self.string): | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |             self.next = None | 
					
						
							|  |  |  |             return | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         char = self.string[self.index] | 
					
						
							|  |  |  |         if char[0] == "\\": | 
					
						
							|  |  |  |             try: | 
					
						
							|  |  |  |                 c = self.string[self.index + 1] | 
					
						
							|  |  |  |             except IndexError: | 
					
						
							| 
									
										
										
										
											2001-11-02 13:59:51 +00:00
										 |  |  |                 raise error, "bogus escape (end of line)" | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             char = char + c | 
					
						
							|  |  |  |         self.index = self.index + len(char) | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |         self.next = char | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |     def match(self, char, skip=1): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if char == self.next: | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             if skip: | 
					
						
							|  |  |  |                 self.__next() | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             return 1 | 
					
						
							|  |  |  |         return 0 | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     def get(self): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         this = self.next | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |         self.__next() | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return this | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |     def tell(self): | 
					
						
							|  |  |  |         return self.index, self.next | 
					
						
							|  |  |  |     def seek(self, index): | 
					
						
							|  |  |  |         self.index, self.next = index | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-29 12:38:45 +00:00
										 |  |  | def isident(char): | 
					
						
							|  |  |  |     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def isdigit(char): | 
					
						
							|  |  |  |     return "0" <= char <= "9" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def isname(name): | 
					
						
							|  |  |  |     # check that group name is a valid string | 
					
						
							|  |  |  |     if not isident(name[0]): | 
					
						
							| 
									
										
										
										
											2002-04-07 06:36:23 +00:00
										 |  |  |         return False | 
					
						
							| 
									
										
										
										
											2000-06-29 12:38:45 +00:00
										 |  |  |     for char in name: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if not isident(char) and not isdigit(char): | 
					
						
							| 
									
										
										
										
											2002-04-07 06:36:23 +00:00
										 |  |  |             return False | 
					
						
							|  |  |  |     return True | 
					
						
							| 
									
										
										
										
											2000-06-29 12:38:45 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-30 00:27:46 +00:00
										 |  |  | def _group(escape, groups): | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     # check if the escape string represents a valid group | 
					
						
							|  |  |  |     try: | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |         gid = atoi(escape[1:]) | 
					
						
							| 
									
										
										
										
											2000-06-30 09:13:06 +00:00
										 |  |  |         if gid and gid < groups: | 
					
						
							|  |  |  |             return gid | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     except ValueError: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         pass | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     return None # not a valid group | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _class_escape(source, escape): | 
					
						
							|  |  |  |     # handle escape code inside character class | 
					
						
							|  |  |  |     code = ESCAPES.get(escape) | 
					
						
							|  |  |  |     if code: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return code | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     code = CATEGORIES.get(escape) | 
					
						
							|  |  |  |     if code: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return code | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if escape[1:2] == "x": | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             # hexadecimal escape (exactly two digits) | 
					
						
							|  |  |  |             while source.next in HEXDIGITS and len(escape) < 4: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 escape = escape + source.get() | 
					
						
							|  |  |  |             escape = escape[2:] | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             if len(escape) != 2: | 
					
						
							|  |  |  |                 raise error, "bogus escape: %s" % repr("\\" + escape) | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |             return LITERAL, atoi(escape, 16) & 0xff | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         elif str(escape[1:2]) in OCTDIGITS: | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             # octal escape (up to three digits) | 
					
						
							|  |  |  |             while source.next in OCTDIGITS and len(escape) < 5: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 escape = escape + source.get() | 
					
						
							|  |  |  |             escape = escape[1:] | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |             return LITERAL, atoi(escape, 8) & 0xff | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if len(escape) == 2: | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |             return LITERAL, ord(escape[1]) | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     except ValueError: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         pass | 
					
						
							| 
									
										
										
										
											2000-06-29 08:58:44 +00:00
										 |  |  |     raise error, "bogus escape: %s" % repr(escape) | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def _escape(source, escape, state): | 
					
						
							|  |  |  |     # handle escape code in expression | 
					
						
							|  |  |  |     code = CATEGORIES.get(escape) | 
					
						
							|  |  |  |     if code: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return code | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     code = ESCAPES.get(escape) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     if code: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         return code | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if escape[1:2] == "x": | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             # hexadecimal escape | 
					
						
							|  |  |  |             while source.next in HEXDIGITS and len(escape) < 4: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 escape = escape + source.get() | 
					
						
							| 
									
										
										
										
											2000-09-02 11:03:34 +00:00
										 |  |  |             if len(escape) != 4: | 
					
						
							|  |  |  |                 raise ValueError | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |             return LITERAL, atoi(escape[2:], 16) & 0xff | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |         elif escape[1:2] == "0": | 
					
						
							|  |  |  |             # octal escape | 
					
						
							| 
									
										
										
										
											2000-09-02 11:03:34 +00:00
										 |  |  |             while source.next in OCTDIGITS and len(escape) < 4: | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                 escape = escape + source.get() | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |             return LITERAL, atoi(escape[1:], 8) & 0xff | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         elif escape[1:2] in DIGITS: | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             # octal escape *or* decimal group reference (sigh) | 
					
						
							|  |  |  |             if source.next in DIGITS: | 
					
						
							|  |  |  |                 escape = escape + source.get() | 
					
						
							| 
									
										
										
										
											2000-09-02 11:03:34 +00:00
										 |  |  |                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and | 
					
						
							|  |  |  |                     source.next in OCTDIGITS): | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                     # got three octal digits; this is an octal escape | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                     escape = escape + source.get() | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |                     return LITERAL, atoi(escape[1:], 8) & 0xff | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |             # got at least one decimal digit; this is a group reference | 
					
						
							|  |  |  |             group = _group(escape, state.groups) | 
					
						
							|  |  |  |             if group: | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |                 if not state.checkgroup(group): | 
					
						
							|  |  |  |                     raise error, "cannot refer to open group" | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                 return GROUPREF, group | 
					
						
							| 
									
										
										
										
											2000-09-02 11:03:34 +00:00
										 |  |  |             raise ValueError | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if len(escape) == 2: | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |             return LITERAL, ord(escape[1]) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     except ValueError: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         pass | 
					
						
							| 
									
										
										
										
											2000-06-29 08:58:44 +00:00
										 |  |  |     raise error, "bogus escape: %s" % repr(escape) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  | def _parse_sub(source, state, nested=1): | 
					
						
							|  |  |  |     # parse an alternation: a|b|c | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |     items = [] | 
					
						
							|  |  |  |     while 1: | 
					
						
							|  |  |  |         items.append(_parse(source, state)) | 
					
						
							|  |  |  |         if source.match("|"): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not nested: | 
					
						
							|  |  |  |             break | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |         if not source.next or source.match(")", 0): | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |             break | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise error, "pattern not properly closed" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if len(items) == 1: | 
					
						
							|  |  |  |         return items[0] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     subpattern = SubPattern(state) | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     # check if all items share a common prefix | 
					
						
							|  |  |  |     while 1: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         prefix = None | 
					
						
							|  |  |  |         for item in items: | 
					
						
							|  |  |  |             if not item: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |             if prefix is None: | 
					
						
							|  |  |  |                 prefix = item[0] | 
					
						
							|  |  |  |             elif item[0] != prefix: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             # all subitems start with a common "prefix". | 
					
						
							|  |  |  |             # move it out of the branch | 
					
						
							|  |  |  |             for item in items: | 
					
						
							|  |  |  |                 del item[0] | 
					
						
							|  |  |  |             subpattern.append(prefix) | 
					
						
							|  |  |  |             continue # check next one | 
					
						
							|  |  |  |         break | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # check if the branch can be replaced by a character set | 
					
						
							|  |  |  |     for item in items: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if len(item) != 1 or item[0][0] != LITERAL: | 
					
						
							|  |  |  |             break | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         # we can store this as a character set instead of a | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |         # branch (the compiler may optimize this even more) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         set = [] | 
					
						
							|  |  |  |         for item in items: | 
					
						
							|  |  |  |             set.append(item[0]) | 
					
						
							|  |  |  |         subpattern.append((IN, set)) | 
					
						
							|  |  |  |         return subpattern | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     subpattern.append((BRANCH, (None, items))) | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     return subpattern | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-30 22:37:31 +00:00
										 |  |  | def _parse(source, state): | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |     # parse a simple pattern | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     subpattern = SubPattern(state) | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     while 1: | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         if source.next in ("|", ")"): | 
					
						
							|  |  |  |             break # end of subpattern | 
					
						
							|  |  |  |         this = source.get() | 
					
						
							|  |  |  |         if this is None: | 
					
						
							|  |  |  |             break # end of pattern | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if state.flags & SRE_FLAG_VERBOSE: | 
					
						
							|  |  |  |             # skip whitespace and comments | 
					
						
							|  |  |  |             if this in WHITESPACE: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if this == "#": | 
					
						
							|  |  |  |                 while 1: | 
					
						
							|  |  |  |                     this = source.get() | 
					
						
							|  |  |  |                     if this in (None, "\n"): | 
					
						
							|  |  |  |                         break | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if this and this[0] not in SPECIAL_CHARS: | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |             subpattern.append((LITERAL, ord(this))) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         elif this == "[": | 
					
						
							|  |  |  |             # character set | 
					
						
							|  |  |  |             set = [] | 
					
						
							|  |  |  | ##          if source.match(":"): | 
					
						
							|  |  |  | ##              pass # handle character classes | 
					
						
							|  |  |  |             if source.match("^"): | 
					
						
							|  |  |  |                 set.append((NEGATE, None)) | 
					
						
							|  |  |  |             # check remaining characters | 
					
						
							|  |  |  |             start = set[:] | 
					
						
							|  |  |  |             while 1: | 
					
						
							|  |  |  |                 this = source.get() | 
					
						
							|  |  |  |                 if this == "]" and set != start: | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |                 elif this and this[0] == "\\": | 
					
						
							|  |  |  |                     code1 = _class_escape(source, this) | 
					
						
							|  |  |  |                 elif this: | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |                     code1 = LITERAL, ord(this) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     raise error, "unexpected end of regular expression" | 
					
						
							|  |  |  |                 if source.match("-"): | 
					
						
							|  |  |  |                     # potential range | 
					
						
							|  |  |  |                     this = source.get() | 
					
						
							|  |  |  |                     if this == "]": | 
					
						
							| 
									
										
										
										
											2000-10-07 10:16:19 +00:00
										 |  |  |                         if code1[0] is IN: | 
					
						
							|  |  |  |                             code1 = code1[1][0] | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                         set.append(code1) | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |                         set.append((LITERAL, ord("-"))) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                         break | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         if this[0] == "\\": | 
					
						
							|  |  |  |                             code2 = _class_escape(source, this) | 
					
						
							|  |  |  |                         else: | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |                             code2 = LITERAL, ord(this) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                         if code1[0] != LITERAL or code2[0] != LITERAL: | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                             raise error, "bad character range" | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                         lo = code1[1] | 
					
						
							|  |  |  |                         hi = code2[1] | 
					
						
							|  |  |  |                         if hi < lo: | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                             raise error, "bad character range" | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                         set.append((RANGE, (lo, hi))) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     if code1[0] is IN: | 
					
						
							|  |  |  |                         code1 = code1[1][0] | 
					
						
							|  |  |  |                     set.append(code1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-14 15:06:11 +00:00
										 |  |  |             # XXX: <fl> should move set optimization to compiler! | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             if len(set)==1 and set[0][0] is LITERAL: | 
					
						
							|  |  |  |                 subpattern.append(set[0]) # optimization | 
					
						
							|  |  |  |             elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: | 
					
						
							|  |  |  |                 subpattern.append((NOT_LITERAL, set[1][1])) # optimization | 
					
						
							|  |  |  |             else: | 
					
						
							| 
									
										
										
										
											2001-01-14 15:06:11 +00:00
										 |  |  |                 # XXX: <fl> should add charmap optimization here | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 subpattern.append((IN, set)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elif this and this[0] in REPEAT_CHARS: | 
					
						
							|  |  |  |             # repeat previous item | 
					
						
							|  |  |  |             if this == "?": | 
					
						
							|  |  |  |                 min, max = 0, 1 | 
					
						
							|  |  |  |             elif this == "*": | 
					
						
							|  |  |  |                 min, max = 0, MAXREPEAT | 
					
						
							| 
									
										
										
										
											2001-02-18 21:04:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             elif this == "+": | 
					
						
							|  |  |  |                 min, max = 1, MAXREPEAT | 
					
						
							|  |  |  |             elif this == "{": | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |                 here = source.tell() | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 min, max = 0, MAXREPEAT | 
					
						
							|  |  |  |                 lo = hi = "" | 
					
						
							|  |  |  |                 while source.next in DIGITS: | 
					
						
							|  |  |  |                     lo = lo + source.get() | 
					
						
							|  |  |  |                 if source.match(","): | 
					
						
							|  |  |  |                     while source.next in DIGITS: | 
					
						
							|  |  |  |                         hi = hi + source.get() | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     hi = lo | 
					
						
							|  |  |  |                 if not source.match("}"): | 
					
						
							| 
									
										
										
										
											2000-07-01 23:49:14 +00:00
										 |  |  |                     subpattern.append((LITERAL, ord(this))) | 
					
						
							|  |  |  |                     source.seek(here) | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 if lo: | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |                     min = atoi(lo) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 if hi: | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |                     max = atoi(hi) | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                 if max < min: | 
					
						
							|  |  |  |                     raise error, "bad repeat interval" | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 raise error, "not supported" | 
					
						
							|  |  |  |             # figure out which item to repeat | 
					
						
							|  |  |  |             if subpattern: | 
					
						
							|  |  |  |                 item = subpattern[-1:] | 
					
						
							|  |  |  |             else: | 
					
						
							| 
									
										
										
										
											2001-02-18 21:04:48 +00:00
										 |  |  |                 item = None | 
					
						
							|  |  |  |             if not item or (len(item) == 1 and item[0][0] == AT): | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 raise error, "nothing to repeat" | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |             if item[0][0] in (MIN_REPEAT, MAX_REPEAT): | 
					
						
							|  |  |  |                 raise error, "multiple repeat" | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             if source.match("?"): | 
					
						
							|  |  |  |                 subpattern[-1] = (MIN_REPEAT, (min, max, item)) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 subpattern[-1] = (MAX_REPEAT, (min, max, item)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elif this == ".": | 
					
						
							|  |  |  |             subpattern.append((ANY, None)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elif this == "(": | 
					
						
							|  |  |  |             group = 1 | 
					
						
							|  |  |  |             name = None | 
					
						
							|  |  |  |             if source.match("?"): | 
					
						
							|  |  |  |                 group = 0 | 
					
						
							|  |  |  |                 # options | 
					
						
							|  |  |  |                 if source.match("P"): | 
					
						
							|  |  |  |                     # python extensions | 
					
						
							|  |  |  |                     if source.match("<"): | 
					
						
							|  |  |  |                         # named group: skip forward to end of name | 
					
						
							|  |  |  |                         name = "" | 
					
						
							|  |  |  |                         while 1: | 
					
						
							|  |  |  |                             char = source.get() | 
					
						
							|  |  |  |                             if char is None: | 
					
						
							|  |  |  |                                 raise error, "unterminated name" | 
					
						
							|  |  |  |                             if char == ">": | 
					
						
							|  |  |  |                                 break | 
					
						
							|  |  |  |                             name = name + char | 
					
						
							|  |  |  |                         group = 1 | 
					
						
							|  |  |  |                         if not isname(name): | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                             raise error, "bad character in group name" | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                     elif source.match("="): | 
					
						
							|  |  |  |                         # named backreference | 
					
						
							| 
									
										
										
										
											2000-06-30 09:13:06 +00:00
										 |  |  |                         name = "" | 
					
						
							|  |  |  |                         while 1: | 
					
						
							|  |  |  |                             char = source.get() | 
					
						
							|  |  |  |                             if char is None: | 
					
						
							|  |  |  |                                 raise error, "unterminated name" | 
					
						
							|  |  |  |                             if char == ")": | 
					
						
							|  |  |  |                                 break | 
					
						
							|  |  |  |                             name = name + char | 
					
						
							|  |  |  |                         if not isname(name): | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                             raise error, "bad character in group name" | 
					
						
							| 
									
										
										
										
											2000-06-30 09:13:06 +00:00
										 |  |  |                         gid = state.groupdict.get(name) | 
					
						
							|  |  |  |                         if gid is None: | 
					
						
							|  |  |  |                             raise error, "unknown group name" | 
					
						
							| 
									
										
										
										
											2000-07-03 21:31:48 +00:00
										 |  |  |                         subpattern.append((GROUPREF, gid)) | 
					
						
							| 
									
										
										
										
											2000-07-02 17:33:27 +00:00
										 |  |  |                         continue | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                     else: | 
					
						
							|  |  |  |                         char = source.get() | 
					
						
							|  |  |  |                         if char is None: | 
					
						
							|  |  |  |                             raise error, "unexpected end of pattern" | 
					
						
							|  |  |  |                         raise error, "unknown specifier: ?P%s" % char | 
					
						
							|  |  |  |                 elif source.match(":"): | 
					
						
							|  |  |  |                     # non-capturing group | 
					
						
							|  |  |  |                     group = 2 | 
					
						
							|  |  |  |                 elif source.match("#"): | 
					
						
							|  |  |  |                     # comment | 
					
						
							|  |  |  |                     while 1: | 
					
						
							|  |  |  |                         if source.next is None or source.next == ")": | 
					
						
							|  |  |  |                             break | 
					
						
							|  |  |  |                         source.get() | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                     if not source.match(")"): | 
					
						
							|  |  |  |                         raise error, "unbalanced parenthesis" | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2000-07-03 18:44:21 +00:00
										 |  |  |                 elif source.next in ("=", "!", "<"): | 
					
						
							| 
									
										
										
										
											2000-06-30 10:41:31 +00:00
										 |  |  |                     # lookahead assertions | 
					
						
							|  |  |  |                     char = source.get() | 
					
						
							| 
									
										
										
										
											2000-07-03 18:44:21 +00:00
										 |  |  |                     dir = 1 | 
					
						
							|  |  |  |                     if char == "<": | 
					
						
							|  |  |  |                         if source.next not in ("=", "!"): | 
					
						
							|  |  |  |                             raise error, "syntax error" | 
					
						
							|  |  |  |                         dir = -1 # lookbehind | 
					
						
							|  |  |  |                         char = source.get() | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |                     p = _parse_sub(source, state) | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                     if not source.match(")"): | 
					
						
							|  |  |  |                         raise error, "unbalanced parenthesis" | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |                     if char == "=": | 
					
						
							|  |  |  |                         subpattern.append((ASSERT, (dir, p))) | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         subpattern.append((ASSERT_NOT, (dir, p))) | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     # flags | 
					
						
							| 
									
										
										
										
											2002-06-01 14:18:47 +00:00
										 |  |  |                     if not source.next in FLAGS: | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                         raise error, "unexpected end of pattern" | 
					
						
							| 
									
										
										
										
											2002-06-01 14:18:47 +00:00
										 |  |  |                     while source.next in FLAGS: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                         state.flags = state.flags | FLAGS[source.get()] | 
					
						
							|  |  |  |             if group: | 
					
						
							|  |  |  |                 # parse group contents | 
					
						
							|  |  |  |                 if group == 2: | 
					
						
							|  |  |  |                     # anonymous group | 
					
						
							|  |  |  |                     group = None | 
					
						
							|  |  |  |                 else: | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |                     group = state.opengroup(name) | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |                 p = _parse_sub(source, state) | 
					
						
							| 
									
										
										
										
											2000-08-31 22:57:55 +00:00
										 |  |  |                 if not source.match(")"): | 
					
						
							|  |  |  |                     raise error, "unbalanced parenthesis" | 
					
						
							| 
									
										
										
										
											2000-10-28 19:30:41 +00:00
										 |  |  |                 if group is not None: | 
					
						
							|  |  |  |                     state.closegroup(group) | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |                 subpattern.append((SUBPATTERN, (group, p))) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 while 1: | 
					
						
							|  |  |  |                     char = source.get() | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                     if char is None: | 
					
						
							|  |  |  |                         raise error, "unexpected end of pattern" | 
					
						
							|  |  |  |                     if char == ")": | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                         break | 
					
						
							|  |  |  |                     raise error, "unknown extension" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elif this == "^": | 
					
						
							|  |  |  |             subpattern.append((AT, AT_BEGINNING)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elif this == "$": | 
					
						
							|  |  |  |             subpattern.append((AT, AT_END)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         elif this and this[0] == "\\": | 
					
						
							|  |  |  |             code = _escape(source, this, state) | 
					
						
							|  |  |  |             subpattern.append(code) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             raise error, "parser error" | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return subpattern | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-08-07 20:59:04 +00:00
										 |  |  | def parse(str, flags=0, pattern=None): | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     # parse 're' pattern into list of (opcode, argument) tuples | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     source = Tokenizer(str) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-08-07 20:59:04 +00:00
										 |  |  |     if pattern is None: | 
					
						
							|  |  |  |         pattern = Pattern() | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  |     pattern.flags = flags | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |     pattern.str = str | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     p = _parse_sub(source, pattern, 0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     tail = source.get() | 
					
						
							|  |  |  |     if tail == ")": | 
					
						
							|  |  |  |         raise error, "unbalanced parenthesis" | 
					
						
							|  |  |  |     elif tail: | 
					
						
							|  |  |  |         raise error, "bogus characters at end of regular expression" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-14 15:06:11 +00:00
										 |  |  |     if flags & SRE_FLAG_DEBUG: | 
					
						
							|  |  |  |         p.dump() | 
					
						
							| 
									
										
										
										
											2000-07-23 21:46:17 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-10-03 19:22:26 +00:00
										 |  |  |     if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: | 
					
						
							|  |  |  |         # the VERBOSE flag was switched on inside the pattern.  to be | 
					
						
							|  |  |  |         # on the safe side, we'll parse the whole thing again... | 
					
						
							|  |  |  |         return parse(str, p.pattern.flags) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-31 14:58:54 +00:00
										 |  |  |     return p | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-29 08:58:44 +00:00
										 |  |  | def parse_template(source, pattern): | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     # parse 're' replacement string into list of literals and | 
					
						
							|  |  |  |     # group references | 
					
						
							|  |  |  |     s = Tokenizer(source) | 
					
						
							|  |  |  |     p = [] | 
					
						
							|  |  |  |     a = p.append | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |     def literal(literal, p=p): | 
					
						
							|  |  |  |         if p and p[-1][0] is LITERAL: | 
					
						
							|  |  |  |             p[-1] = LITERAL, p[-1][1] + literal | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             p.append((LITERAL, literal)) | 
					
						
							|  |  |  |     sep = source[:0] | 
					
						
							|  |  |  |     if type(sep) is type(""): | 
					
						
							| 
									
										
										
										
											2001-09-18 20:55:24 +00:00
										 |  |  |         makechar = chr | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2001-09-18 20:55:24 +00:00
										 |  |  |         makechar = unichr | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  |     while 1: | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         this = s.get() | 
					
						
							|  |  |  |         if this is None: | 
					
						
							|  |  |  |             break # end of replacement string | 
					
						
							|  |  |  |         if this and this[0] == "\\": | 
					
						
							|  |  |  |             # group | 
					
						
							|  |  |  |             if this == "\\g": | 
					
						
							|  |  |  |                 name = "" | 
					
						
							|  |  |  |                 if s.match("<"): | 
					
						
							|  |  |  |                     while 1: | 
					
						
							|  |  |  |                         char = s.get() | 
					
						
							|  |  |  |                         if char is None: | 
					
						
							|  |  |  |                             raise error, "unterminated group name" | 
					
						
							|  |  |  |                         if char == ">": | 
					
						
							|  |  |  |                             break | 
					
						
							|  |  |  |                         name = name + char | 
					
						
							|  |  |  |                 if not name: | 
					
						
							|  |  |  |                     raise error, "bad group name" | 
					
						
							|  |  |  |                 try: | 
					
						
							| 
									
										
										
										
											2001-02-18 12:05:16 +00:00
										 |  |  |                     index = atoi(name) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 except ValueError: | 
					
						
							|  |  |  |                     if not isname(name): | 
					
						
							| 
									
										
										
										
											2001-01-14 21:00:44 +00:00
										 |  |  |                         raise error, "bad character in group name" | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                     try: | 
					
						
							|  |  |  |                         index = pattern.groupindex[name] | 
					
						
							|  |  |  |                     except KeyError: | 
					
						
							|  |  |  |                         raise IndexError, "unknown group name" | 
					
						
							|  |  |  |                 a((MARK, index)) | 
					
						
							|  |  |  |             elif len(this) > 1 and this[1] in DIGITS: | 
					
						
							|  |  |  |                 code = None | 
					
						
							|  |  |  |                 while 1: | 
					
						
							|  |  |  |                     group = _group(this, pattern.groups+1) | 
					
						
							|  |  |  |                     if group: | 
					
						
							| 
									
										
										
										
											2000-09-24 14:46:23 +00:00
										 |  |  |                         if (s.next not in DIGITS or | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                             not _group(this + s.next, pattern.groups+1)): | 
					
						
							| 
									
										
										
										
											2001-01-16 07:37:30 +00:00
										 |  |  |                             code = MARK, group | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                             break | 
					
						
							|  |  |  |                     elif s.next in OCTDIGITS: | 
					
						
							|  |  |  |                         this = this + s.get() | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         break | 
					
						
							|  |  |  |                 if not code: | 
					
						
							|  |  |  |                     this = this[1:] | 
					
						
							| 
									
										
										
										
											2001-09-18 20:55:24 +00:00
										 |  |  |                     code = LITERAL, makechar(atoi(this[-6:], 8) & 0xff) | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |                 if code[0] is LITERAL: | 
					
						
							|  |  |  |                     literal(code[1]) | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     a(code) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 try: | 
					
						
							| 
									
										
										
										
											2001-09-18 20:55:24 +00:00
										 |  |  |                     this = makechar(ESCAPES[this][1]) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |                 except KeyError: | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |                     pass | 
					
						
							|  |  |  |                 literal(this) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |             literal(this) | 
					
						
							|  |  |  |     # convert template to groups and literals lists | 
					
						
							|  |  |  |     i = 0 | 
					
						
							|  |  |  |     groups = [] | 
					
						
							|  |  |  |     literals = [] | 
					
						
							|  |  |  |     for c, s in p: | 
					
						
							|  |  |  |         if c is MARK: | 
					
						
							|  |  |  |             groups.append((i, s)) | 
					
						
							|  |  |  |             literals.append(None) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             literals.append(s) | 
					
						
							|  |  |  |         i = i + 1 | 
					
						
							|  |  |  |     return groups, literals | 
					
						
							| 
									
										
										
										
											2000-06-09 14:08:07 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-06-29 08:58:44 +00:00
										 |  |  | def expand_template(template, match): | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |     g = match.group | 
					
						
							| 
									
										
										
										
											2000-06-30 13:55:15 +00:00
										 |  |  |     sep = match.string[:0] | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |     groups, literals = template | 
					
						
							|  |  |  |     literals = literals[:] | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         for index, group in groups: | 
					
						
							|  |  |  |             literals[index] = s = g(group) | 
					
						
							| 
									
										
										
										
											2000-06-30 07:50:59 +00:00
										 |  |  |             if s is None: | 
					
						
							| 
									
										
										
										
											2001-03-22 15:50:10 +00:00
										 |  |  |                 raise IndexError | 
					
						
							|  |  |  |     except IndexError: | 
					
						
							|  |  |  |         raise error, "empty group" | 
					
						
							|  |  |  |     return string.join(literals, sep) |