mirror of
				https://github.com/python/cpython.git
				synced 2025-10-24 18:33:49 +00:00 
			
		
		
		
	Switch to sre for regular expression matching (the new mini-re module
is actually by Fredrik Lundh). This will break the re tests -- Fredrik will fix this before the final release.
This commit is contained in:
		
							parent
							
								
									ef82cd7234
								
							
						
					
					
						commit
						2850d18615
					
				
					 3 changed files with 665 additions and 654 deletions
				
			
		
							
								
								
									
										652
									
								
								Lib/pre.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										652
									
								
								Lib/pre.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,652 @@ | |||
| # module 're' -- A collection of regular expression operations | ||||
| 
 | ||||
| """Support for regular expressions (RE). | ||||
| 
 | ||||
| This module provides regular expression matching operations similar to | ||||
| those found in Perl. It's 8-bit clean: the strings being processed may | ||||
| contain both null bytes and characters whose high bit is set. Regular | ||||
| expression pattern strings may not contain null bytes, but can specify | ||||
| the null byte using the \\number notation. Characters with the high | ||||
| bit set may be included. | ||||
| 
 | ||||
| Regular expressions can contain both special and ordinary | ||||
| characters. Most ordinary characters, like "A", "a", or "0", are the | ||||
| simplest regular expressions; they simply match themselves. You can | ||||
| concatenate ordinary characters, so last matches the string 'last'. | ||||
| 
 | ||||
| The special characters are: | ||||
|     "."      Matches any character except a newline. | ||||
|     "^"      Matches the start of the string. | ||||
|     "$"      Matches the end of the string. | ||||
|     "*"      Matches 0 or more (greedy) repetitions of the preceding RE. | ||||
|              Greedy means that it will match as many repetitions as possible. | ||||
|     "+"      Matches 1 or more (greedy) repetitions of the preceding RE. | ||||
|     "?"      Matches 0 or 1 (greedy) of the preceding RE. | ||||
|     *?,+?,?? Non-greedy versions of the previous three special characters. | ||||
|     {m,n}    Matches from m to n repetitions of the preceding RE. | ||||
|     {m,n}?   Non-greedy version of the above. | ||||
|     "\\"      Either escapes special characters or signals a special sequence. | ||||
|     []       Indicates a set of characters. | ||||
|              A "^" as the first character indicates a complementing set. | ||||
|     "|"      A|B, creates an RE that will match either A or B. | ||||
|     (...)    Matches the RE inside the parentheses. | ||||
|              The contents can be retrieved or matched later in the string. | ||||
|     (?iLmsx) Set the I, L, M, S, or X flag for the RE. | ||||
|     (?:...)  Non-grouping version of regular parentheses. | ||||
|     (?P<name>...) The substring matched by the group is accessible by name. | ||||
|     (?P=name)     Matches the text matched earlier by the group named name. | ||||
|     (?#...)  A comment; ignored. | ||||
|     (?=...)  Matches if ... matches next, but doesn't consume the string. | ||||
|     (?!...)  Matches if ... doesn't match next. | ||||
| 
 | ||||
| The special sequences consist of "\\" and a character from the list | ||||
| below. If the ordinary character is not on the list, then the | ||||
| resulting RE will match the second character. | ||||
|     \\number  Matches the contents of the group of the same number. | ||||
|     \\A       Matches only at the start of the string. | ||||
|     \\Z       Matches only at the end of the string.  | ||||
|     \\b       Matches the empty string, but only at the start or end of a word. | ||||
|     \\B       Matches the empty string, but not at the start or end of a word. | ||||
|     \\d       Matches any decimal digit; equivalent to the set [0-9]. | ||||
|     \\D       Matches any non-digit character; equivalent to the set [^0-9]. | ||||
|     \\s       Matches any whitespace character; equivalent to [ \\t\\n\\r\\f\\v]. | ||||
|     \\S       Matches any non-whitespace character; equiv. to [^ \\t\\n\\r\\f\\v]. | ||||
|     \\w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. | ||||
|              With LOCALE, it will match the set [0-9_] plus characters defined | ||||
|              as letters for the current locale. | ||||
|     \\W       Matches the complement of \\w. | ||||
|     \\\\       Matches a literal backslash.  | ||||
| 
 | ||||
| This module exports the following functions: | ||||
|     match    Match a regular expression pattern to the beginning of a string. | ||||
|     search   Search a string for the presence of a pattern. | ||||
|     sub      Substitute occurrences of a pattern found in a string. | ||||
|     subn     Same as sub, but also return the number of substitutions made. | ||||
|     split    Split a string by the occurrences of a pattern. | ||||
|     findall  Find all occurrences of a pattern in a string. | ||||
|     compile  Compile a pattern into a RegexObject. | ||||
|     escape   Backslash all non-alphanumerics in a string. | ||||
| 
 | ||||
| This module exports the following classes: | ||||
|     RegexObject    Holds a compiled regular expression pattern. | ||||
|     MatchObject    Contains information about pattern matches. | ||||
| 
 | ||||
| Some of the functions in this module takes flags as optional parameters: | ||||
|     I  IGNORECASE  Perform case-insensitive matching. | ||||
|     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale. | ||||
|     M  MULTILINE   "^" matches the beginning of lines as well as the string. | ||||
|                    "$" matches the end of lines as well as the string. | ||||
|     S  DOTALL      "." matches any character at all, including the newline. | ||||
|     X  VERBOSE     Ignore whitespaces and comments for nicer looking RE's. | ||||
| 
 | ||||
| This module also defines an exception 'error'. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| import sys | ||||
| import string | ||||
| from pcre import * | ||||
| 
 | ||||
| # | ||||
| # First, the public part of the interface: | ||||
| # | ||||
| 
 | ||||
| # pcre.error and re.error should be the same, since exceptions can be | ||||
| # raised from either module. | ||||
| 
 | ||||
| # compilation flags | ||||
| 
 | ||||
| I = IGNORECASE | ||||
| L = LOCALE | ||||
| M = MULTILINE | ||||
| S = DOTALL  | ||||
| X = VERBOSE  | ||||
| 
 | ||||
| 
 | ||||
| # | ||||
| # | ||||
| # | ||||
| 
 | ||||
| _cache = {} | ||||
| _MAXCACHE = 20 | ||||
| 
 | ||||
| def _cachecompile(pattern, flags=0): | ||||
|     key = (pattern, flags) | ||||
|     try: | ||||
|         return _cache[key] | ||||
|     except KeyError: | ||||
|         pass | ||||
|     value = compile(pattern, flags) | ||||
|     if len(_cache) >= _MAXCACHE: | ||||
|         _cache.clear() | ||||
|     _cache[key] = value | ||||
|     return value | ||||
| 
 | ||||
| def match(pattern, string, flags=0): | ||||
|     """match (pattern, string[, flags]) -> MatchObject or None | ||||
|      | ||||
|     If zero or more characters at the beginning of string match the | ||||
|     regular expression pattern, return a corresponding MatchObject | ||||
|     instance. Return None if the string does not match the pattern; | ||||
|     note that this is different from a zero-length match. | ||||
| 
 | ||||
|     Note: If you want to locate a match anywhere in string, use | ||||
|     search() instead. | ||||
| 
 | ||||
|     """ | ||||
|      | ||||
|     return _cachecompile(pattern, flags).match(string) | ||||
|    | ||||
| def search(pattern, string, flags=0): | ||||
|     """search (pattern, string[, flags]) -> MatchObject or None | ||||
|      | ||||
|     Scan through string looking for a location where the regular | ||||
|     expression pattern produces a match, and return a corresponding | ||||
|     MatchObject instance. Return None if no position in the string | ||||
|     matches the pattern; note that this is different from finding a | ||||
|     zero-length match at some point in the string. | ||||
| 
 | ||||
|     """ | ||||
|     return _cachecompile(pattern, flags).search(string) | ||||
|    | ||||
| def sub(pattern, repl, string, count=0): | ||||
|     """sub(pattern, repl, string[, count=0]) -> string | ||||
|      | ||||
|     Return the string obtained by replacing the leftmost | ||||
|     non-overlapping occurrences of pattern in string by the | ||||
|     replacement repl. If the pattern isn't found, string is returned | ||||
|     unchanged. repl can be a string or a function; if a function, it | ||||
|     is called for every non-overlapping occurrence of pattern. The | ||||
|     function takes a single match object argument, and returns the | ||||
|     replacement string. | ||||
| 
 | ||||
|     The pattern may be a string or a regex object; if you need to | ||||
|     specify regular expression flags, you must use a regex object, or | ||||
|     use embedded modifiers in a pattern; e.g. | ||||
|     sub("(?i)b+", "x", "bbbb BBBB") returns 'x x'. | ||||
| 
 | ||||
|     The optional argument count is the maximum number of pattern | ||||
|     occurrences to be replaced; count must be a non-negative integer, | ||||
|     and the default value of 0 means to replace all occurrences. | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.sub(repl, string, count) | ||||
| 
 | ||||
| def subn(pattern, repl, string, count=0): | ||||
|     """subn(pattern, repl, string[, count=0]) -> (string, num substitutions) | ||||
|      | ||||
|     Perform the same operation as sub(), but return a tuple | ||||
|     (new_string, number_of_subs_made). | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.subn(repl, string, count) | ||||
|    | ||||
| def split(pattern, string, maxsplit=0): | ||||
|     """split(pattern, string[, maxsplit=0]) -> list of strings | ||||
|      | ||||
|     Split string by the occurrences of pattern. If capturing | ||||
|     parentheses are used in pattern, then the text of all groups in | ||||
|     the pattern are also returned as part of the resulting list. If | ||||
|     maxsplit is nonzero, at most maxsplit splits occur, and the | ||||
|     remainder of the string is returned as the final element of the | ||||
|     list. | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.split(string, maxsplit) | ||||
| 
 | ||||
| def findall(pattern, string): | ||||
|     """findall(pattern, string) -> list | ||||
|      | ||||
|     Return a list of all non-overlapping matches of pattern in | ||||
|     string. If one or more groups are present in the pattern, return a | ||||
|     list of groups; this will be a list of tuples if the pattern has | ||||
|     more than one group. Empty matches are included in the result. | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.findall(string) | ||||
| 
 | ||||
| def escape(pattern): | ||||
|     """escape(string) -> string | ||||
|      | ||||
|     Return string with all non-alphanumerics backslashed; this is | ||||
|     useful if you want to match an arbitrary literal string that may | ||||
|     have regular expression metacharacters in it. | ||||
| 
 | ||||
|     """ | ||||
|     result = list(pattern) | ||||
|     alphanum=string.letters+'_'+string.digits | ||||
|     for i in range(len(pattern)): | ||||
|         char = pattern[i] | ||||
|         if char not in alphanum: | ||||
|             if char=='\000': result[i] = '\\000' | ||||
|             else: result[i] = '\\'+char | ||||
|     return string.join(result, '') | ||||
| 
 | ||||
| def compile(pattern, flags=0): | ||||
|     """compile(pattern[, flags]) -> RegexObject | ||||
| 
 | ||||
|     Compile a regular expression pattern into a regular expression | ||||
|     object, which can be used for matching using its match() and | ||||
|     search() methods. | ||||
| 
 | ||||
|     """ | ||||
|     groupindex={} | ||||
|     code=pcre_compile(pattern, flags, groupindex) | ||||
|     return RegexObject(pattern, flags, code, groupindex) | ||||
|      | ||||
| 
 | ||||
| # | ||||
| #   Class definitions | ||||
| # | ||||
| 
 | ||||
| class RegexObject: | ||||
|     """Holds a compiled regular expression pattern. | ||||
| 
 | ||||
|     Methods: | ||||
|     match    Match the pattern to the beginning of a string. | ||||
|     search   Search a string for the presence of the pattern. | ||||
|     sub      Substitute occurrences of the pattern found in a string. | ||||
|     subn     Same as sub, but also return the number of substitutions made. | ||||
|     split    Split a string by the occurrences of the pattern. | ||||
|     findall  Find all occurrences of the pattern in a string. | ||||
|      | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, pattern, flags, code, groupindex): | ||||
|         self.code = code  | ||||
|         self.flags = flags | ||||
|         self.pattern = pattern | ||||
|         self.groupindex = groupindex | ||||
| 
 | ||||
|     def search(self, string, pos=0, endpos=None): | ||||
|         """search(string[, pos][, endpos]) -> MatchObject or None | ||||
|          | ||||
|         Scan through string looking for a location where this regular | ||||
|         expression produces a match, and return a corresponding | ||||
|         MatchObject instance. Return None if no position in the string | ||||
|         matches the pattern; note that this is different from finding | ||||
|         a zero-length match at some point in the string. The optional | ||||
|         pos and endpos parameters have the same meaning as for the | ||||
|         match() method. | ||||
|      | ||||
|         """ | ||||
|         if endpos is None or endpos>len(string):  | ||||
|             endpos=len(string) | ||||
|         if endpos<pos: endpos=pos | ||||
|         regs = self.code.match(string, pos, endpos, 0) | ||||
|         if regs is None: | ||||
|             return None | ||||
|         self._num_regs=len(regs) | ||||
|          | ||||
|         return MatchObject(self, | ||||
|                            string, | ||||
|                            pos, endpos, | ||||
|                            regs) | ||||
|      | ||||
|     def match(self, string, pos=0, endpos=None): | ||||
|         """match(string[, pos][, endpos]) -> MatchObject or None | ||||
|          | ||||
|         If zero or more characters at the beginning of string match | ||||
|         this regular expression, return a corresponding MatchObject | ||||
|         instance. Return None if the string does not match the | ||||
|         pattern; note that this is different from a zero-length match. | ||||
| 
 | ||||
|         Note: If you want to locate a match anywhere in string, use | ||||
|         search() instead. | ||||
| 
 | ||||
|         The optional second parameter pos gives an index in the string | ||||
|         where the search is to start; it defaults to 0.  This is not | ||||
|         completely equivalent to slicing the string; the '' pattern | ||||
|         character matches at the real beginning of the string and at | ||||
|         positions just after a newline, but not necessarily at the | ||||
|         index where the search is to start. | ||||
| 
 | ||||
|         The optional parameter endpos limits how far the string will | ||||
|         be searched; it will be as if the string is endpos characters | ||||
|         long, so only the characters from pos to endpos will be | ||||
|         searched for a match. | ||||
| 
 | ||||
|         """ | ||||
|         if endpos is None or endpos>len(string):  | ||||
|             endpos=len(string) | ||||
|         if endpos<pos: endpos=pos | ||||
|         regs = self.code.match(string, pos, endpos, ANCHORED) | ||||
|         if regs is None: | ||||
|             return None | ||||
|         self._num_regs=len(regs) | ||||
|         return MatchObject(self, | ||||
|                            string, | ||||
|                            pos, endpos, | ||||
|                            regs) | ||||
|      | ||||
|     def sub(self, repl, string, count=0): | ||||
|         """sub(repl, string[, count=0]) -> string | ||||
|          | ||||
|         Return the string obtained by replacing the leftmost | ||||
|         non-overlapping occurrences of the compiled pattern in string | ||||
|         by the replacement repl. If the pattern isn't found, string is | ||||
|         returned unchanged. | ||||
| 
 | ||||
|         Identical to the sub() function, using the compiled pattern. | ||||
|          | ||||
|         """ | ||||
|         return self.subn(repl, string, count)[0] | ||||
|      | ||||
|     def subn(self, repl, source, count=0):  | ||||
|         """subn(repl, string[, count=0]) -> tuple | ||||
|          | ||||
|         Perform the same operation as sub(), but return a tuple | ||||
|         (new_string, number_of_subs_made). | ||||
| 
 | ||||
|         """ | ||||
|         if count < 0: | ||||
|             raise error, "negative substitution count" | ||||
|         if count == 0: | ||||
|             count = sys.maxint | ||||
|         n = 0           # Number of matches | ||||
|         pos = 0         # Where to start searching | ||||
|         lastmatch = -1  # End of last match | ||||
|         results = []    # Substrings making up the result | ||||
|         end = len(source) | ||||
| 
 | ||||
|         if type(repl) is type(''): | ||||
|             # See if repl contains group references | ||||
|             try: | ||||
|                 repl = pcre_expand(_Dummy, repl) | ||||
|             except: | ||||
|                 m = MatchObject(self, source, 0, end, []) | ||||
|                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl) | ||||
|             else: | ||||
|                 m = None | ||||
|         else: | ||||
|             m = MatchObject(self, source, 0, end, []) | ||||
| 
 | ||||
|         match = self.code.match | ||||
|         append = results.append | ||||
|         while n < count and pos <= end: | ||||
|             regs = match(source, pos, end, 0) | ||||
|             if not regs: | ||||
|                 break | ||||
|             self._num_regs = len(regs) | ||||
|             i, j = regs[0] | ||||
|             if i == j == lastmatch: | ||||
|                 # Empty match adjacent to previous match | ||||
|                 pos = pos + 1 | ||||
|                 append(source[lastmatch:pos]) | ||||
|                 continue | ||||
|             if pos < i: | ||||
|                 append(source[pos:i]) | ||||
|             if m: | ||||
|                 m.pos = pos | ||||
|                 m.regs = regs | ||||
|                 append(repl(m)) | ||||
|             else: | ||||
|                 append(repl) | ||||
|             pos = lastmatch = j | ||||
|             if i == j: | ||||
|                 # Last match was empty; don't try here again | ||||
|                 pos = pos + 1 | ||||
|                 append(source[lastmatch:pos]) | ||||
|             n = n + 1 | ||||
|         append(source[pos:]) | ||||
|         return (string.join(results, ''), n) | ||||
|                                                                              | ||||
|     def split(self, source, maxsplit=0): | ||||
|         """split(source[, maxsplit=0]) -> list of strings | ||||
|      | ||||
|         Split string by the occurrences of the compiled pattern. If | ||||
|         capturing parentheses are used in the pattern, then the text | ||||
|         of all groups in the pattern are also returned as part of the | ||||
|         resulting list. If maxsplit is nonzero, at most maxsplit | ||||
|         splits occur, and the remainder of the string is returned as | ||||
|         the final element of the list. | ||||
|          | ||||
|         """ | ||||
|         if maxsplit < 0: | ||||
|             raise error, "negative split count" | ||||
|         if maxsplit == 0: | ||||
|             maxsplit = sys.maxint | ||||
|         n = 0 | ||||
|         pos = 0 | ||||
|         lastmatch = 0 | ||||
|         results = [] | ||||
|         end = len(source) | ||||
|         match = self.code.match | ||||
|         append = results.append | ||||
|         while n < maxsplit: | ||||
|             regs = match(source, pos, end, 0) | ||||
|             if not regs: | ||||
|                 break | ||||
|             i, j = regs[0] | ||||
|             if i == j: | ||||
|                 # Empty match | ||||
|                 if pos >= end: | ||||
|                     break | ||||
|                 pos = pos+1 | ||||
|                 continue | ||||
|             append(source[lastmatch:i]) | ||||
|             rest = regs[1:] | ||||
|             if rest: | ||||
|                 for a, b in rest: | ||||
|                     if a == -1 or b == -1: | ||||
|                         group = None | ||||
|                     else: | ||||
|                         group = source[a:b] | ||||
|                     append(group) | ||||
|             pos = lastmatch = j | ||||
|             n = n + 1 | ||||
|         append(source[lastmatch:]) | ||||
|         return results | ||||
| 
 | ||||
|     def findall(self, source): | ||||
|         """findall(source) -> list | ||||
|      | ||||
|         Return a list of all non-overlapping matches of the compiled | ||||
|         pattern in string. If one or more groups are present in the | ||||
|         pattern, return a list of groups; this will be a list of | ||||
|         tuples if the pattern has more than one group. Empty matches | ||||
|         are included in the result. | ||||
| 
 | ||||
|         """ | ||||
|         pos = 0 | ||||
|         end = len(source) | ||||
|         results = [] | ||||
|         match = self.code.match | ||||
|         append = results.append | ||||
|         while pos <= end: | ||||
|             regs = match(source, pos, end, 0) | ||||
|             if not regs: | ||||
|                 break | ||||
|             i, j = regs[0] | ||||
|             rest = regs[1:] | ||||
|             if not rest: | ||||
|                 gr = source[i:j] | ||||
|             elif len(rest) == 1: | ||||
|                 a, b = rest[0] | ||||
|                 gr = source[a:b] | ||||
|             else: | ||||
|                 gr = [] | ||||
|                 for (a, b) in rest: | ||||
|                     gr.append(source[a:b]) | ||||
|                 gr = tuple(gr) | ||||
|             append(gr) | ||||
|             pos = max(j, pos+1) | ||||
|         return results | ||||
| 
 | ||||
|     # The following 3 functions were contributed by Mike Fletcher, and | ||||
|     # allow pickling and unpickling of RegexObject instances. | ||||
|     def __getinitargs__(self): | ||||
|         return (None,None,None,None) # any 4 elements, to work around | ||||
|                                      # problems with the | ||||
|                                      # pickle/cPickle modules not yet  | ||||
|                                      # ignoring the __init__ function | ||||
|     def __getstate__(self): | ||||
|         return self.pattern, self.flags, self.groupindex | ||||
|     def __setstate__(self, statetuple): | ||||
|         self.pattern = statetuple[0] | ||||
|         self.flags = statetuple[1] | ||||
|         self.groupindex = statetuple[2] | ||||
|         self.code = apply(pcre_compile, statetuple) | ||||
| 
 | ||||
| class _Dummy: | ||||
|     # Dummy class used by _subn_string().  Has 'group' to avoid core dump. | ||||
|     group = None | ||||
| 
 | ||||
| class MatchObject: | ||||
|     """Holds a compiled regular expression pattern. | ||||
| 
 | ||||
|     Methods: | ||||
|     start      Return the index of the start of a matched substring. | ||||
|     end        Return the index of the end of a matched substring. | ||||
|     span       Return a tuple of (start, end) of a matched substring. | ||||
|     groups     Return a tuple of all the subgroups of the match. | ||||
|     group      Return one or more subgroups of the match. | ||||
|     groupdict  Return a dictionary of all the named subgroups of the match. | ||||
| 
 | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, re, string, pos, endpos, regs): | ||||
|         self.re = re | ||||
|         self.string = string | ||||
|         self.pos = pos  | ||||
|         self.endpos = endpos | ||||
|         self.regs = regs | ||||
|          | ||||
|     def start(self, g = 0): | ||||
|         """start([group=0]) -> int or None | ||||
|          | ||||
|         Return the index of the start of the substring matched by | ||||
|         group; group defaults to zero (meaning the whole matched | ||||
|         substring). Return None if group exists but did not contribute | ||||
|         to the match. | ||||
| 
 | ||||
|         """ | ||||
|         if type(g) == type(''): | ||||
|             try: | ||||
|                 g = self.re.groupindex[g] | ||||
|             except (KeyError, TypeError): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|         return self.regs[g][0] | ||||
|      | ||||
|     def end(self, g = 0): | ||||
|         """end([group=0]) -> int or None | ||||
|          | ||||
|         Return the indices of the end of the substring matched by | ||||
|         group; group defaults to zero (meaning the whole matched | ||||
|         substring). Return None if group exists but did not contribute | ||||
|         to the match. | ||||
| 
 | ||||
|         """ | ||||
|         if type(g) == type(''): | ||||
|             try: | ||||
|                 g = self.re.groupindex[g] | ||||
|             except (KeyError, TypeError): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|         return self.regs[g][1] | ||||
|      | ||||
|     def span(self, g = 0): | ||||
|         """span([group=0]) -> tuple | ||||
|          | ||||
|         Return the 2-tuple (m.start(group), m.end(group)). Note that | ||||
|         if group did not contribute to the match, this is (None, | ||||
|         None). Group defaults to zero (meaning the whole matched | ||||
|         substring). | ||||
| 
 | ||||
|         """ | ||||
|         if type(g) == type(''): | ||||
|             try: | ||||
|                 g = self.re.groupindex[g] | ||||
|             except (KeyError, TypeError): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|         return self.regs[g] | ||||
|      | ||||
|     def groups(self, default=None): | ||||
|         """groups([default=None]) -> tuple | ||||
|          | ||||
|         Return a tuple containing all the subgroups of the match, from | ||||
|         1 up to however many groups are in the pattern. The default | ||||
|         argument is used for groups that did not participate in the | ||||
|         match. | ||||
| 
 | ||||
|         """ | ||||
|         result = [] | ||||
|         for g in range(1, self.re._num_regs): | ||||
|             a, b = self.regs[g] | ||||
|             if a == -1 or b == -1: | ||||
|                 result.append(default) | ||||
|             else: | ||||
|                 result.append(self.string[a:b]) | ||||
|         return tuple(result) | ||||
| 
 | ||||
|     def group(self, *groups): | ||||
|         """group([group1, group2, ...]) -> string or tuple | ||||
|          | ||||
|         Return one or more subgroups of the match. If there is a | ||||
|         single argument, the result is a single string; if there are | ||||
|         multiple arguments, the result is a tuple with one item per | ||||
|         argument. Without arguments, group1 defaults to zero (i.e. the | ||||
|         whole match is returned). If a groupN argument is zero, the | ||||
|         corresponding return value is the entire matching string; if | ||||
|         it is in the inclusive range [1..99], it is the string | ||||
|         matching the the corresponding parenthesized group. If a group | ||||
|         number is negative or larger than the number of groups defined | ||||
|         in the pattern, an IndexError exception is raised. If a group | ||||
|         is contained in a part of the pattern that did not match, the | ||||
|         corresponding result is None. If a group is contained in a | ||||
|         part of the pattern that matched multiple times, the last | ||||
|         match is returned. | ||||
| 
 | ||||
|         If the regular expression uses the (?P<name>...) syntax, the | ||||
|         groupN arguments may also be strings identifying groups by | ||||
|         their group name. If a string argument is not used as a group | ||||
|         name in the pattern, an IndexError exception is raised. | ||||
| 
 | ||||
|         """ | ||||
|         if len(groups) == 0: | ||||
|             groups = (0,) | ||||
|         result = [] | ||||
|         for g in groups: | ||||
|             if type(g) == type(''): | ||||
|                 try: | ||||
|                     g = self.re.groupindex[g] | ||||
|                 except (KeyError, TypeError): | ||||
|                     raise IndexError, 'group %s is undefined' % `g` | ||||
|             if g >= len(self.regs): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|             a, b = self.regs[g] | ||||
|             if a == -1 or b == -1: | ||||
|                 result.append(None) | ||||
|             else: | ||||
|                 result.append(self.string[a:b]) | ||||
|         if len(result) > 1: | ||||
|             return tuple(result) | ||||
|         elif len(result) == 1: | ||||
|             return result[0] | ||||
|         else: | ||||
|             return () | ||||
| 
 | ||||
|     def groupdict(self, default=None): | ||||
|         """groupdict([default=None]) -> dictionary | ||||
|          | ||||
|         Return a dictionary containing all the named subgroups of the | ||||
|         match, keyed by the subgroup name. The default argument is | ||||
|         used for groups that did not participate in the match. | ||||
| 
 | ||||
|         """ | ||||
|         dict = {} | ||||
|         for name, index in self.re.groupindex.items(): | ||||
|             a, b = self.regs[index] | ||||
|             if a == -1 or b == -1: | ||||
|                 dict[name] = default | ||||
|             else: | ||||
|                 dict[name] = self.string[a:b] | ||||
|         return dict | ||||
							
								
								
									
										659
									
								
								Lib/re.py
									
										
									
									
									
								
							
							
						
						
									
										659
									
								
								Lib/re.py
									
										
									
									
									
								
							|  | @ -1,652 +1,11 @@ | |||
| # module 're' -- A collection of regular expression operations | ||||
| # change this to "pre" if your regexps stopped working.  don't | ||||
| # forget to send a bug report to <some suitable address> | ||||
| 
 | ||||
| """Support for regular expressions (RE). | ||||
| engine = "sre" | ||||
| 
 | ||||
| This module provides regular expression matching operations similar to | ||||
| those found in Perl. It's 8-bit clean: the strings being processed may | ||||
| contain both null bytes and characters whose high bit is set. Regular | ||||
| expression pattern strings may not contain null bytes, but can specify | ||||
| the null byte using the \\number notation. Characters with the high | ||||
| bit set may be included. | ||||
| 
 | ||||
| Regular expressions can contain both special and ordinary | ||||
| characters. Most ordinary characters, like "A", "a", or "0", are the | ||||
| simplest regular expressions; they simply match themselves. You can | ||||
| concatenate ordinary characters, so last matches the string 'last'. | ||||
| 
 | ||||
| The special characters are: | ||||
|     "."      Matches any character except a newline. | ||||
|     "^"      Matches the start of the string. | ||||
|     "$"      Matches the end of the string. | ||||
|     "*"      Matches 0 or more (greedy) repetitions of the preceding RE. | ||||
|              Greedy means that it will match as many repetitions as possible. | ||||
|     "+"      Matches 1 or more (greedy) repetitions of the preceding RE. | ||||
|     "?"      Matches 0 or 1 (greedy) of the preceding RE. | ||||
|     *?,+?,?? Non-greedy versions of the previous three special characters. | ||||
|     {m,n}    Matches from m to n repetitions of the preceding RE. | ||||
|     {m,n}?   Non-greedy version of the above. | ||||
|     "\\"      Either escapes special characters or signals a special sequence. | ||||
|     []       Indicates a set of characters. | ||||
|              A "^" as the first character indicates a complementing set. | ||||
|     "|"      A|B, creates an RE that will match either A or B. | ||||
|     (...)    Matches the RE inside the parentheses. | ||||
|              The contents can be retrieved or matched later in the string. | ||||
|     (?iLmsx) Set the I, L, M, S, or X flag for the RE. | ||||
|     (?:...)  Non-grouping version of regular parentheses. | ||||
|     (?P<name>...) The substring matched by the group is accessible by name. | ||||
|     (?P=name)     Matches the text matched earlier by the group named name. | ||||
|     (?#...)  A comment; ignored. | ||||
|     (?=...)  Matches if ... matches next, but doesn't consume the string. | ||||
|     (?!...)  Matches if ... doesn't match next. | ||||
| 
 | ||||
| The special sequences consist of "\\" and a character from the list | ||||
| below. If the ordinary character is not on the list, then the | ||||
| resulting RE will match the second character. | ||||
|     \\number  Matches the contents of the group of the same number. | ||||
|     \\A       Matches only at the start of the string. | ||||
|     \\Z       Matches only at the end of the string.  | ||||
|     \\b       Matches the empty string, but only at the start or end of a word. | ||||
|     \\B       Matches the empty string, but not at the start or end of a word. | ||||
|     \\d       Matches any decimal digit; equivalent to the set [0-9]. | ||||
|     \\D       Matches any non-digit character; equivalent to the set [^0-9]. | ||||
|     \\s       Matches any whitespace character; equivalent to [ \\t\\n\\r\\f\\v]. | ||||
|     \\S       Matches any non-whitespace character; equiv. to [^ \\t\\n\\r\\f\\v]. | ||||
|     \\w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]. | ||||
|              With LOCALE, it will match the set [0-9_] plus characters defined | ||||
|              as letters for the current locale. | ||||
|     \\W       Matches the complement of \\w. | ||||
|     \\\\       Matches a literal backslash.  | ||||
| 
 | ||||
| This module exports the following functions: | ||||
|     match    Match a regular expression pattern to the beginning of a string. | ||||
|     search   Search a string for the presence of a pattern. | ||||
|     sub      Substitute occurrences of a pattern found in a string. | ||||
|     subn     Same as sub, but also return the number of substitutions made. | ||||
|     split    Split a string by the occurrences of a pattern. | ||||
|     findall  Find all occurrences of a pattern in a string. | ||||
|     compile  Compile a pattern into a RegexObject. | ||||
|     escape   Backslash all non-alphanumerics in a string. | ||||
| 
 | ||||
| This module exports the following classes: | ||||
|     RegexObject    Holds a compiled regular expression pattern. | ||||
|     MatchObject    Contains information about pattern matches. | ||||
| 
 | ||||
| Some of the functions in this module takes flags as optional parameters: | ||||
|     I  IGNORECASE  Perform case-insensitive matching. | ||||
|     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale. | ||||
|     M  MULTILINE   "^" matches the beginning of lines as well as the string. | ||||
|                    "$" matches the end of lines as well as the string. | ||||
|     S  DOTALL      "." matches any character at all, including the newline. | ||||
|     X  VERBOSE     Ignore whitespaces and comments for nicer looking RE's. | ||||
| 
 | ||||
| This module also defines an exception 'error'. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| import sys | ||||
| import string | ||||
| from pcre import * | ||||
| 
 | ||||
| # | ||||
| # First, the public part of the interface: | ||||
| # | ||||
| 
 | ||||
| # pcre.error and re.error should be the same, since exceptions can be | ||||
| # raised from either module. | ||||
| 
 | ||||
| # compilation flags | ||||
| 
 | ||||
| I = IGNORECASE | ||||
| L = LOCALE | ||||
| M = MULTILINE | ||||
| S = DOTALL  | ||||
| X = VERBOSE  | ||||
| 
 | ||||
| 
 | ||||
| # | ||||
| # | ||||
| # | ||||
| 
 | ||||
| _cache = {} | ||||
| _MAXCACHE = 20 | ||||
| 
 | ||||
| def _cachecompile(pattern, flags=0): | ||||
|     key = (pattern, flags) | ||||
|     try: | ||||
|         return _cache[key] | ||||
|     except KeyError: | ||||
|         pass | ||||
|     value = compile(pattern, flags) | ||||
|     if len(_cache) >= _MAXCACHE: | ||||
|         _cache.clear() | ||||
|     _cache[key] = value | ||||
|     return value | ||||
| 
 | ||||
| def match(pattern, string, flags=0): | ||||
|     """match (pattern, string[, flags]) -> MatchObject or None | ||||
|      | ||||
|     If zero or more characters at the beginning of string match the | ||||
|     regular expression pattern, return a corresponding MatchObject | ||||
|     instance. Return None if the string does not match the pattern; | ||||
|     note that this is different from a zero-length match. | ||||
| 
 | ||||
|     Note: If you want to locate a match anywhere in string, use | ||||
|     search() instead. | ||||
| 
 | ||||
|     """ | ||||
|      | ||||
|     return _cachecompile(pattern, flags).match(string) | ||||
|    | ||||
| def search(pattern, string, flags=0): | ||||
|     """search (pattern, string[, flags]) -> MatchObject or None | ||||
|      | ||||
|     Scan through string looking for a location where the regular | ||||
|     expression pattern produces a match, and return a corresponding | ||||
|     MatchObject instance. Return None if no position in the string | ||||
|     matches the pattern; note that this is different from finding a | ||||
|     zero-length match at some point in the string. | ||||
| 
 | ||||
|     """ | ||||
|     return _cachecompile(pattern, flags).search(string) | ||||
|    | ||||
| def sub(pattern, repl, string, count=0): | ||||
|     """sub(pattern, repl, string[, count=0]) -> string | ||||
|      | ||||
|     Return the string obtained by replacing the leftmost | ||||
|     non-overlapping occurrences of pattern in string by the | ||||
|     replacement repl. If the pattern isn't found, string is returned | ||||
|     unchanged. repl can be a string or a function; if a function, it | ||||
|     is called for every non-overlapping occurrence of pattern. The | ||||
|     function takes a single match object argument, and returns the | ||||
|     replacement string. | ||||
| 
 | ||||
|     The pattern may be a string or a regex object; if you need to | ||||
|     specify regular expression flags, you must use a regex object, or | ||||
|     use embedded modifiers in a pattern; e.g. | ||||
|     sub("(?i)b+", "x", "bbbb BBBB") returns 'x x'. | ||||
| 
 | ||||
|     The optional argument count is the maximum number of pattern | ||||
|     occurrences to be replaced; count must be a non-negative integer, | ||||
|     and the default value of 0 means to replace all occurrences. | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.sub(repl, string, count) | ||||
| 
 | ||||
| def subn(pattern, repl, string, count=0): | ||||
|     """subn(pattern, repl, string[, count=0]) -> (string, num substitutions) | ||||
|      | ||||
|     Perform the same operation as sub(), but return a tuple | ||||
|     (new_string, number_of_subs_made). | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.subn(repl, string, count) | ||||
|    | ||||
| def split(pattern, string, maxsplit=0): | ||||
|     """split(pattern, string[, maxsplit=0]) -> list of strings | ||||
|      | ||||
|     Split string by the occurrences of pattern. If capturing | ||||
|     parentheses are used in pattern, then the text of all groups in | ||||
|     the pattern are also returned as part of the resulting list. If | ||||
|     maxsplit is nonzero, at most maxsplit splits occur, and the | ||||
|     remainder of the string is returned as the final element of the | ||||
|     list. | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.split(string, maxsplit) | ||||
| 
 | ||||
| def findall(pattern, string): | ||||
|     """findall(pattern, string) -> list | ||||
|      | ||||
|     Return a list of all non-overlapping matches of pattern in | ||||
|     string. If one or more groups are present in the pattern, return a | ||||
|     list of groups; this will be a list of tuples if the pattern has | ||||
|     more than one group. Empty matches are included in the result. | ||||
| 
 | ||||
|     """ | ||||
|     if type(pattern) == type(''): | ||||
|         pattern = _cachecompile(pattern) | ||||
|     return pattern.findall(string) | ||||
| 
 | ||||
| def escape(pattern): | ||||
|     """escape(string) -> string | ||||
|      | ||||
|     Return string with all non-alphanumerics backslashed; this is | ||||
|     useful if you want to match an arbitrary literal string that may | ||||
|     have regular expression metacharacters in it. | ||||
| 
 | ||||
|     """ | ||||
|     result = list(pattern) | ||||
|     alphanum=string.letters+'_'+string.digits | ||||
|     for i in range(len(pattern)): | ||||
|         char = pattern[i] | ||||
|         if char not in alphanum: | ||||
|             if char=='\000': result[i] = '\\000' | ||||
|             else: result[i] = '\\'+char | ||||
|     return string.join(result, '') | ||||
| 
 | ||||
| def compile(pattern, flags=0): | ||||
|     """compile(pattern[, flags]) -> RegexObject | ||||
| 
 | ||||
|     Compile a regular expression pattern into a regular expression | ||||
|     object, which can be used for matching using its match() and | ||||
|     search() methods. | ||||
| 
 | ||||
|     """ | ||||
|     groupindex={} | ||||
|     code=pcre_compile(pattern, flags, groupindex) | ||||
|     return RegexObject(pattern, flags, code, groupindex) | ||||
|      | ||||
| 
 | ||||
| # | ||||
| #   Class definitions | ||||
| # | ||||
| 
 | ||||
| class RegexObject: | ||||
|     """Holds a compiled regular expression pattern. | ||||
| 
 | ||||
|     Methods: | ||||
|     match    Match the pattern to the beginning of a string. | ||||
|     search   Search a string for the presence of the pattern. | ||||
|     sub      Substitute occurrences of the pattern found in a string. | ||||
|     subn     Same as sub, but also return the number of substitutions made. | ||||
|     split    Split a string by the occurrences of the pattern. | ||||
|     findall  Find all occurrences of the pattern in a string. | ||||
|      | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, pattern, flags, code, groupindex): | ||||
|         self.code = code  | ||||
|         self.flags = flags | ||||
|         self.pattern = pattern | ||||
|         self.groupindex = groupindex | ||||
| 
 | ||||
|     def search(self, string, pos=0, endpos=None): | ||||
|         """search(string[, pos][, endpos]) -> MatchObject or None | ||||
|          | ||||
|         Scan through string looking for a location where this regular | ||||
|         expression produces a match, and return a corresponding | ||||
|         MatchObject instance. Return None if no position in the string | ||||
|         matches the pattern; note that this is different from finding | ||||
|         a zero-length match at some point in the string. The optional | ||||
|         pos and endpos parameters have the same meaning as for the | ||||
|         match() method. | ||||
|      | ||||
|         """ | ||||
|         if endpos is None or endpos>len(string):  | ||||
|             endpos=len(string) | ||||
|         if endpos<pos: endpos=pos | ||||
|         regs = self.code.match(string, pos, endpos, 0) | ||||
|         if regs is None: | ||||
|             return None | ||||
|         self._num_regs=len(regs) | ||||
|          | ||||
|         return MatchObject(self, | ||||
|                            string, | ||||
|                            pos, endpos, | ||||
|                            regs) | ||||
|      | ||||
|     def match(self, string, pos=0, endpos=None): | ||||
|         """match(string[, pos][, endpos]) -> MatchObject or None | ||||
|          | ||||
|         If zero or more characters at the beginning of string match | ||||
|         this regular expression, return a corresponding MatchObject | ||||
|         instance. Return None if the string does not match the | ||||
|         pattern; note that this is different from a zero-length match. | ||||
| 
 | ||||
|         Note: If you want to locate a match anywhere in string, use | ||||
|         search() instead. | ||||
| 
 | ||||
|         The optional second parameter pos gives an index in the string | ||||
|         where the search is to start; it defaults to 0.  This is not | ||||
|         completely equivalent to slicing the string; the '' pattern | ||||
|         character matches at the real beginning of the string and at | ||||
|         positions just after a newline, but not necessarily at the | ||||
|         index where the search is to start. | ||||
| 
 | ||||
|         The optional parameter endpos limits how far the string will | ||||
|         be searched; it will be as if the string is endpos characters | ||||
|         long, so only the characters from pos to endpos will be | ||||
|         searched for a match. | ||||
| 
 | ||||
|         """ | ||||
|         if endpos is None or endpos>len(string):  | ||||
|             endpos=len(string) | ||||
|         if endpos<pos: endpos=pos | ||||
|         regs = self.code.match(string, pos, endpos, ANCHORED) | ||||
|         if regs is None: | ||||
|             return None | ||||
|         self._num_regs=len(regs) | ||||
|         return MatchObject(self, | ||||
|                            string, | ||||
|                            pos, endpos, | ||||
|                            regs) | ||||
|      | ||||
|     def sub(self, repl, string, count=0): | ||||
|         """sub(repl, string[, count=0]) -> string | ||||
|          | ||||
|         Return the string obtained by replacing the leftmost | ||||
|         non-overlapping occurrences of the compiled pattern in string | ||||
|         by the replacement repl. If the pattern isn't found, string is | ||||
|         returned unchanged. | ||||
| 
 | ||||
|         Identical to the sub() function, using the compiled pattern. | ||||
|          | ||||
|         """ | ||||
|         return self.subn(repl, string, count)[0] | ||||
|      | ||||
|     def subn(self, repl, source, count=0):  | ||||
|         """subn(repl, string[, count=0]) -> tuple | ||||
|          | ||||
|         Perform the same operation as sub(), but return a tuple | ||||
|         (new_string, number_of_subs_made). | ||||
| 
 | ||||
|         """ | ||||
|         if count < 0: | ||||
|             raise error, "negative substitution count" | ||||
|         if count == 0: | ||||
|             count = sys.maxint | ||||
|         n = 0           # Number of matches | ||||
|         pos = 0         # Where to start searching | ||||
|         lastmatch = -1  # End of last match | ||||
|         results = []    # Substrings making up the result | ||||
|         end = len(source) | ||||
| 
 | ||||
|         if type(repl) is type(''): | ||||
|             # See if repl contains group references | ||||
|             try: | ||||
|                 repl = pcre_expand(_Dummy, repl) | ||||
|             except: | ||||
|                 m = MatchObject(self, source, 0, end, []) | ||||
|                 repl = lambda m, repl=repl, expand=pcre_expand: expand(m, repl) | ||||
|             else: | ||||
|                 m = None | ||||
|         else: | ||||
|             m = MatchObject(self, source, 0, end, []) | ||||
| 
 | ||||
|         match = self.code.match | ||||
|         append = results.append | ||||
|         while n < count and pos <= end: | ||||
|             regs = match(source, pos, end, 0) | ||||
|             if not regs: | ||||
|                 break | ||||
|             self._num_regs = len(regs) | ||||
|             i, j = regs[0] | ||||
|             if i == j == lastmatch: | ||||
|                 # Empty match adjacent to previous match | ||||
|                 pos = pos + 1 | ||||
|                 append(source[lastmatch:pos]) | ||||
|                 continue | ||||
|             if pos < i: | ||||
|                 append(source[pos:i]) | ||||
|             if m: | ||||
|                 m.pos = pos | ||||
|                 m.regs = regs | ||||
|                 append(repl(m)) | ||||
|             else: | ||||
|                 append(repl) | ||||
|             pos = lastmatch = j | ||||
|             if i == j: | ||||
|                 # Last match was empty; don't try here again | ||||
|                 pos = pos + 1 | ||||
|                 append(source[lastmatch:pos]) | ||||
|             n = n + 1 | ||||
|         append(source[pos:]) | ||||
|         return (string.join(results, ''), n) | ||||
|                                                                              | ||||
|     def split(self, source, maxsplit=0): | ||||
|         """split(source[, maxsplit=0]) -> list of strings | ||||
|      | ||||
|         Split string by the occurrences of the compiled pattern. If | ||||
|         capturing parentheses are used in the pattern, then the text | ||||
|         of all groups in the pattern are also returned as part of the | ||||
|         resulting list. If maxsplit is nonzero, at most maxsplit | ||||
|         splits occur, and the remainder of the string is returned as | ||||
|         the final element of the list. | ||||
|          | ||||
|         """ | ||||
|         if maxsplit < 0: | ||||
|             raise error, "negative split count" | ||||
|         if maxsplit == 0: | ||||
|             maxsplit = sys.maxint | ||||
|         n = 0 | ||||
|         pos = 0 | ||||
|         lastmatch = 0 | ||||
|         results = [] | ||||
|         end = len(source) | ||||
|         match = self.code.match | ||||
|         append = results.append | ||||
|         while n < maxsplit: | ||||
|             regs = match(source, pos, end, 0) | ||||
|             if not regs: | ||||
|                 break | ||||
|             i, j = regs[0] | ||||
|             if i == j: | ||||
|                 # Empty match | ||||
|                 if pos >= end: | ||||
|                     break | ||||
|                 pos = pos+1 | ||||
|                 continue | ||||
|             append(source[lastmatch:i]) | ||||
|             rest = regs[1:] | ||||
|             if rest: | ||||
|                 for a, b in rest: | ||||
|                     if a == -1 or b == -1: | ||||
|                         group = None | ||||
|                     else: | ||||
|                         group = source[a:b] | ||||
|                     append(group) | ||||
|             pos = lastmatch = j | ||||
|             n = n + 1 | ||||
|         append(source[lastmatch:]) | ||||
|         return results | ||||
| 
 | ||||
|     def findall(self, source): | ||||
|         """findall(source) -> list | ||||
|      | ||||
|         Return a list of all non-overlapping matches of the compiled | ||||
|         pattern in string. If one or more groups are present in the | ||||
|         pattern, return a list of groups; this will be a list of | ||||
|         tuples if the pattern has more than one group. Empty matches | ||||
|         are included in the result. | ||||
| 
 | ||||
|         """ | ||||
|         pos = 0 | ||||
|         end = len(source) | ||||
|         results = [] | ||||
|         match = self.code.match | ||||
|         append = results.append | ||||
|         while pos <= end: | ||||
|             regs = match(source, pos, end, 0) | ||||
|             if not regs: | ||||
|                 break | ||||
|             i, j = regs[0] | ||||
|             rest = regs[1:] | ||||
|             if not rest: | ||||
|                 gr = source[i:j] | ||||
|             elif len(rest) == 1: | ||||
|                 a, b = rest[0] | ||||
|                 gr = source[a:b] | ||||
|             else: | ||||
|                 gr = [] | ||||
|                 for (a, b) in rest: | ||||
|                     gr.append(source[a:b]) | ||||
|                 gr = tuple(gr) | ||||
|             append(gr) | ||||
|             pos = max(j, pos+1) | ||||
|         return results | ||||
| 
 | ||||
|     # The following 3 functions were contributed by Mike Fletcher, and | ||||
|     # allow pickling and unpickling of RegexObject instances. | ||||
|     def __getinitargs__(self): | ||||
|         return (None,None,None,None) # any 4 elements, to work around | ||||
|                                      # problems with the | ||||
|                                      # pickle/cPickle modules not yet  | ||||
|                                      # ignoring the __init__ function | ||||
|     def __getstate__(self): | ||||
|         return self.pattern, self.flags, self.groupindex | ||||
|     def __setstate__(self, statetuple): | ||||
|         self.pattern = statetuple[0] | ||||
|         self.flags = statetuple[1] | ||||
|         self.groupindex = statetuple[2] | ||||
|         self.code = apply(pcre_compile, statetuple) | ||||
| 
 | ||||
| class _Dummy: | ||||
|     # Dummy class used by _subn_string().  Has 'group' to avoid core dump. | ||||
|     group = None | ||||
| 
 | ||||
| class MatchObject: | ||||
|     """Holds a compiled regular expression pattern. | ||||
| 
 | ||||
|     Methods: | ||||
|     start      Return the index of the start of a matched substring. | ||||
|     end        Return the index of the end of a matched substring. | ||||
|     span       Return a tuple of (start, end) of a matched substring. | ||||
|     groups     Return a tuple of all the subgroups of the match. | ||||
|     group      Return one or more subgroups of the match. | ||||
|     groupdict  Return a dictionary of all the named subgroups of the match. | ||||
| 
 | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, re, string, pos, endpos, regs): | ||||
|         self.re = re | ||||
|         self.string = string | ||||
|         self.pos = pos  | ||||
|         self.endpos = endpos | ||||
|         self.regs = regs | ||||
|          | ||||
|     def start(self, g = 0): | ||||
|         """start([group=0]) -> int or None | ||||
|          | ||||
|         Return the index of the start of the substring matched by | ||||
|         group; group defaults to zero (meaning the whole matched | ||||
|         substring). Return None if group exists but did not contribute | ||||
|         to the match. | ||||
| 
 | ||||
|         """ | ||||
|         if type(g) == type(''): | ||||
|             try: | ||||
|                 g = self.re.groupindex[g] | ||||
|             except (KeyError, TypeError): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|         return self.regs[g][0] | ||||
|      | ||||
|     def end(self, g = 0): | ||||
|         """end([group=0]) -> int or None | ||||
|          | ||||
|         Return the indices of the end of the substring matched by | ||||
|         group; group defaults to zero (meaning the whole matched | ||||
|         substring). Return None if group exists but did not contribute | ||||
|         to the match. | ||||
| 
 | ||||
|         """ | ||||
|         if type(g) == type(''): | ||||
|             try: | ||||
|                 g = self.re.groupindex[g] | ||||
|             except (KeyError, TypeError): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|         return self.regs[g][1] | ||||
|      | ||||
|     def span(self, g = 0): | ||||
|         """span([group=0]) -> tuple | ||||
|          | ||||
|         Return the 2-tuple (m.start(group), m.end(group)). Note that | ||||
|         if group did not contribute to the match, this is (None, | ||||
|         None). Group defaults to zero (meaning the whole matched | ||||
|         substring). | ||||
| 
 | ||||
|         """ | ||||
|         if type(g) == type(''): | ||||
|             try: | ||||
|                 g = self.re.groupindex[g] | ||||
|             except (KeyError, TypeError): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|         return self.regs[g] | ||||
|      | ||||
|     def groups(self, default=None): | ||||
|         """groups([default=None]) -> tuple | ||||
|          | ||||
|         Return a tuple containing all the subgroups of the match, from | ||||
|         1 up to however many groups are in the pattern. The default | ||||
|         argument is used for groups that did not participate in the | ||||
|         match. | ||||
| 
 | ||||
|         """ | ||||
|         result = [] | ||||
|         for g in range(1, self.re._num_regs): | ||||
|             a, b = self.regs[g] | ||||
|             if a == -1 or b == -1: | ||||
|                 result.append(default) | ||||
|             else: | ||||
|                 result.append(self.string[a:b]) | ||||
|         return tuple(result) | ||||
| 
 | ||||
|     def group(self, *groups): | ||||
|         """group([group1, group2, ...]) -> string or tuple | ||||
|          | ||||
|         Return one or more subgroups of the match. If there is a | ||||
|         single argument, the result is a single string; if there are | ||||
|         multiple arguments, the result is a tuple with one item per | ||||
|         argument. Without arguments, group1 defaults to zero (i.e. the | ||||
|         whole match is returned). If a groupN argument is zero, the | ||||
|         corresponding return value is the entire matching string; if | ||||
|         it is in the inclusive range [1..99], it is the string | ||||
|         matching the the corresponding parenthesized group. If a group | ||||
|         number is negative or larger than the number of groups defined | ||||
|         in the pattern, an IndexError exception is raised. If a group | ||||
|         is contained in a part of the pattern that did not match, the | ||||
|         corresponding result is None. If a group is contained in a | ||||
|         part of the pattern that matched multiple times, the last | ||||
|         match is returned. | ||||
| 
 | ||||
|         If the regular expression uses the (?P<name>...) syntax, the | ||||
|         groupN arguments may also be strings identifying groups by | ||||
|         their group name. If a string argument is not used as a group | ||||
|         name in the pattern, an IndexError exception is raised. | ||||
| 
 | ||||
|         """ | ||||
|         if len(groups) == 0: | ||||
|             groups = (0,) | ||||
|         result = [] | ||||
|         for g in groups: | ||||
|             if type(g) == type(''): | ||||
|                 try: | ||||
|                     g = self.re.groupindex[g] | ||||
|                 except (KeyError, TypeError): | ||||
|                     raise IndexError, 'group %s is undefined' % `g` | ||||
|             if g >= len(self.regs): | ||||
|                 raise IndexError, 'group %s is undefined' % `g` | ||||
|             a, b = self.regs[g] | ||||
|             if a == -1 or b == -1: | ||||
|                 result.append(None) | ||||
|             else: | ||||
|                 result.append(self.string[a:b]) | ||||
|         if len(result) > 1: | ||||
|             return tuple(result) | ||||
|         elif len(result) == 1: | ||||
|             return result[0] | ||||
|         else: | ||||
|             return () | ||||
| 
 | ||||
|     def groupdict(self, default=None): | ||||
|         """groupdict([default=None]) -> dictionary | ||||
|          | ||||
|         Return a dictionary containing all the named subgroups of the | ||||
|         match, keyed by the subgroup name. The default argument is | ||||
|         used for groups that did not participate in the match. | ||||
| 
 | ||||
|         """ | ||||
|         dict = {} | ||||
|         for name, index in self.re.groupindex.items(): | ||||
|             a, b = self.regs[index] | ||||
|             if a == -1 or b == -1: | ||||
|                 dict[name] = default | ||||
|             else: | ||||
|                 dict[name] = self.string[a:b] | ||||
|         return dict | ||||
| if engine == "sre": | ||||
|     # new 2.0 engine | ||||
|     from sre import * | ||||
| else: | ||||
|     # old 1.5.2 engine.  will be removed in 2.0 final. | ||||
|     from pre import * | ||||
|  |  | |||
|  | @ -150,8 +150,8 @@ def bump_num(matchobj): | |||
|     assert re.split("(?::*)", ":a:b::c") == ['', 'a', 'b', 'c'] | ||||
|     assert re.split("(:)*", ":a:b::c") == ['', ':', 'a', ':', 'b', ':', 'c'] | ||||
|     assert re.split("([b:]+)", ":a:b::c") == ['', ':', 'a', ':b::', 'c'] | ||||
|     assert re.split("(b)|(:+)", ":a:b::c") == \ | ||||
|            ['', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c'] | ||||
| ##    assert re.split("(b)|(:+)", ":a:b::c") == \ | ||||
| ##           ['', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c'] | ||||
|     assert re.split("(?:b)|(?::+)", ":a:b::c") == ['', 'a', '', '', 'c'] | ||||
| except AssertionError: | ||||
|     raise TestFailed, "re.split" | ||||
|  | @ -327,9 +327,9 @@ def bump_num(matchobj): | |||
|             # break (because it won't match at the end or start of a | ||||
|             # string), so we'll ignore patterns that feature it. | ||||
|              | ||||
|             if pattern[:2]!='\\B' and pattern[-2:]!='\\B': | ||||
|             if pattern[:2]!='\\B' and pattern[-2:]!='\\B' and result!=None: | ||||
|                 obj=re.compile(pattern) | ||||
|                 result=obj.search(s, pos=result.start(0), endpos=result.end(0)+1) | ||||
|                 result=obj.search(s, result.start(0), result.end(0)+1) | ||||
|                 if result==None: | ||||
|                     print '=== Failed on range-limited match', t | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Guido van Rossum
						Guido van Rossum