mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			197 lines
		
	
	
	
		
			7.9 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			197 lines
		
	
	
	
		
			7.9 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
| \section{Built-in Module \sectcode{regex}}
 | |
| 
 | |
| \bimodindex{regex}
 | |
| This module provides regular expression matching operations similar to
 | |
| those found in Emacs.  It is always available.
 | |
| 
 | |
| By default the patterns are Emacs-style regular expressions; there is
 | |
| a way to change the syntax to match that of several well-known
 | |
| \UNIX{} utilities.
 | |
| 
 | |
| This module is 8-bit clean: both patterns and strings may contain null
 | |
| bytes and characters whose high bit is set.
 | |
| 
 | |
| \strong{Please note:} There is a little-known fact about Python string
 | |
| literals which means that you don't usually have to worry about
 | |
| doubling backslashes, even though they are used to escape special
 | |
| characters in string literals as well as in regular expressions.  This
 | |
| is because Python doesn't remove backslashes from string literals if
 | |
| they are followed by an unrecognized escape character.
 | |
| \emph{However}, if you want to include a literal \dfn{backslash} in a
 | |
| regular expression represented as a string literal, you have to
 | |
| \emph{quadruple} it.  E.g.\  to extract \LaTeX\ \samp{\e section\{{\rm
 | |
| \ldots}\}} headers from a document, you can use this pattern:
 | |
| \code{'\e \e \e\e section\{\e (.*\e )\}'}.
 | |
| 
 | |
| The module defines these functions, and an exception:
 | |
| 
 | |
| \renewcommand{\indexsubitem}{(in module regex)}
 | |
| 
 | |
| \begin{funcdesc}{match}{pattern\, string}
 | |
|   Return how many characters at the beginning of \var{string} match
 | |
|   the regular expression \var{pattern}.  Return \code{-1} if the
 | |
|   string does not match the pattern (this is different from a
 | |
|   zero-length match!).
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{search}{pattern\, string}
 | |
|   Return the first position in \var{string} that matches the regular
 | |
|   expression \var{pattern}.  Return -1 if no position in the string
 | |
|   matches the pattern (this is different from a zero-length match
 | |
|   anywhere!).
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{compile}{pattern\optional{\, translate}}
 | |
|   Compile a regular expression pattern into a regular expression
 | |
|   object, which can be used for matching using its \code{match} and
 | |
|   \code{search} methods, described below.  The optional argument
 | |
|   \var{translate}, if present, must be a 256-character string
 | |
|   indicating how characters (both of the pattern and of the strings to
 | |
|   be matched) are translated before comparing them; the \code{i}-th
 | |
|   element of the string gives the translation for the character with
 | |
|   \ASCII{} code \code{i}.  This can be used to implement
 | |
|   case-insensitive matching; see the \code{casefold} data item below.
 | |
| 
 | |
|   The sequence
 | |
| 
 | |
| \bcode\begin{verbatim}
 | |
| prog = regex.compile(pat)
 | |
| result = prog.match(str)
 | |
| \end{verbatim}\ecode
 | |
| 
 | |
| is equivalent to
 | |
| 
 | |
| \bcode\begin{verbatim}
 | |
| result = regex.match(pat, str)
 | |
| \end{verbatim}\ecode
 | |
| 
 | |
| but the version using \code{compile()} is more efficient when multiple
 | |
| regular expressions are used concurrently in a single program.  (The
 | |
| compiled version of the last pattern passed to \code{regex.match()} or
 | |
| \code{regex.search()} is cached, so programs that use only a single
 | |
| regular expression at a time needn't worry about compiling regular
 | |
| expressions.)
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{set_syntax}{flags}
 | |
|   Set the syntax to be used by future calls to \code{compile},
 | |
|   \code{match} and \code{search}.  (Already compiled expression objects
 | |
|   are not affected.)  The argument is an integer which is the OR of
 | |
|   several flag bits.  The return value is the previous value of
 | |
|   the syntax flags.  Names for the flags are defined in the standard
 | |
|   module \code{regex_syntax}; read the file \file{regex_syntax.py} for
 | |
|   more information.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{symcomp}{pattern\optional{\, translate}}
 | |
| This is like \code{compile}, but supports symbolic group names: if a
 | |
| parenthesis-enclosed group begins with a group name in angular
 | |
| brackets, e.g. \code{'\e(<id>[a-z][a-z0-9]*\e)'}, the group can
 | |
| be referenced by its name in arguments to the \code{group} method of
 | |
| the resulting compiled regular expression object, like this:
 | |
| \code{p.group('id')}.  Group names may contain alphanumeric characters
 | |
| and \code{'_'} only.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{excdesc}{error}
 | |
|   Exception raised when a string passed to one of the functions here
 | |
|   is not a valid regular expression (e.g., unmatched parentheses) or
 | |
|   when some other error occurs during compilation or matching.  (It is
 | |
|   never an error if a string contains no match for a pattern.)
 | |
| \end{excdesc}
 | |
| 
 | |
| \begin{datadesc}{casefold}
 | |
| A string suitable to pass as \var{translate} argument to
 | |
| \code{compile} to map all upper case characters to their lowercase
 | |
| equivalents.
 | |
| \end{datadesc}
 | |
| 
 | |
| \noindent
 | |
| Compiled regular expression objects support these methods:
 | |
| 
 | |
| \renewcommand{\indexsubitem}{(regex method)}
 | |
| \begin{funcdesc}{match}{string\optional{\, pos}}
 | |
|   Return how many characters at the beginning of \var{string} match
 | |
|   the compiled regular expression.  Return \code{-1} if the string
 | |
|   does not match the pattern (this is different from a zero-length
 | |
|   match!).
 | |
|   
 | |
|   The optional second parameter \var{pos} gives an index in the string
 | |
|   where the search is to start; it defaults to \code{0}.  This is not
 | |
|   completely equivalent to slicing the string; the \code{'\^'} pattern
 | |
|   character matches at the real begin of the string and at positions
 | |
|   just after a newline, not necessarily at the index where the search
 | |
|   is to start.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{search}{string\optional{\, pos}}
 | |
|   Return the first position in \var{string} that matches the regular
 | |
|   expression \code{pattern}.  Return \code{-1} if no position in the
 | |
|   string matches the pattern (this is different from a zero-length
 | |
|   match anywhere!).
 | |
|   
 | |
|   The optional second parameter has the same meaning as for the
 | |
|   \code{match} method.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \begin{funcdesc}{group}{index\, index\, ...}
 | |
| This method is only valid when the last call to the \code{match}
 | |
| or \code{search} method found a match.  It returns one or more
 | |
| groups of the match.  If there is a single \var{index} argument,
 | |
| the result is a single string; if there are multiple arguments, the
 | |
| result is a tuple with one item per argument.  If the \var{index} is
 | |
| zero, the corresponding return value is the entire matching string; if
 | |
| it is in the inclusive range [1..99], it is the string matching the
 | |
| the corresponding parenthesized group (using the default syntax,
 | |
| groups are parenthesized using \code{\\(} and \code{\\)}).  If no
 | |
| such group exists, the corresponding result is \code{None}.
 | |
| 
 | |
| If the regular expression was compiled by \code{symcomp} instead of
 | |
| \code{compile}, the \var{index} arguments may also be strings
 | |
| identifying groups by their group name.
 | |
| \end{funcdesc}
 | |
| 
 | |
| \noindent
 | |
| Compiled regular expressions support these data attributes:
 | |
| 
 | |
| \renewcommand{\indexsubitem}{(regex attribute)}
 | |
| 
 | |
| \begin{datadesc}{regs}
 | |
| When the last call to the \code{match} or \code{search} method found a
 | |
| match, this is a tuple of pairs of indices corresponding to the
 | |
| beginning and end of all parenthesized groups in the pattern.  Indices
 | |
| are relative to the string argument passed to \code{match} or
 | |
| \code{search}.  The 0-th tuple gives the beginning and end or the
 | |
| whole pattern.  When the last match or search failed, this is
 | |
| \code{None}.
 | |
| \end{datadesc}
 | |
| 
 | |
| \begin{datadesc}{last}
 | |
| When the last call to the \code{match} or \code{search} method found a
 | |
| match, this is the string argument passed to that method.  When the
 | |
| last match or search failed, this is \code{None}.
 | |
| \end{datadesc}
 | |
| 
 | |
| \begin{datadesc}{translate}
 | |
| This is the value of the \var{translate} argument to
 | |
| \code{regex.compile} that created this regular expression object.  If
 | |
| the \var{translate} argument was omitted in the \code{regex.compile}
 | |
| call, this is \code{None}.
 | |
| \end{datadesc}
 | |
| 
 | |
| \begin{datadesc}{givenpat}
 | |
| The regular expression pattern as passed to \code{compile} or
 | |
| \code{symcomp}.
 | |
| \end{datadesc}
 | |
| 
 | |
| \begin{datadesc}{realpat}
 | |
| The regular expression after stripping the group names for regular
 | |
| expressions compiled with \code{symcomp}.  Same as \code{givenpat}
 | |
| otherwise.
 | |
| \end{datadesc}
 | |
| 
 | |
| \begin{datadesc}{groupindex}
 | |
| A dictionary giving the mapping from symbolic group names to numerical
 | |
| group indices for regular expressions compiled with \code{symcomp}.
 | |
| \code{None} otherwise.
 | |
| \end{datadesc}
 | 
