mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 10:44:55 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			186 lines
		
	
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			186 lines
		
	
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #! /usr/bin/env python1.5
 | |
| 
 | |
| """Convert old ("regex") regular expressions to new syntax ("re").
 | |
| 
 | |
| When imported as a module, there are two functions, with their own
 | |
| strings:
 | |
| 
 | |
|   convert(s, syntax=None) -- convert a regex regular expression to re syntax
 | |
| 
 | |
|   quote(s) -- return a quoted string literal
 | |
| 
 | |
| When used as a script, read a Python string literal (or any other
 | |
| expression evaluating to a string) from stdin, and write the
 | |
| translated expression to stdout as a string literal.  Unless stdout is
 | |
| a tty, no trailing \n is written to stdout.  This is done so that it
 | |
| can be used with Emacs C-U M-| (shell-command-on-region with argument
 | |
| which filters the region through the shell command).
 | |
| 
 | |
| No attempt has been made at coding for performance.
 | |
| 
 | |
| Translation table...
 | |
| 
 | |
|     \(    (     (unless RE_NO_BK_PARENS set)
 | |
|     \)    )     (unless RE_NO_BK_PARENS set)
 | |
|     \|    |     (unless RE_NO_BK_VBAR set)
 | |
|     \<    \b    (not quite the same, but alla...)
 | |
|     \>    \b    (not quite the same, but alla...)
 | |
|     \`    \A
 | |
|     \'    \Z
 | |
| 
 | |
| Not translated...
 | |
| 
 | |
|     .
 | |
|     ^
 | |
|     $
 | |
|     *
 | |
|     +           (unless RE_BK_PLUS_QM set, then to \+)
 | |
|     ?           (unless RE_BK_PLUS_QM set, then to \?)
 | |
|     \
 | |
|     \b
 | |
|     \B
 | |
|     \w
 | |
|     \W
 | |
|     \1 ... \9
 | |
| 
 | |
| Special cases...
 | |
| 
 | |
|     Non-printable characters are always replaced by their 3-digit
 | |
|     escape code (except \t, \n, \r, which use mnemonic escapes)
 | |
| 
 | |
|     Newline is turned into | when RE_NEWLINE_OR is set
 | |
| 
 | |
| XXX To be done...
 | |
| 
 | |
|     [...]     (different treatment of backslashed items?)
 | |
|     [^...]    (different treatment of backslashed items?)
 | |
|     ^ $ * + ? (in some error contexts these are probably treated differently)
 | |
|     \vDD  \DD (in the regex docs but only works when RE_ANSI_HEX set)
 | |
| 
 | |
| """
 | |
| 
 | |
| 
 | |
| import regex
 | |
| from regex_syntax import * # RE_* 
 | |
| 
 | |
| # Default translation table
 | |
| mastertable = {
 | |
|     r'\<': r'\b',
 | |
|     r'\>': r'\b',
 | |
|     r'\`': r'\A',
 | |
|     r'\'': r'\Z',
 | |
|     r'\(': '(',
 | |
|     r'\)': ')',
 | |
|     r'\|': '|',
 | |
|     '(': r'\(',
 | |
|     ')': r'\)',
 | |
|     '|': r'\|',
 | |
|     '\t': r'\t',
 | |
|     '\n': r'\n',
 | |
|     '\r': r'\r',
 | |
| }
 | |
| 
 | |
| 
 | |
| def convert(s, syntax=None):
 | |
|     """Convert a regex regular expression to re syntax.
 | |
| 
 | |
|     The first argument is the regular expression, as a string object,
 | |
|     just like it would be passed to regex.compile().  (I.e., pass the
 | |
|     actual string object -- string quotes must already have been
 | |
|     removed and the standard escape processing has already been done,
 | |
|     e.g. by eval().)
 | |
| 
 | |
|     The optional second argument is the regex syntax variant to be
 | |
|     used.  This is an integer mask as passed to regex.set_syntax();
 | |
|     the flag bits are defined in regex_syntax.  When not specified, or
 | |
|     when None is given, the current regex syntax mask (as retrieved by
 | |
|     regex.get_syntax()) is used -- which is 0 by default.
 | |
| 
 | |
|     The return value is a regular expression, as a string object that
 | |
|     could be passed to re.compile().  (I.e., no string quotes have
 | |
|     been added -- use quote() below, or repr().)
 | |
| 
 | |
|     The conversion is not always guaranteed to be correct.  More
 | |
|     syntactical analysis should be performed to detect borderline
 | |
|     cases and decide what to do with them.  For example, 'x*?' is not
 | |
|     translated correctly.
 | |
| 
 | |
|     """
 | |
|     table = mastertable.copy()
 | |
|     if syntax is None:
 | |
|         syntax = regex.get_syntax()
 | |
|     if syntax & RE_NO_BK_PARENS:
 | |
|         del table[r'\('], table[r'\)']
 | |
|         del table['('], table[')']
 | |
|     if syntax & RE_NO_BK_VBAR:
 | |
|         del table[r'\|']
 | |
|         del table['|']
 | |
|     if syntax & RE_BK_PLUS_QM:
 | |
|         table['+'] = r'\+'
 | |
|         table['?'] = r'\?'
 | |
|         table[r'\+'] = '+'
 | |
|         table[r'\?'] = '?'
 | |
|     if syntax & RE_NEWLINE_OR:
 | |
|         table['\n'] = '|'
 | |
|     res = ""
 | |
| 
 | |
|     i = 0
 | |
|     end = len(s)
 | |
|     while i < end:
 | |
|         c = s[i]
 | |
|         i = i+1
 | |
|         if c == '\\':
 | |
|             c = s[i]
 | |
|             i = i+1
 | |
|             key = '\\' + c
 | |
|             key = table.get(key, key)
 | |
|             res = res + key
 | |
|         else:
 | |
|             c = table.get(c, c)
 | |
|             res = res + c
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def quote(s, quote=None):
 | |
|     """Convert a string object to a quoted string literal.
 | |
| 
 | |
|     This is similar to repr() but will return a "raw" string (r'...'
 | |
|     or r"...") when the string contains backslashes, instead of
 | |
|     doubling all backslashes.  The resulting string does *not* always
 | |
|     evaluate to the same string as the original; however it will do
 | |
|     just the right thing when passed into re.compile().
 | |
| 
 | |
|     The optional second argument forces the string quote; it must be
 | |
|     a single character which is a valid Python string quote.
 | |
| 
 | |
|     """
 | |
|     if quote is None:
 | |
|         q = "'"
 | |
|         altq = "'"
 | |
|         if q in s and altq not in s:
 | |
|             q = altq
 | |
|     else:
 | |
|         assert quote in ('"', "'")
 | |
|         q = quote
 | |
|     res = q
 | |
|     for c in s:
 | |
|         if c == q: c = '\\' + c
 | |
|         elif c < ' ' or c > '~': c = "\\%03o" % ord(c)
 | |
|         res = res + c
 | |
|     res = res + q
 | |
|     if '\\' in res:
 | |
|         res = 'r' + res
 | |
|     return res
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     """Main program -- called when run as a script."""
 | |
|     import sys
 | |
|     s = eval(sys.stdin.read())
 | |
|     sys.stdout.write(quote(convert(s)))
 | |
|     if sys.stdout.isatty():
 | |
|         sys.stdout.write("\n")
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     main()
 | 
