mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			192 lines
		
	
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			192 lines
		
	
	
	
		
			5.2 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
#! /usr/bin/env python
 | 
						|
 | 
						|
r"""Convert old ("regex") regular expressions to new syntax ("re").
 | 
						|
 | 
						|
When imported as a module, there are two functions, with their own
 | 
						|
strings:
 | 
						|
 | 
						|
  convert(s, syntax=None) -- convert a regex regular expression to re syntax
 | 
						|
 | 
						|
  quote(s) -- return a quoted string literal
 | 
						|
 | 
						|
When used as a script, read a Python string literal (or any other
 | 
						|
expression evaluating to a string) from stdin, and write the
 | 
						|
translated expression to stdout as a string literal.  Unless stdout is
 | 
						|
a tty, no trailing \n is written to stdout.  This is done so that it
 | 
						|
can be used with Emacs C-U M-| (shell-command-on-region with argument
 | 
						|
which filters the region through the shell command).
 | 
						|
 | 
						|
No attempt has been made at coding for performance.
 | 
						|
 | 
						|
Translation table...
 | 
						|
 | 
						|
    \(    (     (unless RE_NO_BK_PARENS set)
 | 
						|
    \)    )     (unless RE_NO_BK_PARENS set)
 | 
						|
    \|    |     (unless RE_NO_BK_VBAR set)
 | 
						|
    \<    \b    (not quite the same, but alla...)
 | 
						|
    \>    \b    (not quite the same, but alla...)
 | 
						|
    \`    \A
 | 
						|
    \'    \Z
 | 
						|
 | 
						|
Not translated...
 | 
						|
 | 
						|
    .
 | 
						|
    ^
 | 
						|
    $
 | 
						|
    *
 | 
						|
    +           (unless RE_BK_PLUS_QM set, then to \+)
 | 
						|
    ?           (unless RE_BK_PLUS_QM set, then to \?)
 | 
						|
    \
 | 
						|
    \b
 | 
						|
    \B
 | 
						|
    \w
 | 
						|
    \W
 | 
						|
    \1 ... \9
 | 
						|
 | 
						|
Special cases...
 | 
						|
 | 
						|
    Non-printable characters are always replaced by their 3-digit
 | 
						|
    escape code (except \t, \n, \r, which use mnemonic escapes)
 | 
						|
 | 
						|
    Newline is turned into | when RE_NEWLINE_OR is set
 | 
						|
 | 
						|
XXX To be done...
 | 
						|
 | 
						|
    [...]     (different treatment of backslashed items?)
 | 
						|
    [^...]    (different treatment of backslashed items?)
 | 
						|
    ^ $ * + ? (in some error contexts these are probably treated differently)
 | 
						|
    \vDD  \DD (in the regex docs but only works when RE_ANSI_HEX set)
 | 
						|
 | 
						|
"""
 | 
						|
 | 
						|
 | 
						|
import warnings
 | 
						|
warnings.filterwarnings("ignore", ".* regex .*", DeprecationWarning, __name__,
 | 
						|
                        append=1)
 | 
						|
 | 
						|
import regex
 | 
						|
from regex_syntax import * # RE_*
 | 
						|
 | 
						|
__all__ = ["convert","quote"]
 | 
						|
 | 
						|
# Default translation table
 | 
						|
mastertable = {
 | 
						|
    r'\<': r'\b',
 | 
						|
    r'\>': r'\b',
 | 
						|
    r'\`': r'\A',
 | 
						|
    r'\'': r'\Z',
 | 
						|
    r'\(': '(',
 | 
						|
    r'\)': ')',
 | 
						|
    r'\|': '|',
 | 
						|
    '(': r'\(',
 | 
						|
    ')': r'\)',
 | 
						|
    '|': r'\|',
 | 
						|
    '\t': r'\t',
 | 
						|
    '\n': r'\n',
 | 
						|
    '\r': r'\r',
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
def convert(s, syntax=None):
 | 
						|
    """Convert a regex regular expression to re syntax.
 | 
						|
 | 
						|
    The first argument is the regular expression, as a string object,
 | 
						|
    just like it would be passed to regex.compile().  (I.e., pass the
 | 
						|
    actual string object -- string quotes must already have been
 | 
						|
    removed and the standard escape processing has already been done,
 | 
						|
    e.g. by eval().)
 | 
						|
 | 
						|
    The optional second argument is the regex syntax variant to be
 | 
						|
    used.  This is an integer mask as passed to regex.set_syntax();
 | 
						|
    the flag bits are defined in regex_syntax.  When not specified, or
 | 
						|
    when None is given, the current regex syntax mask (as retrieved by
 | 
						|
    regex.get_syntax()) is used -- which is 0 by default.
 | 
						|
 | 
						|
    The return value is a regular expression, as a string object that
 | 
						|
    could be passed to re.compile().  (I.e., no string quotes have
 | 
						|
    been added -- use quote() below, or repr().)
 | 
						|
 | 
						|
    The conversion is not always guaranteed to be correct.  More
 | 
						|
    syntactical analysis should be performed to detect borderline
 | 
						|
    cases and decide what to do with them.  For example, 'x*?' is not
 | 
						|
    translated correctly.
 | 
						|
 | 
						|
    """
 | 
						|
    table = mastertable.copy()
 | 
						|
    if syntax is None:
 | 
						|
        syntax = regex.get_syntax()
 | 
						|
    if syntax & RE_NO_BK_PARENS:
 | 
						|
        del table[r'\('], table[r'\)']
 | 
						|
        del table['('], table[')']
 | 
						|
    if syntax & RE_NO_BK_VBAR:
 | 
						|
        del table[r'\|']
 | 
						|
        del table['|']
 | 
						|
    if syntax & RE_BK_PLUS_QM:
 | 
						|
        table['+'] = r'\+'
 | 
						|
        table['?'] = r'\?'
 | 
						|
        table[r'\+'] = '+'
 | 
						|
        table[r'\?'] = '?'
 | 
						|
    if syntax & RE_NEWLINE_OR:
 | 
						|
        table['\n'] = '|'
 | 
						|
    res = ""
 | 
						|
 | 
						|
    i = 0
 | 
						|
    end = len(s)
 | 
						|
    while i < end:
 | 
						|
        c = s[i]
 | 
						|
        i = i+1
 | 
						|
        if c == '\\':
 | 
						|
            c = s[i]
 | 
						|
            i = i+1
 | 
						|
            key = '\\' + c
 | 
						|
            key = table.get(key, key)
 | 
						|
            res = res + key
 | 
						|
        else:
 | 
						|
            c = table.get(c, c)
 | 
						|
            res = res + c
 | 
						|
    return res
 | 
						|
 | 
						|
 | 
						|
def quote(s, quote=None):
 | 
						|
    """Convert a string object to a quoted string literal.
 | 
						|
 | 
						|
    This is similar to repr() but will return a "raw" string (r'...'
 | 
						|
    or r"...") when the string contains backslashes, instead of
 | 
						|
    doubling all backslashes.  The resulting string does *not* always
 | 
						|
    evaluate to the same string as the original; however it will do
 | 
						|
    just the right thing when passed into re.compile().
 | 
						|
 | 
						|
    The optional second argument forces the string quote; it must be
 | 
						|
    a single character which is a valid Python string quote.
 | 
						|
 | 
						|
    """
 | 
						|
    if quote is None:
 | 
						|
        q = "'"
 | 
						|
        altq = "'"
 | 
						|
        if q in s and altq not in s:
 | 
						|
            q = altq
 | 
						|
    else:
 | 
						|
        assert quote in ('"', "'")
 | 
						|
        q = quote
 | 
						|
    res = q
 | 
						|
    for c in s:
 | 
						|
        if c == q: c = '\\' + c
 | 
						|
        elif c < ' ' or c > '~': c = "\\%03o" % ord(c)
 | 
						|
        res = res + c
 | 
						|
    res = res + q
 | 
						|
    if '\\' in res:
 | 
						|
        res = 'r' + res
 | 
						|
    return res
 | 
						|
 | 
						|
 | 
						|
def main():
 | 
						|
    """Main program -- called when run as a script."""
 | 
						|
    import sys
 | 
						|
    s = eval(sys.stdin.read())
 | 
						|
    sys.stdout.write(quote(convert(s)))
 | 
						|
    if sys.stdout.isatty():
 | 
						|
        sys.stdout.write("\n")
 | 
						|
 | 
						|
 | 
						|
if __name__ == '__main__':
 | 
						|
    main()
 |