| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  | #! /usr/bin/env python1.5 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-05-30 13:25:35 +00:00
										 |  |  | r"""Convert old ("regex") regular expressions to new syntax ("re").
 | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | When imported as a module, there are two functions, with their own | 
					
						
							|  |  |  | strings: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   convert(s, syntax=None) -- convert a regex regular expression to re syntax | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   quote(s) -- return a quoted string literal | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | When used as a script, read a Python string literal (or any other | 
					
						
							|  |  |  | expression evaluating to a string) from stdin, and write the | 
					
						
							|  |  |  | translated expression to stdout as a string literal.  Unless stdout is | 
					
						
							|  |  |  | a tty, no trailing \n is written to stdout.  This is done so that it | 
					
						
							|  |  |  | can be used with Emacs C-U M-| (shell-command-on-region with argument | 
					
						
							|  |  |  | which filters the region through the shell command). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | No attempt has been made at coding for performance. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Translation table... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     \(    (     (unless RE_NO_BK_PARENS set) | 
					
						
							|  |  |  |     \)    )     (unless RE_NO_BK_PARENS set) | 
					
						
							|  |  |  |     \|    |     (unless RE_NO_BK_VBAR set) | 
					
						
							|  |  |  |     \<    \b    (not quite the same, but alla...) | 
					
						
							|  |  |  |     \>    \b    (not quite the same, but alla...) | 
					
						
							|  |  |  |     \`    \A | 
					
						
							|  |  |  |     \'    \Z | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Not translated... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     . | 
					
						
							|  |  |  |     ^ | 
					
						
							|  |  |  |     $ | 
					
						
							|  |  |  |     * | 
					
						
							|  |  |  |     +           (unless RE_BK_PLUS_QM set, then to \+) | 
					
						
							|  |  |  |     ?           (unless RE_BK_PLUS_QM set, then to \?) | 
					
						
							|  |  |  |     \ | 
					
						
							|  |  |  |     \b | 
					
						
							|  |  |  |     \B | 
					
						
							|  |  |  |     \w | 
					
						
							|  |  |  |     \W | 
					
						
							|  |  |  |     \1 ... \9 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Special cases... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Non-printable characters are always replaced by their 3-digit | 
					
						
							|  |  |  |     escape code (except \t, \n, \r, which use mnemonic escapes) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Newline is turned into | when RE_NEWLINE_OR is set | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | XXX To be done... | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     [...]     (different treatment of backslashed items?) | 
					
						
							|  |  |  |     [^...]    (different treatment of backslashed items?) | 
					
						
							|  |  |  |     ^ $ * + ? (in some error contexts these are probably treated differently) | 
					
						
							|  |  |  |     \vDD  \DD (in the regex docs but only works when RE_ANSI_HEX set) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import regex | 
					
						
							|  |  |  | from regex_syntax import * # RE_*  | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Default translation table | 
					
						
							|  |  |  | mastertable = { | 
					
						
							|  |  |  |     r'\<': r'\b', | 
					
						
							|  |  |  |     r'\>': r'\b', | 
					
						
							|  |  |  |     r'\`': r'\A', | 
					
						
							|  |  |  |     r'\'': r'\Z', | 
					
						
							|  |  |  |     r'\(': '(', | 
					
						
							|  |  |  |     r'\)': ')', | 
					
						
							|  |  |  |     r'\|': '|', | 
					
						
							|  |  |  |     '(': r'\(', | 
					
						
							|  |  |  |     ')': r'\)', | 
					
						
							|  |  |  |     '|': r'\|', | 
					
						
							|  |  |  |     '\t': r'\t', | 
					
						
							|  |  |  |     '\n': r'\n', | 
					
						
							|  |  |  |     '\r': r'\r', | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def convert(s, syntax=None): | 
					
						
							|  |  |  |     """Convert a regex regular expression to re syntax.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The first argument is the regular expression, as a string object, | 
					
						
							|  |  |  |     just like it would be passed to regex.compile().  (I.e., pass the | 
					
						
							|  |  |  |     actual string object -- string quotes must already have been | 
					
						
							|  |  |  |     removed and the standard escape processing has already been done, | 
					
						
							|  |  |  |     e.g. by eval().) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The optional second argument is the regex syntax variant to be | 
					
						
							|  |  |  |     used.  This is an integer mask as passed to regex.set_syntax(); | 
					
						
							|  |  |  |     the flag bits are defined in regex_syntax.  When not specified, or | 
					
						
							|  |  |  |     when None is given, the current regex syntax mask (as retrieved by | 
					
						
							|  |  |  |     regex.get_syntax()) is used -- which is 0 by default. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The return value is a regular expression, as a string object that | 
					
						
							|  |  |  |     could be passed to re.compile().  (I.e., no string quotes have | 
					
						
							|  |  |  |     been added -- use quote() below, or repr().) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The conversion is not always guaranteed to be correct.  More | 
					
						
							|  |  |  |     syntactical analysis should be performed to detect borderline | 
					
						
							|  |  |  |     cases and decide what to do with them.  For example, 'x*?' is not | 
					
						
							|  |  |  |     translated correctly. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     table = mastertable.copy() | 
					
						
							|  |  |  |     if syntax is None: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         syntax = regex.get_syntax() | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     if syntax & RE_NO_BK_PARENS: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         del table[r'\('], table[r'\)'] | 
					
						
							|  |  |  |         del table['('], table[')'] | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     if syntax & RE_NO_BK_VBAR: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         del table[r'\|'] | 
					
						
							|  |  |  |         del table['|'] | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     if syntax & RE_BK_PLUS_QM: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         table['+'] = r'\+' | 
					
						
							|  |  |  |         table['?'] = r'\?' | 
					
						
							|  |  |  |         table[r'\+'] = '+' | 
					
						
							|  |  |  |         table[r'\?'] = '?' | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     if syntax & RE_NEWLINE_OR: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         table['\n'] = '|' | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     res = "" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     i = 0 | 
					
						
							|  |  |  |     end = len(s) | 
					
						
							|  |  |  |     while i < end: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         c = s[i] | 
					
						
							|  |  |  |         i = i+1 | 
					
						
							|  |  |  |         if c == '\\': | 
					
						
							|  |  |  |             c = s[i] | 
					
						
							|  |  |  |             i = i+1 | 
					
						
							|  |  |  |             key = '\\' + c | 
					
						
							|  |  |  |             key = table.get(key, key) | 
					
						
							|  |  |  |             res = res + key | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             c = table.get(c, c) | 
					
						
							|  |  |  |             res = res + c | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     return res | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def quote(s, quote=None): | 
					
						
							|  |  |  |     """Convert a string object to a quoted string literal.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     This is similar to repr() but will return a "raw" string (r'...' | 
					
						
							|  |  |  |     or r"...") when the string contains backslashes, instead of | 
					
						
							|  |  |  |     doubling all backslashes.  The resulting string does *not* always | 
					
						
							|  |  |  |     evaluate to the same string as the original; however it will do | 
					
						
							|  |  |  |     just the right thing when passed into re.compile(). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     The optional second argument forces the string quote; it must be | 
					
						
							|  |  |  |     a single character which is a valid Python string quote. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if quote is None: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         q = "'" | 
					
						
							|  |  |  |         altq = "'" | 
					
						
							|  |  |  |         if q in s and altq not in s: | 
					
						
							|  |  |  |             q = altq | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         assert quote in ('"', "'") | 
					
						
							|  |  |  |         q = quote | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     res = q | 
					
						
							|  |  |  |     for c in s: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         if c == q: c = '\\' + c | 
					
						
							|  |  |  |         elif c < ' ' or c > '~': c = "\\%03o" % ord(c) | 
					
						
							|  |  |  |         res = res + c | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     res = res + q | 
					
						
							|  |  |  |     if '\\' in res: | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         res = 'r' + res | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  |     return res | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							|  |  |  |     """Main program -- called when run as a script.""" | 
					
						
							|  |  |  |     import sys | 
					
						
							|  |  |  |     s = eval(sys.stdin.read()) | 
					
						
							|  |  |  |     sys.stdout.write(quote(convert(s))) | 
					
						
							|  |  |  |     if sys.stdout.isatty(): | 
					
						
							| 
									
										
										
										
											1998-03-26 21:13:24 +00:00
										 |  |  |         sys.stdout.write("\n") | 
					
						
							| 
									
										
										
										
											1997-10-23 22:43:50 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  |     main() |