mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	Issue #3665: \u and \U escapes are now supported in unicode regular expressions.
Patch by Serhiy Storchaka.
This commit is contained in:
		
							parent
							
								
									c9aa8425c4
								
							
						
					
					
						commit
						463badf06c
					
				
					 4 changed files with 144 additions and 34 deletions
				
			
		|  | @ -414,17 +414,24 @@ Most of the standard escapes supported by Python string literals are also | ||||||
| accepted by the regular expression parser:: | accepted by the regular expression parser:: | ||||||
| 
 | 
 | ||||||
|    \a      \b      \f      \n |    \a      \b      \f      \n | ||||||
|    \r      \t      \v      \x |    \r      \t      \u      \U | ||||||
|    \\ |    \v      \x      \\ | ||||||
| 
 | 
 | ||||||
| (Note that ``\b`` is used to represent word boundaries, and means "backspace" | (Note that ``\b`` is used to represent word boundaries, and means "backspace" | ||||||
| only inside character classes.) | only inside character classes.) | ||||||
| 
 | 
 | ||||||
|  | ``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode | ||||||
|  | patterns.  In bytes patterns they are not treated specially. | ||||||
|  | 
 | ||||||
| Octal escapes are included in a limited form.  If the first digit is a 0, or if | Octal escapes are included in a limited form.  If the first digit is a 0, or if | ||||||
| there are three octal digits, it is considered an octal escape. Otherwise, it is | there are three octal digits, it is considered an octal escape. Otherwise, it is | ||||||
| a group reference.  As for string literals, octal escapes are always at most | a group reference.  As for string literals, octal escapes are always at most | ||||||
| three digits in length. | three digits in length. | ||||||
| 
 | 
 | ||||||
|  | .. versionchanged:: 3.3 | ||||||
|  |    The ``'\u'`` and ``'\U'`` escape sequences have been added. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| .. _contents-of-module-re: | .. _contents-of-module-re: | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -177,6 +177,7 @@ def getwidth(self): | ||||||
| 
 | 
 | ||||||
| class Tokenizer: | class Tokenizer: | ||||||
|     def __init__(self, string): |     def __init__(self, string): | ||||||
|  |         self.istext = isinstance(string, str) | ||||||
|         self.string = string |         self.string = string | ||||||
|         self.index = 0 |         self.index = 0 | ||||||
|         self.__next() |         self.__next() | ||||||
|  | @ -187,14 +188,14 @@ def __next(self): | ||||||
|         char = self.string[self.index:self.index+1] |         char = self.string[self.index:self.index+1] | ||||||
|         # Special case for the str8, since indexing returns a integer |         # Special case for the str8, since indexing returns a integer | ||||||
|         # XXX This is only needed for test_bug_926075 in test_re.py |         # XXX This is only needed for test_bug_926075 in test_re.py | ||||||
|         if char and isinstance(char, bytes): |         if char and not self.istext: | ||||||
|             char = chr(char[0]) |             char = chr(char[0]) | ||||||
|         if char == "\\": |         if char == "\\": | ||||||
|             try: |             try: | ||||||
|                 c = self.string[self.index + 1] |                 c = self.string[self.index + 1] | ||||||
|             except IndexError: |             except IndexError: | ||||||
|                 raise error("bogus escape (end of line)") |                 raise error("bogus escape (end of line)") | ||||||
|             if isinstance(self.string, bytes): |             if not self.istext: | ||||||
|                 c = chr(c) |                 c = chr(c) | ||||||
|             char = char + c |             char = char + c | ||||||
|         self.index = self.index + len(char) |         self.index = self.index + len(char) | ||||||
|  | @ -209,6 +210,15 @@ def get(self): | ||||||
|         this = self.next |         this = self.next | ||||||
|         self.__next() |         self.__next() | ||||||
|         return this |         return this | ||||||
|  |     def getwhile(self, n, charset): | ||||||
|  |         result = '' | ||||||
|  |         for _ in range(n): | ||||||
|  |             c = self.next | ||||||
|  |             if c not in charset: | ||||||
|  |                 break | ||||||
|  |             result += c | ||||||
|  |             self.__next() | ||||||
|  |         return result | ||||||
|     def tell(self): |     def tell(self): | ||||||
|         return self.index, self.next |         return self.index, self.next | ||||||
|     def seek(self, index): |     def seek(self, index): | ||||||
|  | @ -241,20 +251,30 @@ def _class_escape(source, escape): | ||||||
|         c = escape[1:2] |         c = escape[1:2] | ||||||
|         if c == "x": |         if c == "x": | ||||||
|             # hexadecimal escape (exactly two digits) |             # hexadecimal escape (exactly two digits) | ||||||
|             while source.next in HEXDIGITS and len(escape) < 4: |             escape += source.getwhile(2, HEXDIGITS) | ||||||
|                 escape = escape + source.get() |             if len(escape) != 4: | ||||||
|             escape = escape[2:] |                 raise ValueError | ||||||
|             if len(escape) != 2: |             return LITERAL, int(escape[2:], 16) & 0xff | ||||||
|                 raise error("bogus escape: %s" % repr("\\" + escape)) |         elif c == "u" and source.istext: | ||||||
|             return LITERAL, int(escape, 16) & 0xff |             # unicode escape (exactly four digits) | ||||||
|  |             escape += source.getwhile(4, HEXDIGITS) | ||||||
|  |             if len(escape) != 6: | ||||||
|  |                 raise ValueError | ||||||
|  |             return LITERAL, int(escape[2:], 16) | ||||||
|  |         elif c == "U" and source.istext: | ||||||
|  |             # unicode escape (exactly eight digits) | ||||||
|  |             escape += source.getwhile(8, HEXDIGITS) | ||||||
|  |             if len(escape) != 10: | ||||||
|  |                 raise ValueError | ||||||
|  |             c = int(escape[2:], 16) | ||||||
|  |             chr(c) # raise ValueError for invalid code | ||||||
|  |             return LITERAL, c | ||||||
|         elif c in OCTDIGITS: |         elif c in OCTDIGITS: | ||||||
|             # octal escape (up to three digits) |             # octal escape (up to three digits) | ||||||
|             while source.next in OCTDIGITS and len(escape) < 4: |             escape += source.getwhile(2, OCTDIGITS) | ||||||
|                 escape = escape + source.get() |             return LITERAL, int(escape[1:], 8) & 0xff | ||||||
|             escape = escape[1:] |  | ||||||
|             return LITERAL, int(escape, 8) & 0xff |  | ||||||
|         elif c in DIGITS: |         elif c in DIGITS: | ||||||
|             raise error("bogus escape: %s" % repr(escape)) |             raise ValueError | ||||||
|         if len(escape) == 2: |         if len(escape) == 2: | ||||||
|             return LITERAL, ord(escape[1]) |             return LITERAL, ord(escape[1]) | ||||||
|     except ValueError: |     except ValueError: | ||||||
|  | @ -273,15 +293,27 @@ def _escape(source, escape, state): | ||||||
|         c = escape[1:2] |         c = escape[1:2] | ||||||
|         if c == "x": |         if c == "x": | ||||||
|             # hexadecimal escape |             # hexadecimal escape | ||||||
|             while source.next in HEXDIGITS and len(escape) < 4: |             escape += source.getwhile(2, HEXDIGITS) | ||||||
|                 escape = escape + source.get() |  | ||||||
|             if len(escape) != 4: |             if len(escape) != 4: | ||||||
|                 raise ValueError |                 raise ValueError | ||||||
|             return LITERAL, int(escape[2:], 16) & 0xff |             return LITERAL, int(escape[2:], 16) & 0xff | ||||||
|  |         elif c == "u" and source.istext: | ||||||
|  |             # unicode escape (exactly four digits) | ||||||
|  |             escape += source.getwhile(4, HEXDIGITS) | ||||||
|  |             if len(escape) != 6: | ||||||
|  |                 raise ValueError | ||||||
|  |             return LITERAL, int(escape[2:], 16) | ||||||
|  |         elif c == "U" and source.istext: | ||||||
|  |             # unicode escape (exactly eight digits) | ||||||
|  |             escape += source.getwhile(8, HEXDIGITS) | ||||||
|  |             if len(escape) != 10: | ||||||
|  |                 raise ValueError | ||||||
|  |             c = int(escape[2:], 16) | ||||||
|  |             chr(c) # raise ValueError for invalid code | ||||||
|  |             return LITERAL, c | ||||||
|         elif c == "0": |         elif c == "0": | ||||||
|             # octal escape |             # octal escape | ||||||
|             while source.next in OCTDIGITS and len(escape) < 4: |             escape += source.getwhile(2, OCTDIGITS) | ||||||
|                 escape = escape + source.get() |  | ||||||
|             return LITERAL, int(escape[1:], 8) & 0xff |             return LITERAL, int(escape[1:], 8) & 0xff | ||||||
|         elif c in DIGITS: |         elif c in DIGITS: | ||||||
|             # octal escape *or* decimal group reference (sigh) |             # octal escape *or* decimal group reference (sigh) | ||||||
|  |  | ||||||
|  | @ -526,24 +526,92 @@ def test_flags(self): | ||||||
|             self.assertNotEqual(re.compile('^pattern$', flag), None) |             self.assertNotEqual(re.compile('^pattern$', flag), None) | ||||||
| 
 | 
 | ||||||
|     def test_sre_character_literals(self): |     def test_sre_character_literals(self): | ||||||
|         for i in [0, 8, 16, 32, 64, 127, 128, 255]: |         for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: | ||||||
|             self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None) |             if i < 256: | ||||||
|             self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None) |                 self.assertIsNotNone(re.match(r"\%03o" % i, chr(i))) | ||||||
|             self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None) |                 self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0")) | ||||||
|             self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) |                 self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8")) | ||||||
|             self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) |                 self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i))) | ||||||
|             self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) |                 self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0")) | ||||||
|         self.assertRaises(re.error, re.match, "\911", "") |                 self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z")) | ||||||
|  |             if i < 0x10000: | ||||||
|  |                 self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0")) | ||||||
|  |                 self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z")) | ||||||
|  |             self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i))) | ||||||
|  |             self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0")) | ||||||
|  |             self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z")) | ||||||
|  |         self.assertIsNotNone(re.match(r"\0", "\000")) | ||||||
|  |         self.assertIsNotNone(re.match(r"\08", "\0008")) | ||||||
|  |         self.assertIsNotNone(re.match(r"\01", "\001")) | ||||||
|  |         self.assertIsNotNone(re.match(r"\018", "\0018")) | ||||||
|  |         self.assertIsNotNone(re.match(r"\567", chr(0o167))) | ||||||
|  |         self.assertRaises(re.error, re.match, r"\911", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\x1", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\x1z", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\u123", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\u123z", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\U0001234", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\U0001234z", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"\U00110000", "") | ||||||
| 
 | 
 | ||||||
|     def test_sre_character_class_literals(self): |     def test_sre_character_class_literals(self): | ||||||
|  |         for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: | ||||||
|  |             if i < 256: | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i))) | ||||||
|  |             if i < 0x10000: | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i))) | ||||||
|  |                 self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i))) | ||||||
|  |             self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i))) | ||||||
|  |             self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0")) | ||||||
|  |             self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z")) | ||||||
|  |         self.assertRaises(re.error, re.match, r"[\911]", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"[\x1z]", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"[\u123z]", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"[\U0001234z]", "") | ||||||
|  |         self.assertRaises(re.error, re.match, r"[\U00110000]", "") | ||||||
|  | 
 | ||||||
|  |     def test_sre_byte_literals(self): | ||||||
|         for i in [0, 8, 16, 32, 64, 127, 128, 255]: |         for i in [0, 8, 16, 32, 64, 127, 128, 255]: | ||||||
|             self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) |             self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i]))) | ||||||
|             self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None) |             self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) | ||||||
|             self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None) |             self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) | ||||||
|             self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) |             self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i]))) | ||||||
|             self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) |             self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) | ||||||
|             self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) |             self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) | ||||||
|         self.assertRaises(re.error, re.match, "[\911]", "") |         self.assertIsNotNone(re.match(br"\u", b'u')) | ||||||
|  |         self.assertIsNotNone(re.match(br"\U", b'U')) | ||||||
|  |         self.assertIsNotNone(re.match(br"\0", b"\000")) | ||||||
|  |         self.assertIsNotNone(re.match(br"\08", b"\0008")) | ||||||
|  |         self.assertIsNotNone(re.match(br"\01", b"\001")) | ||||||
|  |         self.assertIsNotNone(re.match(br"\018", b"\0018")) | ||||||
|  |         self.assertIsNotNone(re.match(br"\567", bytes([0o167]))) | ||||||
|  |         self.assertRaises(re.error, re.match, br"\911", b"") | ||||||
|  |         self.assertRaises(re.error, re.match, br"\x1", b"") | ||||||
|  |         self.assertRaises(re.error, re.match, br"\x1z", b"") | ||||||
|  | 
 | ||||||
|  |     def test_sre_byte_class_literals(self): | ||||||
|  |         for i in [0, 8, 16, 32, 64, 127, 128, 255]: | ||||||
|  |             self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i]))) | ||||||
|  |             self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i]))) | ||||||
|  |         self.assertIsNotNone(re.match(br"[\u]", b'u')) | ||||||
|  |         self.assertIsNotNone(re.match(br"[\U]", b'U')) | ||||||
|  |         self.assertRaises(re.error, re.match, br"[\911]", "") | ||||||
|  |         self.assertRaises(re.error, re.match, br"[\x1z]", "") | ||||||
| 
 | 
 | ||||||
|     def test_bug_113254(self): |     def test_bug_113254(self): | ||||||
|         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) |         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) | ||||||
|  |  | ||||||
|  | @ -40,6 +40,9 @@ Core and Builtins | ||||||
| Library | Library | ||||||
| ------- | ------- | ||||||
| 
 | 
 | ||||||
|  | - Issue #3665: \u and \U escapes are now supported in unicode regular | ||||||
|  |   expressions.  Patch by Serhiy Storchaka. | ||||||
|  | 
 | ||||||
| - Issue #15153: Added inspect.getgeneratorlocals to simplify white box | - Issue #15153: Added inspect.getgeneratorlocals to simplify white box | ||||||
|   testing of generator state updates |   testing of generator state updates | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Antoine Pitrou
						Antoine Pitrou