mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 05:31:20 +00:00 
			
		
		
		
	gh-111259: Optimize complementary character sets in RE (GH-120742)
Patterns like "[\s\S]" or "\s|\S" which match any character are now compiled
to the same effective code as a dot with the DOTALL modifier ("(?s:.)").
			
			
This commit is contained in:
		
							parent
							
								
									3846fcfb92
								
							
						
					
					
						commit
						8bc76ae45f
					
				
					 4 changed files with 50 additions and 13 deletions
				
			
		|  | @ -28,6 +28,8 @@ | ||||||
|     POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), |     POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | _CHARSET_ALL = [(NEGATE, None)] | ||||||
|  | 
 | ||||||
| def _combine_flags(flags, add_flags, del_flags, | def _combine_flags(flags, add_flags, del_flags, | ||||||
|                    TYPE_FLAGS=_parser.TYPE_FLAGS): |                    TYPE_FLAGS=_parser.TYPE_FLAGS): | ||||||
|     if add_flags & TYPE_FLAGS: |     if add_flags & TYPE_FLAGS: | ||||||
|  | @ -84,17 +86,22 @@ def _compile(code, pattern, flags): | ||||||
|                     code[skip] = _len(code) - skip |                     code[skip] = _len(code) - skip | ||||||
|         elif op is IN: |         elif op is IN: | ||||||
|             charset, hascased = _optimize_charset(av, iscased, tolower, fixes) |             charset, hascased = _optimize_charset(av, iscased, tolower, fixes) | ||||||
|             if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: |             if not charset: | ||||||
|                 emit(IN_LOC_IGNORE) |                 emit(FAILURE) | ||||||
|             elif not hascased: |             elif charset == _CHARSET_ALL: | ||||||
|                 emit(IN) |                 emit(ANY_ALL) | ||||||
|             elif not fixes:  # ascii |  | ||||||
|                 emit(IN_IGNORE) |  | ||||||
|             else: |             else: | ||||||
|                 emit(IN_UNI_IGNORE) |                 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: | ||||||
|             skip = _len(code); emit(0) |                     emit(IN_LOC_IGNORE) | ||||||
|             _compile_charset(charset, flags, code) |                 elif not hascased: | ||||||
|             code[skip] = _len(code) - skip |                     emit(IN) | ||||||
|  |                 elif not fixes:  # ascii | ||||||
|  |                     emit(IN_IGNORE) | ||||||
|  |                 else: | ||||||
|  |                     emit(IN_UNI_IGNORE) | ||||||
|  |                 skip = _len(code); emit(0) | ||||||
|  |                 _compile_charset(charset, flags, code) | ||||||
|  |                 code[skip] = _len(code) - skip | ||||||
|         elif op is ANY: |         elif op is ANY: | ||||||
|             if flags & SRE_FLAG_DOTALL: |             if flags & SRE_FLAG_DOTALL: | ||||||
|                 emit(ANY_ALL) |                 emit(ANY_ALL) | ||||||
|  | @ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): | ||||||
|                             charmap[i] = 1 |                             charmap[i] = 1 | ||||||
|                 elif op is NEGATE: |                 elif op is NEGATE: | ||||||
|                     out.append((op, av)) |                     out.append((op, av)) | ||||||
|  |                 elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail: | ||||||
|  |                     # Optimize [\s\S] etc. | ||||||
|  |                     out = [] if out else _CHARSET_ALL | ||||||
|  |                     return out, False | ||||||
|                 else: |                 else: | ||||||
|                     tail.append((op, av)) |                     tail.append((op, av)) | ||||||
|             except IndexError: |             except IndexError: | ||||||
|  | @ -519,13 +530,18 @@ def _compile_info(code, pattern, flags): | ||||||
|     # look for a literal prefix |     # look for a literal prefix | ||||||
|     prefix = [] |     prefix = [] | ||||||
|     prefix_skip = 0 |     prefix_skip = 0 | ||||||
|     charset = [] # not used |     charset = None # not used | ||||||
|     if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): |     if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): | ||||||
|         # look for literal prefix |         # look for literal prefix | ||||||
|         prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) |         prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) | ||||||
|         # if no prefix, look for charset prefix |         # if no prefix, look for charset prefix | ||||||
|         if not prefix: |         if not prefix: | ||||||
|             charset = _get_charset_prefix(pattern, flags) |             charset = _get_charset_prefix(pattern, flags) | ||||||
|  |             if charset: | ||||||
|  |                 charset, hascased = _optimize_charset(charset) | ||||||
|  |                 assert not hascased | ||||||
|  |                 if charset == _CHARSET_ALL: | ||||||
|  |                     charset = None | ||||||
| ##     if prefix: | ##     if prefix: | ||||||
| ##         print("*** PREFIX", prefix, prefix_skip) | ##         print("*** PREFIX", prefix, prefix_skip) | ||||||
| ##     if charset: | ##     if charset: | ||||||
|  | @ -560,8 +576,6 @@ def _compile_info(code, pattern, flags): | ||||||
|         # generate overlap table |         # generate overlap table | ||||||
|         code.extend(_generate_overlap_table(prefix)) |         code.extend(_generate_overlap_table(prefix)) | ||||||
|     elif charset: |     elif charset: | ||||||
|         charset, hascased = _optimize_charset(charset) |  | ||||||
|         assert not hascased |  | ||||||
|         _compile_charset(charset, flags, code) |         _compile_charset(charset, flags, code) | ||||||
|     code[skip] = len(code) - skip |     code[skip] = len(code) - skip | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -206,6 +206,8 @@ def _makecodes(*names): | ||||||
|     CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK |     CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2])) | ||||||
|  | 
 | ||||||
| # flags | # flags | ||||||
| SRE_FLAG_IGNORECASE = 2 # case insensitive | SRE_FLAG_IGNORECASE = 2 # case insensitive | ||||||
| SRE_FLAG_LOCALE = 4 # honour system locale | SRE_FLAG_LOCALE = 4 # honour system locale | ||||||
|  |  | ||||||
|  | @ -2473,6 +2473,24 @@ def test_regression_gh94675(self): | ||||||
|     def test_fail(self): |     def test_fail(self): | ||||||
|         self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3') |         self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3') | ||||||
| 
 | 
 | ||||||
|  |     def test_character_set_any(self): | ||||||
|  |         # The union of complementary character sets mathes any character | ||||||
|  |         # and is equivalent to "(?s:.)". | ||||||
|  |         s = '1x\n' | ||||||
|  |         for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S': | ||||||
|  |             with self.subTest(pattern=p): | ||||||
|  |                 self.assertEqual(re.findall(p, s), list(s)) | ||||||
|  |                 self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s) | ||||||
|  | 
 | ||||||
|  |     def test_character_set_none(self): | ||||||
|  |         # Negation of the union of complementary character sets does not match | ||||||
|  |         # any character. | ||||||
|  |         s = '1x\n' | ||||||
|  |         for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]': | ||||||
|  |             with self.subTest(pattern=p): | ||||||
|  |                 self.assertIsNone(re.search(p, s)) | ||||||
|  |                 self.assertIsNone(re.search('(?s:.)' + p, s)) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def get_debug_out(pat): | def get_debug_out(pat): | ||||||
|     with captured_stdout() as out: |     with captured_stdout() as out: | ||||||
|  |  | ||||||
|  | @ -0,0 +1,3 @@ | ||||||
|  | :mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match | ||||||
|  | any character as effectively as a dot with the ``DOTALL`` modifier | ||||||
|  | (``"(?s:.)"``). | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Serhiy Storchaka
						Serhiy Storchaka