gh-111259: Optimize complementary character sets in RE (GH-120742)

Patterns like "[\s\S]" or "\s|\S" which match any character are now compiled to the same effective code as a dot with the DOTALL modifier ("(?s:.)").
2025-10-31 05:31:20 +00:00 · 2024-06-20 10:19:32 +03:00 · 2024-06-20 10:19:32 +03:00 · 8bc76ae45f
commit 8bc76ae45f
parent 3846fcfb92
4 changed files with 50 additions and 13 deletions
--- a/Lib/re/_compiler.py
+++ b/Lib/re/_compiler.py
@ -28,6 +28,8 @@
    POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
 }
 _CHARSET_ALL = [(NEGATE, None)]
 def _combine_flags(flags, add_flags, del_flags,
                   TYPE_FLAGS=_parser.TYPE_FLAGS):
    if add_flags & TYPE_FLAGS:
@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
                    code[skip] = _len(code) - skip
        elif op is IN:
            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
-            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+            if not charset:
-                emit(IN_LOC_IGNORE)
+                emit(FAILURE)
-            elif not hascased:
+            elif charset == _CHARSET_ALL:
-                emit(IN)
+                emit(ANY_ALL)
            elif not fixes:  # ascii
                emit(IN_IGNORE)
            else:
-                emit(IN_UNI_IGNORE)
+                if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
-            skip = _len(code); emit(0)
+                    emit(IN_LOC_IGNORE)
-            _compile_charset(charset, flags, code)
+                elif not hascased:
-            code[skip] = _len(code) - skip
+                    emit(IN)
                elif not fixes:  # ascii
                    emit(IN_IGNORE)
                else:
                    emit(IN_UNI_IGNORE)
                skip = _len(code); emit(0)
                _compile_charset(charset, flags, code)
                code[skip] = _len(code) - skip
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
                emit(ANY_ALL)
@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
                            charmap[i] = 1
                elif op is NEGATE:
                    out.append((op, av))
                elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
                    # Optimize [\s\S] etc.
                    out = [] if out else _CHARSET_ALL
                    return out, False
                else:
                    tail.append((op, av))
            except IndexError:
@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags):
    # look for a literal prefix
    prefix = []
    prefix_skip = 0
-    charset = [] # not used
+    charset = None # not used
    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
        # look for literal prefix
        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
        # if no prefix, look for charset prefix
        if not prefix:
            charset = _get_charset_prefix(pattern, flags)
            if charset:
                charset, hascased = _optimize_charset(charset)
                assert not hascased
                if charset == _CHARSET_ALL:
                    charset = None
 ##     if prefix:
 ##         print("*** PREFIX", prefix, prefix_skip)
 ##     if charset:
@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags):
        # generate overlap table
        code.extend(_generate_overlap_table(prefix))
    elif charset:
        charset, hascased = _optimize_charset(charset)
        assert not hascased
        _compile_charset(charset, flags, code)
    code[skip] = len(code) - skip
--- a/Lib/re/_constants.py
+++ b/Lib/re/_constants.py
@ -206,6 +206,8 @@ def _makecodes(*names):
    CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
 }
 CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))
 # flags
 SRE_FLAG_IGNORECASE = 2 # case insensitive
 SRE_FLAG_LOCALE = 4 # honour system locale
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@ -2473,6 +2473,24 @@ def test_regression_gh94675(self):
    def test_fail(self):
        self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
    def test_character_set_any(self):
        # The union of complementary character sets mathes any character
        # and is equivalent to "(?s:.)".
        s = '1x\n'
        for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
            with self.subTest(pattern=p):
                self.assertEqual(re.findall(p, s), list(s))
                self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
    def test_character_set_none(self):
        # Negation of the union of complementary character sets does not match
        # any character.
        s = '1x\n'
        for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
            with self.subTest(pattern=p):
                self.assertIsNone(re.search(p, s))
                self.assertIsNone(re.search('(?s:.)' + p, s))
 def get_debug_out(pat):
    with captured_stdout() as out:
--- a/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst
+++ b/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst
@ -0,0 +1,3 @@
 :mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match
 any character as effectively as a dot with the ``DOTALL`` modifier
 (``"(?s:.)"``).