gh-152033: Optimize category escapes outside character sets (GH-152035)

Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and
``\W``) that occur outside a character set are now compiled directly to a
single CATEGORY opcode instead of being wrapped in an IN block.  This
removes the IN wrapper (three code words) and an indirect charset() call,
and makes such an escape a simple repeatable unit so that, for example,
``\d+`` uses the REPEAT_ONE fast path; a CATEGORY case is added to
SRE(count).

The transformation preserves behaviour exactly.  For category-heavy
patterns the compiled byte code is about 20% smaller and matching is up
to ~2x faster, with no effect on patterns that do not use bare category
escapes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Serhiy Storchaka 2026-06-24 08:49:14 +03:00 committed by GitHub
parent fcda96fbf3
commit fde4cf862c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 70 additions and 34 deletions

View file

@ -265,6 +265,15 @@ zipfile
Optimizations
=============
re
--
* Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and ``\W``)
outside a character set are now compiled to a single ``CATEGORY`` opcode
instead of being wrapped in an ``IN`` block. This speeds up matching of
patterns such as ``\d+`` and reduces the size of the compiled byte code.
(Contributed by Serhiy Storchaka in :gh:`152033`.)
module_name
-----------

View file

@ -20,7 +20,7 @@
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN, CATEGORY}
_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
@ -495,6 +495,8 @@ def _get_charset_prefix(pattern, flags):
if iscased and iscased(av):
return None
return [(op, av)]
elif op is CATEGORY:
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append

View file

@ -27,6 +27,7 @@
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
_SETITEMCODES = frozenset({LITERAL, CATEGORY})
ESCAPES = {
r"\a": (LITERAL, ord("\a")),
@ -43,12 +44,12 @@
r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\d": (CATEGORY, CATEGORY_DIGIT),
r"\D": (CATEGORY, CATEGORY_NOT_DIGIT),
r"\s": (CATEGORY, CATEGORY_SPACE),
r"\S": (CATEGORY, CATEGORY_NOT_SPACE),
r"\w": (CATEGORY, CATEGORY_WORD),
r"\W": (CATEGORY, CATEGORY_NOT_WORD),
r"\z": (AT, AT_END_STRING), # end of string
r"\Z": (AT, AT_END_STRING), # end of string (obsolete)
}
@ -315,7 +316,7 @@ def _class_escape(source, escape):
if code:
return code
code = CATEGORIES.get(escape)
if code and code[0] is IN:
if code and code[0] is CATEGORY:
return code
try:
c = escape[1:2]
@ -493,7 +494,7 @@ def _parse_sub(source, state, verbose, nested):
if len(item) != 1:
break
op, av = item[0]
if op is LITERAL:
if op in _SETITEMCODES:
set.append((op, av))
elif op is IN and av[0][0] is not NEGATE:
set.extend(av)
@ -590,8 +591,6 @@ def _parse(source, state, verbose, nested, first=False):
raise source.error("unterminated character set",
source.tell() - here)
if that == "]":
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
setappend((LITERAL, _ord("-")))
break
@ -616,8 +615,6 @@ def _parse(source, state, verbose, nested, first=False):
raise source.error(msg, len(this) + 1 + len(that))
setappend((RANGE, (lo, hi)))
else:
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
set = _uniq(set)

View file

@ -0,0 +1,5 @@
Optimize matching of character class escapes (``\d``, ``\D``, ``\s``,
``\S``, ``\w`` and ``\W``) that occur outside a character set: they are now
compiled to a single ``CATEGORY`` opcode instead of being wrapped in an
``IN`` block. This speeds up patterns such as ``\d+`` and reduces the size
of the compiled byte code.

View file

@ -1842,6 +1842,34 @@ _sre_template_impl(PyObject *module, PyObject *pattern, PyObject *template)
} while (0)
#define GET_SKIP GET_SKIP_ADJ(0)
static int
_validate_category(SRE_CODE arg)
{
switch (arg) {
case SRE_CATEGORY_DIGIT:
case SRE_CATEGORY_NOT_DIGIT:
case SRE_CATEGORY_SPACE:
case SRE_CATEGORY_NOT_SPACE:
case SRE_CATEGORY_WORD:
case SRE_CATEGORY_NOT_WORD:
case SRE_CATEGORY_LINEBREAK:
case SRE_CATEGORY_NOT_LINEBREAK:
case SRE_CATEGORY_LOC_WORD:
case SRE_CATEGORY_LOC_NOT_WORD:
case SRE_CATEGORY_UNI_DIGIT:
case SRE_CATEGORY_UNI_NOT_DIGIT:
case SRE_CATEGORY_UNI_SPACE:
case SRE_CATEGORY_UNI_NOT_SPACE:
case SRE_CATEGORY_UNI_WORD:
case SRE_CATEGORY_UNI_NOT_WORD:
case SRE_CATEGORY_UNI_LINEBREAK:
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
return 1;
default:
return 0;
}
}
static int
_validate_charset(SRE_CODE *code, SRE_CODE *end)
{
@ -1894,27 +1922,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
case SRE_OP_CATEGORY:
GET_ARG;
switch (arg) {
case SRE_CATEGORY_DIGIT:
case SRE_CATEGORY_NOT_DIGIT:
case SRE_CATEGORY_SPACE:
case SRE_CATEGORY_NOT_SPACE:
case SRE_CATEGORY_WORD:
case SRE_CATEGORY_NOT_WORD:
case SRE_CATEGORY_LINEBREAK:
case SRE_CATEGORY_NOT_LINEBREAK:
case SRE_CATEGORY_LOC_WORD:
case SRE_CATEGORY_LOC_NOT_WORD:
case SRE_CATEGORY_UNI_DIGIT:
case SRE_CATEGORY_UNI_NOT_DIGIT:
case SRE_CATEGORY_UNI_SPACE:
case SRE_CATEGORY_UNI_NOT_SPACE:
case SRE_CATEGORY_UNI_WORD:
case SRE_CATEGORY_UNI_NOT_WORD:
case SRE_CATEGORY_UNI_LINEBREAK:
case SRE_CATEGORY_UNI_NOT_LINEBREAK:
break;
default:
if (!_validate_category(arg)) {
FAIL;
}
break;
@ -1995,6 +2003,13 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
}
break;
case SRE_OP_CATEGORY:
GET_ARG;
if (!_validate_category(arg)) {
FAIL;
}
break;
case SRE_OP_ANY:
case SRE_OP_ANY_ALL:
/* These have no operands */

View file

@ -193,6 +193,7 @@ LOCAL(Py_ssize_t)
SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
{
SRE_CODE chr;
SRE_CODE arg;
SRE_CHAR c;
const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr;
const SRE_CHAR* end = (const SRE_CHAR *)state->end;
@ -302,6 +303,13 @@ SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
ptr++;
break;
case SRE_OP_CATEGORY:
arg = pattern[1];
TRACE(("|%p|%p|COUNT CATEGORY %d\n", pattern, ptr, arg));
while (ptr < end && sre_category(arg, *ptr))
ptr++;
break;
default:
/* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));