mirror of
https://github.com/python/cpython.git
synced 2026-06-10 03:42:08 +00:00
gh-150889: Improve performance of unicodedata.normalize() (GH-150890)
Scan the nfc_first/nfc_last reindex tables comparing only .start, range-check the candidate once, and terminate on a sentinel above every codepoint, so each entry costs a single comparison. ~2x faster on non-Latin and combining-heavy NFC/NFKC input; no new data tables. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2452449b32
commit
97dea30914
4 changed files with 24 additions and 13 deletions
|
|
@ -0,0 +1 @@
|
|||
Speed up :func:`unicodedata.normalize` for the NFC and NFKC forms of non-ASCII text up to a factor 2.
|
||||
|
|
@ -785,15 +785,19 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
|
|||
static int
|
||||
find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
|
||||
{
|
||||
unsigned int index;
|
||||
for (index = 0; nfc[index].start; index++) {
|
||||
unsigned int start = nfc[index].start;
|
||||
if (code < start)
|
||||
return -1;
|
||||
if (code <= start + nfc[index].count) {
|
||||
unsigned int delta = code - start;
|
||||
return nfc[index].index + delta;
|
||||
}
|
||||
/* The table is sorted by .start ascending with disjoint [start, start+count]
|
||||
ranges and ends with a sentinel whose .start exceeds every codepoint, so
|
||||
a single .start <= code test per entry also stops at the sentinel. Find
|
||||
the first entry past code, then range-check the candidate (entry i - 1). */
|
||||
unsigned int i;
|
||||
for (i = 0; (Py_UCS4)nfc[i].start <= code; i++) {
|
||||
}
|
||||
if (i == 0) {
|
||||
return -1;
|
||||
}
|
||||
unsigned int start = nfc[i - 1].start;
|
||||
if (code <= start + nfc[i - 1].count) {
|
||||
return nfc[i - 1].index + (code - start);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
|
|
|||
4
Modules/unicodedata_db.h
generated
4
Modules/unicodedata_db.h
generated
|
|
@ -629,7 +629,7 @@ static struct reindex nfc_first[] = {
|
|||
{ 93539, 0, 388},
|
||||
{ 93543, 0, 389},
|
||||
{ 93545, 0, 390},
|
||||
{0,0,0}
|
||||
{0x7fffffff, 0, 0}
|
||||
};
|
||||
|
||||
static struct reindex nfc_last[] = {
|
||||
|
|
@ -680,7 +680,7 @@ static struct reindex nfc_last[] = {
|
|||
{ 90398, 2, 67},
|
||||
{ 90409, 0, 70},
|
||||
{ 93543, 0, 71},
|
||||
{0,0,0}
|
||||
{0x7fffffff, 0, 0}
|
||||
};
|
||||
|
||||
/* string literals */
|
||||
|
|
|
|||
|
|
@ -342,15 +342,21 @@ def makeunicodedata(unicode, trace):
|
|||
fprint("#define TOTAL_FIRST",total_first)
|
||||
fprint("#define TOTAL_LAST",total_last)
|
||||
fprint("struct reindex{int start;short count,index;};")
|
||||
# The reindex tables are read only by find_nfc_index(), which scans
|
||||
# forward while .start <= code. The trailing sentinel's .start must
|
||||
# exceed every codepoint (so the scan stops with a single comparison)
|
||||
# and fit the signed int .start field.
|
||||
nfc_sentinel = 0x7fffffff
|
||||
assert sys.maxunicode < nfc_sentinel <= 0x7fffffff
|
||||
fprint("static struct reindex nfc_first[] = {")
|
||||
for start,end in comp_first_ranges:
|
||||
fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
|
||||
fprint(" {0,0,0}")
|
||||
fprint(" {0x%x, 0, 0}" % nfc_sentinel)
|
||||
fprint("};\n")
|
||||
fprint("static struct reindex nfc_last[] = {")
|
||||
for start,end in comp_last_ranges:
|
||||
fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
|
||||
fprint(" {0,0,0}")
|
||||
fprint(" {0x%x, 0, 0}" % nfc_sentinel)
|
||||
fprint("};\n")
|
||||
|
||||
# FIXME: <fl> the following tables could be made static, and
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue