[3.14] gh-80667: Fix lookup for Tangut ideographs in unicodedata (GH-144789) (GH-144871)

(cherry picked from commit 8b7b5a9946)

Co-authored-by: Pierre Le Marre <dev@wismill.eu>
This commit is contained in:
Serhiy Storchaka 2026-02-16 14:25:43 +02:00 committed by GitHub
parent 46e7189d09
commit bcabbd02f6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 209 additions and 60 deletions

View file

@ -111,6 +111,30 @@ def test_cjk_unified_ideographs(self):
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")
def test_tangut_ideographs(self):
self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000")
self.checkletter("TANGUT IDEOGRAPH-187F7", "\U000187f7")
self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00")
self.checkletter("TANGUT IDEOGRAPH-18D08", "\U00018d08")
self.checkletter("tangut ideograph-18d08", "\U00018d08")
def test_egyptian_hieroglyphs(self):
self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460")
self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa")
self.checkletter("egyptian hieroglyph-143fa", "\U000143fa")
def test_khitan_small_script_characters(self):
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00")
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5")
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
self.checkletter("khitan small script character-18cff", "\U00018cff")
def test_nushu_characters(self):
self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170")
self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb")
self.checkletter("nushu character-1b2fb", "\U0001b2fb")
def test_bmp_characters(self):
for code in range(0x10000):
char = chr(code)

View file

@ -116,6 +116,56 @@ def test_function_checksum(self):
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)
def test_name(self):
name = self.db.name
self.assertRaises(ValueError, name, '\0')
self.assertRaises(ValueError, name, '\n')
self.assertRaises(ValueError, name, '\x1F')
self.assertRaises(ValueError, name, '\x7F')
self.assertRaises(ValueError, name, '\x9F')
self.assertRaises(ValueError, name, '\uFFFE')
self.assertRaises(ValueError, name, '\uFFFF')
self.assertRaises(ValueError, name, '\U0010FFFF')
self.assertEqual(name('\U0010FFFF', 42), 42)
self.assertEqual(name(' '), 'SPACE')
self.assertEqual(name('1'), 'DIGIT ONE')
self.assertEqual(name('A'), 'LATIN CAPITAL LETTER A')
self.assertEqual(name('\xA0'), 'NO-BREAK SPACE')
self.assertEqual(name('\u0221', None), None if self.old else
'LATIN SMALL LETTER D WITH CURL')
self.assertEqual(name('\u3400'), 'CJK UNIFIED IDEOGRAPH-3400')
self.assertEqual(name('\u9FA5'), 'CJK UNIFIED IDEOGRAPH-9FA5')
self.assertEqual(name('\uAC00'), 'HANGUL SYLLABLE GA')
self.assertEqual(name('\uD7A3'), 'HANGUL SYLLABLE HIH')
self.assertEqual(name('\uF900'), 'CJK COMPATIBILITY IDEOGRAPH-F900')
self.assertEqual(name('\uFA6A'), 'CJK COMPATIBILITY IDEOGRAPH-FA6A')
self.assertEqual(name('\uFBF9'),
'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA '
'ABOVE WITH ALEF MAKSURA ISOLATED FORM')
self.assertEqual(name('\U00013460', None), None if self.old else
'EGYPTIAN HIEROGLYPH-13460')
self.assertEqual(name('\U000143FA', None), None if self.old else
'EGYPTIAN HIEROGLYPH-143FA')
self.assertEqual(name('\U00018B00', None), None if self.old else
'KHITAN SMALL SCRIPT CHARACTER-18B00')
self.assertEqual(name('\U00018CD5', None), None if self.old else
'KHITAN SMALL SCRIPT CHARACTER-18CD5')
self.assertEqual(name('\U00018CFF', None), None if self.old else
'KHITAN SMALL SCRIPT CHARACTER-18CFF')
self.assertEqual(name('\U0001B170', None), None if self.old else
'NUSHU CHARACTER-1B170')
self.assertEqual(name('\U0001B2FB', None), None if self.old else
'NUSHU CHARACTER-1B2FB')
self.assertEqual(name('\U0001FBA8', None), None if self.old else
'BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO '
'MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE')
self.assertEqual(name('\U0002A6D6'), 'CJK UNIFIED IDEOGRAPH-2A6D6')
self.assertEqual(name('\U0002FA1D'), 'CJK COMPATIBILITY IDEOGRAPH-2FA1D')
self.assertEqual(name('\U000323AF', None), None if self.old else
'CJK UNIFIED IDEOGRAPH-323AF')
@requires_resource('cpu')
def test_name_inverse_lookup(self):
for char in iterallchars():
looked_name = self.db.name(char, None)
@ -139,6 +189,17 @@ def test_lookup_nonexistant(self):
"HANDBUG",
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
"???",
"CJK UNIFIED IDEOGRAPH-03400",
"CJK UNIFIED IDEOGRAPH-020000",
"CJK UNIFIED IDEOGRAPH-33FF",
"CJK UNIFIED IDEOGRAPH-F900",
"CJK UNIFIED IDEOGRAPH-13460",
"CJK UNIFIED IDEOGRAPH-17000",
"CJK UNIFIED IDEOGRAPH-18B00",
"CJK UNIFIED IDEOGRAPH-1B170",
"CJK COMPATIBILITY IDEOGRAPH-3400",
"TANGUT IDEOGRAPH-3400",
"HANGUL SYLLABLE AC00",
]:
self.assertRaises(KeyError, self.db.lookup, nonexistent)

View file

@ -0,0 +1 @@
Support lookup for Tangut Ideographs in :mod:`unicodedata`.

View file

@ -1010,21 +1010,18 @@ static const char * const hangul_syllables[][3] = {
{ 0, 0, "H" }
};
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int
is_unified_ideograph(Py_UCS4 code)
find_prefix_id(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
if (code < derived_name_ranges[i].first) {
return -1;
}
if (code <= derived_name_ranges[i].last) {
return derived_name_ranges[i].prefixid;
}
}
return -1;
}
/* macros used to determine if the given code point is in the PUA range that
@ -1302,7 +1299,9 @@ _getucname(PyObject *self,
}
}
if (SBase <= code && code < SBase+SCount) {
int prefixid = find_prefix_id(code);
if (prefixid == 0) {
assert(SBase <= code && code < SBase+SCount);
/* Hangul syllable. */
int SIndex = code - SBase;
int L = SIndex / NCount;
@ -1324,11 +1323,13 @@ _getucname(PyObject *self,
return 1;
}
if (is_unified_ideograph(code)) {
if (buflen < 28)
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
/* Only support CJK unified ideographs.
* Support for Tangut ideographs is a new feature in 3.15. */
if (prefixid == 1) {
const char *prefix = derived_name_prefixes[prefixid];
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
return 0;
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
}
return 1;
}
@ -1385,6 +1386,35 @@ _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
return 1;
}
static Py_UCS4
parse_hex_code(const char *name, int namelen)
{
if (namelen < 4 || namelen > 6) {
return (Py_UCS4)-1;
}
if (*name == '0') {
return (Py_UCS4)-1;
}
int v = 0;
while (namelen--) {
v *= 16;
Py_UCS1 c = Py_TOUPPER(*name);
if (c >= '0' && c <= '9') {
v += c - '0';
}
else if (c >= 'A' && c <= 'F') {
v += c - 'A' + 10;
}
else {
return (Py_UCS4)-1;
}
name++;
}
if (v > 0x10ffff) {
return (Py_UCS4)-1;
}
return v;
}
static int
_getcode(const char* name, int namelen, Py_UCS4* code)
@ -1393,8 +1423,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
* Named aliases are not resolved, they are returned as a code point in the
* PUA */
/* Check for hangul syllables. */
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
int i = 0;
size_t prefixlen;
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
const char *prefix = derived_name_prefixes[i];
prefixlen = strlen(derived_name_prefixes[i]);
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
break;
}
}
if (i == 0) {
/* Hangul syllables. */
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
int len, L = -1, V = -1, T = -1;
const char *pos = name + 16;
find_syllable(pos, &len, &L, LCount, 0);
@ -1411,28 +1452,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
return 0;
}
/* Check for unified ideographs. */
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
unsigned int v;
v = 0;
name += 22;
namelen -= 22;
if (namelen != 4 && namelen != 5)
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);
if (find_prefix_id(v) != i) {
return 0;
while (namelen--) {
v *= 16;
Py_UCS1 c = Py_TOUPPER(*name);
if (c >= '0' && c <= '9')
v += c - '0';
else if (c >= 'A' && c <= 'F')
v += c - 'A' + 10;
else
return 0;
name++;
}
if (!is_unified_ideograph(v))
return 0;
*code = v;
return 1;
}

View file

@ -19473,3 +19473,30 @@ static const named_sequence named_sequences[] = {
{2, {0x02E5, 0x02E9}},
{2, {0x02E9, 0x02E5}},
};
typedef struct {
Py_UCS4 first;
Py_UCS4 last;
int prefixid;
} derived_name_range;
static const derived_name_range derived_name_ranges[] = {
{0x3400, 0x4DBF, 1},
{0x4E00, 0x9FFF, 1},
{0xAC00, 0xD7A3, 0},
{0x17000, 0x187F7, 2},
{0x18D00, 0x18D08, 2},
{0x20000, 0x2A6DF, 1},
{0x2A700, 0x2B739, 1},
{0x2B740, 0x2B81D, 1},
{0x2B820, 0x2CEA1, 1},
{0x2CEB0, 0x2EBE0, 1},
{0x2EBF0, 0x2EE5D, 1},
{0x30000, 0x3134A, 1},
{0x31350, 0x323AF, 1},
};
static const char * const derived_name_prefixes[] = {
"HANGUL SYLLABLE ",
"CJK UNIFIED IDEOGRAPH-",
"TANGUT IDEOGRAPH-",
};

View file

@ -99,18 +99,13 @@
CASED_MASK = 0x2000
EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DBF'), # CJK Ideograph Extension A CJK
('4E00', '9FFF'), # CJK Ideograph
('20000', '2A6DF'), # CJK Ideograph Extension B
('2A700', '2B739'), # CJK Ideograph Extension C
('2B740', '2B81D'), # CJK Ideograph Extension D
('2B820', '2CEA1'), # CJK Ideograph Extension E
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
('30000', '3134A'), # CJK Ideograph Extension G
('31350', '323AF'), # CJK Ideograph Extension H
# Maps the range names in UnicodeData.txt to prefixes for
# derived names specified by rule NR2.
# Hangul should always be at index 0, since it uses special format.
derived_name_range_names = [
("Hangul Syllable", "HANGUL SYLLABLE "),
("CJK Ideograph", "CJK UNIFIED IDEOGRAPH-"),
("Tangut Ideograph", "TANGUT IDEOGRAPH-"),
]
@ -124,7 +119,7 @@ def maketables(trace=0):
for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(version, cjk_check=False)
old_unicode = UnicodeData(version, ideograph_check=False)
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
@ -698,6 +693,23 @@ def makeunicodename(unicode, trace):
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
fprint('};')
fprint(dedent("""
typedef struct {
Py_UCS4 first;
Py_UCS4 last;
int prefixid;
} derived_name_range;
"""))
fprint('static const derived_name_range derived_name_ranges[] = {')
for name_range in unicode.derived_name_ranges:
fprint(' {0x%s, 0x%s, %d},' % name_range)
fprint('};')
fprint('static const char * const derived_name_prefixes[] = {')
for _, prefix in derived_name_range_names:
fprint(' "%s",' % prefix)
fprint('};')
def merge_old_version(version, new, old):
# Changes to exclusion file not implemented yet
@ -905,14 +917,14 @@ def from_row(row: List[str]) -> UcdRecord:
class UnicodeData:
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
def __init__(self, version, cjk_check=True):
def __init__(self, version, ideograph_check=True):
self.changed = []
table = [None] * 0x110000
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = from_row(s)
cjk_ranges_found = []
self.derived_name_ranges = []
# expand first-last ranges
field = None
@ -926,15 +938,15 @@ def __init__(self, version, cjk_check=True):
s.name = ""
field = dataclasses.astuple(s)[:15]
elif s.name[-5:] == "Last>":
if s.name.startswith("<CJK Ideograph"):
cjk_ranges_found.append((field[0],
s.codepoint))
for j, (rangename, _) in enumerate(derived_name_range_names):
if s.name.startswith("<" + rangename):
self.derived_name_ranges.append(
(field[0], s.codepoint, j))
break
s.name = ""
field = None
elif field:
table[i] = from_row(('%X' % i,) + field[1:])
if cjk_check and cjk_ranges != cjk_ranges_found:
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
# public attributes
self.filename = UNICODE_DATA % ''