mirror of
https://github.com/python/cpython.git
synced 2026-04-15 16:21:24 +00:00
[3.14] gh-80667: Fix lookup for Tangut ideographs in unicodedata (GH-144789) (GH-144871)
(cherry picked from commit 8b7b5a9946)
Co-authored-by: Pierre Le Marre <dev@wismill.eu>
This commit is contained in:
parent
46e7189d09
commit
bcabbd02f6
6 changed files with 209 additions and 60 deletions
|
|
@ -1010,21 +1010,18 @@ static const char * const hangul_syllables[][3] = {
|
|||
{ 0, 0, "H" }
|
||||
};
|
||||
|
||||
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
|
||||
static int
|
||||
is_unified_ideograph(Py_UCS4 code)
|
||||
find_prefix_id(Py_UCS4 code)
|
||||
{
|
||||
return
|
||||
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
|
||||
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
|
||||
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
|
||||
(0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
|
||||
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
|
||||
(0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
|
||||
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
|
||||
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
|
||||
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
|
||||
(0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
|
||||
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
|
||||
if (code < derived_name_ranges[i].first) {
|
||||
return -1;
|
||||
}
|
||||
if (code <= derived_name_ranges[i].last) {
|
||||
return derived_name_ranges[i].prefixid;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* macros used to determine if the given code point is in the PUA range that
|
||||
|
|
@ -1302,7 +1299,9 @@ _getucname(PyObject *self,
|
|||
}
|
||||
}
|
||||
|
||||
if (SBase <= code && code < SBase+SCount) {
|
||||
int prefixid = find_prefix_id(code);
|
||||
if (prefixid == 0) {
|
||||
assert(SBase <= code && code < SBase+SCount);
|
||||
/* Hangul syllable. */
|
||||
int SIndex = code - SBase;
|
||||
int L = SIndex / NCount;
|
||||
|
|
@ -1324,11 +1323,13 @@ _getucname(PyObject *self,
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (is_unified_ideograph(code)) {
|
||||
if (buflen < 28)
|
||||
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
|
||||
/* Only support CJK unified ideographs.
|
||||
* Support for Tangut ideographs is a new feature in 3.15. */
|
||||
if (prefixid == 1) {
|
||||
const char *prefix = derived_name_prefixes[prefixid];
|
||||
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
|
||||
return 0;
|
||||
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -1385,6 +1386,35 @@ _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static Py_UCS4
|
||||
parse_hex_code(const char *name, int namelen)
|
||||
{
|
||||
if (namelen < 4 || namelen > 6) {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
if (*name == '0') {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
int v = 0;
|
||||
while (namelen--) {
|
||||
v *= 16;
|
||||
Py_UCS1 c = Py_TOUPPER(*name);
|
||||
if (c >= '0' && c <= '9') {
|
||||
v += c - '0';
|
||||
}
|
||||
else if (c >= 'A' && c <= 'F') {
|
||||
v += c - 'A' + 10;
|
||||
}
|
||||
else {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
name++;
|
||||
}
|
||||
if (v > 0x10ffff) {
|
||||
return (Py_UCS4)-1;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
static int
|
||||
_getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
|
|
@ -1393,8 +1423,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
|||
* Named aliases are not resolved, they are returned as a code point in the
|
||||
* PUA */
|
||||
|
||||
/* Check for hangul syllables. */
|
||||
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
|
||||
int i = 0;
|
||||
size_t prefixlen;
|
||||
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
|
||||
const char *prefix = derived_name_prefixes[i];
|
||||
prefixlen = strlen(derived_name_prefixes[i]);
|
||||
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == 0) {
|
||||
/* Hangul syllables. */
|
||||
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
|
||||
int len, L = -1, V = -1, T = -1;
|
||||
const char *pos = name + 16;
|
||||
find_syllable(pos, &len, &L, LCount, 0);
|
||||
|
|
@ -1411,28 +1452,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Check for unified ideographs. */
|
||||
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
|
||||
/* Four or five hexdigits must follow. */
|
||||
unsigned int v;
|
||||
v = 0;
|
||||
name += 22;
|
||||
namelen -= 22;
|
||||
if (namelen != 4 && namelen != 5)
|
||||
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
|
||||
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);
|
||||
if (find_prefix_id(v) != i) {
|
||||
return 0;
|
||||
while (namelen--) {
|
||||
v *= 16;
|
||||
Py_UCS1 c = Py_TOUPPER(*name);
|
||||
if (c >= '0' && c <= '9')
|
||||
v += c - '0';
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
v += c - 'A' + 10;
|
||||
else
|
||||
return 0;
|
||||
name++;
|
||||
}
|
||||
if (!is_unified_ideograph(v))
|
||||
return 0;
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
|
|
|
|||
27
Modules/unicodename_db.h
generated
27
Modules/unicodename_db.h
generated
|
|
@ -19473,3 +19473,30 @@ static const named_sequence named_sequences[] = {
|
|||
{2, {0x02E5, 0x02E9}},
|
||||
{2, {0x02E9, 0x02E5}},
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
Py_UCS4 first;
|
||||
Py_UCS4 last;
|
||||
int prefixid;
|
||||
} derived_name_range;
|
||||
|
||||
static const derived_name_range derived_name_ranges[] = {
|
||||
{0x3400, 0x4DBF, 1},
|
||||
{0x4E00, 0x9FFF, 1},
|
||||
{0xAC00, 0xD7A3, 0},
|
||||
{0x17000, 0x187F7, 2},
|
||||
{0x18D00, 0x18D08, 2},
|
||||
{0x20000, 0x2A6DF, 1},
|
||||
{0x2A700, 0x2B739, 1},
|
||||
{0x2B740, 0x2B81D, 1},
|
||||
{0x2B820, 0x2CEA1, 1},
|
||||
{0x2CEB0, 0x2EBE0, 1},
|
||||
{0x2EBF0, 0x2EE5D, 1},
|
||||
{0x30000, 0x3134A, 1},
|
||||
{0x31350, 0x323AF, 1},
|
||||
};
|
||||
static const char * const derived_name_prefixes[] = {
|
||||
"HANGUL SYLLABLE ",
|
||||
"CJK UNIFIED IDEOGRAPH-",
|
||||
"TANGUT IDEOGRAPH-",
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue