mirror of
https://github.com/python/cpython.git
synced 2026-04-14 15:50:50 +00:00
[3.14] gh-80667: Fix lookup for Tangut ideographs in unicodedata (GH-144789) (GH-144871)
(cherry picked from commit 8b7b5a9946)
Co-authored-by: Pierre Le Marre <dev@wismill.eu>
This commit is contained in:
parent
46e7189d09
commit
bcabbd02f6
6 changed files with 209 additions and 60 deletions
|
|
@ -99,18 +99,13 @@
|
|||
CASED_MASK = 0x2000
|
||||
EXTENDED_CASE_MASK = 0x4000
|
||||
|
||||
# these ranges need to match unicodedata.c:is_unified_ideograph
|
||||
cjk_ranges = [
|
||||
('3400', '4DBF'), # CJK Ideograph Extension A CJK
|
||||
('4E00', '9FFF'), # CJK Ideograph
|
||||
('20000', '2A6DF'), # CJK Ideograph Extension B
|
||||
('2A700', '2B739'), # CJK Ideograph Extension C
|
||||
('2B740', '2B81D'), # CJK Ideograph Extension D
|
||||
('2B820', '2CEA1'), # CJK Ideograph Extension E
|
||||
('2CEB0', '2EBE0'), # CJK Ideograph Extension F
|
||||
('2EBF0', '2EE5D'), # CJK Ideograph Extension I
|
||||
('30000', '3134A'), # CJK Ideograph Extension G
|
||||
('31350', '323AF'), # CJK Ideograph Extension H
|
||||
# Maps the range names in UnicodeData.txt to prefixes for
|
||||
# derived names specified by rule NR2.
|
||||
# Hangul should always be at index 0, since it uses special format.
|
||||
derived_name_range_names = [
|
||||
("Hangul Syllable", "HANGUL SYLLABLE "),
|
||||
("CJK Ideograph", "CJK UNIFIED IDEOGRAPH-"),
|
||||
("Tangut Ideograph", "TANGUT IDEOGRAPH-"),
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -124,7 +119,7 @@ def maketables(trace=0):
|
|||
|
||||
for version in old_versions:
|
||||
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
||||
old_unicode = UnicodeData(version, cjk_check=False)
|
||||
old_unicode = UnicodeData(version, ideograph_check=False)
|
||||
print(len(list(filter(None, old_unicode.table))), "characters")
|
||||
merge_old_version(version, unicode, old_unicode)
|
||||
|
||||
|
|
@ -698,6 +693,23 @@ def makeunicodename(unicode, trace):
|
|||
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
|
||||
fprint('};')
|
||||
|
||||
fprint(dedent("""
|
||||
typedef struct {
|
||||
Py_UCS4 first;
|
||||
Py_UCS4 last;
|
||||
int prefixid;
|
||||
} derived_name_range;
|
||||
"""))
|
||||
|
||||
fprint('static const derived_name_range derived_name_ranges[] = {')
|
||||
for name_range in unicode.derived_name_ranges:
|
||||
fprint(' {0x%s, 0x%s, %d},' % name_range)
|
||||
fprint('};')
|
||||
|
||||
fprint('static const char * const derived_name_prefixes[] = {')
|
||||
for _, prefix in derived_name_range_names:
|
||||
fprint(' "%s",' % prefix)
|
||||
fprint('};')
|
||||
|
||||
def merge_old_version(version, new, old):
|
||||
# Changes to exclusion file not implemented yet
|
||||
|
|
@ -905,14 +917,14 @@ def from_row(row: List[str]) -> UcdRecord:
|
|||
class UnicodeData:
|
||||
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
|
||||
|
||||
def __init__(self, version, cjk_check=True):
|
||||
def __init__(self, version, ideograph_check=True):
|
||||
self.changed = []
|
||||
table = [None] * 0x110000
|
||||
for s in UcdFile(UNICODE_DATA, version):
|
||||
char = int(s[0], 16)
|
||||
table[char] = from_row(s)
|
||||
|
||||
cjk_ranges_found = []
|
||||
self.derived_name_ranges = []
|
||||
|
||||
# expand first-last ranges
|
||||
field = None
|
||||
|
|
@ -926,15 +938,15 @@ def __init__(self, version, cjk_check=True):
|
|||
s.name = ""
|
||||
field = dataclasses.astuple(s)[:15]
|
||||
elif s.name[-5:] == "Last>":
|
||||
if s.name.startswith("<CJK Ideograph"):
|
||||
cjk_ranges_found.append((field[0],
|
||||
s.codepoint))
|
||||
for j, (rangename, _) in enumerate(derived_name_range_names):
|
||||
if s.name.startswith("<" + rangename):
|
||||
self.derived_name_ranges.append(
|
||||
(field[0], s.codepoint, j))
|
||||
break
|
||||
s.name = ""
|
||||
field = None
|
||||
elif field:
|
||||
table[i] = from_row(('%X' % i,) + field[1:])
|
||||
if cjk_check and cjk_ranges != cjk_ranges_found:
|
||||
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
||||
|
||||
# public attributes
|
||||
self.filename = UNICODE_DATA % ''
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue