gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (#97906)

Co-authored-by: Łukasz Langa <lukasz@langa.pl>
Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com>
Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>
This commit is contained in:
CF Bolz-Tereick 2023-11-04 15:56:58 +01:00 committed by GitHub
parent 0e9c364f4a
commit 9573d14215
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 18134 additions and 30444 deletions

View file

@ -623,120 +623,12 @@ def makeunicodetype(unicode, trace):
# unicode name database
def makeunicodename(unicode, trace):
from dawg import build_compression_dawg
FILE = "Modules/unicodename_db.h"
print("--- Preparing", FILE, "...")
# collect names
names = [None] * len(unicode.chars)
for char in unicode.chars:
record = unicode.table[char]
if record:
name = record.name.strip()
if name and name[0] != "<":
names[char] = name + chr(0)
print(len([n for n in names if n is not None]), "distinct names")
# collect unique words from names (note that we differ between
# words inside a sentence, and words ending a sentence. the
# latter includes the trailing null byte.
words = {}
n = b = 0
for char in unicode.chars:
name = names[char]
if name:
w = name.split()
b = b + len(name)
n = n + len(w)
for w in w:
l = words.get(w)
if l:
l.append(None)
else:
words[w] = [len(words)]
print(n, "words in text;", b, "bytes")
wordlist = list(words.items())
# sort on falling frequency, then by name
def word_key(a):
aword, alist = a
return -len(alist), aword
wordlist.sort(key=word_key)
# figure out how many phrasebook escapes we need
escapes = 0
while escapes * 256 < len(wordlist):
escapes = escapes + 1
print(escapes, "escapes")
short = 256 - escapes
assert short > 0
print(short, "short indexes in lexicon")
# statistics
n = 0
for i in range(short):
n = n + len(wordlist[i][1])
print(n, "short indexes in phrasebook")
# pick the most commonly used words, and sort the rest on falling
# length (to maximize overlap)
wordlist, wordtail = wordlist[:short], wordlist[short:]
wordtail.sort(key=lambda a: a[0], reverse=True)
wordlist.extend(wordtail)
# generate lexicon from words
lexicon_offset = [0]
lexicon = ""
words = {}
# build a lexicon string
offset = 0
for w, x in wordlist:
# encoding: bit 7 indicates last character in word (chr(128)
# indicates the last character in an entire string)
ww = w[:-1] + chr(ord(w[-1])+128)
# reuse string tails, when possible
o = lexicon.find(ww)
if o < 0:
o = offset
lexicon = lexicon + ww
offset = offset + len(w)
words[w] = len(lexicon_offset)
lexicon_offset.append(o)
lexicon = list(map(ord, lexicon))
# generate phrasebook from names and lexicon
phrasebook = [0]
phrasebook_offset = [0] * len(unicode.chars)
for char in unicode.chars:
name = names[char]
if name:
w = name.split()
phrasebook_offset[char] = len(phrasebook)
for w in w:
i = words[w]
if i < short:
phrasebook.append(i)
else:
# store as two bytes
phrasebook.append((i>>8) + short)
phrasebook.append(i&255)
assert getsize(phrasebook) == 1
#
# unicode name hash table
# extract names
@ -748,12 +640,6 @@ def word_key(a):
if name and name[0] != "<":
data.append((name, char))
# the magic number 47 was chosen to minimize the number of
# collisions on the current data set. if you like, change it
# and see what happens...
codehash = Hash("code", data, 47)
print("--- Writing", FILE, "...")
with open(FILE, "w") as fp:
@ -762,24 +648,22 @@ def word_key(a):
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
fprint()
fprint("#define NAME_MAXLEN", 256)
assert max(len(x) for x in data) < 256
fprint()
fprint("/* lexicon */")
Array("lexicon", lexicon).dump(fp, trace)
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
# split decomposition index table
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
fprint("/* code->name phrasebook */")
fprint("#define phrasebook_shift", shift)
fprint("#define phrasebook_short", short)
Array("phrasebook", phrasebook).dump(fp, trace)
Array("phrasebook_offset1", offset1).dump(fp, trace)
Array("phrasebook_offset2", offset2).dump(fp, trace)
fprint("/* name->code dictionary */")
codehash.dump(fp, trace)
packed_dawg, pos_to_codepoint = build_compression_dawg(data)
notfound = len(pos_to_codepoint)
inverse_list = [notfound] * len(unicode.chars)
for pos, codepoint in enumerate(pos_to_codepoint):
inverse_list[codepoint] = pos
Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace)
Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace)
index1, index2, shift = splitbins(inverse_list, trace)
fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift)
fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound)
Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace)
Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace)
fprint()
fprint('static const unsigned int aliases_start = %#x;' %
@ -1188,94 +1072,6 @@ def uselatin1(self):
self.chars = list(range(256))
# hash table tools
# this is a straight-forward reimplementation of Python's built-in
# dictionary type, using a static data structure, and a custom string
# hash algorithm.
def myhash(s, magic):
h = 0
for c in map(ord, s.upper()):
h = (h * magic) + c
ix = h & 0xff000000
if ix:
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
return h
SIZES = [
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
]
class Hash:
def __init__(self, name, data, magic):
# turn a (key, value) list into a static hash table structure
# determine table size
for size, poly in SIZES:
if size > len(data):
poly = size + poly
break
else:
raise AssertionError("ran out of polynomials")
print(size, "slots in hash table")
table = [None] * size
mask = size-1
n = 0
hash = myhash
# initialize hash table
for key, value in data:
h = hash(key, magic)
i = (~h) & mask
v = table[i]
if v is None:
table[i] = value
continue
incr = (h ^ (h >> 3)) & mask
if not incr:
incr = mask
while 1:
n = n + 1
i = (i + incr) & mask
v = table[i]
if v is None:
table[i] = value
break
incr = incr << 1
if incr > mask:
incr = incr ^ poly
print(n, "collisions")
self.collisions = n
for i in range(len(table)):
if table[i] is None:
table[i] = 0
self.data = Array(name + "_hash", table)
self.magic = magic
self.name = name
self.size = size
self.poly = poly
def dump(self, file, trace):
# write data to file, as a C array
self.data.dump(file, trace)
file.write("#define %s_magic %d\n" % (self.name, self.magic))
file.write("#define %s_size %d\n" % (self.name, self.size))
file.write("#define %s_poly %d\n" % (self.name, self.poly))
# stuff to deal with arrays of unsigned integers