mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	 fad27aee11
			
		
	
	
		fad27aee11
		
	
	
	
	
		
			
			ranges) -- but thanks to the 2.0 compression scheme, this doesn't add a single byte to the resulting binaries (!) Closes bug #117524
		
			
				
	
	
		
			389 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			389 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #
 | |
| # (re)generate unicode property and type databases
 | |
| #
 | |
| # this script converts a unicode 3.0 database file to
 | |
| # Modules/unicodedata_db.h and Objects/unicodetype_db.h
 | |
| #
 | |
| # history:
 | |
| # 2000-09-24 fl   created (based on bits and pieces from unidb)
 | |
| # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
 | |
| # 2000-09-25 fl   added character type table
 | |
| # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields
 | |
| # 2000-11-03 fl   expand first/last ranges
 | |
| #
 | |
| # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 | |
| #
 | |
| 
 | |
| import sys
 | |
| 
 | |
| SCRIPT = sys.argv[0]
 | |
| VERSION = "1.1"
 | |
| 
 | |
| UNICODE_DATA = "UnicodeData-Latest.txt"
 | |
| 
 | |
| CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
 | |
|     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
 | |
|     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
 | |
|     "So" ]
 | |
| 
 | |
| BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
 | |
|     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
 | |
|     "ON" ]
 | |
| 
 | |
| # note: should match definitions in Objects/unicodectype.c
 | |
| ALPHA_MASK = 0x01
 | |
| DECIMAL_MASK = 0x02
 | |
| DIGIT_MASK = 0x04
 | |
| LOWER_MASK = 0x08
 | |
| LINEBREAK_MASK = 0x10
 | |
| SPACE_MASK = 0x20
 | |
| TITLE_MASK = 0x40
 | |
| UPPER_MASK = 0x80
 | |
| 
 | |
| def maketables(trace=0):
 | |
| 
 | |
|     unicode = UnicodeData(UNICODE_DATA)
 | |
| 
 | |
|     print "--- Processing", UNICODE_DATA, "..."
 | |
|     print len(filter(None, unicode.table)), "characters"
 | |
| 
 | |
|     # extract unicode properties
 | |
|     dummy = (0, 0, 0, 0)
 | |
|     table = [dummy]
 | |
|     cache = {0: dummy}
 | |
|     index = [0] * len(unicode.chars)
 | |
| 
 | |
|     # 1) database properties
 | |
|     for char in unicode.chars:
 | |
|         record = unicode.table[char]
 | |
|         if record:
 | |
|             # extract database properties
 | |
|             category = CATEGORY_NAMES.index(record[2])
 | |
|             combining = int(record[3])
 | |
|             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
 | |
|             mirrored = record[9] == "Y"
 | |
|             item = (
 | |
|                 category, combining, bidirectional, mirrored
 | |
|                 )
 | |
|             # add entry to index and item tables
 | |
|             i = cache.get(item)
 | |
|             if i is None:
 | |
|                 cache[item] = i = len(table)
 | |
|                 table.append(item)
 | |
|             index[char] = i
 | |
| 
 | |
|     # 2) decomposition data
 | |
| 
 | |
|     # FIXME: <fl> using the encoding stuff from unidb would save
 | |
|     # another 50k or so, but I'll leave that for 2.1...
 | |
| 
 | |
|     decomp_data = [""]
 | |
|     decomp_index = [0] * len(unicode.chars)
 | |
| 
 | |
|     for char in unicode.chars:
 | |
|         record = unicode.table[char]
 | |
|         if record:
 | |
|             if record[5]:
 | |
|                 try:
 | |
|                     i = decomp_data.index(record[5])
 | |
|                 except ValueError:
 | |
|                     i = len(decomp_data)
 | |
|                     decomp_data.append(record[5])
 | |
|             else:
 | |
|                 i = 0
 | |
|             decomp_index[char] = i
 | |
| 
 | |
|     FILE = "Modules/unicodedata_db.h"
 | |
| 
 | |
|     print "--- Writing", FILE, "..."
 | |
| 
 | |
|     print len(table), "unique properties"
 | |
|     print len(decomp_data), "unique decomposition entries"
 | |
| 
 | |
|     fp = open(FILE, "w")
 | |
|     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 | |
|     print >>fp
 | |
|     print >>fp, "/* a list of unique database records */"
 | |
|     print >>fp, \
 | |
|           "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
 | |
|     for item in table:
 | |
|         print >>fp, "    {%d, %d, %d, %d}," % item
 | |
|     print >>fp, "};"
 | |
|     print >>fp
 | |
| 
 | |
|     # FIXME: the following tables should be made static, and
 | |
|     # the support code moved into unicodedatabase.c
 | |
| 
 | |
|     print >>fp, "/* string literals */"
 | |
|     print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
 | |
|     for name in CATEGORY_NAMES:
 | |
|         print >>fp, "    \"%s\"," % name
 | |
|     print >>fp, "    NULL"
 | |
|     print >>fp, "};"
 | |
| 
 | |
|     print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
 | |
|     for name in BIDIRECTIONAL_NAMES:
 | |
|         print >>fp, "    \"%s\"," % name
 | |
|     print >>fp, "    NULL"
 | |
|     print >>fp, "};"
 | |
| 
 | |
|     print >>fp, "static const char *decomp_data[] = {"
 | |
|     for name in decomp_data:
 | |
|         print >>fp, "    \"%s\"," % name
 | |
|     print >>fp, "    NULL"
 | |
|     print >>fp, "};"
 | |
| 
 | |
|     # split record index table
 | |
|     index1, index2, shift = splitbins(index, trace)
 | |
| 
 | |
|     print >>fp, "/* index tables for the database records */"
 | |
|     print >>fp, "#define SHIFT", shift
 | |
|     Array("index1", index1).dump(fp)
 | |
|     Array("index2", index2).dump(fp)
 | |
| 
 | |
|     # split decomposition index table
 | |
|     index1, index2, shift = splitbins(decomp_index, trace)
 | |
| 
 | |
|     print >>fp, "/* index tables for the decomposition data */"
 | |
|     print >>fp, "#define DECOMP_SHIFT", shift
 | |
|     Array("decomp_index1", index1).dump(fp)
 | |
|     Array("decomp_index2", index2).dump(fp)
 | |
| 
 | |
|     #
 | |
|     # 3) unicode type data
 | |
| 
 | |
|     # extract unicode types
 | |
|     dummy = (0, 0, 0, 0, 0, 0)
 | |
|     table = [dummy]
 | |
|     cache = {0: dummy}
 | |
|     index = [0] * len(unicode.chars)
 | |
| 
 | |
|     for char in unicode.chars:
 | |
|         record = unicode.table[char]
 | |
|         if record:
 | |
|             # extract database properties
 | |
|             category = record[2]
 | |
|             bidirectional = record[4]
 | |
|             flags = 0
 | |
|             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
 | |
|                 flags |= ALPHA_MASK
 | |
|             if category == "Ll":
 | |
|                 flags |= LOWER_MASK
 | |
|             if category == "Zl" or bidirectional == "B":
 | |
|                 flags |= LINEBREAK_MASK
 | |
|             if category == "Zs" or bidirectional in ("WS", "B", "S"):
 | |
|                 flags |= SPACE_MASK
 | |
|             if category == "Lt":
 | |
|                 flags |= TITLE_MASK
 | |
|             if category == "Lu":
 | |
|                 flags |= UPPER_MASK
 | |
|             # use delta predictor for upper/lower/title
 | |
|             if record[12]:
 | |
|                 upper = (int(record[12], 16) - char) & 0xffff
 | |
|             else:
 | |
|                 upper = 0
 | |
|             if record[13]:
 | |
|                 lower = (int(record[13], 16) - char) & 0xffff
 | |
|             else:
 | |
|                 lower = 0
 | |
|             if record[14]:
 | |
|                 title = (int(record[14], 16) - char) & 0xffff
 | |
|             else:
 | |
|                 title = 0
 | |
|             # decimal digit, integer digit
 | |
|             decimal = 0
 | |
|             if record[6]:
 | |
|                 flags |= DECIMAL_MASK
 | |
|                 decimal = int(record[6])
 | |
|             digit = 0
 | |
|             if record[7]:
 | |
|                 flags |= DIGIT_MASK
 | |
|                 digit = int(record[7])
 | |
|             item = (
 | |
|                 flags, upper, lower, title, decimal, digit
 | |
|                 )
 | |
|             # add entry to index and item tables
 | |
|             i = cache.get(item)
 | |
|             if i is None:
 | |
|                 cache[item] = i = len(table)
 | |
|                 table.append(item)
 | |
|             index[char] = i
 | |
| 
 | |
|     FILE = "Objects/unicodetype_db.h"
 | |
| 
 | |
|     fp = open(FILE, "w")
 | |
| 
 | |
|     print "--- Writing", FILE, "..."
 | |
| 
 | |
|     print len(table), "unique character type entries"
 | |
| 
 | |
|     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 | |
|     print >>fp
 | |
|     print >>fp, "/* a list of unique character type descriptors */"
 | |
|     print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
 | |
|     for item in table:
 | |
|         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
 | |
|     print >>fp, "};"
 | |
|     print >>fp
 | |
| 
 | |
|     # split decomposition index table
 | |
|     index1, index2, shift = splitbins(index, trace)
 | |
| 
 | |
|     print >>fp, "/* type indexes */"
 | |
|     print >>fp, "#define SHIFT", shift
 | |
|     Array("index1", index1).dump(fp)
 | |
|     Array("index2", index2).dump(fp)
 | |
| 
 | |
| # --------------------------------------------------------------------
 | |
| # the following support code is taken from the unidb utilities
 | |
| # Copyright (c) 1999-2000 by Secret Labs AB
 | |
| 
 | |
| # load a unicode-data file from disk
 | |
| 
 | |
| import string, sys
 | |
| 
 | |
| class UnicodeData:
 | |
| 
 | |
|     def __init__(self, filename, expand=1):
 | |
|         file = open(filename)
 | |
|         table = [None] * 65536
 | |
|         while 1:
 | |
|             s = file.readline()
 | |
|             if not s:
 | |
|                 break
 | |
|             s = string.split(string.strip(s), ";")
 | |
|             char = string.atoi(s[0], 16)
 | |
|             table[char] = s
 | |
| 
 | |
|         # expand first-last ranges (ignore surrogates and private use)
 | |
|         if expand:
 | |
|             field = None
 | |
|             for i in range(0, 0xD800):
 | |
|                 s = table[i]
 | |
|                 if s:
 | |
|                     if s[1][-6:] == "First>":
 | |
|                         s[1] = ""
 | |
|                         field = s[:]
 | |
|                     elif s[1][-5:] == "Last>":
 | |
|                         s[1] = ""
 | |
|                         field = None
 | |
|                 elif field:
 | |
|                     field[0] = hex(i)
 | |
|                     table[i] = field
 | |
| 
 | |
|         # public attributes
 | |
|         self.filename = filename
 | |
|         self.table = table
 | |
|         self.chars = range(65536) # unicode
 | |
| 
 | |
|     def uselatin1(self):
 | |
|         # restrict character range to ISO Latin 1
 | |
|         self.chars = range(256)
 | |
| 
 | |
| # stuff to deal with arrays of unsigned integers
 | |
| 
 | |
| class Array:
 | |
| 
 | |
|     def __init__(self, name, data):
 | |
|         self.name = name
 | |
|         self.data = data
 | |
| 
 | |
|     def dump(self, file):
 | |
|         # write data to file, as a C array
 | |
|         size = getsize(self.data)
 | |
|         # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
 | |
|         file.write("static ")
 | |
|         if size == 1:
 | |
|             file.write("unsigned char")
 | |
|         elif size == 2:
 | |
|             file.write("unsigned short")
 | |
|         else:
 | |
|             file.write("unsigned int")
 | |
|         file.write(" " + self.name + "[] = {\n")
 | |
|         if self.data:
 | |
|             s = "    "
 | |
|             for item in self.data:
 | |
|                 i = str(item) + ", "
 | |
|                 if len(s) + len(i) > 78:
 | |
|                     file.write(s + "\n")
 | |
|                     s = "    " + i
 | |
|                 else:
 | |
|                     s = s + i
 | |
|             if string.strip(s):
 | |
|                 file.write(s + "\n")
 | |
|         file.write("};\n\n")
 | |
| 
 | |
| def getsize(data):
 | |
|     # return smallest possible integer size for the given array
 | |
|     maxdata = max(data)
 | |
|     if maxdata < 256:
 | |
|         return 1
 | |
|     elif maxdata < 65536:
 | |
|         return 2
 | |
|     else:
 | |
|         return 4
 | |
| 
 | |
| def splitbins(t, trace=0):
 | |
|     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
 | |
| 
 | |
|     t is a sequence of ints.  This function can be useful to save space if
 | |
|     many of the ints are the same.  t1 and t2 are lists of ints, and shift
 | |
|     is an int, chosen to minimize the combined size of t1 and t2 (in C
 | |
|     code), and where for each i in range(len(t)),
 | |
|         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 | |
|     where mask is a bitmask isolating the last "shift" bits.
 | |
| 
 | |
|     If optional arg trace is non-zero (default zero), progress info
 | |
|     is printed to sys.stderr.  The higher the value, the more info
 | |
|     you'll get.
 | |
|     """
 | |
| 
 | |
|     import sys
 | |
|     if trace:
 | |
|         def dump(t1, t2, shift, bytes):
 | |
|             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
 | |
|                 len(t1), len(t2), shift, bytes)
 | |
|         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
 | |
|                             "bytes"
 | |
|     n = len(t)-1    # last valid index
 | |
|     maxshift = 0    # the most we can shift n and still have something left
 | |
|     if n > 0:
 | |
|         while n >> 1:
 | |
|             n >>= 1
 | |
|             maxshift += 1
 | |
|     del n
 | |
|     bytes = sys.maxint  # smallest total size so far
 | |
|     t = tuple(t)    # so slices can be dict keys
 | |
|     for shift in range(maxshift + 1):
 | |
|         t1 = []
 | |
|         t2 = []
 | |
|         size = 2**shift
 | |
|         bincache = {}
 | |
|         for i in range(0, len(t), size):
 | |
|             bin = t[i:i+size]
 | |
|             index = bincache.get(bin)
 | |
|             if index is None:
 | |
|                 index = len(t2)
 | |
|                 bincache[bin] = index
 | |
|                 t2.extend(bin)
 | |
|             t1.append(index >> shift)
 | |
|         # determine memory size
 | |
|         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
 | |
|         if trace > 1:
 | |
|             dump(t1, t2, shift, b)
 | |
|         if b < bytes:
 | |
|             best = t1, t2, shift
 | |
|             bytes = b
 | |
|     t1, t2, shift = best
 | |
|     if trace:
 | |
|         print >>sys.stderr, "Best:",
 | |
|         dump(t1, t2, shift, bytes)
 | |
|     if __debug__:
 | |
|         # exhaustively verify that the decomposition is correct
 | |
|         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
 | |
|         for i in xrange(len(t)):
 | |
|             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 | |
|     return best
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     maketables(1)
 |