| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | # (re)generate unicode property and type databases | 
					
						
							|  |  |  | # | 
					
						
							| 
									
										
										
										
											2018-06-06 20:14:28 -07:00
										 |  |  | # This script converts Unicode database files to Modules/unicodedata_db.h, | 
					
						
							|  |  |  | # Modules/unicodename_db.h, and Objects/unicodetype_db.h | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  | # | 
					
						
							|  |  |  | # history: | 
					
						
							|  |  |  | # 2000-09-24 fl   created (based on bits and pieces from unidb) | 
					
						
							|  |  |  | # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | # 2000-09-25 fl   added character type table | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  | # 2000-11-03 fl   expand first/last ranges | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | # 2001-01-19 fl   added character name tables (2.1) | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  | # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  | # 2002-09-11 wd   use string methods | 
					
						
							|  |  |  | # 2002-10-18 mvl  update to Unicode 3.2 | 
					
						
							|  |  |  | # 2002-10-22 mvl  generate NFC tables | 
					
						
							| 
									
										
										
										
											2002-11-24 23:05:09 +00:00
										 |  |  | # 2002-11-24 mvl  expand all ranges, sort names version-independently | 
					
						
							| 
									
										
										
										
											2002-11-25 09:13:37 +00:00
										 |  |  | # 2002-11-25 mvl  add UNIDATA_VERSION | 
					
						
							| 
									
										
										
										
											2004-06-02 16:49:17 +00:00
										 |  |  | # 2004-05-29 perky add east asian width information | 
					
						
							| 
									
										
										
										
											2006-03-11 12:43:44 +00:00
										 |  |  | # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta | 
					
						
							| 
									
										
										
										
											2008-07-04 15:55:02 +00:00
										 |  |  | # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  | # 2011-10-21 ezio add support for name aliases and named sequences | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  | # 2012-01    benjamin add full case mappings | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  | # | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  | # written by Fredrik Lundh (fredrik@pythonware.com) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | # | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  | import dataclasses | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  | import os | 
					
						
							|  |  |  | import sys | 
					
						
							|  |  |  | import zipfile | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | from functools import partial | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  | from textwrap import dedent | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  | from typing import Iterator, List, Optional, Set, Tuple | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | SCRIPT = sys.argv[0] | 
					
						
							| 
									
										
										
										
											2018-06-06 20:14:28 -07:00
										 |  |  | VERSION = "3.3" | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-25 09:13:37 +00:00
										 |  |  | # The Unicode Database | 
					
						
							| 
									
										
										
										
											2014-10-09 17:30:33 -04:00
										 |  |  | # -------------------- | 
					
						
							|  |  |  | # When changing UCD version please update | 
					
						
							|  |  |  | #   * Doc/library/stdtypes.rst, and | 
					
						
							|  |  |  | #   * Doc/library/unicodedata.rst | 
					
						
							| 
									
										
										
										
											2014-10-09 20:45:59 -04:00
										 |  |  | #   * Doc/reference/lexical_analysis.rst (two occurrences) | 
					
						
							| 
									
										
										
										
											2023-09-20 01:07:47 -04:00
										 |  |  | UNIDATA_VERSION = "15.1.0" | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  | UNICODE_DATA = "UnicodeData%s.txt" | 
					
						
							|  |  |  | COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" | 
					
						
							|  |  |  | EASTASIAN_WIDTH = "EastAsianWidth%s.txt" | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  | UNIHAN = "Unihan%s.zip" | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" | 
					
						
							| 
									
										
										
										
											2009-04-27 22:31:40 +00:00
										 |  |  | DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" | 
					
						
							| 
									
										
										
										
											2010-03-30 19:34:18 +00:00
										 |  |  | LINE_BREAK = "LineBreak%s.txt" | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  | NAME_ALIASES = "NameAliases%s.txt" | 
					
						
							|  |  |  | NAMED_SEQUENCES = "NamedSequences%s.txt" | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  | SPECIAL_CASING = "SpecialCasing%s.txt" | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  | CASE_FOLDING = "CaseFolding%s.txt" | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Private Use Areas -- in planes 1, 15, 16 | 
					
						
							|  |  |  | PUA_1 = range(0xE000, 0xF900) | 
					
						
							|  |  |  | PUA_15 = range(0xF0000, 0xFFFFE) | 
					
						
							|  |  |  | PUA_16 = range(0x100000, 0x10FFFE) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # we use this ranges of PUA_15 to store name aliases and named sequences | 
					
						
							|  |  |  | NAME_ALIASES_START = 0xF0000 | 
					
						
							| 
									
										
										
										
											2012-02-20 22:24:29 -05:00
										 |  |  | NAMED_SEQUENCES_START = 0xF0200 | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | old_versions = ["3.2.0"] | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", | 
					
						
							|  |  |  |     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", | 
					
						
							|  |  |  |     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", | 
					
						
							|  |  |  |     "So" ] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", | 
					
						
							|  |  |  |     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", | 
					
						
							| 
									
										
										
										
											2013-10-10 17:24:45 -04:00
										 |  |  |     "ON", "LRI", "RLI", "FSI", "PDI" ] | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-26 18:29:39 +02:00
										 |  |  | # "N" needs to be the first entry, see the comment in makeunicodedata | 
					
						
							|  |  |  | EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ] | 
					
						
							| 
									
										
										
										
											2004-08-04 07:38:35 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-03-30 19:34:18 +00:00
										 |  |  | MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  | # note: should match definitions in Objects/unicodectype.c | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | ALPHA_MASK = 0x01 | 
					
						
							|  |  |  | DECIMAL_MASK = 0x02 | 
					
						
							|  |  |  | DIGIT_MASK = 0x04 | 
					
						
							|  |  |  | LOWER_MASK = 0x08 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  | LINEBREAK_MASK = 0x10 | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | SPACE_MASK = 0x20 | 
					
						
							|  |  |  | TITLE_MASK = 0x40 | 
					
						
							|  |  |  | UPPER_MASK = 0x80 | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | XID_START_MASK = 0x100 | 
					
						
							|  |  |  | XID_CONTINUE_MASK = 0x200 | 
					
						
							| 
									
										
										
										
											2008-07-04 15:55:02 +00:00
										 |  |  | PRINTABLE_MASK = 0x400 | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  | NUMERIC_MASK = 0x800 | 
					
						
							|  |  |  | CASE_IGNORABLE_MASK = 0x1000 | 
					
						
							|  |  |  | CASED_MASK = 0x2000 | 
					
						
							|  |  |  | EXTENDED_CASE_MASK = 0x4000 | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-11-22 09:00:02 +00:00
										 |  |  | # these ranges need to match unicodedata.c:is_unified_ideograph | 
					
						
							|  |  |  | cjk_ranges = [ | 
					
						
							| 
									
										
										
										
											2023-09-20 01:07:47 -04:00
										 |  |  |     ('3400', '4DBF'),    # CJK Ideograph Extension A CJK | 
					
						
							|  |  |  |     ('4E00', '9FFF'),    # CJK Ideograph | 
					
						
							|  |  |  |     ('20000', '2A6DF'),  # CJK Ideograph Extension B | 
					
						
							|  |  |  |     ('2A700', '2B739'),  # CJK Ideograph Extension C | 
					
						
							|  |  |  |     ('2B740', '2B81D'),  # CJK Ideograph Extension D | 
					
						
							|  |  |  |     ('2B820', '2CEA1'),  # CJK Ideograph Extension E | 
					
						
							|  |  |  |     ('2CEB0', '2EBE0'),  # CJK Ideograph Extension F | 
					
						
							|  |  |  |     ('2EBF0', '2EE5D'),  # CJK Ideograph Extension I | 
					
						
							|  |  |  |     ('30000', '3134A'),  # CJK Ideograph Extension G | 
					
						
							|  |  |  |     ('31350', '323AF'),  # CJK Ideograph Extension H | 
					
						
							| 
									
										
										
										
											2010-11-22 09:00:02 +00:00
										 |  |  | ] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  | def maketables(trace=0): | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Reading", UNICODE_DATA % "", "...") | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |     unicode = UnicodeData(UNIDATA_VERSION) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-06-11 18:37:52 +00:00
										 |  |  |     print(len(list(filter(None, unicode.table))), "characters") | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |     for version in old_versions: | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |         print("--- Reading", UNICODE_DATA % ("-"+version), "...") | 
					
						
							| 
									
										
										
										
											2010-11-22 09:00:02 +00:00
										 |  |  |         old_unicode = UnicodeData(version, cjk_check=False) | 
					
						
							| 
									
										
										
										
											2008-06-11 18:37:52 +00:00
										 |  |  |         print(len(list(filter(None, old_unicode.table))), "characters") | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |         merge_old_version(version, unicode, old_unicode) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-21 23:31:52 +00:00
										 |  |  |     makeunicodename(unicode, trace) | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  |     makeunicodedata(unicode, trace) | 
					
						
							| 
									
										
										
										
											2001-01-21 23:31:52 +00:00
										 |  |  |     makeunicodetype(unicode, trace) | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | # -------------------------------------------------------------------- | 
					
						
							|  |  |  | # unicode character properties | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def makeunicodedata(unicode, trace): | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-26 18:29:39 +02:00
										 |  |  |     # the default value of east_asian_width is "N", for unassigned code points | 
					
						
							|  |  |  |     # not mentioned in EastAsianWidth.txt | 
					
						
							|  |  |  |     # in addition there are some reserved but unassigned code points in CJK | 
					
						
							|  |  |  |     # ranges that are classified as "W". code points in private use areas | 
					
						
							|  |  |  |     # have a width of "A". both of these have entries in | 
					
						
							|  |  |  |     # EastAsianWidth.txt | 
					
						
							|  |  |  |     # see https://unicode.org/reports/tr11/#Unassigned | 
					
						
							|  |  |  |     assert EASTASIANWIDTH_NAMES[0] == "N" | 
					
						
							| 
									
										
										
										
											2009-04-27 22:31:40 +00:00
										 |  |  |     dummy = (0, 0, 0, 0, 0, 0) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |     table = [dummy] | 
					
						
							|  |  |  |     cache = {0: dummy} | 
					
						
							|  |  |  |     index = [0] * len(unicode.chars) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  |     FILE = "Modules/unicodedata_db.h" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Preparing", FILE, "...") | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  |     # 1) database properties | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |     for char in unicode.chars: | 
					
						
							|  |  |  |         record = unicode.table[char] | 
					
						
							|  |  |  |         if record: | 
					
						
							|  |  |  |             # extract database properties | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             category = CATEGORY_NAMES.index(record.general_category) | 
					
						
							|  |  |  |             combining = int(record.canonical_combining_class) | 
					
						
							|  |  |  |             bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class) | 
					
						
							|  |  |  |             mirrored = record.bidi_mirrored == "Y" | 
					
						
							|  |  |  |             eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width) | 
					
						
							|  |  |  |             normalizationquickcheck = record.quick_check | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |             item = ( | 
					
						
							| 
									
										
										
										
											2009-04-27 22:31:40 +00:00
										 |  |  |                 category, combining, bidirectional, mirrored, eastasianwidth, | 
					
						
							|  |  |  |                 normalizationquickcheck | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |                 ) | 
					
						
							| 
									
										
										
										
											2022-08-26 18:29:39 +02:00
										 |  |  |         elif unicode.widths[char] is not None: | 
					
						
							|  |  |  |             # an unassigned but reserved character, with a known | 
					
						
							|  |  |  |             # east_asian_width | 
					
						
							|  |  |  |             eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char]) | 
					
						
							|  |  |  |             item = (0, 0, 0, 0, eastasianwidth, 0) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # add entry to index and item tables | 
					
						
							|  |  |  |         i = cache.get(item) | 
					
						
							|  |  |  |         if i is None: | 
					
						
							|  |  |  |             cache[item] = i = len(table) | 
					
						
							|  |  |  |             table.append(item) | 
					
						
							|  |  |  |         index[char] = i | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  |     # 2) decomposition data | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-08-19 11:20:44 +02:00
										 |  |  |     decomp_data_cache = {} | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  |     decomp_data = [0] | 
					
						
							|  |  |  |     decomp_prefix = [""] | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  |     decomp_index = [0] * len(unicode.chars) | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  |     decomp_size = 0 | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  |     comp_pairs = [] | 
					
						
							|  |  |  |     comp_first = [None] * len(unicode.chars) | 
					
						
							|  |  |  |     comp_last = [None] * len(unicode.chars) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  |     for char in unicode.chars: | 
					
						
							|  |  |  |         record = unicode.table[char] | 
					
						
							|  |  |  |         if record: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             if record.decomposition_type: | 
					
						
							|  |  |  |                 decomp = record.decomposition_type.split() | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |                 if len(decomp) > 19: | 
					
						
							| 
									
										
										
										
											2007-08-22 23:05:06 +00:00
										 |  |  |                     raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  |                 # prefix | 
					
						
							|  |  |  |                 if decomp[0][0] == "<": | 
					
						
							|  |  |  |                     prefix = decomp.pop(0) | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     prefix = "" | 
					
						
							|  |  |  |                 try: | 
					
						
							|  |  |  |                     i = decomp_prefix.index(prefix) | 
					
						
							|  |  |  |                 except ValueError: | 
					
						
							|  |  |  |                     i = len(decomp_prefix) | 
					
						
							|  |  |  |                     decomp_prefix.append(prefix) | 
					
						
							|  |  |  |                 prefix = i | 
					
						
							|  |  |  |                 assert prefix < 256 | 
					
						
							|  |  |  |                 # content | 
					
						
							| 
									
										
										
										
											2008-05-16 17:02:34 +00:00
										 |  |  |                 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  |                 # Collect NFC pairs | 
					
						
							|  |  |  |                 if not prefix and len(decomp) == 3 and \ | 
					
						
							|  |  |  |                    char not in unicode.exclusions and \ | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                    unicode.table[decomp[1]].canonical_combining_class == "0": | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  |                     p, l, r = decomp | 
					
						
							|  |  |  |                     comp_first[l] = 1 | 
					
						
							|  |  |  |                     comp_last[r] = 1 | 
					
						
							|  |  |  |                     comp_pairs.append((l,r,char)) | 
					
						
							| 
									
										
										
										
											2022-08-19 11:20:44 +02:00
										 |  |  |                 key = tuple(decomp) | 
					
						
							|  |  |  |                 i = decomp_data_cache.get(key, -1) | 
					
						
							|  |  |  |                 if i == -1: | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  |                     i = len(decomp_data) | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  |                     decomp_data.extend(decomp) | 
					
						
							|  |  |  |                     decomp_size = decomp_size + len(decomp) * 2 | 
					
						
							| 
									
										
										
										
											2022-08-19 11:20:44 +02:00
										 |  |  |                     decomp_data_cache[key] = i | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     assert decomp_data[i:i+len(decomp)] == decomp | 
					
						
							| 
									
										
										
										
											2000-09-25 08:07:06 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 i = 0 | 
					
						
							|  |  |  |             decomp_index[char] = i | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  |     f = l = 0 | 
					
						
							|  |  |  |     comp_first_ranges = [] | 
					
						
							|  |  |  |     comp_last_ranges = [] | 
					
						
							|  |  |  |     prev_f = prev_l = None | 
					
						
							|  |  |  |     for i in unicode.chars: | 
					
						
							|  |  |  |         if comp_first[i] is not None: | 
					
						
							|  |  |  |             comp_first[i] = f | 
					
						
							|  |  |  |             f += 1 | 
					
						
							|  |  |  |             if prev_f is None: | 
					
						
							|  |  |  |                 prev_f = (i,i) | 
					
						
							|  |  |  |             elif prev_f[1]+1 == i: | 
					
						
							|  |  |  |                 prev_f = prev_f[0],i | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 comp_first_ranges.append(prev_f) | 
					
						
							|  |  |  |                 prev_f = (i,i) | 
					
						
							|  |  |  |         if comp_last[i] is not None: | 
					
						
							|  |  |  |             comp_last[i] = l | 
					
						
							|  |  |  |             l += 1 | 
					
						
							|  |  |  |             if prev_l is None: | 
					
						
							|  |  |  |                 prev_l = (i,i) | 
					
						
							|  |  |  |             elif prev_l[1]+1 == i: | 
					
						
							|  |  |  |                 prev_l = prev_l[0],i | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 comp_last_ranges.append(prev_l) | 
					
						
							|  |  |  |                 prev_l = (i,i) | 
					
						
							|  |  |  |     comp_first_ranges.append(prev_f) | 
					
						
							|  |  |  |     comp_last_ranges.append(prev_l) | 
					
						
							|  |  |  |     total_first = f | 
					
						
							|  |  |  |     total_last = l | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     comp_data = [0]*(total_first*total_last) | 
					
						
							|  |  |  |     for f,l,char in comp_pairs: | 
					
						
							|  |  |  |         f = comp_first[f] | 
					
						
							|  |  |  |         l = comp_last[l] | 
					
						
							|  |  |  |         comp_data[f*total_last+l] = char | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print(len(table), "unique properties") | 
					
						
							|  |  |  |     print(len(decomp_prefix), "unique decomposition prefixes") | 
					
						
							|  |  |  |     print(len(decomp_data), "unique decomposition entries:", end=' ') | 
					
						
							|  |  |  |     print(decomp_size, "bytes") | 
					
						
							|  |  |  |     print(total_first, "first characters in NFC") | 
					
						
							|  |  |  |     print(total_last, "last characters in NFC") | 
					
						
							|  |  |  |     print(len(comp_pairs), "NFC pairs") | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Writing", FILE, "...") | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  |     with open(FILE, "w") as fp: | 
					
						
							|  |  |  |         fprint = partial(print, file=fp) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  |         fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION) | 
					
						
							|  |  |  |         fprint("/* a list of unique database records */") | 
					
						
							|  |  |  |         fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {") | 
					
						
							|  |  |  |         for item in table: | 
					
						
							|  |  |  |             fprint("    {%d, %d, %d, %d, %d, %d}," % item) | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* Reindexing of NFC first characters. */") | 
					
						
							|  |  |  |         fprint("#define TOTAL_FIRST",total_first) | 
					
						
							|  |  |  |         fprint("#define TOTAL_LAST",total_last) | 
					
						
							|  |  |  |         fprint("struct reindex{int start;short count,index;};") | 
					
						
							|  |  |  |         fprint("static struct reindex nfc_first[] = {") | 
					
						
							|  |  |  |         for start,end in comp_first_ranges: | 
					
						
							|  |  |  |             fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start])) | 
					
						
							|  |  |  |         fprint("    {0,0,0}") | 
					
						
							|  |  |  |         fprint("};\n") | 
					
						
							|  |  |  |         fprint("static struct reindex nfc_last[] = {") | 
					
						
							|  |  |  |         for start,end in comp_last_ranges: | 
					
						
							|  |  |  |             fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start])) | 
					
						
							|  |  |  |         fprint("  {0,0,0}") | 
					
						
							|  |  |  |         fprint("};\n") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # FIXME: <fl> the following tables could be made static, and | 
					
						
							|  |  |  |         # the support code moved into unicodedatabase.c | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* string literals */") | 
					
						
							|  |  |  |         fprint("const char *_PyUnicode_CategoryNames[] = {") | 
					
						
							|  |  |  |         for name in CATEGORY_NAMES: | 
					
						
							|  |  |  |             fprint("    \"%s\"," % name) | 
					
						
							|  |  |  |         fprint("    NULL") | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("const char *_PyUnicode_BidirectionalNames[] = {") | 
					
						
							|  |  |  |         for name in BIDIRECTIONAL_NAMES: | 
					
						
							|  |  |  |             fprint("    \"%s\"," % name) | 
					
						
							|  |  |  |         fprint("    NULL") | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("const char *_PyUnicode_EastAsianWidthNames[] = {") | 
					
						
							|  |  |  |         for name in EASTASIANWIDTH_NAMES: | 
					
						
							|  |  |  |             fprint("    \"%s\"," % name) | 
					
						
							|  |  |  |         fprint("    NULL") | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("static const char *decomp_prefix[] = {") | 
					
						
							|  |  |  |         for name in decomp_prefix: | 
					
						
							|  |  |  |             fprint("    \"%s\"," % name) | 
					
						
							|  |  |  |         fprint("    NULL") | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # split record index table | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |         index1, index2, shift = splitbins(index, trace) | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* index tables for the database records */") | 
					
						
							|  |  |  |         fprint("#define SHIFT", shift) | 
					
						
							|  |  |  |         Array("index1", index1).dump(fp, trace) | 
					
						
							|  |  |  |         Array("index2", index2).dump(fp, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # split decomposition index table | 
					
						
							|  |  |  |         index1, index2, shift = splitbins(decomp_index, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* decomposition data */") | 
					
						
							|  |  |  |         Array("decomp_data", decomp_data).dump(fp, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* index tables for the decomposition data */") | 
					
						
							|  |  |  |         fprint("#define DECOMP_SHIFT", shift) | 
					
						
							|  |  |  |         Array("decomp_index1", index1).dump(fp, trace) | 
					
						
							|  |  |  |         Array("decomp_index2", index2).dump(fp, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         index, index2, shift = splitbins(comp_data, trace) | 
					
						
							|  |  |  |         fprint("/* NFC pairs */") | 
					
						
							|  |  |  |         fprint("#define COMP_SHIFT", shift) | 
					
						
							|  |  |  |         Array("comp_index", index).dump(fp, trace) | 
					
						
							|  |  |  |         Array("comp_data", index2).dump(fp, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Generate delta tables for old versions | 
					
						
							|  |  |  |         for version, table, normalization in unicode.changed: | 
					
						
							|  |  |  |             cversion = version.replace(".","_") | 
					
						
							|  |  |  |             records = [table[0]] | 
					
						
							|  |  |  |             cache = {table[0]:0} | 
					
						
							|  |  |  |             index = [0] * len(table) | 
					
						
							|  |  |  |             for i, record in enumerate(table): | 
					
						
							|  |  |  |                 try: | 
					
						
							|  |  |  |                     index[i] = cache[record] | 
					
						
							|  |  |  |                 except KeyError: | 
					
						
							|  |  |  |                     index[i] = cache[record] = len(records) | 
					
						
							|  |  |  |                     records.append(record) | 
					
						
							|  |  |  |             index1, index2, shift = splitbins(index, trace) | 
					
						
							|  |  |  |             fprint("static const change_record change_records_%s[] = {" % cversion) | 
					
						
							|  |  |  |             for record in records: | 
					
						
							|  |  |  |                 fprint("    { %s }," % ", ".join(map(str,record))) | 
					
						
							|  |  |  |             fprint("};") | 
					
						
							|  |  |  |             Array("changes_%s_index" % cversion, index1).dump(fp, trace) | 
					
						
							|  |  |  |             Array("changes_%s_data" % cversion, index2).dump(fp, trace) | 
					
						
							|  |  |  |             fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion) | 
					
						
							|  |  |  |             fprint("{") | 
					
						
							|  |  |  |             fprint("    int index;") | 
					
						
							|  |  |  |             fprint("    if (n >= 0x110000) index = 0;") | 
					
						
							|  |  |  |             fprint("    else {") | 
					
						
							|  |  |  |             fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift)) | 
					
						
							|  |  |  |             fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \ | 
					
						
							|  |  |  |                    (cversion, shift, ((1<<shift)-1))) | 
					
						
							|  |  |  |             fprint("    }") | 
					
						
							|  |  |  |             fprint("    return change_records_%s+index;" % cversion) | 
					
						
							|  |  |  |             fprint("}\n") | 
					
						
							|  |  |  |             fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion) | 
					
						
							|  |  |  |             fprint("{") | 
					
						
							|  |  |  |             fprint("    switch(n) {") | 
					
						
							|  |  |  |             for k, v in normalization: | 
					
						
							|  |  |  |                 fprint("    case %s: return 0x%s;" % (hex(k), v)) | 
					
						
							|  |  |  |             fprint("    default: return 0;") | 
					
						
							|  |  |  |             fprint("    }\n}\n") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # -------------------------------------------------------------------- | 
					
						
							|  |  |  | # unicode character type tables | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def makeunicodetype(unicode, trace): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     FILE = "Objects/unicodetype_db.h" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Preparing", FILE, "...") | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # extract unicode types | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  |     dummy = (0, 0, 0, 0, 0, 0) | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |     table = [dummy] | 
					
						
							| 
									
										
										
											
												bpo-47243: Duplicate entry in 'Objects/unicodetype_db.h' (GH-32376)
Fix for duplicate 1st entry in 'Objects/unicodetype_db.h':
```c
/* a list of unique character type descriptors */
const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
    {0, 0, 0, 0, 0, 0},
    {0, 0, 0, 0, 0, 0}, <--- HERE
    {0, 0, 0, 0, 0, 32},
    {0, 0, 0, 0, 0, 48},
    …
```
https://bugs.python.org/issue47243
Automerge-Triggered-By: GH:isidentical
											
										 
											2022-09-28 15:57:14 +02:00
										 |  |  |     cache = {dummy: 0} | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |     index = [0] * len(unicode.chars) | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |     numeric = {} | 
					
						
							|  |  |  |     spaces = [] | 
					
						
							|  |  |  |     linebreaks = [] | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |     extra_casing = [] | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     for char in unicode.chars: | 
					
						
							|  |  |  |         record = unicode.table[char] | 
					
						
							|  |  |  |         if record: | 
					
						
							|  |  |  |             # extract database properties | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             category = record.general_category | 
					
						
							|  |  |  |             bidirectional = record.bidi_class | 
					
						
							|  |  |  |             properties = record.binary_properties | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |             flags = 0 | 
					
						
							|  |  |  |             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: | 
					
						
							|  |  |  |                 flags |= ALPHA_MASK | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |             if "Lowercase" in properties: | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |                 flags |= LOWER_MASK | 
					
						
							| 
									
										
										
										
											2010-03-30 19:34:18 +00:00
										 |  |  |             if 'Line_Break' in properties or bidirectional == "B": | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  |                 flags |= LINEBREAK_MASK | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |                 linebreaks.append(char) | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |             if category == "Zs" or bidirectional in ("WS", "B", "S"): | 
					
						
							|  |  |  |                 flags |= SPACE_MASK | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |                 spaces.append(char) | 
					
						
							| 
									
										
										
										
											2000-09-25 23:03:34 +00:00
										 |  |  |             if category == "Lt": | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |                 flags |= TITLE_MASK | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |             if "Uppercase" in properties: | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |                 flags |= UPPER_MASK | 
					
						
							| 
									
										
										
										
											2009-03-26 17:15:46 +00:00
										 |  |  |             if char == ord(" ") or category[0] not in ("C", "Z"): | 
					
						
							| 
									
										
										
										
											2008-07-04 15:55:02 +00:00
										 |  |  |                 flags |= PRINTABLE_MASK | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  |             if "XID_Start" in properties: | 
					
						
							|  |  |  |                 flags |= XID_START_MASK | 
					
						
							|  |  |  |             if "XID_Continue" in properties: | 
					
						
							|  |  |  |                 flags |= XID_CONTINUE_MASK | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |             if "Cased" in properties: | 
					
						
							|  |  |  |                 flags |= CASED_MASK | 
					
						
							|  |  |  |             if "Case_Ignorable" in properties: | 
					
						
							|  |  |  |                 flags |= CASE_IGNORABLE_MASK | 
					
						
							|  |  |  |             sc = unicode.special_casing.get(char) | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |             cf = unicode.case_folding.get(char, [char]) | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             if record.simple_uppercase_mapping: | 
					
						
							|  |  |  |                 upper = int(record.simple_uppercase_mapping, 16) | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 upper = char | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             if record.simple_lowercase_mapping: | 
					
						
							|  |  |  |                 lower = int(record.simple_lowercase_mapping, 16) | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 lower = char | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             if record.simple_titlecase_mapping: | 
					
						
							|  |  |  |                 title = int(record.simple_titlecase_mapping, 16) | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 title = upper | 
					
						
							|  |  |  |             if sc is None and cf != [lower]: | 
					
						
							|  |  |  |                 sc = ([lower], [title], [upper]) | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |             if sc is None: | 
					
						
							|  |  |  |                 if upper == lower == title: | 
					
						
							|  |  |  |                     upper = lower = title = 0 | 
					
						
							| 
									
										
										
										
											2012-01-15 21:19:20 -05:00
										 |  |  |                 else: | 
					
						
							|  |  |  |                     upper = upper - char | 
					
						
							|  |  |  |                     lower = lower - char | 
					
						
							|  |  |  |                     title = title - char | 
					
						
							|  |  |  |                     assert (abs(upper) <= 2147483647 and | 
					
						
							|  |  |  |                             abs(lower) <= 2147483647 and | 
					
						
							|  |  |  |                             abs(title) <= 2147483647) | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |                 # This happens either when some character maps to more than one | 
					
						
							|  |  |  |                 # character in uppercase, lowercase, or titlecase or the | 
					
						
							|  |  |  |                 # casefolded version of the character is different from the | 
					
						
							|  |  |  |                 # lowercase. The extra characters are stored in a different | 
					
						
							|  |  |  |                 # array. | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |                 flags |= EXTENDED_CASE_MASK | 
					
						
							|  |  |  |                 lower = len(extra_casing) | (len(sc[0]) << 24) | 
					
						
							|  |  |  |                 extra_casing.extend(sc[0]) | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |                 if cf != sc[0]: | 
					
						
							|  |  |  |                     lower |= len(cf) << 20 | 
					
						
							|  |  |  |                     extra_casing.extend(cf) | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |                 upper = len(extra_casing) | (len(sc[2]) << 24) | 
					
						
							|  |  |  |                 extra_casing.extend(sc[2]) | 
					
						
							|  |  |  |                 # Title is probably equal to upper. | 
					
						
							|  |  |  |                 if sc[1] == sc[2]: | 
					
						
							|  |  |  |                     title = upper | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     title = len(extra_casing) | (len(sc[1]) << 24) | 
					
						
							|  |  |  |                     extra_casing.extend(sc[1]) | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  |             # decimal digit, integer digit | 
					
						
							|  |  |  |             decimal = 0 | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             if record.decomposition_mapping: | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  |                 flags |= DECIMAL_MASK | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 decimal = int(record.decomposition_mapping) | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  |             digit = 0 | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             if record.numeric_type: | 
					
						
							| 
									
										
										
										
											2000-09-25 21:01:56 +00:00
										 |  |  |                 flags |= DIGIT_MASK | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 digit = int(record.numeric_type) | 
					
						
							|  |  |  |             if record.numeric_value: | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |                 flags |= NUMERIC_MASK | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 numeric.setdefault(record.numeric_value, []).append(char) | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |             item = ( | 
					
						
							| 
									
										
										
										
											2004-06-02 16:49:17 +00:00
										 |  |  |                 upper, lower, title, decimal, digit, flags | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  |                 ) | 
					
						
							|  |  |  |             # add entry to index and item tables | 
					
						
							|  |  |  |             i = cache.get(item) | 
					
						
							|  |  |  |             if i is None: | 
					
						
							|  |  |  |                 cache[item] = i = len(table) | 
					
						
							|  |  |  |                 table.append(item) | 
					
						
							|  |  |  |             index[char] = i | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print(len(table), "unique character type entries") | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |     print(sum(map(len, numeric.values())), "numeric code points") | 
					
						
							|  |  |  |     print(len(spaces), "whitespace code points") | 
					
						
							|  |  |  |     print(len(linebreaks), "linebreak code points") | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |     print(len(extra_casing), "extended case array") | 
					
						
							| 
									
										
										
										
											2000-09-25 17:59:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Writing", FILE, "...") | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  |     with open(FILE, "w") as fp: | 
					
						
							|  |  |  |         fprint = partial(print, file=fp) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  |         fprint("/* a list of unique character type descriptors */") | 
					
						
							|  |  |  |         fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {") | 
					
						
							|  |  |  |         for item in table: | 
					
						
							|  |  |  |             fprint("    {%d, %d, %d, %d, %d, %d}," % item) | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* extended case mappings */") | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  |         fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {") | 
					
						
							|  |  |  |         for c in extra_casing: | 
					
						
							|  |  |  |             fprint("    %d," % c) | 
					
						
							|  |  |  |         fprint("};") | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # split decomposition index table | 
					
						
							|  |  |  |         index1, index2, shift = splitbins(index, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* type indexes */") | 
					
						
							|  |  |  |         fprint("#define SHIFT", shift) | 
					
						
							|  |  |  |         Array("index1", index1).dump(fp, trace) | 
					
						
							|  |  |  |         Array("index2", index2).dump(fp, trace) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Generate code for _PyUnicode_ToNumeric() | 
					
						
							|  |  |  |         numeric_items = sorted(numeric.items()) | 
					
						
							|  |  |  |         fprint('/* Returns the numeric value as double for Unicode characters') | 
					
						
							|  |  |  |         fprint(' * having this property, -1.0 otherwise.') | 
					
						
							|  |  |  |         fprint(' */') | 
					
						
							|  |  |  |         fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)') | 
					
						
							|  |  |  |         fprint('{') | 
					
						
							|  |  |  |         fprint('    switch (ch) {') | 
					
						
							|  |  |  |         for value, codepoints in numeric_items: | 
					
						
							|  |  |  |             # Turn text into float literals | 
					
						
							|  |  |  |             parts = value.split('/') | 
					
						
							|  |  |  |             parts = [repr(float(part)) for part in parts] | 
					
						
							|  |  |  |             value = '/'.join(parts) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             codepoints.sort() | 
					
						
							|  |  |  |             for codepoint in codepoints: | 
					
						
							|  |  |  |                 fprint('    case 0x%04X:' % (codepoint,)) | 
					
						
							|  |  |  |             fprint('        return (double) %s;' % (value,)) | 
					
						
							|  |  |  |         fprint('    }') | 
					
						
							|  |  |  |         fprint('    return -1.0;') | 
					
						
							|  |  |  |         fprint('}') | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Generate code for _PyUnicode_IsWhitespace() | 
					
						
							|  |  |  |         fprint("/* Returns 1 for Unicode characters having the bidirectional") | 
					
						
							|  |  |  |         fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.") | 
					
						
							|  |  |  |         fprint(" */") | 
					
						
							|  |  |  |         fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)') | 
					
						
							|  |  |  |         fprint('{') | 
					
						
							|  |  |  |         fprint('    switch (ch) {') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for codepoint in sorted(spaces): | 
					
						
							|  |  |  |             fprint('    case 0x%04X:' % (codepoint,)) | 
					
						
							|  |  |  |         fprint('        return 1;') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint('    }') | 
					
						
							|  |  |  |         fprint('    return 0;') | 
					
						
							|  |  |  |         fprint('}') | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # Generate code for _PyUnicode_IsLinebreak() | 
					
						
							|  |  |  |         fprint("/* Returns 1 for Unicode characters having the line break") | 
					
						
							|  |  |  |         fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") | 
					
						
							|  |  |  |         fprint(" * type 'B', 0 otherwise.") | 
					
						
							|  |  |  |         fprint(" */") | 
					
						
							|  |  |  |         fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') | 
					
						
							|  |  |  |         fprint('{') | 
					
						
							|  |  |  |         fprint('    switch (ch) {') | 
					
						
							|  |  |  |         for codepoint in sorted(linebreaks): | 
					
						
							|  |  |  |             fprint('    case 0x%04X:' % (codepoint,)) | 
					
						
							|  |  |  |         fprint('        return 1;') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint('    }') | 
					
						
							|  |  |  |         fprint('    return 0;') | 
					
						
							|  |  |  |         fprint('}') | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # -------------------------------------------------------------------- | 
					
						
							|  |  |  | # unicode name database | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def makeunicodename(unicode, trace): | 
					
						
							| 
									
										
										
										
											2023-11-04 15:56:58 +01:00
										 |  |  |     from dawg import build_compression_dawg | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     FILE = "Modules/unicodename_db.h" | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Preparing", FILE, "...") | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # unicode name hash table | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # extract names | 
					
						
							|  |  |  |     data = [] | 
					
						
							|  |  |  |     for char in unicode.chars: | 
					
						
							|  |  |  |         record = unicode.table[char] | 
					
						
							|  |  |  |         if record: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             name = record.name.strip() | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  |             if name and name[0] != "<": | 
					
						
							|  |  |  |                 data.append((name, char)) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |     print("--- Writing", FILE, "...") | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  |     with open(FILE, "w") as fp: | 
					
						
							|  |  |  |         fprint = partial(print, file=fp) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION)) | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  |         fprint("#define NAME_MAXLEN", 256) | 
					
						
							| 
									
										
										
										
											2023-11-04 15:56:58 +01:00
										 |  |  |         assert max(len(x) for x in data) < 256 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  |         fprint() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint("/* name->code dictionary */") | 
					
						
							| 
									
										
										
										
											2023-11-04 15:56:58 +01:00
										 |  |  |         packed_dawg, pos_to_codepoint = build_compression_dawg(data) | 
					
						
							|  |  |  |         notfound = len(pos_to_codepoint) | 
					
						
							|  |  |  |         inverse_list = [notfound] * len(unicode.chars) | 
					
						
							|  |  |  |         for pos, codepoint in enumerate(pos_to_codepoint): | 
					
						
							|  |  |  |             inverse_list[codepoint] = pos | 
					
						
							|  |  |  |         Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace) | 
					
						
							|  |  |  |         Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace) | 
					
						
							|  |  |  |         index1, index2, shift = splitbins(inverse_list, trace) | 
					
						
							|  |  |  |         fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift) | 
					
						
							|  |  |  |         fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound) | 
					
						
							|  |  |  |         Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace) | 
					
						
							|  |  |  |         Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace) | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  |         fprint() | 
					
						
							|  |  |  |         fprint('static const unsigned int aliases_start = %#x;' % | 
					
						
							|  |  |  |                NAME_ALIASES_START) | 
					
						
							|  |  |  |         fprint('static const unsigned int aliases_end = %#x;' % | 
					
						
							|  |  |  |                (NAME_ALIASES_START + len(unicode.aliases))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint('static const unsigned int name_aliases[] = {') | 
					
						
							|  |  |  |         for name, codepoint in unicode.aliases: | 
					
						
							|  |  |  |             fprint('    0x%04X,' % codepoint) | 
					
						
							|  |  |  |         fprint('};') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, | 
					
						
							|  |  |  |         # so we are using Py_UCS2 seq[4].  This needs to be updated if longer | 
					
						
							|  |  |  |         # sequences or sequences with non-BMP chars are added. | 
					
						
							|  |  |  |         # unicodedata_lookup should be adapted too. | 
					
						
							|  |  |  |         fprint(dedent("""
 | 
					
						
							|  |  |  |             typedef struct NamedSequence { | 
					
						
							|  |  |  |                 int seqlen; | 
					
						
							|  |  |  |                 Py_UCS2 seq[4]; | 
					
						
							|  |  |  |             } named_sequence; | 
					
						
							|  |  |  |             """))
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint('static const unsigned int named_sequences_start = %#x;' % | 
					
						
							|  |  |  |                NAMED_SEQUENCES_START) | 
					
						
							|  |  |  |         fprint('static const unsigned int named_sequences_end = %#x;' % | 
					
						
							|  |  |  |                (NAMED_SEQUENCES_START + len(unicode.named_sequences))) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         fprint('static const named_sequence named_sequences[] = {') | 
					
						
							|  |  |  |         for name, sequence in unicode.named_sequences: | 
					
						
							|  |  |  |             seq_str = ', '.join('0x%04X' % cp for cp in sequence) | 
					
						
							|  |  |  |             fprint('    {%d, {%s}},' % (len(sequence), seq_str)) | 
					
						
							|  |  |  |         fprint('};') | 
					
						
							| 
									
										
										
										
											2001-01-21 17:01:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def merge_old_version(version, new, old): | 
					
						
							|  |  |  |     # Changes to exclusion file not implemented yet | 
					
						
							|  |  |  |     if old.exclusions != new.exclusions: | 
					
						
							| 
									
										
										
										
											2007-08-22 23:05:06 +00:00
										 |  |  |         raise NotImplementedError("exclusions differ") | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # In these change records, 0xFF means "no change" | 
					
						
							|  |  |  |     bidir_changes = [0xFF]*0x110000 | 
					
						
							|  |  |  |     category_changes = [0xFF]*0x110000 | 
					
						
							|  |  |  |     decimal_changes = [0xFF]*0x110000 | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  |     mirrored_changes = [0xFF]*0x110000 | 
					
						
							| 
									
										
										
										
											2016-09-14 23:53:47 -07:00
										 |  |  |     east_asian_width_changes = [0xFF]*0x110000 | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |     # In numeric data, 0 means "no change", | 
					
						
							|  |  |  |     # -1 means "did not have a numeric value | 
					
						
							|  |  |  |     numeric_changes = [0] * 0x110000 | 
					
						
							|  |  |  |     # normalization_changes is a list of key-value pairs | 
					
						
							|  |  |  |     normalization_changes = [] | 
					
						
							|  |  |  |     for i in range(0x110000): | 
					
						
							|  |  |  |         if new.table[i] is None: | 
					
						
							|  |  |  |             # Characters unassigned in the new version ought to | 
					
						
							|  |  |  |             # be unassigned in the old one | 
					
						
							|  |  |  |             assert old.table[i] is None | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         # check characters unassigned in the old version | 
					
						
							|  |  |  |         if old.table[i] is None: | 
					
						
							|  |  |  |             # category 0 is "unassigned" | 
					
						
							|  |  |  |             category_changes[i] = 0 | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         # check characters that differ | 
					
						
							|  |  |  |         if old.table[i] != new.table[i]: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             for k, field in enumerate(dataclasses.fields(UcdRecord)): | 
					
						
							|  |  |  |                 value = getattr(old.table[i], field.name) | 
					
						
							|  |  |  |                 new_value = getattr(new.table[i], field.name) | 
					
						
							|  |  |  |                 if value != new_value: | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  |                     if k == 1 and i in PUA_15: | 
					
						
							|  |  |  |                         # the name is not set in the old.table, but in the | 
					
						
							|  |  |  |                         # new.table we are using it for aliases and named seq | 
					
						
							|  |  |  |                         assert value == '' | 
					
						
							|  |  |  |                     elif k == 2: | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |                         category_changes[i] = CATEGORY_NAMES.index(value) | 
					
						
							|  |  |  |                     elif k == 4: | 
					
						
							|  |  |  |                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) | 
					
						
							|  |  |  |                     elif k == 5: | 
					
						
							|  |  |  |                         # We assume that all normalization changes are in 1:1 mappings | 
					
						
							|  |  |  |                         assert " " not in value | 
					
						
							|  |  |  |                         normalization_changes.append((i, value)) | 
					
						
							|  |  |  |                     elif k == 6: | 
					
						
							|  |  |  |                         # we only support changes where the old value is a single digit | 
					
						
							|  |  |  |                         assert value in "0123456789" | 
					
						
							|  |  |  |                         decimal_changes[i] = int(value) | 
					
						
							|  |  |  |                     elif k == 8: | 
					
						
							|  |  |  |                         # Since 0 encodes "no change", the old value is better not 0 | 
					
						
							|  |  |  |                         if not value: | 
					
						
							|  |  |  |                             numeric_changes[i] = -1 | 
					
						
							|  |  |  |                         else: | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |                             numeric_changes[i] = float(value) | 
					
						
							|  |  |  |                             assert numeric_changes[i] not in (0, -1) | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  |                     elif k == 9: | 
					
						
							|  |  |  |                         if value == 'Y': | 
					
						
							|  |  |  |                             mirrored_changes[i] = '1' | 
					
						
							|  |  |  |                         else: | 
					
						
							|  |  |  |                             mirrored_changes[i] = '0' | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |                     elif k == 11: | 
					
						
							|  |  |  |                         # change to ISO comment, ignore | 
					
						
							|  |  |  |                         pass | 
					
						
							|  |  |  |                     elif k == 12: | 
					
						
							|  |  |  |                         # change to simple uppercase mapping; ignore | 
					
						
							|  |  |  |                         pass | 
					
						
							|  |  |  |                     elif k == 13: | 
					
						
							|  |  |  |                         # change to simple lowercase mapping; ignore | 
					
						
							|  |  |  |                         pass | 
					
						
							|  |  |  |                     elif k == 14: | 
					
						
							|  |  |  |                         # change to simple titlecase mapping; ignore | 
					
						
							|  |  |  |                         pass | 
					
						
							| 
									
										
										
										
											2016-09-14 23:53:47 -07:00
										 |  |  |                     elif k == 15: | 
					
						
							|  |  |  |                         # change to east asian width | 
					
						
							|  |  |  |                         east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value) | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  |                     elif k == 16: | 
					
						
							|  |  |  |                         # derived property changes; not yet | 
					
						
							|  |  |  |                         pass | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |                     elif k == 17: | 
					
						
							|  |  |  |                         # normalization quickchecks are not performed | 
					
						
							|  |  |  |                         # for older versions | 
					
						
							|  |  |  |                         pass | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |                     else: | 
					
						
							|  |  |  |                         class Difference(Exception):pass | 
					
						
							| 
									
										
										
										
											2007-08-22 23:05:06 +00:00
										 |  |  |                         raise Difference(hex(i), k, old.table[i], new.table[i]) | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  |     new.changed.append((version, list(zip(bidir_changes, category_changes, | 
					
						
							| 
									
										
										
										
											2016-09-14 23:53:47 -07:00
										 |  |  |                                           decimal_changes, mirrored_changes, | 
					
						
							|  |  |  |                                           east_asian_width_changes, | 
					
						
							|  |  |  |                                           numeric_changes)), | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |                         normalization_changes)) | 
					
						
							| 
									
										
										
										
											2006-03-10 23:39:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-14 18:18:53 -07:00
										 |  |  | DATA_DIR = os.path.join('Tools', 'unicode', 'data') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  | def open_data(template, version): | 
					
						
							| 
									
										
										
										
											2019-08-14 18:18:53 -07:00
										 |  |  |     local = os.path.join(DATA_DIR, template % ('-'+version,)) | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |     if not os.path.exists(local): | 
					
						
							|  |  |  |         import urllib.request | 
					
						
							|  |  |  |         if version == '3.2.0': | 
					
						
							|  |  |  |             # irregular url structure | 
					
						
							| 
									
										
										
										
											2020-03-10 21:10:59 -07:00
										 |  |  |             url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,) | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2020-03-10 21:10:59 -07:00
										 |  |  |             url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '') | 
					
						
							| 
									
										
										
										
											2019-08-14 18:18:53 -07:00
										 |  |  |         os.makedirs(DATA_DIR, exist_ok=True) | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |         urllib.request.urlretrieve(url, filename=local) | 
					
						
							|  |  |  |     if local.endswith('.txt'): | 
					
						
							|  |  |  |         return open(local, encoding='utf-8') | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         # Unihan.zip | 
					
						
							|  |  |  |         return open(local, 'rb') | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  | def expand_range(char_range: str) -> Iterator[int]: | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     Parses ranges of code points, as described in UAX #44: | 
					
						
							|  |  |  |       https://www.unicode.org/reports/tr44/#Code_Point_Ranges | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     if '..' in char_range: | 
					
						
							|  |  |  |         first, last = [int(c, 16) for c in char_range.split('..')] | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         first = last = int(char_range, 16) | 
					
						
							|  |  |  |     for char in range(first, last+1): | 
					
						
							|  |  |  |         yield char | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  | class UcdFile: | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  |     A file in the standard format of the UCD. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     See: https://www.unicode.org/reports/tr44/#Format_Conventions | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Note that, as described there, the Unihan data files have their | 
					
						
							|  |  |  |     own separate format. | 
					
						
							|  |  |  |     '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, template: str, version: str) -> None: | 
					
						
							|  |  |  |         self.template = template | 
					
						
							|  |  |  |         self.version = version | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def records(self) -> Iterator[List[str]]: | 
					
						
							|  |  |  |         with open_data(self.template, self.version) as file: | 
					
						
							|  |  |  |             for line in file: | 
					
						
							|  |  |  |                 line = line.split('#', 1)[0].strip() | 
					
						
							|  |  |  |                 if not line: | 
					
						
							|  |  |  |                     continue | 
					
						
							|  |  |  |                 yield [field.strip() for field in line.split(';')] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __iter__(self) -> Iterator[List[str]]: | 
					
						
							|  |  |  |         return self.records() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  |     def expanded(self) -> Iterator[Tuple[int, List[str]]]: | 
					
						
							|  |  |  |         for record in self.records(): | 
					
						
							|  |  |  |             char_range, rest = record[0], record[1:] | 
					
						
							|  |  |  |             for char in expand_range(char_range): | 
					
						
							|  |  |  |                 yield char, rest | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  | @dataclasses.dataclass | 
					
						
							|  |  |  | class UcdRecord: | 
					
						
							|  |  |  |     # 15 fields from UnicodeData.txt .  See: | 
					
						
							|  |  |  |     #   https://www.unicode.org/reports/tr44/#UnicodeData.txt | 
					
						
							|  |  |  |     codepoint: str | 
					
						
							|  |  |  |     name: str | 
					
						
							|  |  |  |     general_category: str | 
					
						
							|  |  |  |     canonical_combining_class: str | 
					
						
							|  |  |  |     bidi_class: str | 
					
						
							|  |  |  |     decomposition_type: str | 
					
						
							|  |  |  |     decomposition_mapping: str | 
					
						
							|  |  |  |     numeric_type: str | 
					
						
							|  |  |  |     numeric_value: str | 
					
						
							|  |  |  |     bidi_mirrored: str | 
					
						
							|  |  |  |     unicode_1_name: str  # obsolete | 
					
						
							|  |  |  |     iso_comment: str  # obsolete | 
					
						
							|  |  |  |     simple_uppercase_mapping: str | 
					
						
							|  |  |  |     simple_lowercase_mapping: str | 
					
						
							|  |  |  |     simple_titlecase_mapping: str | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt | 
					
						
							|  |  |  |     east_asian_width: Optional[str] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Binary properties, as a set of those that are true. | 
					
						
							|  |  |  |     # Taken from multiple files: | 
					
						
							|  |  |  |     #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt | 
					
						
							|  |  |  |     #   https://www.unicode.org/reports/tr44/#LineBreak.txt | 
					
						
							|  |  |  |     binary_properties: Set[str] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # The Quick_Check properties related to normalization: | 
					
						
							|  |  |  |     #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization | 
					
						
							|  |  |  |     # We store them as a bitmask. | 
					
						
							|  |  |  |     quick_check: int | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def from_row(row: List[str]) -> UcdRecord: | 
					
						
							|  |  |  |     return UcdRecord(*row, None, set(), 0) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | # -------------------------------------------------------------------- | 
					
						
							|  |  |  | # the following support code is taken from the unidb utilities | 
					
						
							|  |  |  | # Copyright (c) 1999-2000 by Secret Labs AB | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # load a unicode-data file from disk | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class UnicodeData: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |     # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-12 22:59:30 -07:00
										 |  |  |     def __init__(self, version, cjk_check=True): | 
					
						
							| 
									
										
										
										
											2006-03-09 23:38:20 +00:00
										 |  |  |         self.changed = [] | 
					
						
							| 
									
										
										
										
											2002-10-18 16:11:54 +00:00
										 |  |  |         table = [None] * 0x110000 | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |         for s in UcdFile(UNICODE_DATA, version): | 
					
						
							|  |  |  |             char = int(s[0], 16) | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |             table[char] = from_row(s) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-11-22 09:00:02 +00:00
										 |  |  |         cjk_ranges_found = [] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-24 23:05:09 +00:00
										 |  |  |         # expand first-last ranges | 
					
						
							| 
									
										
										
										
											2019-08-12 22:59:30 -07:00
										 |  |  |         field = None | 
					
						
							|  |  |  |         for i in range(0, 0x110000): | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  |             # The file UnicodeData.txt has its own distinct way of | 
					
						
							|  |  |  |             # expressing ranges.  See: | 
					
						
							|  |  |  |             #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges | 
					
						
							| 
									
										
										
										
											2019-08-12 22:59:30 -07:00
										 |  |  |             s = table[i] | 
					
						
							|  |  |  |             if s: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 if s.name[-6:] == "First>": | 
					
						
							|  |  |  |                     s.name = "" | 
					
						
							|  |  |  |                     field = dataclasses.astuple(s)[:15] | 
					
						
							|  |  |  |                 elif s.name[-5:] == "Last>": | 
					
						
							|  |  |  |                     if s.name.startswith("<CJK Ideograph"): | 
					
						
							| 
									
										
										
										
											2019-08-12 22:59:30 -07:00
										 |  |  |                         cjk_ranges_found.append((field[0], | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                                                  s.codepoint)) | 
					
						
							|  |  |  |                     s.name = "" | 
					
						
							| 
									
										
										
										
											2019-08-12 22:59:30 -07:00
										 |  |  |                     field = None | 
					
						
							|  |  |  |             elif field: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 table[i] = from_row(('%X' % i,) + field[1:]) | 
					
						
							| 
									
										
										
										
											2019-08-12 22:59:30 -07:00
										 |  |  |         if cjk_check and cjk_ranges != cjk_ranges_found: | 
					
						
							|  |  |  |             raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         # public attributes | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |         self.filename = UNICODE_DATA % '' | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         self.table = table | 
					
						
							| 
									
										
										
										
											2008-05-16 17:02:34 +00:00
										 |  |  |         self.chars = list(range(0x110000)) # unicode 3.2 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  |         # check for name aliases and named sequences, see #12753 | 
					
						
							|  |  |  |         # aliases and named sequences are not in 3.2.0 | 
					
						
							|  |  |  |         if version != '3.2.0': | 
					
						
							|  |  |  |             self.aliases = [] | 
					
						
							|  |  |  |             # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, | 
					
						
							|  |  |  |             # in order to take advantage of the compression and lookup | 
					
						
							|  |  |  |             # algorithms used for the other characters | 
					
						
							|  |  |  |             pua_index = NAME_ALIASES_START | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |             for char, name, abbrev in UcdFile(NAME_ALIASES, version): | 
					
						
							|  |  |  |                 char = int(char, 16) | 
					
						
							|  |  |  |                 self.aliases.append((name, char)) | 
					
						
							|  |  |  |                 # also store the name in the PUA 1 | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 self.table[pua_index].name = name | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |                 pua_index += 1 | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  |             assert pua_index - NAME_ALIASES_START == len(self.aliases) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             self.named_sequences = [] | 
					
						
							| 
									
										
										
										
											2013-08-26 01:32:56 +03:00
										 |  |  |             # store named sequences in the PUA 1, in range U+F0100.., | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  |             # in order to take advantage of the compression and lookup | 
					
						
							|  |  |  |             # algorithms used for the other characters. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-20 22:24:29 -05:00
										 |  |  |             assert pua_index < NAMED_SEQUENCES_START | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  |             pua_index = NAMED_SEQUENCES_START | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |             for name, chars in UcdFile(NAMED_SEQUENCES, version): | 
					
						
							|  |  |  |                 chars = tuple(int(char, 16) for char in chars.split()) | 
					
						
							|  |  |  |                 # check that the structure defined in makeunicodename is OK | 
					
						
							|  |  |  |                 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" | 
					
						
							|  |  |  |                 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " | 
					
						
							|  |  |  |                     "the NamedSequence struct and in unicodedata_lookup") | 
					
						
							|  |  |  |                 self.named_sequences.append((name, chars)) | 
					
						
							|  |  |  |                 # also store these in the PUA 1 | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 self.table[pua_index].name = name | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |                 pua_index += 1 | 
					
						
							| 
									
										
										
										
											2011-10-21 21:57:36 +03:00
										 |  |  |             assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  |         self.exclusions = {} | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |         for char, in UcdFile(COMPOSITION_EXCLUSIONS, version): | 
					
						
							|  |  |  |             char = int(char, 16) | 
					
						
							|  |  |  |             self.exclusions[char] = 1 | 
					
						
							| 
									
										
										
										
											2002-11-23 22:08:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-06-02 16:49:17 +00:00
										 |  |  |         widths = [None] * 0x110000 | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  |         for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded(): | 
					
						
							|  |  |  |             widths[char] = width | 
					
						
							| 
									
										
										
										
											2011-09-30 08:46:25 +03:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2004-06-02 16:49:17 +00:00
										 |  |  |         for i in range(0, 0x110000): | 
					
						
							|  |  |  |             if table[i] is not None: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 table[i].east_asian_width = widths[i] | 
					
						
							| 
									
										
										
										
											2022-08-26 18:29:39 +02:00
										 |  |  |         self.widths = widths | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-09-20 01:07:47 -04:00
										 |  |  |         for char, (propname, *propinfo) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): | 
					
						
							|  |  |  |             if propinfo: | 
					
						
							|  |  |  |                 # this is not a binary property, ignore it | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  |             if table[char]: | 
					
						
							|  |  |  |                 # Some properties (e.g. Default_Ignorable_Code_Point) | 
					
						
							|  |  |  |                 # apply to unassigned code points; ignore them | 
					
						
							| 
									
										
										
										
											2023-09-20 01:07:47 -04:00
										 |  |  |                 table[char].binary_properties.add(propname) | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  |         for char_range, value in UcdFile(LINE_BREAK, version): | 
					
						
							|  |  |  |             if value not in MANDATORY_LINE_BREAKS: | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  |             for char in expand_range(char_range): | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 table[char].binary_properties.add('Line_Break') | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # We only want the quickcheck properties | 
					
						
							|  |  |  |         # Format: NF?_QC; Y(es)/N(o)/M(aybe) | 
					
						
							|  |  |  |         # Yes is the default, hence only N and M occur | 
					
						
							|  |  |  |         # In 3.2.0, the format was different (NF?_NO) | 
					
						
							|  |  |  |         # The parsing will incorrectly determine these as | 
					
						
							|  |  |  |         # "yes", however, unicodedata.c will not perform quickchecks | 
					
						
							|  |  |  |         # for older versions, and no delta records will be created. | 
					
						
							|  |  |  |         quickchecks = [0] * 0x110000 | 
					
						
							|  |  |  |         qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |         for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version): | 
					
						
							|  |  |  |             if len(s) < 2 or s[1] not in qc_order: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No | 
					
						
							|  |  |  |             quickcheck_shift = qc_order.index(s[1])*2 | 
					
						
							|  |  |  |             quickcheck <<= quickcheck_shift | 
					
						
							| 
									
										
										
										
											2019-08-13 19:28:38 -07:00
										 |  |  |             for char in expand_range(s[0]): | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |                 assert not (quickchecks[char]>>quickcheck_shift)&3 | 
					
						
							|  |  |  |                 quickchecks[char] |= quickcheck | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |         for i in range(0, 0x110000): | 
					
						
							|  |  |  |             if table[i] is not None: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 table[i].quick_check = quickchecks[i] | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-09-30 08:46:25 +03:00
										 |  |  |         with open_data(UNIHAN, version) as file: | 
					
						
							|  |  |  |             zip = zipfile.ZipFile(file) | 
					
						
							|  |  |  |             if version == '3.2.0': | 
					
						
							|  |  |  |                 data = zip.open('Unihan-3.2.0.txt').read() | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 data = zip.open('Unihan_NumericValues.txt').read() | 
					
						
							| 
									
										
										
										
											2010-10-11 22:42:28 +00:00
										 |  |  |         for line in data.decode("utf-8").splitlines(): | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |             if not line.startswith('U+'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             code, tag, value = line.split(None, 3)[:3] | 
					
						
							|  |  |  |             if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', | 
					
						
							|  |  |  |                            'kOtherNumeric'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             value = value.strip().replace(',', '') | 
					
						
							|  |  |  |             i = int(code[2:], 16) | 
					
						
							|  |  |  |             # Patch the numeric field | 
					
						
							|  |  |  |             if table[i] is not None: | 
					
						
							| 
									
										
										
										
											2019-09-12 02:23:43 -07:00
										 |  |  |                 table[i].numeric_value = value | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-01-11 18:17:06 -05:00
										 |  |  |         sc = self.special_casing = {} | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |         for data in UcdFile(SPECIAL_CASING, version): | 
					
						
							|  |  |  |             if data[4]: | 
					
						
							|  |  |  |                 # We ignore all conditionals (since they depend on | 
					
						
							|  |  |  |                 # languages) except for one, which is hardcoded. See | 
					
						
							|  |  |  |                 # handle_capital_sigma in unicodeobject.c. | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             c = int(data[0], 16) | 
					
						
							|  |  |  |             lower = [int(char, 16) for char in data[1].split()] | 
					
						
							|  |  |  |             title = [int(char, 16) for char in data[2].split()] | 
					
						
							|  |  |  |             upper = [int(char, 16) for char in data[3].split()] | 
					
						
							|  |  |  |             sc[c] = (lower, title, upper) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-01-14 13:23:30 -05:00
										 |  |  |         cf = self.case_folding = {} | 
					
						
							|  |  |  |         if version != '3.2.0': | 
					
						
							| 
									
										
										
										
											2019-08-12 22:20:56 -07:00
										 |  |  |             for data in UcdFile(CASE_FOLDING, version): | 
					
						
							|  |  |  |                 if data[1] in "CF": | 
					
						
							|  |  |  |                     c = int(data[0], 16) | 
					
						
							|  |  |  |                     cf[c] = [int(char, 16) for char in data[2].split()] | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |     def uselatin1(self): | 
					
						
							|  |  |  |         # restrict character range to ISO Latin 1 | 
					
						
							| 
									
										
										
										
											2008-05-16 17:02:34 +00:00
										 |  |  |         self.chars = list(range(256)) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | # stuff to deal with arrays of unsigned integers | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Array: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def __init__(self, name, data): | 
					
						
							|  |  |  |         self.name = name | 
					
						
							|  |  |  |         self.data = data | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  |     def dump(self, file, trace=0): | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         # write data to file, as a C array | 
					
						
							|  |  |  |         size = getsize(self.data) | 
					
						
							| 
									
										
										
										
											2001-01-21 22:41:08 +00:00
										 |  |  |         if trace: | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |             print(self.name+":", size*len(self.data), "bytes", file=sys.stderr) | 
					
						
							| 
									
										
										
										
											2019-04-17 08:40:34 +09:00
										 |  |  |         file.write("static const ") | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         if size == 1: | 
					
						
							|  |  |  |             file.write("unsigned char") | 
					
						
							|  |  |  |         elif size == 2: | 
					
						
							|  |  |  |             file.write("unsigned short") | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             file.write("unsigned int") | 
					
						
							|  |  |  |         file.write(" " + self.name + "[] = {\n") | 
					
						
							|  |  |  |         if self.data: | 
					
						
							|  |  |  |             s = "    " | 
					
						
							|  |  |  |             for item in self.data: | 
					
						
							|  |  |  |                 i = str(item) + ", " | 
					
						
							|  |  |  |                 if len(s) + len(i) > 78: | 
					
						
							| 
									
										
										
										
											2017-06-22 22:31:08 -07:00
										 |  |  |                     file.write(s.rstrip() + "\n") | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |                     s = "    " + i | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     s = s + i | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |             if s.strip(): | 
					
						
							| 
									
										
										
										
											2017-06-22 22:31:08 -07:00
										 |  |  |                 file.write(s.rstrip() + "\n") | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         file.write("};\n\n") | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | def getsize(data): | 
					
						
							|  |  |  |     # return smallest possible integer size for the given array | 
					
						
							|  |  |  |     maxdata = max(data) | 
					
						
							|  |  |  |     if maxdata < 256: | 
					
						
							|  |  |  |         return 1 | 
					
						
							|  |  |  |     elif maxdata < 65536: | 
					
						
							|  |  |  |         return 2 | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return 4 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  | def splitbins(t, trace=0): | 
					
						
							|  |  |  |     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     t is a sequence of ints.  This function can be useful to save space if | 
					
						
							|  |  |  |     many of the ints are the same.  t1 and t2 are lists of ints, and shift | 
					
						
							|  |  |  |     is an int, chosen to minimize the combined size of t1 and t2 (in C | 
					
						
							|  |  |  |     code), and where for each i in range(len(t)), | 
					
						
							|  |  |  |         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] | 
					
						
							|  |  |  |     where mask is a bitmask isolating the last "shift" bits. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  |     If optional arg trace is non-zero (default zero), progress info | 
					
						
							|  |  |  |     is printed to sys.stderr.  The higher the value, the more info | 
					
						
							|  |  |  |     you'll get. | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |     """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if trace: | 
					
						
							|  |  |  |         def dump(t1, t2, shift, bytes): | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |             print("%d+%d bins at shift %d; %d bytes" % ( | 
					
						
							|  |  |  |                 len(t1), len(t2), shift, bytes), file=sys.stderr) | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  |         print("Size of original table:", len(t)*getsize(t), "bytes", | 
					
						
							|  |  |  |               file=sys.stderr) | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |     n = len(t)-1    # last valid index | 
					
						
							|  |  |  |     maxshift = 0    # the most we can shift n and still have something left | 
					
						
							|  |  |  |     if n > 0: | 
					
						
							|  |  |  |         while n >> 1: | 
					
						
							|  |  |  |             n >>= 1 | 
					
						
							|  |  |  |             maxshift += 1 | 
					
						
							|  |  |  |     del n | 
					
						
							| 
									
										
										
										
											2007-12-04 23:02:19 +00:00
										 |  |  |     bytes = sys.maxsize  # smallest total size so far | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |     t = tuple(t)    # so slices can be dict keys | 
					
						
							|  |  |  |     for shift in range(maxshift + 1): | 
					
						
							|  |  |  |         t1 = [] | 
					
						
							|  |  |  |         t2 = [] | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         size = 2**shift | 
					
						
							|  |  |  |         bincache = {} | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |         for i in range(0, len(t), size): | 
					
						
							|  |  |  |             bin = t[i:i+size] | 
					
						
							|  |  |  |             index = bincache.get(bin) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |             if index is None: | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |                 index = len(t2) | 
					
						
							|  |  |  |                 bincache[bin] = index | 
					
						
							|  |  |  |                 t2.extend(bin) | 
					
						
							|  |  |  |             t1.append(index >> shift) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         # determine memory size | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |         b = len(t1)*getsize(t1) + len(t2)*getsize(t2) | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  |         if trace > 1: | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |             dump(t1, t2, shift, b) | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |         if b < bytes: | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |             best = t1, t2, shift | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  |             bytes = b | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |     t1, t2, shift = best | 
					
						
							|  |  |  |     if trace: | 
					
						
							| 
									
										
										
										
											2007-08-03 17:06:41 +00:00
										 |  |  |         print("Best:", end=' ', file=sys.stderr) | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |         dump(t1, t2, shift, bytes) | 
					
						
							|  |  |  |     if __debug__: | 
					
						
							|  |  |  |         # exhaustively verify that the decomposition is correct | 
					
						
							|  |  |  |         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits | 
					
						
							| 
									
										
										
										
											2007-05-07 22:24:25 +00:00
										 |  |  |         for i in range(len(t)): | 
					
						
							| 
									
										
										
										
											2000-09-25 07:13:41 +00:00
										 |  |  |             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] | 
					
						
							|  |  |  |     return best | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2019-06-01 21:49:03 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-24 23:18:31 +00:00
										 |  |  | if __name__ == "__main__": | 
					
						
							| 
									
										
										
										
											2000-11-03 20:24:15 +00:00
										 |  |  |     maketables(1) |