mirror of
				https://github.com/python/cpython.git
				synced 2025-10-23 01:43:53 +00:00 
			
		
		
		
	bpo-37760: Factor out the basic UCD parsing logic of makeunicodedata. (GH-15130)
There were 10 copies of this, and almost as many distinct versions of exactly how it was written. They're all implementing the same standard. Pull them out to the top, so the more interesting logic that remains becomes easier to read.
This commit is contained in:
		
							parent
							
								
									66a34d35e4
								
							
						
					
					
						commit
						ef2af1ad44
					
				
					 1 changed files with 108 additions and 132 deletions
				
			
		|  | @ -30,8 +30,9 @@ | ||||||
| import sys | import sys | ||||||
| import zipfile | import zipfile | ||||||
| 
 | 
 | ||||||
| from textwrap import dedent |  | ||||||
| from functools import partial | from functools import partial | ||||||
|  | from textwrap import dedent | ||||||
|  | from typing import * | ||||||
| 
 | 
 | ||||||
| SCRIPT = sys.argv[0] | SCRIPT = sys.argv[0] | ||||||
| VERSION = "3.3" | VERSION = "3.3" | ||||||
|  | @ -903,6 +904,32 @@ def open_data(template, version): | ||||||
|         return open(local, 'rb') |         return open(local, 'rb') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class UcdFile: | ||||||
|  |     ''' | ||||||
|  |     A file in the standard format of the UCD. | ||||||
|  | 
 | ||||||
|  |     See: https://www.unicode.org/reports/tr44/#Format_Conventions | ||||||
|  | 
 | ||||||
|  |     Note that, as described there, the Unihan data files have their | ||||||
|  |     own separate format. | ||||||
|  |     ''' | ||||||
|  | 
 | ||||||
|  |     def __init__(self, template: str, version: str) -> None: | ||||||
|  |         self.template = template | ||||||
|  |         self.version = version | ||||||
|  | 
 | ||||||
|  |     def records(self) -> Iterator[List[str]]: | ||||||
|  |         with open_data(self.template, self.version) as file: | ||||||
|  |             for line in file: | ||||||
|  |                 line = line.split('#', 1)[0].strip() | ||||||
|  |                 if not line: | ||||||
|  |                     continue | ||||||
|  |                 yield [field.strip() for field in line.split(';')] | ||||||
|  | 
 | ||||||
|  |     def __iter__(self) -> Iterator[List[str]]: | ||||||
|  |         return self.records() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # -------------------------------------------------------------------- | # -------------------------------------------------------------------- | ||||||
| # the following support code is taken from the unidb utilities | # the following support code is taken from the unidb utilities | ||||||
| # Copyright (c) 1999-2000 by Secret Labs AB | # Copyright (c) 1999-2000 by Secret Labs AB | ||||||
|  | @ -922,14 +949,9 @@ def __init__(self, version, | ||||||
|                  cjk_check=True): |                  cjk_check=True): | ||||||
|         self.changed = [] |         self.changed = [] | ||||||
|         table = [None] * 0x110000 |         table = [None] * 0x110000 | ||||||
|         with open_data(UNICODE_DATA, version) as file: |         for s in UcdFile(UNICODE_DATA, version): | ||||||
|             while 1: |             char = int(s[0], 16) | ||||||
|                 s = file.readline() |             table[char] = s | ||||||
|                 if not s: |  | ||||||
|                     break |  | ||||||
|                 s = s.strip().split(";") |  | ||||||
|                 char = int(s[0], 16) |  | ||||||
|                 table[char] = s |  | ||||||
| 
 | 
 | ||||||
|         cjk_ranges_found = [] |         cjk_ranges_found = [] | ||||||
| 
 | 
 | ||||||
|  | @ -968,17 +990,12 @@ def __init__(self, version, | ||||||
|             # in order to take advantage of the compression and lookup |             # in order to take advantage of the compression and lookup | ||||||
|             # algorithms used for the other characters |             # algorithms used for the other characters | ||||||
|             pua_index = NAME_ALIASES_START |             pua_index = NAME_ALIASES_START | ||||||
|             with open_data(NAME_ALIASES, version) as file: |             for char, name, abbrev in UcdFile(NAME_ALIASES, version): | ||||||
|                 for s in file: |                 char = int(char, 16) | ||||||
|                     s = s.strip() |                 self.aliases.append((name, char)) | ||||||
|                     if not s or s.startswith('#'): |                 # also store the name in the PUA 1 | ||||||
|                         continue |                 self.table[pua_index][1] = name | ||||||
|                     char, name, abbrev = s.split(';') |                 pua_index += 1 | ||||||
|                     char = int(char, 16) |  | ||||||
|                     self.aliases.append((name, char)) |  | ||||||
|                     # also store the name in the PUA 1 |  | ||||||
|                     self.table[pua_index][1] = name |  | ||||||
|                     pua_index += 1 |  | ||||||
|             assert pua_index - NAME_ALIASES_START == len(self.aliases) |             assert pua_index - NAME_ALIASES_START == len(self.aliases) | ||||||
| 
 | 
 | ||||||
|             self.named_sequences = [] |             self.named_sequences = [] | ||||||
|  | @ -988,50 +1005,32 @@ def __init__(self, version, | ||||||
| 
 | 
 | ||||||
|             assert pua_index < NAMED_SEQUENCES_START |             assert pua_index < NAMED_SEQUENCES_START | ||||||
|             pua_index = NAMED_SEQUENCES_START |             pua_index = NAMED_SEQUENCES_START | ||||||
|             with open_data(NAMED_SEQUENCES, version) as file: |             for name, chars in UcdFile(NAMED_SEQUENCES, version): | ||||||
|                 for s in file: |                 chars = tuple(int(char, 16) for char in chars.split()) | ||||||
|                     s = s.strip() |                 # check that the structure defined in makeunicodename is OK | ||||||
|                     if not s or s.startswith('#'): |                 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" | ||||||
|                         continue |                 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " | ||||||
|                     name, chars = s.split(';') |                     "the NamedSequence struct and in unicodedata_lookup") | ||||||
|                     chars = tuple(int(char, 16) for char in chars.split()) |                 self.named_sequences.append((name, chars)) | ||||||
|                     # check that the structure defined in makeunicodename is OK |                 # also store these in the PUA 1 | ||||||
|                     assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" |                 self.table[pua_index][1] = name | ||||||
|                     assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " |                 pua_index += 1 | ||||||
|                         "the NamedSequence struct and in unicodedata_lookup") |  | ||||||
|                     self.named_sequences.append((name, chars)) |  | ||||||
|                     # also store these in the PUA 1 |  | ||||||
|                     self.table[pua_index][1] = name |  | ||||||
|                     pua_index += 1 |  | ||||||
|             assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) |             assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) | ||||||
| 
 | 
 | ||||||
|         self.exclusions = {} |         self.exclusions = {} | ||||||
|         with open_data(COMPOSITION_EXCLUSIONS, version) as file: |         for char, in UcdFile(COMPOSITION_EXCLUSIONS, version): | ||||||
|             for s in file: |             char = int(char, 16) | ||||||
|                 s = s.strip() |             self.exclusions[char] = 1 | ||||||
|                 if not s: |  | ||||||
|                     continue |  | ||||||
|                 if s[0] == '#': |  | ||||||
|                     continue |  | ||||||
|                 char = int(s.split()[0],16) |  | ||||||
|                 self.exclusions[char] = 1 |  | ||||||
| 
 | 
 | ||||||
|         widths = [None] * 0x110000 |         widths = [None] * 0x110000 | ||||||
|         with open_data(EASTASIAN_WIDTH, version) as file: |         for s in UcdFile(EASTASIAN_WIDTH, version): | ||||||
|             for s in file: |             if '..' in s[0]: | ||||||
|                 s = s.strip() |                 first, last = [int(c, 16) for c in s[0].split('..')] | ||||||
|                 if not s: |                 chars = list(range(first, last+1)) | ||||||
|                     continue |             else: | ||||||
|                 if s[0] == '#': |                 chars = [int(s[0], 16)] | ||||||
|                     continue |             for char in chars: | ||||||
|                 s = s.split()[0].split(';') |                 widths[char] = s[1] | ||||||
|                 if '..' in s[0]: |  | ||||||
|                     first, last = [int(c, 16) for c in s[0].split('..')] |  | ||||||
|                     chars = list(range(first, last+1)) |  | ||||||
|                 else: |  | ||||||
|                     chars = [int(s[0], 16)] |  | ||||||
|                 for char in chars: |  | ||||||
|                     widths[char] = s[1] |  | ||||||
| 
 | 
 | ||||||
|         for i in range(0, 0x110000): |         for i in range(0, 0x110000): | ||||||
|             if table[i] is not None: |             if table[i] is not None: | ||||||
|  | @ -1041,38 +1040,27 @@ def __init__(self, version, | ||||||
|             if table[i] is not None: |             if table[i] is not None: | ||||||
|                 table[i].append(set()) |                 table[i].append(set()) | ||||||
| 
 | 
 | ||||||
|         with open_data(DERIVED_CORE_PROPERTIES, version) as file: |         for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version): | ||||||
|             for s in file: |             if ".." in r: | ||||||
|                 s = s.split('#', 1)[0].strip() |                 first, last = [int(c, 16) for c in r.split('..')] | ||||||
|                 if not s: |                 chars = list(range(first, last+1)) | ||||||
|                     continue |             else: | ||||||
|  |                 chars = [int(r, 16)] | ||||||
|  |             for char in chars: | ||||||
|  |                 if table[char]: | ||||||
|  |                     # Some properties (e.g. Default_Ignorable_Code_Point) | ||||||
|  |                     # apply to unassigned code points; ignore them | ||||||
|  |                     table[char][-1].add(p) | ||||||
| 
 | 
 | ||||||
|                 r, p = s.split(";") |         for s in UcdFile(LINE_BREAK, version): | ||||||
|                 r = r.strip() |             if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: | ||||||
|                 p = p.strip() |                 continue | ||||||
|                 if ".." in r: |             if '..' not in s[0]: | ||||||
|                     first, last = [int(c, 16) for c in r.split('..')] |                 first = last = int(s[0], 16) | ||||||
|                     chars = list(range(first, last+1)) |             else: | ||||||
|                 else: |                 first, last = [int(c, 16) for c in s[0].split('..')] | ||||||
|                     chars = [int(r, 16)] |             for char in range(first, last+1): | ||||||
|                 for char in chars: |                 table[char][-1].add('Line_Break') | ||||||
|                     if table[char]: |  | ||||||
|                         # Some properties (e.g. Default_Ignorable_Code_Point) |  | ||||||
|                         # apply to unassigned code points; ignore them |  | ||||||
|                         table[char][-1].add(p) |  | ||||||
| 
 |  | ||||||
|         with open_data(LINE_BREAK, version) as file: |  | ||||||
|             for s in file: |  | ||||||
|                 s = s.partition('#')[0] |  | ||||||
|                 s = [i.strip() for i in s.split(';')] |  | ||||||
|                 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: |  | ||||||
|                     continue |  | ||||||
|                 if '..' not in s[0]: |  | ||||||
|                     first = last = int(s[0], 16) |  | ||||||
|                 else: |  | ||||||
|                     first, last = [int(c, 16) for c in s[0].split('..')] |  | ||||||
|                 for char in range(first, last+1): |  | ||||||
|                     table[char][-1].add('Line_Break') |  | ||||||
| 
 | 
 | ||||||
|         # We only want the quickcheck properties |         # We only want the quickcheck properties | ||||||
|         # Format: NF?_QC; Y(es)/N(o)/M(aybe) |         # Format: NF?_QC; Y(es)/N(o)/M(aybe) | ||||||
|  | @ -1083,23 +1071,19 @@ def __init__(self, version, | ||||||
|         # for older versions, and no delta records will be created. |         # for older versions, and no delta records will be created. | ||||||
|         quickchecks = [0] * 0x110000 |         quickchecks = [0] * 0x110000 | ||||||
|         qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() |         qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() | ||||||
|         with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: |         for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version): | ||||||
|             for s in file: |             if len(s) < 2 or s[1] not in qc_order: | ||||||
|                 if '#' in s: |                 continue | ||||||
|                     s = s[:s.index('#')] |             quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No | ||||||
|                 s = [i.strip() for i in s.split(';')] |             quickcheck_shift = qc_order.index(s[1])*2 | ||||||
|                 if len(s) < 2 or s[1] not in qc_order: |             quickcheck <<= quickcheck_shift | ||||||
|                     continue |             if '..' not in s[0]: | ||||||
|                 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No |                 first = last = int(s[0], 16) | ||||||
|                 quickcheck_shift = qc_order.index(s[1])*2 |             else: | ||||||
|                 quickcheck <<= quickcheck_shift |                 first, last = [int(c, 16) for c in s[0].split('..')] | ||||||
|                 if '..' not in s[0]: |             for char in range(first, last+1): | ||||||
|                     first = last = int(s[0], 16) |                 assert not (quickchecks[char]>>quickcheck_shift)&3 | ||||||
|                 else: |                 quickchecks[char] |= quickcheck | ||||||
|                     first, last = [int(c, 16) for c in s[0].split('..')] |  | ||||||
|                 for char in range(first, last+1): |  | ||||||
|                     assert not (quickchecks[char]>>quickcheck_shift)&3 |  | ||||||
|                     quickchecks[char] |= quickcheck |  | ||||||
|         for i in range(0, 0x110000): |         for i in range(0, 0x110000): | ||||||
|             if table[i] is not None: |             if table[i] is not None: | ||||||
|                 table[i].append(quickchecks[i]) |                 table[i].append(quickchecks[i]) | ||||||
|  | @ -1122,34 +1106,26 @@ def __init__(self, version, | ||||||
|             # Patch the numeric field |             # Patch the numeric field | ||||||
|             if table[i] is not None: |             if table[i] is not None: | ||||||
|                 table[i][8] = value |                 table[i][8] = value | ||||||
|  | 
 | ||||||
|         sc = self.special_casing = {} |         sc = self.special_casing = {} | ||||||
|         with open_data(SPECIAL_CASING, version) as file: |         for data in UcdFile(SPECIAL_CASING, version): | ||||||
|             for s in file: |             if data[4]: | ||||||
|                 s = s[:-1].split('#', 1)[0] |                 # We ignore all conditionals (since they depend on | ||||||
|                 if not s: |                 # languages) except for one, which is hardcoded. See | ||||||
|                     continue |                 # handle_capital_sigma in unicodeobject.c. | ||||||
|                 data = s.split("; ") |                 continue | ||||||
|                 if data[4]: |             c = int(data[0], 16) | ||||||
|                     # We ignore all conditionals (since they depend on |             lower = [int(char, 16) for char in data[1].split()] | ||||||
|                     # languages) except for one, which is hardcoded. See |             title = [int(char, 16) for char in data[2].split()] | ||||||
|                     # handle_capital_sigma in unicodeobject.c. |             upper = [int(char, 16) for char in data[3].split()] | ||||||
|                     continue |             sc[c] = (lower, title, upper) | ||||||
|                 c = int(data[0], 16) | 
 | ||||||
|                 lower = [int(char, 16) for char in data[1].split()] |  | ||||||
|                 title = [int(char, 16) for char in data[2].split()] |  | ||||||
|                 upper = [int(char, 16) for char in data[3].split()] |  | ||||||
|                 sc[c] = (lower, title, upper) |  | ||||||
|         cf = self.case_folding = {} |         cf = self.case_folding = {} | ||||||
|         if version != '3.2.0': |         if version != '3.2.0': | ||||||
|             with open_data(CASE_FOLDING, version) as file: |             for data in UcdFile(CASE_FOLDING, version): | ||||||
|                 for s in file: |                 if data[1] in "CF": | ||||||
|                     s = s[:-1].split('#', 1)[0] |                     c = int(data[0], 16) | ||||||
|                     if not s: |                     cf[c] = [int(char, 16) for char in data[2].split()] | ||||||
|                         continue |  | ||||||
|                     data = s.split("; ") |  | ||||||
|                     if data[1] in "CF": |  | ||||||
|                         c = int(data[0], 16) |  | ||||||
|                         cf[c] = [int(char, 16) for char in data[2].split()] |  | ||||||
| 
 | 
 | ||||||
|     def uselatin1(self): |     def uselatin1(self): | ||||||
|         # restrict character range to ISO Latin 1 |         # restrict character range to ISO Latin 1 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Greg Price
						Greg Price