mirror of
				https://github.com/python/cpython.git
				synced 2025-10-24 18:33:49 +00:00 
			
		
		
		
	GH-96172 fix unicodedata.east_asian_width being wrong on unassigned code points (#96207)
This commit is contained in:
		
							parent
							
								
									c1581a928c
								
							
						
					
					
						commit
						9c197bc8bf
					
				
					 4 changed files with 686 additions and 620 deletions
				
			
		|  | @ -77,7 +77,8 @@ | |||
|     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", | ||||
|     "ON", "LRI", "RLI", "FSI", "PDI" ] | ||||
| 
 | ||||
| EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] | ||||
| # "N" needs to be the first entry, see the comment in makeunicodedata | ||||
| EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ] | ||||
| 
 | ||||
| MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] | ||||
| 
 | ||||
|  | @ -135,6 +136,14 @@ def maketables(trace=0): | |||
| 
 | ||||
| def makeunicodedata(unicode, trace): | ||||
| 
 | ||||
|     # the default value of east_asian_width is "N", for unassigned code points | ||||
|     # not mentioned in EastAsianWidth.txt | ||||
|     # in addition there are some reserved but unassigned code points in CJK | ||||
|     # ranges that are classified as "W". code points in private use areas | ||||
|     # have a width of "A". both of these have entries in | ||||
|     # EastAsianWidth.txt | ||||
|     # see https://unicode.org/reports/tr11/#Unassigned | ||||
|     assert EASTASIANWIDTH_NAMES[0] == "N" | ||||
|     dummy = (0, 0, 0, 0, 0, 0) | ||||
|     table = [dummy] | ||||
|     cache = {0: dummy} | ||||
|  | @ -160,12 +169,20 @@ def makeunicodedata(unicode, trace): | |||
|                 category, combining, bidirectional, mirrored, eastasianwidth, | ||||
|                 normalizationquickcheck | ||||
|                 ) | ||||
|             # add entry to index and item tables | ||||
|             i = cache.get(item) | ||||
|             if i is None: | ||||
|                 cache[item] = i = len(table) | ||||
|                 table.append(item) | ||||
|             index[char] = i | ||||
|         elif unicode.widths[char] is not None: | ||||
|             # an unassigned but reserved character, with a known | ||||
|             # east_asian_width | ||||
|             eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char]) | ||||
|             item = (0, 0, 0, 0, eastasianwidth, 0) | ||||
|         else: | ||||
|             continue | ||||
| 
 | ||||
|         # add entry to index and item tables | ||||
|         i = cache.get(item) | ||||
|         if i is None: | ||||
|             cache[item] = i = len(table) | ||||
|             table.append(item) | ||||
|         index[char] = i | ||||
| 
 | ||||
|     # 2) decomposition data | ||||
| 
 | ||||
|  | @ -1085,6 +1102,7 @@ def __init__(self, version, cjk_check=True): | |||
|         for i in range(0, 0x110000): | ||||
|             if table[i] is not None: | ||||
|                 table[i].east_asian_width = widths[i] | ||||
|         self.widths = widths | ||||
| 
 | ||||
|         for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded(): | ||||
|             if table[char]: | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Carl Friedrich Bolz-Tereick
						Carl Friedrich Bolz-Tereick