mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			96 lines
		
	
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			96 lines
		
	
	
	
		
			2.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
#! /usr/bin/env python3
 | 
						|
# This script generates Lib/re/_casefix.py.
 | 
						|
 | 
						|
import collections
 | 
						|
import sys
 | 
						|
import unicodedata
 | 
						|
 | 
						|
SCRIPT_NAME = 'Tools/build/generate_re_casefix.py'
 | 
						|
 | 
						|
def update_file(file, content):
 | 
						|
    try:
 | 
						|
        with open(file, encoding='utf-8') as fobj:
 | 
						|
            if fobj.read() == content:
 | 
						|
                return False
 | 
						|
    except (OSError, ValueError):
 | 
						|
        pass
 | 
						|
    with open(file, 'w', encoding='utf-8') as fobj:
 | 
						|
        fobj.write(content)
 | 
						|
    return True
 | 
						|
 | 
						|
re_casefix_template = f"""\
 | 
						|
# Auto-generated by {SCRIPT_NAME}.
 | 
						|
 | 
						|
# Maps the code of lowercased character to codes of different lowercased
 | 
						|
# characters which have the same uppercase.
 | 
						|
_EXTRA_CASES = {{
 | 
						|
%s
 | 
						|
}}
 | 
						|
"""
 | 
						|
 | 
						|
def uname(i):
 | 
						|
    return unicodedata.name(chr(i), r'U+%04X' % i)
 | 
						|
 | 
						|
class hexint(int):
 | 
						|
    def __repr__(self):
 | 
						|
        return '%#06x' % self
 | 
						|
 | 
						|
def alpha(i):
 | 
						|
    c = chr(i)
 | 
						|
    return c if c.isalpha() else ascii(c)[1:-1]
 | 
						|
 | 
						|
 | 
						|
def main(outfile='Lib/re/_casefix.py'):
 | 
						|
    # Find sets of characters which have the same uppercase.
 | 
						|
    equivalent_chars = collections.defaultdict(str)
 | 
						|
    for c in map(chr, range(sys.maxunicode + 1)):
 | 
						|
        equivalent_chars[c.upper()] += c
 | 
						|
    equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
 | 
						|
 | 
						|
    # List of codes of lowercased characters which have the same uppercase.
 | 
						|
    equivalent_lower_codes = [sorted(t)
 | 
						|
                              for s in equivalent_chars
 | 
						|
                              for t in [{ord(c.lower()) for c in s}]
 | 
						|
                              if len(t) > 1]
 | 
						|
 | 
						|
    bad_codes = []
 | 
						|
    for t in equivalent_lower_codes:
 | 
						|
        for i in t:
 | 
						|
            if i > 0xffff:
 | 
						|
                bad_codes.extend(t)
 | 
						|
                try:
 | 
						|
                    bad_codes.append(ord(chr(i).upper()))
 | 
						|
                except (ValueError, TypeError):
 | 
						|
                    pass
 | 
						|
                break
 | 
						|
    if bad_codes:
 | 
						|
        print('Case-insensitive matching may not work correctly for character:',
 | 
						|
              file=sys.stderr)
 | 
						|
        for i in sorted(bad_codes):
 | 
						|
            print("  '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
 | 
						|
                  file=sys.stderr)
 | 
						|
        sys.exit(1)
 | 
						|
 | 
						|
    mapping = {i: tuple(j for j in t if i != j)
 | 
						|
               for t in equivalent_lower_codes
 | 
						|
               for i in t}
 | 
						|
 | 
						|
    items = []
 | 
						|
    for i, t in sorted(mapping.items()):
 | 
						|
        items.append('    # %s: %s' % (
 | 
						|
            uname(i),
 | 
						|
            ', '.join(map(uname, t)),
 | 
						|
        ))
 | 
						|
        items.append("    %r: %r, # '%s': '%s'" % (
 | 
						|
            hexint(i),
 | 
						|
            tuple(map(hexint, t)),
 | 
						|
            alpha(i),
 | 
						|
            ''.join(map(alpha, t)),
 | 
						|
        ))
 | 
						|
 | 
						|
    update_file(outfile, re_casefix_template % '\n'.join(items))
 | 
						|
 | 
						|
 | 
						|
if __name__ == '__main__':
 | 
						|
    import sys
 | 
						|
    main(*sys.argv[1:])
 |