mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 07:31:38 +00:00 
			
		
		
		
	
		
			
	
	
		
			150 lines
		
	
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			150 lines
		
	
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# genmap_schinese.py: Simplified Chinese Codecs Map Generator
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								# Original Author:  Hye-Shik Chang <perky@FreeBSD.org>
							 | 
						||
| 
								 | 
							
								# Modified Author:  Dong-hee Na <donghee.na92@gmail.com>
							 | 
						||
| 
								 | 
							
								#
							 | 
						||
| 
								 | 
							
								import os
							 | 
						||
| 
								 | 
							
								import re
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								from genmap_support import *
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								GB2312_C1   = (0x21, 0x7e)
							 | 
						||
| 
								 | 
							
								GB2312_C2   = (0x21, 0x7e)
							 | 
						||
| 
								 | 
							
								GBKL1_C1    = (0x81, 0xa8)
							 | 
						||
| 
								 | 
							
								GBKL1_C2    = (0x40, 0xfe)
							 | 
						||
| 
								 | 
							
								GBKL2_C1    = (0xa9, 0xfe)
							 | 
						||
| 
								 | 
							
								GBKL2_C2    = (0x40, 0xa0)
							 | 
						||
| 
								 | 
							
								GB18030EXTP1_C1 = (0xa1, 0xa9)
							 | 
						||
| 
								 | 
							
								GB18030EXTP1_C2 = (0x40, 0xfe)
							 | 
						||
| 
								 | 
							
								GB18030EXTP2_C1 = (0xaa, 0xaf)
							 | 
						||
| 
								 | 
							
								GB18030EXTP2_C2 = (0xa1, 0xfe)
							 | 
						||
| 
								 | 
							
								GB18030EXTP3_C1 = (0xd7, 0xd7)
							 | 
						||
| 
								 | 
							
								GB18030EXTP3_C2 = (0xfa, 0xfe)
							 | 
						||
| 
								 | 
							
								GB18030EXTP4_C1 = (0xf8, 0xfd)
							 | 
						||
| 
								 | 
							
								GB18030EXTP4_C2 = (0xa1, 0xfe)
							 | 
						||
| 
								 | 
							
								GB18030EXTP5_C1 = (0xfe, 0xfe)
							 | 
						||
| 
								 | 
							
								GB18030EXTP5_C2 = (0x50, 0xfe)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
							 | 
						||
| 
								 | 
							
								MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
							 | 
						||
| 
								 | 
							
								MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def parse_gb18030map(fo):
							 | 
						||
| 
								 | 
							
								    m, gbuni = {}, {}
							 | 
						||
| 
								 | 
							
								    for i in range(65536):
							 | 
						||
| 
								 | 
							
								        if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
							 | 
						||
| 
								 | 
							
								            gbuni[i] = None
							 | 
						||
| 
								 | 
							
								    for uni, native in re_gb18030ass.findall(fo.read()):
							 | 
						||
| 
								 | 
							
								        uni = eval('0x'+uni)
							 | 
						||
| 
								 | 
							
								        native = [eval('0x'+u) for u in native.split()]
							 | 
						||
| 
								 | 
							
								        if len(native) <= 2:
							 | 
						||
| 
								 | 
							
								            del gbuni[uni]
							 | 
						||
| 
								 | 
							
								        if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
							 | 
						||
| 
								 | 
							
								            m.setdefault(native[0], {})
							 | 
						||
| 
								 | 
							
								            m[native[0]][native[1]] = uni
							 | 
						||
| 
								 | 
							
								    gbuni = [k for k in gbuni.keys()]
							 | 
						||
| 
								 | 
							
								    gbuni.sort()
							 | 
						||
| 
								 | 
							
								    return m, gbuni
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								def main():
							 | 
						||
| 
								 | 
							
								    print("Loading Mapping File...")
							 | 
						||
| 
								 | 
							
								    gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
							 | 
						||
| 
								 | 
							
								    cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
							 | 
						||
| 
								 | 
							
								    gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
							 | 
						||
| 
								 | 
							
								    gbkdecmap = loadmap(cp936map)
							 | 
						||
| 
								 | 
							
								    gb2312decmap = loadmap(gb2312map)
							 | 
						||
| 
								 | 
							
								    difmap = {}
							 | 
						||
| 
								 | 
							
								    for c1, m in gbkdecmap.items():
							 | 
						||
| 
								 | 
							
								        for c2, code in m.items():
							 | 
						||
| 
								 | 
							
								            del gb18030decmap[c1][c2]
							 | 
						||
| 
								 | 
							
								            if not gb18030decmap[c1]:
							 | 
						||
| 
								 | 
							
								                del gb18030decmap[c1]
							 | 
						||
| 
								 | 
							
								    for c1, m in gb2312decmap.items():
							 | 
						||
| 
								 | 
							
								        for c2, code in m.items():
							 | 
						||
| 
								 | 
							
								            gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
							 | 
						||
| 
								 | 
							
								            if gbkdecmap[gbkc1][gbkc2] == code:
							 | 
						||
| 
								 | 
							
								                del gbkdecmap[gbkc1][gbkc2]
							 | 
						||
| 
								 | 
							
								                if not gbkdecmap[gbkc1]:
							 | 
						||
| 
								 | 
							
								                    del gbkdecmap[gbkc1]
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    gb2312_gbkencmap, gb18030encmap = {}, {}
							 | 
						||
| 
								 | 
							
								    for c1, m in gbkdecmap.items():
							 | 
						||
| 
								 | 
							
								        for c2, code in m.items():
							 | 
						||
| 
								 | 
							
								            gb2312_gbkencmap.setdefault(code >> 8, {})
							 | 
						||
| 
								 | 
							
								            gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
							 | 
						||
| 
								 | 
							
								    for c1, m in gb2312decmap.items():
							 | 
						||
| 
								 | 
							
								        for c2, code in m.items():
							 | 
						||
| 
								 | 
							
								            gb2312_gbkencmap.setdefault(code >> 8, {})
							 | 
						||
| 
								 | 
							
								            gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
							 | 
						||
| 
								 | 
							
								    for c1, m in gb18030decmap.items():
							 | 
						||
| 
								 | 
							
								        for c2, code in m.items():
							 | 
						||
| 
								 | 
							
								            gb18030encmap.setdefault(code >> 8, {})
							 | 
						||
| 
								 | 
							
								            gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    with open('mappings_cn.h', 'w') as fp:
							 | 
						||
| 
								 | 
							
								        print_autogen(fp, os.path.basename(__file__))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        print("Generating GB2312 decode map...")
							 | 
						||
| 
								 | 
							
								        writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
							 | 
						||
| 
								 | 
							
								        writer.update_decode_map(GB2312_C1, GB2312_C2)
							 | 
						||
| 
								 | 
							
								        writer.generate()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        print("Generating GBK decode map...")
							 | 
						||
| 
								 | 
							
								        writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
							 | 
						||
| 
								 | 
							
								        writer.update_decode_map(GBKL1_C1, GBKL1_C2)
							 | 
						||
| 
								 | 
							
								        writer.update_decode_map(GBKL2_C1, GBKL2_C2)
							 | 
						||
| 
								 | 
							
								        writer.generate()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        print("Generating GB2312 && GBK encode map...")
							 | 
						||
| 
								 | 
							
								        writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
							 | 
						||
| 
								 | 
							
								        writer.generate()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        print("Generating GB18030 extension decode map...")
							 | 
						||
| 
								 | 
							
								        writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
							 | 
						||
| 
								 | 
							
								        for i in range(1, 6):
							 | 
						||
| 
								 | 
							
								            writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        writer.generate()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        print("Generating GB18030 extension encode map...")
							 | 
						||
| 
								 | 
							
								        writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
							 | 
						||
| 
								 | 
							
								        writer.generate()
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        print("Generating GB18030 Unicode BMP Mapping Ranges...")
							 | 
						||
| 
								 | 
							
								        ranges = [[-1, -1, -1]]
							 | 
						||
| 
								 | 
							
								        gblinnum = 0
							 | 
						||
| 
								 | 
							
								        fp.write("""
							 | 
						||
| 
								 | 
							
								static const struct _gb18030_to_unibmp_ranges {
							 | 
						||
| 
								 | 
							
								    Py_UCS4   first, last;
							 | 
						||
| 
								 | 
							
								    DBCHAR       base;
							 | 
						||
| 
								 | 
							
								} gb18030_to_unibmp_ranges[] = {
							 | 
						||
| 
								 | 
							
								""")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        for uni in gb18030unilinear:
							 | 
						||
| 
								 | 
							
								            if uni == ranges[-1][1] + 1:
							 | 
						||
| 
								 | 
							
								                ranges[-1][1] = uni
							 | 
						||
| 
								 | 
							
								            else:
							 | 
						||
| 
								 | 
							
								                ranges.append([uni, uni, gblinnum])
							 | 
						||
| 
								 | 
							
								            gblinnum += 1
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        filler = BufferedFiller()
							 | 
						||
| 
								 | 
							
								        for first, last, base in ranges[1:]:
							 | 
						||
| 
								 | 
							
								            filler.write('{', str(first), ',', str(last), ',', str(base), '},')
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        filler.write('{', '0,', '0,', str(
							 | 
						||
| 
								 | 
							
								            ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
							 | 
						||
| 
								 | 
							
								        filler.printout(fp)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    print("Done!")
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								if __name__ == '__main__':
							 | 
						||
| 
								 | 
							
								    main()
							 |