| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | """ Unicode Mapping Parser and Codec Generator.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This script parses Unicode mapping files as available from the Unicode | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  | site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec | 
					
						
							|  |  |  | modules from them. The codecs use the standard character mapping codec | 
					
						
							|  |  |  | to actually apply the mapping. | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | Synopsis: gencodec.py dir codec_prefix | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | All files in dir are scanned and those producing non-empty mappings | 
					
						
							|  |  |  | will be written to <codec_prefix><mapname>.py with <mapname> being the | 
					
						
							|  |  |  | first part of the map's filename ('a' in a.b.c.txt) converted to | 
					
						
							|  |  |  | lowercase with hyphens replaced by underscores. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-17 16:56:23 +00:00
										 |  |  | The tool also writes marshalled versions of the mapping tables to the | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | same location (with .mapping extension). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  | (c) Copyright Guido van Rossum, 2000. | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | """#"
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  | import re,os,time,marshal | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | # Create numeric tables or character based ones ? | 
					
						
							|  |  |  | numeric = 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' | 
					
						
							|  |  |  |                    '\s+' | 
					
						
							|  |  |  |                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' | 
					
						
							|  |  |  |                    '\s*' | 
					
						
							|  |  |  |                    '(#.+)?') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def parsecodes(codes, | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |                len=len, filter=filter,range=range): | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     """ Converts code combinations to either a single code integer
 | 
					
						
							|  |  |  |         or a tuple of integers. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         meta-codes (in angular brackets, e.g. <LR> and <RL>) are | 
					
						
							|  |  |  |         ignored. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Empty codes or illegal ones are returned as None. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not codes: | 
					
						
							|  |  |  |         return None | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |     l = codes.split('+') | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     if len(l) == 1: | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |         return int(l[0],16) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     for i in range(len(l)): | 
					
						
							|  |  |  |         try: | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |             l[i] = int(l[i],16) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |         except ValueError: | 
					
						
							|  |  |  |             l[i] = None | 
					
						
							|  |  |  |     l = filter(lambda x: x is not None, l) | 
					
						
							|  |  |  |     if len(l) == 1: | 
					
						
							|  |  |  |         return l[0] | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         return tuple(l) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  | def readmap(filename): | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     f = open(filename,'r') | 
					
						
							|  |  |  |     lines = f.readlines() | 
					
						
							|  |  |  |     f.close() | 
					
						
							|  |  |  |     enc2uni = {} | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |     identity = [] | 
					
						
							|  |  |  |     unmapped = range(256) | 
					
						
							|  |  |  |     for i in range(256): | 
					
						
							|  |  |  |         unmapped[i] = i | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     for line in lines: | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |         line = line.strip() | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |         if not line or line[0] == '#': | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         m = mapRE.match(line) | 
					
						
							|  |  |  |         if not m: | 
					
						
							|  |  |  |             #print '* not matched: %s' % repr(line) | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         enc,uni,comment = m.groups() | 
					
						
							|  |  |  |         enc = parsecodes(enc) | 
					
						
							|  |  |  |         uni = parsecodes(uni) | 
					
						
							|  |  |  |         if not comment: | 
					
						
							|  |  |  |             comment = '' | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             comment = comment[1:] | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |         if enc < 256: | 
					
						
							|  |  |  |             unmapped.remove(enc) | 
					
						
							|  |  |  |             if enc == uni: | 
					
						
							|  |  |  |                 identity.append(enc) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 enc2uni[enc] = (uni,comment) | 
					
						
							|  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |             enc2uni[enc] = (uni,comment) | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |     # If there are more identity-mapped entries than unmapped entries, | 
					
						
							|  |  |  |     # it pays to generate an identity dictionary first, add add explicit | 
					
						
							|  |  |  |     # mappings to None for the rest | 
					
						
							|  |  |  |     if len(identity)>=len(unmapped): | 
					
						
							|  |  |  |         for enc in unmapped: | 
					
						
							|  |  |  |             enc2uni[enc] = (None, "") | 
					
						
							|  |  |  |         enc2uni['IDENTITY'] = 256 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     return enc2uni | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  | def hexrepr(t): | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if t is None: | 
					
						
							|  |  |  |         return 'None' | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         len(t) | 
					
						
							|  |  |  |     except: | 
					
						
							|  |  |  |         return '0x%04x' % t | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |     return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')' | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  | def unicoderepr(t): | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if t is None: | 
					
						
							|  |  |  |         return 'None' | 
					
						
							|  |  |  |     if numeric: | 
					
						
							|  |  |  |         return hexrepr(t) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             len(t) | 
					
						
							|  |  |  |         except: | 
					
						
							|  |  |  |             return repr(unichr(t)) | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |         return repr(''.join(map(unichr, t))) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  | def keyrepr(t): | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if t is None: | 
					
						
							|  |  |  |         return 'None' | 
					
						
							|  |  |  |     if numeric: | 
					
						
							|  |  |  |         return hexrepr(t) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             len(t) | 
					
						
							|  |  |  |         except: | 
					
						
							|  |  |  |             if t < 256: | 
					
						
							|  |  |  |                 return repr(chr(t)) | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 return repr(unichr(t)) | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |         return repr(''.join(map(chr, t))) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def codegen(name,map,comments=1): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """ Returns Python source for the given map.
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         Comments are included in the source, if comments is true (default). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     l = [ | 
					
						
							|  |  |  |         '''\
 | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  | """ Python Character Mapping Codec generated from '%s' with gencodec.py.
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | Written by Marc-Andre Lemburg (mal@lemburg.com). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  | (c) Copyright 2000 Guido van Rossum. | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | """#"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import codecs | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### Codec APIs | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class Codec(codecs.Codec): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def encode(self,input,errors='strict'): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return codecs.charmap_encode(input,errors,encoding_map) | 
					
						
							| 
									
										
										
										
											2001-01-17 08:48:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     def decode(self,input,errors='strict'): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         return codecs.charmap_decode(input,errors,decoding_map) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class StreamWriter(Codec,codecs.StreamWriter): | 
					
						
							|  |  |  |     pass | 
					
						
							| 
									
										
										
										
											2001-01-17 08:48:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | class StreamReader(Codec,codecs.StreamReader): | 
					
						
							|  |  |  |     pass | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### encodings module API | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def getregentry(): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (Codec().encode,Codec().decode,StreamReader,StreamWriter) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | ### Decoding Map | 
					
						
							|  |  |  | ''' % name,
 | 
					
						
							|  |  |  |         ] | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if map.has_key("IDENTITY"): | 
					
						
							|  |  |  |         l.append("decoding_map = codecs.make_identity_dict(range(%d))" | 
					
						
							|  |  |  |                  % map["IDENTITY"]) | 
					
						
							|  |  |  |         l.append("decoding_map.update({") | 
					
						
							|  |  |  |         splits = 1 | 
					
						
							|  |  |  |         del map["IDENTITY"] | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         l.append("decoding_map = {") | 
					
						
							|  |  |  |         splits = 0 | 
					
						
							| 
									
										
										
										
											2001-01-17 08:48:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     mappings = map.items() | 
					
						
							|  |  |  |     mappings.sort() | 
					
						
							|  |  |  |     append = l.append | 
					
						
							|  |  |  |     i = 0 | 
					
						
							|  |  |  |     for e,value in mappings: | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             (u,c) = value | 
					
						
							|  |  |  |         except TypeError: | 
					
						
							|  |  |  |             u = value | 
					
						
							|  |  |  |             c = '' | 
					
						
							|  |  |  |         key = keyrepr(e) | 
					
						
							|  |  |  |         if c and comments: | 
					
						
							|  |  |  |             append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c)) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             append('\t%s: %s,' % (key,unicoderepr(u))) | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |         i += 1 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |         if i == 4096: | 
					
						
							|  |  |  |             # Split the definition into parts to that the Python | 
					
						
							|  |  |  |             # parser doesn't dump core | 
					
						
							|  |  |  |             if splits == 0: | 
					
						
							|  |  |  |                 append('}') | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 append('})') | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |             append('decoding_map.update({') | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |             i = 0 | 
					
						
							|  |  |  |             splits = splits + 1 | 
					
						
							|  |  |  |     if splits == 0: | 
					
						
							|  |  |  |         append('}') | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         append('})') | 
					
						
							|  |  |  |     append('''
 | 
					
						
							|  |  |  | ### Encoding Map | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-05-16 09:41:45 +00:00
										 |  |  | encoding_map = codecs.make_encoding_map(decoding_map) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | ''')
 | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |     return '\n'.join(l) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | def pymap(name,map,pyfile,comments=1): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     code = codegen(name,map,comments) | 
					
						
							|  |  |  |     f = open(pyfile,'w') | 
					
						
							|  |  |  |     f.write(code) | 
					
						
							|  |  |  |     f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def marshalmap(name,map,marshalfile): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     d = {} | 
					
						
							|  |  |  |     for e,(u,c) in map.items(): | 
					
						
							|  |  |  |         d[e] = (u,c) | 
					
						
							|  |  |  |     f = open(marshalfile,'wb') | 
					
						
							|  |  |  |     marshal.dump(d,f) | 
					
						
							|  |  |  |     f.close() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def convertdir(dir,prefix='',comments=1): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     mapnames = os.listdir(dir) | 
					
						
							|  |  |  |     for mapname in mapnames: | 
					
						
							|  |  |  |         name = os.path.split(mapname)[1] | 
					
						
							| 
									
										
										
										
											2002-09-11 20:36:02 +00:00
										 |  |  |         name = name.replace('-','_') | 
					
						
							|  |  |  |         name = name.split('.')[0] | 
					
						
							|  |  |  |         name = name.lower() | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |         codefile = name + '.py' | 
					
						
							|  |  |  |         marshalfile = name + '.mapping' | 
					
						
							|  |  |  |         print 'converting %s to %s and %s' % (mapname, | 
					
						
							|  |  |  |                                               prefix + codefile, | 
					
						
							|  |  |  |                                               prefix + marshalfile) | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             map = readmap(os.path.join(dir,mapname)) | 
					
						
							|  |  |  |             if not map: | 
					
						
							|  |  |  |                 print '* map is empty; skipping' | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 pymap(mapname, map, prefix + codefile,comments) | 
					
						
							|  |  |  |                 marshalmap(mapname, map, prefix + marshalfile) | 
					
						
							|  |  |  |         except ValueError: | 
					
						
							|  |  |  |             print '* conversion failed' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def rewritepythondir(dir,prefix='',comments=1): | 
					
						
							| 
									
										
										
										
											2001-01-17 08:48:39 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |     mapnames = os.listdir(dir) | 
					
						
							|  |  |  |     for mapname in mapnames: | 
					
						
							| 
									
										
										
										
											2001-01-03 21:29:14 +00:00
										 |  |  |         if not mapname.endswith('.mapping'): | 
					
						
							| 
									
										
										
										
											2000-03-10 22:36:57 +00:00
										 |  |  |             continue | 
					
						
							|  |  |  |         codefile = mapname[:-len('.mapping')] + '.py' | 
					
						
							|  |  |  |         print 'converting %s to %s' % (mapname, | 
					
						
							|  |  |  |                                        prefix + codefile) | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             map = marshal.load(open(os.path.join(dir,mapname), | 
					
						
							|  |  |  |                                'rb')) | 
					
						
							|  |  |  |             if not map: | 
					
						
							|  |  |  |                 print '* map is empty; skipping' | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 pymap(mapname, map, prefix + codefile,comments) | 
					
						
							|  |  |  |         except ValueError, why: | 
					
						
							|  |  |  |             print '* conversion failed: %s' % why | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     import sys | 
					
						
							|  |  |  |     if 1: | 
					
						
							|  |  |  |         apply(convertdir,tuple(sys.argv[1:])) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         apply(rewritepythondir,tuple(sys.argv[1:])) |