| 
									
										
										
										
											2010-03-11 22:53:45 +00:00
										 |  |  | #!/usr/bin/env python3 | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  | """ Utility for parsing HTML entity definitions available from:
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       http://www.w3.org/ as e.g. | 
					
						
							|  |  |  |       http://www.w3.org/TR/REC-html40/HTMLlat1.ent | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Input is read from stdin, output is written to stdout in form of a | 
					
						
							|  |  |  |     Python snippet defining a dictionary "entitydefs" mapping literal | 
					
						
							|  |  |  |     entity name to character or numeric entity. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-01-17 08:48:39 +00:00
										 |  |  |     Marc-Andre Lemburg, mal@lemburg.com, 1999. | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     Use as you like. NO WARRANTIES. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | import re,sys | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | entityRE = re.compile('<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def parse(text,pos=0,endpos=None): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     pos = 0 | 
					
						
							|  |  |  |     if endpos is None: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         endpos = len(text) | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     d = {} | 
					
						
							|  |  |  |     while 1: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         m = entityRE.search(text,pos,endpos) | 
					
						
							|  |  |  |         if not m: | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |         name,charcode,comment = m.groups() | 
					
						
							|  |  |  |         d[name] = charcode,comment | 
					
						
							|  |  |  |         pos = m.end() | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     return d | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def writefile(f,defs): | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     f.write("entitydefs = {\n") | 
					
						
							| 
									
										
										
										
											2008-05-16 15:23:30 +00:00
										 |  |  |     items = sorted(defs.items()) | 
					
						
							|  |  |  |     for name, (charcode,comment) in items: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         if charcode[:2] == '&#': | 
					
						
							|  |  |  |             code = int(charcode[2:-1]) | 
					
						
							|  |  |  |             if code < 256: | 
					
						
							|  |  |  |                 charcode = "'\%o'" % code | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 charcode = repr(charcode) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             charcode = repr(charcode) | 
					
						
							| 
									
										
										
										
											2012-04-04 21:28:14 -04:00
										 |  |  |         comment = ' '.join(comment.split()) | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment)) | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     f.write('\n}\n') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  |     if len(sys.argv) > 1: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         infile = open(sys.argv[1]) | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         infile = sys.stdin | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     if len(sys.argv) > 2: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         outfile = open(sys.argv[2],'w') | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 |  |  |         outfile = sys.stdout | 
					
						
							| 
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 |  |  |     text = infile.read() | 
					
						
							|  |  |  |     defs = parse(text) | 
					
						
							|  |  |  |     writefile(outfile,defs) |