2010-03-11 22:53:45 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								#!/usr/bin/env python3
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								""" Utility for parsing HTML entity definitions available from:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								      http://www.w3.org/ as e.g.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								      http://www.w3.org/TR/REC-html40/HTMLlat1.ent
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Input is read from stdin, output is written to stdout in form of a
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Python snippet defining a dictionary "entitydefs" mapping literal
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    entity name to character or numeric entity.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2001-01-17 08:48:39 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    Marc-Andre Lemburg, mal@lemburg.com, 1999.
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    Use as you like. NO WARRANTIES.
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								"""
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import re,sys
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2016-09-08 13:59:53 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def parse(text,pos=0,endpos=None):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    pos = 0
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if endpos is None:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        endpos = len(text)
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    d = {}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    while 1:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        m = entityRE.search(text,pos,endpos)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        if not m:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            break
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        name,charcode,comment = m.groups()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        d[name] = charcode,comment
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        pos = m.end()
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    return d
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def writefile(f,defs):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    f.write("entitydefs = {\n")
							 | 
						
					
						
							
								
									
										
										
										
											2008-05-16 15:23:30 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    items = sorted(defs.items())
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    for name, (charcode,comment) in items:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        if charcode[:2] == '&#':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            code = int(charcode[2:-1])
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            if code < 256:
							 | 
						
					
						
							
								
									
										
										
										
											2016-09-08 13:59:53 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								                charcode = r"'\%o'" % code
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								                charcode = repr(charcode)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        else:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								            charcode = repr(charcode)
							 | 
						
					
						
							
								
									
										
										
										
											2012-04-04 21:28:14 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        comment = ' '.join(comment.split())
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    f.write('\n}\n')
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								if __name__ == '__main__':
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if len(sys.argv) > 1:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        infile = open(sys.argv[1])
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        infile = sys.stdin
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if len(sys.argv) > 2:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        outfile = open(sys.argv[2],'w')
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2000-09-18 01:46:01 +00:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        outfile = sys.stdout
							 | 
						
					
						
							
								
									
										
										
										
											1999-08-19 16:00:41 +00:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    text = infile.read()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    defs = parse(text)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    writefile(outfile,defs)
							 |