mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	gh-82927: Update files related to HTML entities. (GH-92504)
This commit is contained in:
		
							parent
							
								
									4e08fbcfdf
								
							
						
					
					
						commit
						f28ec34c5c
					
				
					 6 changed files with 29 additions and 78 deletions
				
			
		
							
								
								
									
										1
									
								
								.github/CODEOWNERS
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/CODEOWNERS
									
										
									
									
										vendored
									
									
								
							|  | @ -53,6 +53,7 @@ Python/pythonrun.c            @iritkatriel | ||||||
| /Lib/html/                    @ezio-melotti | /Lib/html/                    @ezio-melotti | ||||||
| /Lib/_markupbase.py           @ezio-melotti | /Lib/_markupbase.py           @ezio-melotti | ||||||
| /Lib/test/test_html*.py       @ezio-melotti | /Lib/test/test_html*.py       @ezio-melotti | ||||||
|  | /Tools/scripts/*html5*        @ezio-melotti | ||||||
| 
 | 
 | ||||||
| # Import (including importlib). | # Import (including importlib). | ||||||
| # Ignoring importlib.h so as to not get flagged on | # Ignoring importlib.h so as to not get flagged on | ||||||
|  |  | ||||||
|  | @ -34,12 +34,12 @@ This module defines four dictionaries, :data:`html5`, | ||||||
| 
 | 
 | ||||||
| .. data:: name2codepoint | .. data:: name2codepoint | ||||||
| 
 | 
 | ||||||
|    A dictionary that maps HTML entity names to the Unicode code points. |    A dictionary that maps HTML4 entity names to the Unicode code points. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. data:: codepoint2name | .. data:: codepoint2name | ||||||
| 
 | 
 | ||||||
|    A dictionary that maps Unicode code points to HTML entity names. |    A dictionary that maps Unicode code points to HTML4 entity names. | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. rubric:: Footnotes | .. rubric:: Footnotes | ||||||
|  |  | ||||||
|  | @ -3,8 +3,7 @@ | ||||||
| __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] | __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs'] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # maps the HTML entity name to the Unicode code point | # maps HTML4 entity name to the Unicode code point | ||||||
| # from https://html.spec.whatwg.org/multipage/named-characters.html |  | ||||||
| name2codepoint = { | name2codepoint = { | ||||||
|     'AElig':    0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 |     'AElig':    0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 | ||||||
|     'Aacute':   0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 |     'Aacute':   0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1 | ||||||
|  | @ -261,7 +260,11 @@ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # maps the HTML5 named character references to the equivalent Unicode character(s) | # HTML5 named character references | ||||||
|  | # Generated by 'Tools/scripts/parse_html5_entities.py' | ||||||
|  | # from https://html.spec.whatwg.org/entities.json and | ||||||
|  | # https://html.spec.whatwg.org/multipage/named-characters.html. | ||||||
|  | # Map HTML5 named character references to the equivalent Unicode character(s). | ||||||
| html5 = { | html5 = { | ||||||
|     'Aacute': '\xc1', |     'Aacute': '\xc1', | ||||||
|     'aacute': '\xe1', |     'aacute': '\xe1', | ||||||
|  |  | ||||||
|  | @ -0,0 +1,2 @@ | ||||||
|  | The ``Tools/scripts/parseentities.py`` script used to parse HTML4 entities | ||||||
|  | has been removed. | ||||||
|  | @ -2,10 +2,14 @@ | ||||||
| """ | """ | ||||||
| Utility for parsing HTML5 entity definitions available from: | Utility for parsing HTML5 entity definitions available from: | ||||||
| 
 | 
 | ||||||
|     http://dev.w3.org/html5/spec/entities.json |     https://html.spec.whatwg.org/entities.json | ||||||
|  |     https://html.spec.whatwg.org/multipage/named-characters.html | ||||||
|  | 
 | ||||||
|  | The page now contains the following note: | ||||||
|  | 
 | ||||||
|  |     "This list is static and will not be expanded or changed in the future." | ||||||
| 
 | 
 | ||||||
| Written by Ezio Melotti and Iuliia Proskurnia. | Written by Ezio Melotti and Iuliia Proskurnia. | ||||||
| 
 |  | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| import os | import os | ||||||
|  | @ -14,7 +18,9 @@ | ||||||
| from urllib.request import urlopen | from urllib.request import urlopen | ||||||
| from html.entities import html5 | from html.entities import html5 | ||||||
| 
 | 
 | ||||||
| entities_url = 'http://dev.w3.org/html5/spec/entities.json' | PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' | ||||||
|  | ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' | ||||||
|  | HTML5_SECTION_START = '# HTML5 named character references' | ||||||
| 
 | 
 | ||||||
| def get_json(url): | def get_json(url): | ||||||
|     """Download the json file from the url and returns a decoded object.""" |     """Download the json file from the url and returns a decoded object.""" | ||||||
|  | @ -62,9 +68,15 @@ def write_items(entities, file=sys.stdout): | ||||||
|     # be before their equivalent lowercase version. |     # be before their equivalent lowercase version. | ||||||
|     keys = sorted(entities.keys()) |     keys = sorted(entities.keys()) | ||||||
|     keys = sorted(keys, key=str.lower) |     keys = sorted(keys, key=str.lower) | ||||||
|  |     print(HTML5_SECTION_START, file=file) | ||||||
|  |     print(f'# Generated by {sys.argv[0]!r}\n' | ||||||
|  |           f'# from {ENTITIES_URL} and\n' | ||||||
|  |           f'# {PAGE_URL}.\n' | ||||||
|  |           f'# Map HTML5 named character references to the ' | ||||||
|  |           f'equivalent Unicode character(s).', file=file) | ||||||
|     print('html5 = {', file=file) |     print('html5 = {', file=file) | ||||||
|     for name in keys: |     for name in keys: | ||||||
|         print('    {!r}: {!a},'.format(name, entities[name]), file=file) |         print(f'    {name!r}: {entities[name]!a},', file=file) | ||||||
|     print('}', file=file) |     print('}', file=file) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -72,11 +84,8 @@ def write_items(entities, file=sys.stdout): | ||||||
|     # without args print a diff between html.entities.html5 and new_html5 |     # without args print a diff between html.entities.html5 and new_html5 | ||||||
|     # with --create print the new html5 dict |     # with --create print the new html5 dict | ||||||
|     # with --patch patch the Lib/html/entities.py file |     # with --patch patch the Lib/html/entities.py file | ||||||
|     new_html5 = create_dict(get_json(entities_url)) |     new_html5 = create_dict(get_json(ENTITIES_URL)) | ||||||
|     if '--create' in sys.argv: |     if '--create' in sys.argv: | ||||||
|         print('# map the HTML5 named character references to the ' |  | ||||||
|               'equivalent Unicode character(s)') |  | ||||||
|         print('# Generated by {}.  Do not edit manually.'.format(__file__)) |  | ||||||
|         write_items(new_html5) |         write_items(new_html5) | ||||||
|     elif '--patch' in sys.argv: |     elif '--patch' in sys.argv: | ||||||
|         fname = 'Lib/html/entities.py' |         fname = 'Lib/html/entities.py' | ||||||
|  | @ -84,7 +93,7 @@ def write_items(entities, file=sys.stdout): | ||||||
|         with open(fname) as f1, open(temp_fname, 'w') as f2: |         with open(fname) as f1, open(temp_fname, 'w') as f2: | ||||||
|             skip = False |             skip = False | ||||||
|             for line in f1: |             for line in f1: | ||||||
|                 if line.startswith('html5 = {'): |                 if line.startswith(HTML5_SECTION_START): | ||||||
|                     write_items(new_html5, file=f2) |                     write_items(new_html5, file=f2) | ||||||
|                     skip = True |                     skip = True | ||||||
|                     continue |                     continue | ||||||
|  |  | ||||||
|  | @ -1,64 +0,0 @@ | ||||||
| #!/usr/bin/env python3 |  | ||||||
| """ Utility for parsing HTML entity definitions available from: |  | ||||||
| 
 |  | ||||||
|       http://www.w3.org/ as e.g. |  | ||||||
|       http://www.w3.org/TR/REC-html40/HTMLlat1.ent |  | ||||||
| 
 |  | ||||||
|     Input is read from stdin, output is written to stdout in form of a |  | ||||||
|     Python snippet defining a dictionary "entitydefs" mapping literal |  | ||||||
|     entity name to character or numeric entity. |  | ||||||
| 
 |  | ||||||
|     Marc-Andre Lemburg, mal@lemburg.com, 1999. |  | ||||||
|     Use as you like. NO WARRANTIES. |  | ||||||
| 
 |  | ||||||
| """ |  | ||||||
| import re,sys |  | ||||||
| 
 |  | ||||||
| entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') |  | ||||||
| 
 |  | ||||||
| def parse(text,pos=0,endpos=None): |  | ||||||
| 
 |  | ||||||
|     pos = 0 |  | ||||||
|     if endpos is None: |  | ||||||
|         endpos = len(text) |  | ||||||
|     d = {} |  | ||||||
|     while 1: |  | ||||||
|         m = entityRE.search(text,pos,endpos) |  | ||||||
|         if not m: |  | ||||||
|             break |  | ||||||
|         name,charcode,comment = m.groups() |  | ||||||
|         d[name] = charcode,comment |  | ||||||
|         pos = m.end() |  | ||||||
|     return d |  | ||||||
| 
 |  | ||||||
| def writefile(f,defs): |  | ||||||
| 
 |  | ||||||
|     f.write("entitydefs = {\n") |  | ||||||
|     items = sorted(defs.items()) |  | ||||||
|     for name, (charcode,comment) in items: |  | ||||||
|         if charcode[:2] == '&#': |  | ||||||
|             code = int(charcode[2:-1]) |  | ||||||
|             if code < 256: |  | ||||||
|                 charcode = r"'\%o'" % code |  | ||||||
|             else: |  | ||||||
|                 charcode = repr(charcode) |  | ||||||
|         else: |  | ||||||
|             charcode = repr(charcode) |  | ||||||
|         comment = ' '.join(comment.split()) |  | ||||||
|         f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment)) |  | ||||||
|     f.write('\n}\n') |  | ||||||
| 
 |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     if len(sys.argv) > 1: |  | ||||||
|         with open(sys.argv[1]) as infile: |  | ||||||
|             text = infile.read() |  | ||||||
|     else: |  | ||||||
|         text = sys.stdin.read() |  | ||||||
| 
 |  | ||||||
|     defs = parse(text) |  | ||||||
| 
 |  | ||||||
|     if len(sys.argv) > 2: |  | ||||||
|         with open(sys.argv[2],'w') as outfile: |  | ||||||
|             writefile(outfile, defs) |  | ||||||
|     else: |  | ||||||
|         writefile(sys.stdout, defs) |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Ezio Melotti
						Ezio Melotti