mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
This commit is contained in:
		
							parent
							
								
									ff432e6f4a
								
							
						
					
					
						commit
						ab8a6bba25
					
				
					 4 changed files with 43 additions and 13 deletions
				
			
		|  | @ -75,14 +75,18 @@ This method is called to handle the start of a tag.  It is intended to | ||||||
| be overridden by a derived class; the base class implementation does | be overridden by a derived class; the base class implementation does | ||||||
| nothing.   | nothing.   | ||||||
| 
 | 
 | ||||||
| The \var{tag} argument is the name of the tag converted to | The \var{tag} argument is the name of the tag converted to lower case. | ||||||
| lower case.  The \var{attrs} argument is a list of \code{(\var{name}, | The \var{attrs} argument is a list of \code{(\var{name}, \var{value})} | ||||||
| \var{value})} pairs containing the attributes found inside the tag's | pairs containing the attributes found inside the tag's \code{<>} | ||||||
| \code{<>} brackets.  The \var{name} will be translated to lower case | brackets.  The \var{name} will be translated to lower case, and quotes | ||||||
| and double quotes and backslashes in the \var{value} have been | in the \var{value} have been removed, and character and entity | ||||||
| interpreted.  For instance, for the tag \code{<A | references have been replaced.  For instance, for the tag \code{<A | ||||||
|   HREF="http://www.cwi.nl/">}, this method would be called as |   HREF="http://www.cwi.nl/">}, this method would be called as | ||||||
| \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}. | \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}. | ||||||
|  | 
 | ||||||
|  | \versionchanged[All entity references from htmlentitydefs are now | ||||||
|  | replaced in the attribute values]{2.6} | ||||||
|  | 
 | ||||||
| \end{methoddesc} | \end{methoddesc} | ||||||
| 
 | 
 | ||||||
| \begin{methoddesc}{handle_startendtag}{tag, attrs} | \begin{methoddesc}{handle_startendtag}{tag, attrs} | ||||||
|  |  | ||||||
|  | @ -358,12 +358,30 @@ def unknown_decl(self, data): | ||||||
|         self.error("unknown declaration: %r" % (data,)) |         self.error("unknown declaration: %r" % (data,)) | ||||||
| 
 | 
 | ||||||
|     # Internal -- helper to remove special character quoting |     # Internal -- helper to remove special character quoting | ||||||
|  |     entitydefs = None | ||||||
|     def unescape(self, s): |     def unescape(self, s): | ||||||
|         if '&' not in s: |         if '&' not in s: | ||||||
|             return s |             return s | ||||||
|         s = s.replace("<", "<") |         def replaceEntities(s): | ||||||
|         s = s.replace(">", ">") |             s = s.groups()[0] | ||||||
|         s = s.replace("'", "'") |             if s[0] == "#": | ||||||
|         s = s.replace(""", '"') |                 s = s[1:] | ||||||
|         s = s.replace("&", "&") # Must be last |                 if s[0] in ['x','X']: | ||||||
|         return s |                     c = int(s[1:], 16) | ||||||
|  |                 else: | ||||||
|  |                     c = int(s) | ||||||
|  |                 return unichr(c) | ||||||
|  |             else: | ||||||
|  |                 # Cannot use name2codepoint directly, because HTMLParser supports apos, | ||||||
|  |                 # which is not part of HTML 4 | ||||||
|  |                 import htmlentitydefs | ||||||
|  |                 if HTMLParser.entitydefs is None: | ||||||
|  |                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"} | ||||||
|  |                     for k, v in htmlentitydefs.name2codepoint.iteritems(): | ||||||
|  |                         entitydefs[k] = unichr(v) | ||||||
|  |                 try: | ||||||
|  |                     return self.entitydefs[s] | ||||||
|  |                 except KeyError: | ||||||
|  |                     return '&'+s+';' | ||||||
|  | 
 | ||||||
|  |         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) | ||||||
|  |  | ||||||
|  | @ -309,6 +309,11 @@ def test_cdata_content(self): | ||||||
|             ("endtag", "script"), |             ("endtag", "script"), | ||||||
|             ]) |             ]) | ||||||
| 
 | 
 | ||||||
|  |     def test_entityrefs_in_attributes(self): | ||||||
|  |         self._run_check("<html foo='€&aa&unsupported;'>", [ | ||||||
|  |                 ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")]) | ||||||
|  |                 ]) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_main(): | def test_main(): | ||||||
|     test_support.run_unittest(HTMLParserTestCase) |     test_support.run_unittest(HTMLParserTestCase) | ||||||
|  |  | ||||||
|  | @ -141,6 +141,9 @@ Core and builtins | ||||||
| Library | Library | ||||||
| ------- | ------- | ||||||
| 
 | 
 | ||||||
|  | - Patch #912410: Replace HTML entity references for attribute values  | ||||||
|  |   in HTMLParser. | ||||||
|  | 
 | ||||||
| - Patch #1663234: you can now run doctest on test files and modules | - Patch #1663234: you can now run doctest on test files and modules | ||||||
|   using "python -m doctest [-v] filename ...". |   using "python -m doctest [-v] filename ...". | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Martin v. Löwis
						Martin v. Löwis