mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
This commit is contained in:
		
							parent
							
								
									ff432e6f4a
								
							
						
					
					
						commit
						ab8a6bba25
					
				
					 4 changed files with 43 additions and 13 deletions
				
			
		|  | @ -75,14 +75,18 @@ This method is called to handle the start of a tag.  It is intended to | |||
| be overridden by a derived class; the base class implementation does | ||||
| nothing.   | ||||
| 
 | ||||
| The \var{tag} argument is the name of the tag converted to | ||||
| lower case.  The \var{attrs} argument is a list of \code{(\var{name}, | ||||
| \var{value})} pairs containing the attributes found inside the tag's | ||||
| \code{<>} brackets.  The \var{name} will be translated to lower case | ||||
| and double quotes and backslashes in the \var{value} have been | ||||
| interpreted.  For instance, for the tag \code{<A | ||||
| HREF="http://www.cwi.nl/">}, this method would be called as | ||||
| The \var{tag} argument is the name of the tag converted to lower case. | ||||
| The \var{attrs} argument is a list of \code{(\var{name}, \var{value})} | ||||
| pairs containing the attributes found inside the tag's \code{<>} | ||||
| brackets.  The \var{name} will be translated to lower case, and quotes | ||||
| in the \var{value} have been removed, and character and entity | ||||
| references have been replaced.  For instance, for the tag \code{<A | ||||
|   HREF="http://www.cwi.nl/">}, this method would be called as | ||||
| \samp{handle_starttag('a', [('href', 'http://www.cwi.nl/')])}. | ||||
| 
 | ||||
| \versionchanged[All entity references from htmlentitydefs are now | ||||
| replaced in the attribute values]{2.6} | ||||
| 
 | ||||
| \end{methoddesc} | ||||
| 
 | ||||
| \begin{methoddesc}{handle_startendtag}{tag, attrs} | ||||
|  |  | |||
|  | @ -358,12 +358,30 @@ def unknown_decl(self, data): | |||
|         self.error("unknown declaration: %r" % (data,)) | ||||
| 
 | ||||
|     # Internal -- helper to remove special character quoting | ||||
|     entitydefs = None | ||||
|     def unescape(self, s): | ||||
|         if '&' not in s: | ||||
|             return s | ||||
|         s = s.replace("<", "<") | ||||
|         s = s.replace(">", ">") | ||||
|         s = s.replace("'", "'") | ||||
|         s = s.replace(""", '"') | ||||
|         s = s.replace("&", "&") # Must be last | ||||
|         return s | ||||
|         def replaceEntities(s): | ||||
|             s = s.groups()[0] | ||||
|             if s[0] == "#": | ||||
|                 s = s[1:] | ||||
|                 if s[0] in ['x','X']: | ||||
|                     c = int(s[1:], 16) | ||||
|                 else: | ||||
|                     c = int(s) | ||||
|                 return unichr(c) | ||||
|             else: | ||||
|                 # Cannot use name2codepoint directly, because HTMLParser supports apos, | ||||
|                 # which is not part of HTML 4 | ||||
|                 import htmlentitydefs | ||||
|                 if HTMLParser.entitydefs is None: | ||||
|                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"} | ||||
|                     for k, v in htmlentitydefs.name2codepoint.iteritems(): | ||||
|                         entitydefs[k] = unichr(v) | ||||
|                 try: | ||||
|                     return self.entitydefs[s] | ||||
|                 except KeyError: | ||||
|                     return '&'+s+';' | ||||
| 
 | ||||
|         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) | ||||
|  |  | |||
|  | @ -309,6 +309,11 @@ def test_cdata_content(self): | |||
|             ("endtag", "script"), | ||||
|             ]) | ||||
| 
 | ||||
|     def test_entityrefs_in_attributes(self): | ||||
|         self._run_check("<html foo='€&aa&unsupported;'>", [ | ||||
|                 ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")]) | ||||
|                 ]) | ||||
| 
 | ||||
| 
 | ||||
| def test_main(): | ||||
|     test_support.run_unittest(HTMLParserTestCase) | ||||
|  |  | |||
|  | @ -141,6 +141,9 @@ Core and builtins | |||
| Library | ||||
| ------- | ||||
| 
 | ||||
| - Patch #912410: Replace HTML entity references for attribute values  | ||||
|   in HTMLParser. | ||||
| 
 | ||||
| - Patch #1663234: you can now run doctest on test files and modules | ||||
|   using "python -m doctest [-v] filename ...". | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Martin v. Löwis
						Martin v. Löwis