mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 21:51:50 +00:00 
			
		
		
		
	gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
		
							parent
							
								
									e7741dd773
								
							
						
					
					
						commit
						76c0b01bc4
					
				
					 3 changed files with 68 additions and 19 deletions
				
			
		|  | @ -566,12 +566,33 @@ def test_EOF_in_charref(self): | |||
|         for html, expected in data: | ||||
|             self._run_check(html, expected) | ||||
| 
 | ||||
|     def test_broken_comments(self): | ||||
|     def test_EOF_in_comments_or_decls(self): | ||||
|         data = [ | ||||
|             ('<!', [('data', '<!')]), | ||||
|             ('<!-', [('data', '<!-')]), | ||||
|             ('<!--', [('data', '<!--')]), | ||||
|             ('<![', [('data', '<![')]), | ||||
|             ('<![CDATA[', [('data', '<![CDATA[')]), | ||||
|             ('<![CDATA[x', [('data', '<![CDATA[x')]), | ||||
|             ('<!DOCTYPE', [('data', '<!DOCTYPE')]), | ||||
|             ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]), | ||||
|         ] | ||||
|         for html, expected in data: | ||||
|             self._run_check(html, expected) | ||||
|     def test_bogus_comments(self): | ||||
|         html = ('<! not really a comment >' | ||||
|                 '<! not a comment either -->' | ||||
|                 '<! -- close enough -->' | ||||
|                 '<!><!<-- this was an empty comment>' | ||||
|                 '<!!! another bogus comment !!!>') | ||||
|                 '<!!! another bogus comment !!!>' | ||||
|                 # see #32876 | ||||
|                 '<![with square brackets]!>' | ||||
|                 '<![\nmultiline\nbogusness\n]!>' | ||||
|                 '<![more brackets]-[and a hyphen]!>' | ||||
|                 '<![cdata[should be uppercase]]>' | ||||
|                 '<![CDATA [whitespaces are not ignored]]>' | ||||
|                 '<![CDATA]]>'  # required '[' after CDATA | ||||
|         ) | ||||
|         expected = [ | ||||
|             ('comment', ' not really a comment '), | ||||
|             ('comment', ' not a comment either --'), | ||||
|  | @ -579,39 +600,65 @@ def test_broken_comments(self): | |||
|             ('comment', ''), | ||||
|             ('comment', '<-- this was an empty comment'), | ||||
|             ('comment', '!! another bogus comment !!!'), | ||||
|             ('comment', '[with square brackets]!'), | ||||
|             ('comment', '[\nmultiline\nbogusness\n]!'), | ||||
|             ('comment', '[more brackets]-[and a hyphen]!'), | ||||
|             ('comment', '[cdata[should be uppercase]]'), | ||||
|             ('comment', '[CDATA [whitespaces are not ignored]]'), | ||||
|             ('comment', '[CDATA]]'), | ||||
|         ] | ||||
|         self._run_check(html, expected) | ||||
| 
 | ||||
|     def test_broken_condcoms(self): | ||||
|         # these condcoms are missing the '--' after '<!' and before the '>' | ||||
|         # and they are considered bogus comments according to | ||||
|         # "8.2.4.42. Markup declaration open state" | ||||
|         html = ('<![if !(IE)]>broken condcom<![endif]>' | ||||
|                 '<![if ! IE]><link href="favicon.tiff"/><![endif]>' | ||||
|                 '<![if !IE 6]><img src="firefox.png" /><![endif]>' | ||||
|                 '<![if !ie 6]><b>foo</b><![endif]>' | ||||
|                 '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>') | ||||
|         # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" | ||||
|         # and "8.2.4.45 Markup declaration open state", comment tokens should | ||||
|         # be emitted instead of 'unknown decl', but calling unknown_decl | ||||
|         # provides more flexibility. | ||||
|         # See also Lib/_markupbase.py:parse_declaration | ||||
|         expected = [ | ||||
|             ('unknown decl', 'if !(IE)'), | ||||
|             ('comment', '[if !(IE)]'), | ||||
|             ('data', 'broken condcom'), | ||||
|             ('unknown decl', 'endif'), | ||||
|             ('unknown decl', 'if ! IE'), | ||||
|             ('comment', '[endif]'), | ||||
|             ('comment', '[if ! IE]'), | ||||
|             ('startendtag', 'link', [('href', 'favicon.tiff')]), | ||||
|             ('unknown decl', 'endif'), | ||||
|             ('unknown decl', 'if !IE 6'), | ||||
|             ('comment', '[endif]'), | ||||
|             ('comment', '[if !IE 6]'), | ||||
|             ('startendtag', 'img', [('src', 'firefox.png')]), | ||||
|             ('unknown decl', 'endif'), | ||||
|             ('unknown decl', 'if !ie 6'), | ||||
|             ('comment', '[endif]'), | ||||
|             ('comment', '[if !ie 6]'), | ||||
|             ('starttag', 'b', []), | ||||
|             ('data', 'foo'), | ||||
|             ('endtag', 'b'), | ||||
|             ('unknown decl', 'endif'), | ||||
|             ('unknown decl', 'if (!IE)|(lt IE 9)'), | ||||
|             ('comment', '[endif]'), | ||||
|             ('comment', '[if (!IE)|(lt IE 9)]'), | ||||
|             ('startendtag', 'img', [('src', 'mammoth.bmp')]), | ||||
|             ('unknown decl', 'endif') | ||||
|             ('comment', '[endif]') | ||||
|         ] | ||||
|         self._run_check(html, expected) | ||||
| 
 | ||||
|     def test_cdata_declarations(self): | ||||
|         # More tests should be added. See also "8.2.4.42. Markup | ||||
|         # declaration open state", "8.2.4.69. CDATA section state", | ||||
|         # and issue 32876 | ||||
|         html = ('<![CDATA[just some plain text]]>') | ||||
|         expected = [('unknown decl', 'CDATA[just some plain text')] | ||||
|         self._run_check(html, expected) | ||||
| 
 | ||||
|     def test_cdata_declarations_multiline(self): | ||||
|         html = ('<code><![CDATA[' | ||||
|                 '    if (a < b && a > b) {' | ||||
|                 '        printf("[<marquee>How?</marquee>]");' | ||||
|                 '    }' | ||||
|                 ']]></code>') | ||||
|         expected = [ | ||||
|             ('starttag', 'code', []), | ||||
|             ('unknown decl', | ||||
|              'CDATA[    if (a < b && a > b) {        ' | ||||
|              'printf("[<marquee>How?</marquee>]");    }'), | ||||
|             ('endtag', 'code') | ||||
|         ] | ||||
|         self._run_check(html, expected) | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Ezio Melotti
						Ezio Melotti