diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 0a1dd3b7d3b..1b8b6ea0e5a 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -278,7 +278,7 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == ''
''
''
- '')
+ ''
+ # see #32876
+ ''
+ ''
+ ''
+ ''
+ ''
+ '' # required '[' after CDATA
+ )
expected = [
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
@@ -579,39 +600,65 @@ def test_broken_comments(self):
('comment', ''),
('comment', '<-- this was an empty comment'),
('comment', '!! another bogus comment !!!'),
+ ('comment', '[with square brackets]!'),
+ ('comment', '[\nmultiline\nbogusness\n]!'),
+ ('comment', '[more brackets]-[and a hyphen]!'),
+ ('comment', '[cdata[should be uppercase]]'),
+ ('comment', '[CDATA [whitespaces are not ignored]]'),
+ ('comment', '[CDATA]]'),
]
self._run_check(html, expected)
def test_broken_condcoms(self):
# these condcoms are missing the '--' after ''
+ # and they are considered bogus comments according to
+ # "8.2.4.42. Markup declaration open state"
html = ('broken condcom'
''
'
'
'foo'
'
')
- # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
- # and "8.2.4.45 Markup declaration open state", comment tokens should
- # be emitted instead of 'unknown decl', but calling unknown_decl
- # provides more flexibility.
- # See also Lib/_markupbase.py:parse_declaration
expected = [
- ('unknown decl', 'if !(IE)'),
+ ('comment', '[if !(IE)]'),
('data', 'broken condcom'),
- ('unknown decl', 'endif'),
- ('unknown decl', 'if ! IE'),
+ ('comment', '[endif]'),
+ ('comment', '[if ! IE]'),
('startendtag', 'link', [('href', 'favicon.tiff')]),
- ('unknown decl', 'endif'),
- ('unknown decl', 'if !IE 6'),
+ ('comment', '[endif]'),
+ ('comment', '[if !IE 6]'),
('startendtag', 'img', [('src', 'firefox.png')]),
- ('unknown decl', 'endif'),
- ('unknown decl', 'if !ie 6'),
+ ('comment', '[endif]'),
+ ('comment', '[if !ie 6]'),
('starttag', 'b', []),
('data', 'foo'),
('endtag', 'b'),
- ('unknown decl', 'endif'),
- ('unknown decl', 'if (!IE)|(lt IE 9)'),
+ ('comment', '[endif]'),
+ ('comment', '[if (!IE)|(lt IE 9)]'),
('startendtag', 'img', [('src', 'mammoth.bmp')]),
- ('unknown decl', 'endif')
+ ('comment', '[endif]')
+ ]
+ self._run_check(html, expected)
+
+ def test_cdata_declarations(self):
+ # More tests should be added. See also "8.2.4.42. Markup
+ # declaration open state", "8.2.4.69. CDATA section state",
+ # and issue 32876
+ html = ('')
+ expected = [('unknown decl', 'CDATA[just some plain text')]
+ self._run_check(html, expected)
+
+ def test_cdata_declarations_multiline(self):
+ html = (' b) {'
+ ' printf("[]");'
+ ' }'
+ ']]>
')
+ expected = [
+ ('starttag', 'code', []),
+ ('unknown decl',
+ 'CDATA[ if (a < b && a > b) { '
+ 'printf("[]"); }'),
+ ('endtag', 'code')
]
self._run_check(html, expected)
diff --git a/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst
new file mode 100644
index 00000000000..42107de75c7
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst
@@ -0,0 +1,2 @@
+Fix handling of invalid markup declarations in
+:class:`html.parser.HTMLParser`.