[3.9] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137774) (GH-139661)

"] ]>" and "]] >" no longer end the CDATA section.

Make CDATA section parsing  context depending.
Add private method HTMLParser._set_support_cdata() to change the context.
If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>".
If called with False, "<[CDATA[" starts a bogus comments which ends with ">".
(cherry picked from commit 0cbbfc4621)
(cherry picked from commit dcf24768c9)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
This commit is contained in:
Miss Islington (bot) 2025-10-07 21:15:04 +02:00 committed by GitHub
parent f3d8338cd5
commit ed904d5bbf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 104 additions and 29 deletions

View file

@ -9,10 +9,13 @@
class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, **kw):
def __init__(self, *args, autocdata=False, **kw):
self.autocdata = autocdata
self.events = []
self.append = self.events.append
html.parser.HTMLParser.__init__(self, *args, **kw)
if autocdata:
self._set_support_cdata(False)
def get_events(self):
# Normalize the list of events so that buffer artefacts don't
@ -33,12 +36,16 @@ def get_events(self):
def handle_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs))
if self.autocdata and tag == 'svg':
self._set_support_cdata(True)
def handle_startendtag(self, tag, attrs):
self.append(("startendtag", tag, attrs))
def handle_endtag(self, tag):
self.append(("endtag", tag))
if self.autocdata and tag == 'svg':
self._set_support_cdata(False)
# all other markup
@ -739,10 +746,6 @@ def test_eof_in_declarations(self):
('<!', [('comment', '')]),
('<!-', [('comment', '-')]),
('<![', [('comment', '[')]),
('<![CDATA[', [('unknown decl', 'CDATA[')]),
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@ -755,6 +758,18 @@ def test_eof_in_declarations(self):
for html, expected in data:
self._run_check(html, expected)
@support.subTests('content', ['', 'x', 'x]', 'x]]'])
def test_eof_in_cdata(self, content):
self._run_check('<![CDATA[' + content,
[('unknown decl', 'CDATA[' + content)])
self._run_check('<![CDATA[' + content,
[('comment', '[CDATA[' + content)],
collector=EventCollector(autocdata=True))
self._run_check('<svg><text y="100"><![CDATA[' + content,
[('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[' + content)])
def test_bogus_comments(self):
html = ('<!ELEMENT br EMPTY>'
'<! not really a comment >'
@ -804,8 +819,57 @@ def test_broken_condcoms(self):
('startendtag', 'img', [('src', 'mammoth.bmp')]),
('unknown decl', 'endif')
]
self._run_check(html, expected)
@support.subTests('content', [
'just some plain text',
'<!-- not a comment -->',
'&not-an-entity-ref;',
"<not a='start tag'>",
'',
'[[I have many brackets]]',
'I have a > in the middle',
'I have a ]] in the middle',
'] ]>',
']] >',
('\n'
' if (a < b && a > b) {\n'
' printf("[<marquee>How?</marquee>]");\n'
' }\n'),
])
def test_cdata_section_content(self, content):
# See "13.2.5.42 Markup declaration open state",
# "13.2.5.69 CDATA section state", and issue bpo-32876.
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
expected = [
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[' + content),
('endtag', 'text'),
('endtag', 'svg'),
]
self._run_check(html, expected)
self._run_check(html, expected, collector=EventCollector(autocdata=True))
def test_cdata_section(self):
# See "13.2.5.42 Markup declaration open state".
html = ('<![CDATA[foo<br>bar]]>'
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
'<![CDATA[foo<br>bar]]>')
expected = [
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[foo<br>bar'),
('endtag', 'text'),
('endtag', 'svg'),
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
]
self._run_check(html, expected, collector=EventCollector(autocdata=True))
def test_convert_charrefs_dropped_text(self):
# #23144: make sure that all the events are triggered when
# convert_charrefs is True, even if we don't call .close()
@ -1041,27 +1105,6 @@ def test_weird_chars_in_unquoted_attribute_values(self):
('starttag', 'form',
[('action', 'bogus|&#()value')])])
def test_invalid_keyword_error_exception(self):
# bpo-34480: check that subclasses that define an
# error method that raises an exception work
class InvalidMarkupException(Exception):
pass
class MyHTMLParser(html.parser.HTMLParser):
def error(self, message):
raise InvalidMarkupException(message)
parser = MyHTMLParser()
with self.assertRaises(InvalidMarkupException):
parser.feed('<![invalid>')
def test_invalid_keyword_error_pass(self):
# bpo-34480: check that subclasses that define an
# error method that doesn't raise an exception work
class MyHTMLParser(html.parser.HTMLParser):
def error(self, message):
pass
parser = MyHTMLParser()
self.assertEqual(parser.feed('<![invalid>'), None)
if __name__ == "__main__":
unittest.main()