[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-09-08 17:31:41 +02:00 committed by GitHub
parent 75c2d9f7c4
commit 61f7156965
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 90 additions and 27 deletions

View file

@ -146,6 +146,7 @@ def reset(self):
self.lasttag = '???' self.lasttag = '???'
self.interesting = interesting_normal self.interesting = interesting_normal
self.cdata_elem = None self.cdata_elem = None
self._support_cdata = True
self._escapable = True self._escapable = True
super().reset() super().reset()
@ -183,6 +184,19 @@ def clear_cdata_mode(self):
self.cdata_elem = None self.cdata_elem = None
self._escapable = True self._escapable = True
def _set_support_cdata(self, flag=True):
"""Enable or disable support of the CDATA sections.
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
This method is not called by default. Its purpose is to be called
in custom handle_starttag() and handle_endtag() methods, with
value that depends on the adjusted current node.
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
for details.
"""
self._support_cdata = flag
# Internal -- handle data as far as reasonable. May leave state # Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is # and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker. # true, force handling all data as if followed by EOF marker.
@ -257,7 +271,7 @@ def goahead(self, end):
j -= len(suffix) j -= len(suffix)
break break
self.handle_comment(rawdata[i+4:j]) self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i): elif startswith("<![CDATA[", i) and self._support_cdata:
self.unknown_decl(rawdata[i+3:]) self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype': elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:]) self.handle_decl(rawdata[i+2:])
@ -333,8 +347,12 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == '<!--': if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead() # this case is actually already handled in goahead()
return self.parse_comment(i) return self.parse_comment(i)
elif rawdata[i:i+9] == '<![CDATA[': elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
return self.parse_marked_section(i) j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
elif rawdata[i:i+9].lower() == '<!doctype': elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing > # find the closing >
gtpos = rawdata.find('>', i+9) gtpos = rawdata.find('>', i+9)

View file

@ -10,10 +10,13 @@
class EventCollector(html.parser.HTMLParser): class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, **kw): def __init__(self, *args, autocdata=False, **kw):
self.autocdata = autocdata
self.events = [] self.events = []
self.append = self.events.append self.append = self.events.append
html.parser.HTMLParser.__init__(self, *args, **kw) html.parser.HTMLParser.__init__(self, *args, **kw)
if autocdata:
self._set_support_cdata(False)
def get_events(self): def get_events(self):
# Normalize the list of events so that buffer artefacts don't # Normalize the list of events so that buffer artefacts don't
@ -34,12 +37,16 @@ def get_events(self):
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs)) self.append(("starttag", tag, attrs))
if self.autocdata and tag == 'svg':
self._set_support_cdata(True)
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):
self.append(("startendtag", tag, attrs)) self.append(("startendtag", tag, attrs))
def handle_endtag(self, tag): def handle_endtag(self, tag):
self.append(("endtag", tag)) self.append(("endtag", tag))
if self.autocdata and tag == 'svg':
self._set_support_cdata(False)
# all other markup # all other markup
@ -767,10 +774,6 @@ def test_eof_in_declarations(self):
('<!', [('comment', '')]), ('<!', [('comment', '')]),
('<!-', [('comment', '-')]), ('<!-', [('comment', '-')]),
('<![', [('comment', '[')]), ('<![', [('comment', '[')]),
('<![CDATA[', [('unknown decl', 'CDATA[')]),
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
('<!DOCTYPE', [('decl', 'DOCTYPE')]), ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@ -783,6 +786,18 @@ def test_eof_in_declarations(self):
for html, expected in data: for html, expected in data:
self._run_check(html, expected) self._run_check(html, expected)
@support.subTests('content', ['', 'x', 'x]', 'x]]'])
def test_eof_in_cdata(self, content):
self._run_check('<![CDATA[' + content,
[('unknown decl', 'CDATA[' + content)])
self._run_check('<![CDATA[' + content,
[('comment', '[CDATA[' + content)],
collector=EventCollector(autocdata=True))
self._run_check('<svg><text y="100"><![CDATA[' + content,
[('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[' + content)])
def test_bogus_comments(self): def test_bogus_comments(self):
html = ('<!ELEMENT br EMPTY>' html = ('<!ELEMENT br EMPTY>'
'<! not really a comment >' '<! not really a comment >'
@ -845,28 +860,53 @@ def test_broken_condcoms(self):
] ]
self._run_check(html, expected) self._run_check(html, expected)
def test_cdata_declarations(self): @support.subTests('content', [
# More tests should be added. See also "8.2.4.42. Markup 'just some plain text',
# declaration open state", "8.2.4.69. CDATA section state", '<!-- not a comment -->',
# and issue 32876 '&not-an-entity-ref;',
html = ('<![CDATA[just some plain text]]>') "<not a='start tag'>",
expected = [('unknown decl', 'CDATA[just some plain text')] '',
self._run_check(html, expected) '[[I have many brackets]]',
'I have a > in the middle',
def test_cdata_declarations_multiline(self): 'I have a ]] in the middle',
html = ('<code><![CDATA[' '] ]>',
' if (a < b && a > b) {' ']] >',
' printf("[<marquee>How?</marquee>]");' ('\n'
' }' ' if (a < b && a > b) {\n'
']]></code>') ' printf("[<marquee>How?</marquee>]");\n'
' }\n'),
])
def test_cdata_section_content(self, content):
# See "13.2.5.42 Markup declaration open state",
# "13.2.5.69 CDATA section state", and issue bpo-32876.
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
expected = [ expected = [
('starttag', 'code', []), ('starttag', 'svg', []),
('unknown decl', ('starttag', 'text', [('y', '100')]),
'CDATA[ if (a < b && a > b) { ' ('unknown decl', 'CDATA[' + content),
'printf("[<marquee>How?</marquee>]"); }'), ('endtag', 'text'),
('endtag', 'code') ('endtag', 'svg'),
] ]
self._run_check(html, expected) self._run_check(html, expected)
self._run_check(html, expected, collector=EventCollector(autocdata=True))
def test_cdata_section(self):
# See "13.2.5.42 Markup declaration open state".
html = ('<![CDATA[foo<br>bar]]>'
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
'<![CDATA[foo<br>bar]]>')
expected = [
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[foo<br>bar'),
('endtag', 'text'),
('endtag', 'svg'),
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
]
self._run_check(html, expected, collector=EventCollector(autocdata=True))
def test_convert_charrefs_dropped_text(self): def test_convert_charrefs_dropped_text(self):
# #23144: make sure that all the events are triggered when # #23144: make sure that all the events are triggered when

View file

@ -0,0 +1,5 @@
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
Add private method ``_set_support_cdata()`` which can be used to specify
how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
(SVG or MathML) or as a bogus comment in the HTML namespace.