mirror of
https://github.com/python/cpython.git
synced 2025-11-01 14:11:41 +00:00
[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
75c2d9f7c4
commit
61f7156965
3 changed files with 90 additions and 27 deletions
|
|
@ -146,6 +146,7 @@ def reset(self):
|
||||||
self.lasttag = '???'
|
self.lasttag = '???'
|
||||||
self.interesting = interesting_normal
|
self.interesting = interesting_normal
|
||||||
self.cdata_elem = None
|
self.cdata_elem = None
|
||||||
|
self._support_cdata = True
|
||||||
self._escapable = True
|
self._escapable = True
|
||||||
super().reset()
|
super().reset()
|
||||||
|
|
||||||
|
|
@ -183,6 +184,19 @@ def clear_cdata_mode(self):
|
||||||
self.cdata_elem = None
|
self.cdata_elem = None
|
||||||
self._escapable = True
|
self._escapable = True
|
||||||
|
|
||||||
|
def _set_support_cdata(self, flag=True):
|
||||||
|
"""Enable or disable support of the CDATA sections.
|
||||||
|
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
|
||||||
|
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
|
||||||
|
|
||||||
|
This method is not called by default. Its purpose is to be called
|
||||||
|
in custom handle_starttag() and handle_endtag() methods, with
|
||||||
|
value that depends on the adjusted current node.
|
||||||
|
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
||||||
|
for details.
|
||||||
|
"""
|
||||||
|
self._support_cdata = flag
|
||||||
|
|
||||||
# Internal -- handle data as far as reasonable. May leave state
|
# Internal -- handle data as far as reasonable. May leave state
|
||||||
# and data to be processed by a subsequent call. If 'end' is
|
# and data to be processed by a subsequent call. If 'end' is
|
||||||
# true, force handling all data as if followed by EOF marker.
|
# true, force handling all data as if followed by EOF marker.
|
||||||
|
|
@ -257,7 +271,7 @@ def goahead(self, end):
|
||||||
j -= len(suffix)
|
j -= len(suffix)
|
||||||
break
|
break
|
||||||
self.handle_comment(rawdata[i+4:j])
|
self.handle_comment(rawdata[i+4:j])
|
||||||
elif startswith("<![CDATA[", i):
|
elif startswith("<![CDATA[", i) and self._support_cdata:
|
||||||
self.unknown_decl(rawdata[i+3:])
|
self.unknown_decl(rawdata[i+3:])
|
||||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||||
self.handle_decl(rawdata[i+2:])
|
self.handle_decl(rawdata[i+2:])
|
||||||
|
|
@ -333,8 +347,12 @@ def parse_html_declaration(self, i):
|
||||||
if rawdata[i:i+4] == '<!--':
|
if rawdata[i:i+4] == '<!--':
|
||||||
# this case is actually already handled in goahead()
|
# this case is actually already handled in goahead()
|
||||||
return self.parse_comment(i)
|
return self.parse_comment(i)
|
||||||
elif rawdata[i:i+9] == '<![CDATA[':
|
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
|
||||||
return self.parse_marked_section(i)
|
j = rawdata.find(']]>', i+9)
|
||||||
|
if j < 0:
|
||||||
|
return -1
|
||||||
|
self.unknown_decl(rawdata[i+3: j])
|
||||||
|
return j + 3
|
||||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||||
# find the closing >
|
# find the closing >
|
||||||
gtpos = rawdata.find('>', i+9)
|
gtpos = rawdata.find('>', i+9)
|
||||||
|
|
|
||||||
|
|
@ -10,10 +10,13 @@
|
||||||
|
|
||||||
class EventCollector(html.parser.HTMLParser):
|
class EventCollector(html.parser.HTMLParser):
|
||||||
|
|
||||||
def __init__(self, *args, **kw):
|
def __init__(self, *args, autocdata=False, **kw):
|
||||||
|
self.autocdata = autocdata
|
||||||
self.events = []
|
self.events = []
|
||||||
self.append = self.events.append
|
self.append = self.events.append
|
||||||
html.parser.HTMLParser.__init__(self, *args, **kw)
|
html.parser.HTMLParser.__init__(self, *args, **kw)
|
||||||
|
if autocdata:
|
||||||
|
self._set_support_cdata(False)
|
||||||
|
|
||||||
def get_events(self):
|
def get_events(self):
|
||||||
# Normalize the list of events so that buffer artefacts don't
|
# Normalize the list of events so that buffer artefacts don't
|
||||||
|
|
@ -34,12 +37,16 @@ def get_events(self):
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
self.append(("starttag", tag, attrs))
|
self.append(("starttag", tag, attrs))
|
||||||
|
if self.autocdata and tag == 'svg':
|
||||||
|
self._set_support_cdata(True)
|
||||||
|
|
||||||
def handle_startendtag(self, tag, attrs):
|
def handle_startendtag(self, tag, attrs):
|
||||||
self.append(("startendtag", tag, attrs))
|
self.append(("startendtag", tag, attrs))
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
self.append(("endtag", tag))
|
self.append(("endtag", tag))
|
||||||
|
if self.autocdata and tag == 'svg':
|
||||||
|
self._set_support_cdata(False)
|
||||||
|
|
||||||
# all other markup
|
# all other markup
|
||||||
|
|
||||||
|
|
@ -767,10 +774,6 @@ def test_eof_in_declarations(self):
|
||||||
('<!', [('comment', '')]),
|
('<!', [('comment', '')]),
|
||||||
('<!-', [('comment', '-')]),
|
('<!-', [('comment', '-')]),
|
||||||
('<![', [('comment', '[')]),
|
('<![', [('comment', '[')]),
|
||||||
('<![CDATA[', [('unknown decl', 'CDATA[')]),
|
|
||||||
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
|
|
||||||
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
|
|
||||||
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
|
|
||||||
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
|
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
|
||||||
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
|
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
|
||||||
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
|
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
|
||||||
|
|
@ -783,6 +786,18 @@ def test_eof_in_declarations(self):
|
||||||
for html, expected in data:
|
for html, expected in data:
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
|
@support.subTests('content', ['', 'x', 'x]', 'x]]'])
|
||||||
|
def test_eof_in_cdata(self, content):
|
||||||
|
self._run_check('<![CDATA[' + content,
|
||||||
|
[('unknown decl', 'CDATA[' + content)])
|
||||||
|
self._run_check('<![CDATA[' + content,
|
||||||
|
[('comment', '[CDATA[' + content)],
|
||||||
|
collector=EventCollector(autocdata=True))
|
||||||
|
self._run_check('<svg><text y="100"><![CDATA[' + content,
|
||||||
|
[('starttag', 'svg', []),
|
||||||
|
('starttag', 'text', [('y', '100')]),
|
||||||
|
('unknown decl', 'CDATA[' + content)])
|
||||||
|
|
||||||
def test_bogus_comments(self):
|
def test_bogus_comments(self):
|
||||||
html = ('<!ELEMENT br EMPTY>'
|
html = ('<!ELEMENT br EMPTY>'
|
||||||
'<! not really a comment >'
|
'<! not really a comment >'
|
||||||
|
|
@ -845,28 +860,53 @@ def test_broken_condcoms(self):
|
||||||
]
|
]
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_cdata_declarations(self):
|
@support.subTests('content', [
|
||||||
# More tests should be added. See also "8.2.4.42. Markup
|
'just some plain text',
|
||||||
# declaration open state", "8.2.4.69. CDATA section state",
|
'<!-- not a comment -->',
|
||||||
# and issue 32876
|
'¬-an-entity-ref;',
|
||||||
html = ('<![CDATA[just some plain text]]>')
|
"<not a='start tag'>",
|
||||||
expected = [('unknown decl', 'CDATA[just some plain text')]
|
'',
|
||||||
self._run_check(html, expected)
|
'[[I have many brackets]]',
|
||||||
|
'I have a > in the middle',
|
||||||
def test_cdata_declarations_multiline(self):
|
'I have a ]] in the middle',
|
||||||
html = ('<code><![CDATA['
|
'] ]>',
|
||||||
' if (a < b && a > b) {'
|
']] >',
|
||||||
' printf("[<marquee>How?</marquee>]");'
|
('\n'
|
||||||
' }'
|
' if (a < b && a > b) {\n'
|
||||||
']]></code>')
|
' printf("[<marquee>How?</marquee>]");\n'
|
||||||
|
' }\n'),
|
||||||
|
])
|
||||||
|
def test_cdata_section_content(self, content):
|
||||||
|
# See "13.2.5.42 Markup declaration open state",
|
||||||
|
# "13.2.5.69 CDATA section state", and issue bpo-32876.
|
||||||
|
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
|
||||||
expected = [
|
expected = [
|
||||||
('starttag', 'code', []),
|
('starttag', 'svg', []),
|
||||||
('unknown decl',
|
('starttag', 'text', [('y', '100')]),
|
||||||
'CDATA[ if (a < b && a > b) { '
|
('unknown decl', 'CDATA[' + content),
|
||||||
'printf("[<marquee>How?</marquee>]"); }'),
|
('endtag', 'text'),
|
||||||
('endtag', 'code')
|
('endtag', 'svg'),
|
||||||
]
|
]
|
||||||
self._run_check(html, expected)
|
self._run_check(html, expected)
|
||||||
|
self._run_check(html, expected, collector=EventCollector(autocdata=True))
|
||||||
|
|
||||||
|
def test_cdata_section(self):
|
||||||
|
# See "13.2.5.42 Markup declaration open state".
|
||||||
|
html = ('<![CDATA[foo<br>bar]]>'
|
||||||
|
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
|
||||||
|
'<![CDATA[foo<br>bar]]>')
|
||||||
|
expected = [
|
||||||
|
('comment', '[CDATA[foo<br'),
|
||||||
|
('data', 'bar]]>'),
|
||||||
|
('starttag', 'svg', []),
|
||||||
|
('starttag', 'text', [('y', '100')]),
|
||||||
|
('unknown decl', 'CDATA[foo<br>bar'),
|
||||||
|
('endtag', 'text'),
|
||||||
|
('endtag', 'svg'),
|
||||||
|
('comment', '[CDATA[foo<br'),
|
||||||
|
('data', 'bar]]>'),
|
||||||
|
]
|
||||||
|
self._run_check(html, expected, collector=EventCollector(autocdata=True))
|
||||||
|
|
||||||
def test_convert_charrefs_dropped_text(self):
|
def test_convert_charrefs_dropped_text(self):
|
||||||
# #23144: make sure that all the events are triggered when
|
# #23144: make sure that all the events are triggered when
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
|
||||||
|
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
|
||||||
|
Add private method ``_set_support_cdata()`` which can be used to specify
|
||||||
|
how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
|
||||||
|
(SVG or MathML) or as a bogus comment in the HTML namespace.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue