mirror of
https://github.com/python/cpython.git
synced 2025-10-20 16:33:53 +00:00
gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665)
"] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">".
This commit is contained in:
parent
04f8ef663b
commit
0cbbfc4621
3 changed files with 95 additions and 26 deletions
|
@ -146,6 +146,7 @@ def reset(self):
|
|||
self.lasttag = '???'
|
||||
self.interesting = interesting_normal
|
||||
self.cdata_elem = None
|
||||
self._support_cdata = True
|
||||
self._escapable = True
|
||||
super().reset()
|
||||
|
||||
|
@ -183,6 +184,19 @@ def clear_cdata_mode(self):
|
|||
self.cdata_elem = None
|
||||
self._escapable = True
|
||||
|
||||
def _set_support_cdata(self, flag=True):
|
||||
"""Enable or disable support of the CDATA sections.
|
||||
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
|
||||
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
|
||||
|
||||
This method is not called by default. Its purpose is to be called
|
||||
in custom handle_starttag() and handle_endtag() methods, with
|
||||
value that depends on the adjusted current node.
|
||||
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
||||
for details.
|
||||
"""
|
||||
self._support_cdata = flag
|
||||
|
||||
# Internal -- handle data as far as reasonable. May leave state
|
||||
# and data to be processed by a subsequent call. If 'end' is
|
||||
# true, force handling all data as if followed by EOF marker.
|
||||
|
@ -258,7 +272,10 @@ def goahead(self, end):
|
|||
break
|
||||
self.handle_comment(rawdata[i+4:j])
|
||||
elif startswith("<![CDATA[", i):
|
||||
self.unknown_decl(rawdata[i+3:])
|
||||
if self._support_cdata:
|
||||
self.unknown_decl(rawdata[i+3:])
|
||||
else:
|
||||
self.handle_comment(rawdata[i+1:])
|
||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||
self.handle_decl(rawdata[i+2:])
|
||||
elif startswith("<!", i):
|
||||
|
@ -334,7 +351,14 @@ def parse_html_declaration(self, i):
|
|||
# this case is actually already handled in goahead()
|
||||
return self.parse_comment(i)
|
||||
elif rawdata[i:i+9] == '<![CDATA[':
|
||||
return self.parse_marked_section(i)
|
||||
if self._support_cdata:
|
||||
j = rawdata.find(']]>', i+9)
|
||||
if j < 0:
|
||||
return -1
|
||||
self.unknown_decl(rawdata[i+3: j])
|
||||
return j + 3
|
||||
else:
|
||||
return self.parse_bogus_comment(i)
|
||||
elif rawdata[i:i+9].lower() == '<!doctype':
|
||||
# find the closing >
|
||||
gtpos = rawdata.find('>', i+9)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue