gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665)

"] ]>" and "]] >" no longer end the CDATA section.

Make CDATA section parsing  context depending.
Add private method HTMLParser._set_support_cdata() to change the context.
If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>".
If called with False, "<[CDATA[" starts a bogus comments which ends with ">".
This commit is contained in:
Serhiy Storchaka 2025-08-14 21:13:22 +03:00 committed by GitHub
parent 04f8ef663b
commit 0cbbfc4621
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 95 additions and 26 deletions

View file

@ -146,6 +146,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._support_cdata = True
self._escapable = True
super().reset()
@ -183,6 +184,19 @@ def clear_cdata_mode(self):
self.cdata_elem = None
self._escapable = True
def _set_support_cdata(self, flag=True):
"""Enable or disable support of the CDATA sections.
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
This method is not called by default. Its purpose is to be called
in custom handle_starttag() and handle_endtag() methods, with
value that depends on the adjusted current node.
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
for details.
"""
self._support_cdata = flag
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@ -258,7 +272,10 @@ def goahead(self, end):
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
self.unknown_decl(rawdata[i+3:])
if self._support_cdata:
self.unknown_decl(rawdata[i+3:])
else:
self.handle_comment(rawdata[i+1:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
@ -334,7 +351,14 @@ def parse_html_declaration(self, i):
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
if self._support_cdata:
j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
else:
return self.parse_bogus_comment(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)