[3.9] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137774) (GH-139661)

"] ]>" and "]] >" no longer end the CDATA section.

Make CDATA section parsing  context depending.
Add private method HTMLParser._set_support_cdata() to change the context.
If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>".
If called with False, "<[CDATA[" starts a bogus comments which ends with ">".
(cherry picked from commit 0cbbfc4621)
(cherry picked from commit dcf24768c9)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
This commit is contained in:
Miss Islington (bot) 2025-10-07 21:15:04 +02:00 committed by GitHub
parent f3d8338cd5
commit ed904d5bbf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 104 additions and 29 deletions

View file

@ -127,6 +127,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._support_cdata = True
self._escapable = True
_markupbase.ParserBase.reset(self)
@ -164,6 +165,19 @@ def clear_cdata_mode(self):
self.cdata_elem = None
self._escapable = True
def _set_support_cdata(self, flag=True):
"""Enable or disable support of the CDATA sections.
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
This method is not called by default. Its purpose is to be called
in custom handle_starttag() and handle_endtag() methods, with
value that depends on the adjusted current node.
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
for details.
"""
self._support_cdata = flag
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
@ -238,7 +252,7 @@ def goahead(self, end):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
elif startswith("<![CDATA[", i) and self._support_cdata:
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
@ -314,8 +328,12 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+3] == '<![':
return self.parse_marked_section(i)
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
@ -323,6 +341,15 @@ def parse_html_declaration(self, i):
return -1
self.handle_decl(rawdata[i+2:gtpos])
return gtpos+1
elif rawdata[i:i+3] == '<![':
j = rawdata.find('>', i+3)
if j < 0:
return -1
if rawdata[j-1] == ']':
self.unknown_decl(rawdata[i+3: j-1])
else:
self.handle_comment(rawdata[i+2: j])
return j + 1
else:
return self.parse_bogus_comment(i)