gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665)

"] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">".
2025-12-08 06:10:17 +00:00 · 2025-08-14 21:13:22 +03:00 · 2025-08-14 21:13:22 +03:00 · 0cbbfc4621
commit 0cbbfc4621
parent 04f8ef663b
3 changed files with 95 additions and 26 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -146,6 +146,7 @@ def reset(self):
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
+        self._support_cdata = True
        self._escapable = True
        super().reset()

@ -183,6 +184,19 @@ def clear_cdata_mode(self):
        self.cdata_elem = None
        self._escapable = True

+    def _set_support_cdata(self, flag=True):
+        """Enable or disable support of the CDATA sections.
+        If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
+        If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
+
+        This method is not called by default. Its purpose is to be called
+        in custom handle_starttag() and handle_endtag() methods, with
+        value that depends on the adjusted current node.
+        See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
+        for details.
+        """
+        self._support_cdata = flag
+
    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
@ -258,7 +272,10 @@ def goahead(self, end):
                                break
                        self.handle_comment(rawdata[i+4:j])
                    elif startswith("<![CDATA[", i):
-                        self.unknown_decl(rawdata[i+3:])
+                        if self._support_cdata:
+                            self.unknown_decl(rawdata[i+3:])
+                        else:
+                            self.handle_comment(rawdata[i+1:])
                    elif rawdata[i:i+9].lower() == '<!doctype':
                        self.handle_decl(rawdata[i+2:])
                    elif startswith("<!", i):
@ -334,7 +351,14 @@ def parse_html_declaration(self, i):
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
        elif rawdata[i:i+9] == '<![CDATA[':
-            return self.parse_marked_section(i)
+            if self._support_cdata:
+                j = rawdata.find(']]>', i+9)
+                if j < 0:
+                    return -1
+                self.unknown_decl(rawdata[i+3: j])
+                return j + 3
+            else:
+                return self.parse_bogus_comment(i)
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
            gtpos = rawdata.find('>', i+9)