[3.14] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (#137772)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-11-01 14:11:41 +00:00 · 2025-09-08 17:31:41 +02:00 · 2025-09-08 17:31:41 +02:00 · 61f7156965
commit 61f7156965
parent 75c2d9f7c4
3 changed files with 90 additions and 27 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -146,6 +146,7 @@ def reset(self):
        self.lasttag = '???'
        self.interesting = interesting_normal
        self.cdata_elem = None
        self._support_cdata = True
        self._escapable = True
        super().reset()
@ -183,6 +184,19 @@ def clear_cdata_mode(self):
        self.cdata_elem = None
        self._escapable = True
    def _set_support_cdata(self, flag=True):
        """Enable or disable support of the CDATA sections.
        If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
        If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
        This method is not called by default. Its purpose is to be called
        in custom handle_starttag() and handle_endtag() methods, with
        value that depends on the adjusted current node.
        See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
        for details.
        """
        self._support_cdata = flag
    # Internal -- handle data as far as reasonable.  May leave state
    # and data to be processed by a subsequent call.  If 'end' is
    # true, force handling all data as if followed by EOF marker.
@ -257,7 +271,7 @@ def goahead(self, end):
                                j -= len(suffix)
                                break
                        self.handle_comment(rawdata[i+4:j])
-                    elif startswith("<![CDATA[", i):
+                    elif startswith("<![CDATA[", i) and self._support_cdata:
                        self.unknown_decl(rawdata[i+3:])
                    elif rawdata[i:i+9].lower() == '<!doctype':
                        self.handle_decl(rawdata[i+2:])
@ -333,8 +347,12 @@ def parse_html_declaration(self, i):
        if rawdata[i:i+4] == '<!--':
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
-        elif rawdata[i:i+9] == '<![CDATA[':
+        elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
-            return self.parse_marked_section(i)
+            j = rawdata.find(']]>', i+9)
            if j < 0:
                return -1
            self.unknown_decl(rawdata[i+3: j])
            return j + 3
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
            gtpos = rawdata.find('>', i+9)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -10,10 +10,13 @@
 class EventCollector(html.parser.HTMLParser):
-    def __init__(self, *args, **kw):
+    def __init__(self, *args, autocdata=False, **kw):
        self.autocdata = autocdata
        self.events = []
        self.append = self.events.append
        html.parser.HTMLParser.__init__(self, *args, **kw)
        if autocdata:
            self._set_support_cdata(False)
    def get_events(self):
        # Normalize the list of events so that buffer artefacts don't
@ -34,12 +37,16 @@ def get_events(self):
    def handle_starttag(self, tag, attrs):
        self.append(("starttag", tag, attrs))
        if self.autocdata and tag == 'svg':
            self._set_support_cdata(True)
    def handle_startendtag(self, tag, attrs):
        self.append(("startendtag", tag, attrs))
    def handle_endtag(self, tag):
        self.append(("endtag", tag))
        if self.autocdata and tag == 'svg':
            self._set_support_cdata(False)
    # all other markup
@ -767,10 +774,6 @@ def test_eof_in_declarations(self):
            ('<!', [('comment', '')]),
            ('<!-', [('comment', '-')]),
            ('<![', [('comment', '[')]),
            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@ -783,6 +786,18 @@ def test_eof_in_declarations(self):
        for html, expected in data:
            self._run_check(html, expected)
    @support.subTests('content', ['', 'x', 'x]', 'x]]'])
    def test_eof_in_cdata(self, content):
        self._run_check('<![CDATA[' + content,
                        [('unknown decl', 'CDATA[' + content)])
        self._run_check('<![CDATA[' + content,
                        [('comment', '[CDATA[' + content)],
                        collector=EventCollector(autocdata=True))
        self._run_check('<svg><text y="100"><![CDATA[' + content,
                        [('starttag', 'svg', []),
                         ('starttag', 'text', [('y', '100')]),
                         ('unknown decl', 'CDATA[' + content)])
    def test_bogus_comments(self):
        html = ('<!ELEMENT br EMPTY>'
                '<! not really a comment >'
@ -845,28 +860,53 @@ def test_broken_condcoms(self):
        ]
        self._run_check(html, expected)
-    def test_cdata_declarations(self):
+    @support.subTests('content', [
-        # More tests should be added. See also "8.2.4.42. Markup
+        'just some plain text',
-        # declaration open state", "8.2.4.69. CDATA section state",
+        '<!-- not a comment -->',
-        # and issue 32876
+        '&not-an-entity-ref;',
-        html = ('<![CDATA[just some plain text]]>')
+        "<not a='start tag'>",
-        expected = [('unknown decl', 'CDATA[just some plain text')]
+        '',
-        self._run_check(html, expected)
+        '[[I have many brackets]]',
-
+        'I have a > in the middle',
-    def test_cdata_declarations_multiline(self):
+        'I have a ]] in the middle',
-        html = ('<code><![CDATA['
+        '] ]>',
-                '    if (a < b && a > b) {'
+        ']] >',
-                '        printf("[<marquee>How?</marquee>]");'
+        ('\n'
-                '    }'
+         '    if (a < b && a > b) {\n'
-                ']]></code>')
+         '        printf("[<marquee>How?</marquee>]");\n'
         '    }\n'),
    ])
    def test_cdata_section_content(self, content):
        # See "13.2.5.42 Markup declaration open state",
        # "13.2.5.69 CDATA section state", and issue bpo-32876.
        html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
        expected = [
-            ('starttag', 'code', []),
+            ('starttag', 'svg', []),
-            ('unknown decl',
+            ('starttag', 'text', [('y', '100')]),
-             'CDATA[    if (a < b && a > b) {        '
+            ('unknown decl', 'CDATA[' + content),
-             'printf("[<marquee>How?</marquee>]");    }'),
+            ('endtag', 'text'),
-            ('endtag', 'code')
+            ('endtag', 'svg'),
        ]
        self._run_check(html, expected)
        self._run_check(html, expected, collector=EventCollector(autocdata=True))
    def test_cdata_section(self):
        # See "13.2.5.42 Markup declaration open state".
        html = ('<![CDATA[foo<br>bar]]>'
                '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
                '<![CDATA[foo<br>bar]]>')
        expected = [
            ('comment', '[CDATA[foo<br'),
            ('data', 'bar]]>'),
            ('starttag', 'svg', []),
            ('starttag', 'text', [('y', '100')]),
            ('unknown decl', 'CDATA[foo<br>bar'),
            ('endtag', 'text'),
            ('endtag', 'svg'),
            ('comment', '[CDATA[foo<br'),
            ('data', 'bar]]>'),
        ]
        self._run_check(html, expected, collector=EventCollector(autocdata=True))
    def test_convert_charrefs_dropped_text(self):
        # #23144: make sure that all the events are triggered when
--- a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
+++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst
@ -0,0 +1,5 @@
 Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
 the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
 Add private method ``_set_support_cdata()`` which can be used to specify
 how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
 (SVG or MathML) or as a bogus comment in the HTML namespace.