[3.9] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137774) (GH-139661)

"] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">". (cherry picked from commit 0cbbfc4621) (cherry picked from commit dcf24768c9) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Łukasz Langa <lukasz@langa.pl>
2025-12-08 06:10:17 +00:00 · 2025-10-07 21:15:04 +02:00 · 2025-10-07 21:15:04 +02:00 · ed904d5bbf
commit ed904d5bbf
parent f3d8338cd5
3 changed files with 104 additions and 29 deletions
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -9,10 +9,13 @@

 class EventCollector(html.parser.HTMLParser):

-    def __init__(self, *args, **kw):
+    def __init__(self, *args, autocdata=False, **kw):
+        self.autocdata = autocdata
        self.events = []
        self.append = self.events.append
        html.parser.HTMLParser.__init__(self, *args, **kw)
+        if autocdata:
+            self._set_support_cdata(False)

    def get_events(self):
        # Normalize the list of events so that buffer artefacts don't
@ -33,12 +36,16 @@ def get_events(self):

    def handle_starttag(self, tag, attrs):
        self.append(("starttag", tag, attrs))
+        if self.autocdata and tag == 'svg':
+            self._set_support_cdata(True)

    def handle_startendtag(self, tag, attrs):
        self.append(("startendtag", tag, attrs))

    def handle_endtag(self, tag):
        self.append(("endtag", tag))
+        if self.autocdata and tag == 'svg':
+            self._set_support_cdata(False)

    # all other markup

@ -739,10 +746,6 @@ def test_eof_in_declarations(self):
            ('<!', [('comment', '')]),
            ('<!-', [('comment', '-')]),
            ('<![', [('comment', '[')]),
-            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
-            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
-            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
-            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@ -755,6 +758,18 @@ def test_eof_in_declarations(self):
        for html, expected in data:
            self._run_check(html, expected)

+    @support.subTests('content', ['', 'x', 'x]', 'x]]'])
+    def test_eof_in_cdata(self, content):
+        self._run_check('<![CDATA[' + content,
+                        [('unknown decl', 'CDATA[' + content)])
+        self._run_check('<![CDATA[' + content,
+                        [('comment', '[CDATA[' + content)],
+                        collector=EventCollector(autocdata=True))
+        self._run_check('<svg><text y="100"><![CDATA[' + content,
+                        [('starttag', 'svg', []),
+                         ('starttag', 'text', [('y', '100')]),
+                         ('unknown decl', 'CDATA[' + content)])
+
    def test_bogus_comments(self):
        html = ('<!ELEMENT br EMPTY>'
                '<! not really a comment >'
@ -804,8 +819,57 @@ def test_broken_condcoms(self):
            ('startendtag', 'img', [('src', 'mammoth.bmp')]),
            ('unknown decl', 'endif')
        ]
+
        self._run_check(html, expected)

+    @support.subTests('content', [
+        'just some plain text',
+        '<!-- not a comment -->',
+        '&not-an-entity-ref;',
+        "<not a='start tag'>",
+        '',
+        '[[I have many brackets]]',
+        'I have a > in the middle',
+        'I have a ]] in the middle',
+        '] ]>',
+        ']] >',
+        ('\n'
+         '    if (a < b && a > b) {\n'
+         '        printf("[<marquee>How?</marquee>]");\n'
+         '    }\n'),
+    ])
+    def test_cdata_section_content(self, content):
+        # See "13.2.5.42 Markup declaration open state",
+        # "13.2.5.69 CDATA section state", and issue bpo-32876.
+        html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
+        expected = [
+            ('starttag', 'svg', []),
+            ('starttag', 'text', [('y', '100')]),
+            ('unknown decl', 'CDATA[' + content),
+            ('endtag', 'text'),
+            ('endtag', 'svg'),
+        ]
+        self._run_check(html, expected)
+        self._run_check(html, expected, collector=EventCollector(autocdata=True))
+
+    def test_cdata_section(self):
+        # See "13.2.5.42 Markup declaration open state".
+        html = ('<![CDATA[foo<br>bar]]>'
+                '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
+                '<![CDATA[foo<br>bar]]>')
+        expected = [
+            ('comment', '[CDATA[foo<br'),
+            ('data', 'bar]]>'),
+            ('starttag', 'svg', []),
+            ('starttag', 'text', [('y', '100')]),
+            ('unknown decl', 'CDATA[foo<br>bar'),
+            ('endtag', 'text'),
+            ('endtag', 'svg'),
+            ('comment', '[CDATA[foo<br'),
+            ('data', 'bar]]>'),
+        ]
+        self._run_check(html, expected, collector=EventCollector(autocdata=True))
+
    def test_convert_charrefs_dropped_text(self):
        # #23144: make sure that all the events are triggered when
        # convert_charrefs is True, even if we don't call .close()
@ -1041,27 +1105,6 @@ def test_weird_chars_in_unquoted_attribute_values(self):
                            ('starttag', 'form',
                                [('action', 'bogus|&#()value')])])

-    def test_invalid_keyword_error_exception(self):
-        # bpo-34480: check that subclasses that define an
-        # error method that raises an exception work
-        class InvalidMarkupException(Exception):
-            pass
-        class MyHTMLParser(html.parser.HTMLParser):
-            def error(self, message):
-                raise InvalidMarkupException(message)
-        parser = MyHTMLParser()
-        with self.assertRaises(InvalidMarkupException):
-            parser.feed('<![invalid>')
-
-    def test_invalid_keyword_error_pass(self):
-        # bpo-34480: check that subclasses that define an
-        # error method that doesn't raise an exception work
-        class MyHTMLParser(html.parser.HTMLParser):
-            def error(self, message):
-                pass
-        parser = MyHTMLParser()
-        self.assertEqual(parser.feed('<![invalid>'), None)
-

 if __name__ == "__main__":
    unittest.main()