[3.13] gh-135661: Fix parsing unterminated bogus comments in HTMLParser (GH-137873) (GH-137875)

Bogus comments that start with "<![CDATA[" should not include the starting "!" in its value. (cherry picked from commit 7636a66635) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
2025-12-08 06:10:17 +00:00 · 2025-08-17 12:59:24 +02:00 · 2025-08-17 12:59:24 +02:00 · f2b7954ce0
commit f2b7954ce0
parent 133f7bd22b
2 changed files with 9 additions and 15 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -271,11 +271,8 @@ def goahead(self, end):
                                j -= len(suffix)
                                break
                        self.handle_comment(rawdata[i+4:j])
-                    elif startswith("<![CDATA[", i):
+                    elif startswith("<![CDATA[", i) and self._support_cdata:
-                        if self._support_cdata:
+                        self.unknown_decl(rawdata[i+3:])
                            self.unknown_decl(rawdata[i+3:])
                        else:
                            self.handle_comment(rawdata[i+1:])
                    elif rawdata[i:i+9].lower() == '<!doctype':
                        self.handle_decl(rawdata[i+2:])
                    elif startswith("<!", i):
@ -350,15 +347,12 @@ def parse_html_declaration(self, i):
        if rawdata[i:i+4] == '<!--':
            # this case is actually already handled in goahead()
            return self.parse_comment(i)
-        elif rawdata[i:i+9] == '<![CDATA[':
+        elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
-            if self._support_cdata:
+            j = rawdata.find(']]>', i+9)
-                j = rawdata.find(']]>', i+9)
+            if j < 0:
-                if j < 0:
+                return -1
-                    return -1
+            self.unknown_decl(rawdata[i+3: j])
-                self.unknown_decl(rawdata[i+3: j])
+            return j + 3
                return j + 3
            else:
                return self.parse_bogus_comment(i)
        elif rawdata[i:i+9].lower() == '<!doctype':
            # find the closing >
            gtpos = rawdata.find('>', i+9)
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -791,7 +791,7 @@ def test_eof_in_cdata(self, content):
        self._run_check('<![CDATA[' + content,
                        [('unknown decl', 'CDATA[' + content)])
        self._run_check('<![CDATA[' + content,
-                        [('comment', '![CDATA[' + content)],
+                        [('comment', '[CDATA[' + content)],
                        collector=EventCollector(autocdata=True))
        self._run_check('<svg><text y="100"><![CDATA[' + content,
                        [('starttag', 'svg', []),