[3.9] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser (GH-137837) (GH-140842) (GH-140857)

(cherry picked from commit a17c57eee5) (cherry picked from commit 0329bd11c7) Co-authored-by: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
2025-12-08 06:10:17 +00:00 · 2025-10-31 19:02:38 +02:00 · 2025-10-31 19:02:38 +02:00 · 941811fc9d
commit 941811fc9d
parent fde6ac1290
4 changed files with 163 additions and 114 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
    argument.
    """

-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # See the HTML5 specs section "13.4 Parsing HTML fragments".
+    # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
    RCDATA_CONTENT_ELEMENTS = ("textarea", "title")

-    def __init__(self, *, convert_charrefs=True):
+    def __init__(self, *, convert_charrefs=True, scripting=False):
        """Initialize and reset this instance.

-        If convert_charrefs is True (the default), all character references
+        If convert_charrefs is true (the default), all character references
        are automatically converted to the corresponding Unicode characters.
+
+        If *scripting* is false (the default), the content of the
+        ``noscript`` element is parsed normally; if it's true,
+        it's returned as is without being parsed.
        """
        self.convert_charrefs = convert_charrefs
+        self.scripting = scripting
        self.reset()

    def reset(self):
@ -153,7 +161,9 @@ def get_starttag_text(self):
    def set_cdata_mode(self, elem, *, escapable=False):
        self.cdata_elem = elem.lower()
        self._escapable = escapable
-        if escapable and not self.convert_charrefs:
+        if self.cdata_elem == 'plaintext':
+            self.interesting = re.compile(r'\Z')
+        elif escapable and not self.convert_charrefs:
            self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                          re.IGNORECASE|re.ASCII)
        else:
@ -441,8 +451,10 @@ def parse_starttag(self, i):
            self.handle_startendtag(tag, attrs)
        else:
            self.handle_starttag(tag, attrs)
-            if tag in self.CDATA_CONTENT_ELEMENTS:
-                self.set_cdata_mode(tag)
+            if (tag in self.CDATA_CONTENT_ELEMENTS or
+                (self.scripting and tag == "noscript") or
+                tag == "plaintext"):
+                self.set_cdata_mode(tag, escapable=False)
            elif tag in self.RCDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag, escapable=True)
        return endpos