[3.9] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-137784)

(cherry picked from commit 4d02f31cdd)

Co-authored-by: Timon Viola <44016238+timonviola@users.noreply.github.com>
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
This commit is contained in:
Serhiy Storchaka 2025-09-13 23:35:13 +03:00 committed by GitHub
parent 220e67748a
commit 4dea0fb67b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 113 additions and 5 deletions

View file

@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
@ -126,6 +127,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True
_markupbase.ParserBase.reset(self)
def feed(self, data):
@ -147,14 +149,20 @@ def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
def set_cdata_mode(self, elem):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
if escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
@ -187,7 +195,7 @@ def goahead(self, end):
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
@ -289,7 +297,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
@ -408,6 +416,8 @@ def parse_starttag(self, i):
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos
# Internal -- check to see if we have a complete starttag; return end

View file

@ -316,6 +316,49 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])
@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /title>',
'</ title>',
'</titled>',
'</title\v>',
'</title\xa0>',
'</tıtle>',
])
def test_title_content(self, content):
source = f"<title>{content}</title>"
self._run_check(source, [
("starttag", "title", []),
("data", content),
("endtag", "title"),
])
@support.subTests('content', [
'<!-- not a comment -->',
"<not a='start tag'>",
'<![CDATA[not a cdata]]>',
'<!not a bogus comment>',
'</not a bogus comment>',
'\u2603',
'< /textarea>',
'</ textarea>',
'</textareable>',
'</textarea\v>',
'</textarea\xa0>',
])
def test_textarea_content(self, content):
source = f"<textarea>{content}</textarea>"
self._run_check(source, [
("starttag", "textarea", []),
("data", content),
("endtag", "textarea"),
])
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
'script/', 'script foo=bar', 'script foo=">"'])
def test_script_closing_tag(self, endtag):
@ -345,6 +388,38 @@ def test_style_closing_tag(self, endtag):
("endtag", "style")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
'title/', 'title foo=bar', 'title foo=">"'])
def test_title_closing_tag(self, endtag):
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
s = f'<TitLe>{content}</{endtag}>'
self._run_check(s, [("starttag", "title", []),
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
("endtag", "title")],
collector=EventCollectorNoNormalize(convert_charrefs=True))
self._run_check(s, [("starttag", "title", []),
('data', '<!-- not a comment --><i>Egg '),
('entityref', 'amp'),
('data', ' Spam</i>'),
("endtag", "title")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
def test_textarea_closing_tag(self, endtag):
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
s = f'<TexTarEa>{content}</{endtag}>'
self._run_check(s, [("starttag", "textarea", []),
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
("endtag", "textarea")],
collector=EventCollectorNoNormalize(convert_charrefs=True))
self._run_check(s, [("starttag", "textarea", []),
('data', '<!-- not a comment --><i>Egg '),
('entityref', 'amp'),
('data', ' Spam</i>'),
("endtag", "textarea")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
@support.subTests('tail,end', [
('', False),
('<', False),
@ -362,6 +437,27 @@ def test_eof_in_script(self, tail, end):
("data", content if end else content + tail)],
collector=EventCollectorNoNormalize(convert_charrefs=False))
@support.subTests('tail,end', [
('', False),
('<', False),
('</', False),
('</t', False),
('</title', False),
('</title ', True),
('</title foo=bar', True),
('</title foo=">', True),
])
def test_eof_in_title(self, tail, end):
s = f'<TitLe>Egg &amp; Spam{tail}'
self._run_check(s, [("starttag", "title", []),
("data", "Egg & Spam" + ('' if end else tail))],
collector=EventCollectorNoNormalize(convert_charrefs=True))
self._run_check(s, [("starttag", "title", []),
('data', 'Egg '),
('entityref', 'amp'),
('data', ' Spam' + ('' if end else tail))],
collector=EventCollectorNoNormalize(convert_charrefs=False))
def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'

View file

@ -0,0 +1,2 @@
Fix support of escapable raw text mode (elements "textarea" and "title")
in :class:`html.parser.HTMLParser`.