[3.13] gh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser (GH-140904) (GH-141746)

(cherry picked from commit 95296a9d40)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-11-19 13:17:54 +01:00 committed by GitHub
parent 26d831327d
commit c7064e7d6b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 109 additions and 33 deletions

View file

@ -109,12 +109,13 @@ def get_events(self):
class TestCaseBase(unittest.TestCase):
def get_collector(self):
return EventCollector(convert_charrefs=False)
def get_collector(self, convert_charrefs=False):
return EventCollector(convert_charrefs=convert_charrefs)
def _run_check(self, source, expected_events, collector=None):
def _run_check(self, source, expected_events,
*, collector=None, convert_charrefs=False):
if collector is None:
collector = self.get_collector()
collector = self.get_collector(convert_charrefs=convert_charrefs)
parser = collector
for s in source:
parser.feed(s)
@ -128,7 +129,7 @@ def _run_check(self, source, expected_events, collector=None):
def _run_check_extra(self, source, events):
self._run_check(source, events,
EventCollectorExtra(convert_charrefs=False))
collector=EventCollectorExtra(convert_charrefs=False))
class HTMLParserTestCase(TestCaseBase):
@ -187,10 +188,87 @@ def test_malformatted_charref(self):
])
def test_unclosed_entityref(self):
self._run_check("&entityref foo", [
("entityref", "entityref"),
("data", " foo"),
])
self._run_check('&gt &lt;', [('entityref', 'gt'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&gt &lt;', [('data', '> <')], convert_charrefs=True)
self._run_check('&undefined &lt;',
[('entityref', 'undefined'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&undefined &lt;', [('data', '&undefined <')],
convert_charrefs=True)
self._run_check('&gtundefined &lt;',
[('entityref', 'gtundefined'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&gtundefined &lt;', [('data', '>undefined <')],
convert_charrefs=True)
self._run_check('& &lt;', [('data', '& '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('& &lt;', [('data', '& <')], convert_charrefs=True)
def test_eof_in_entityref(self):
self._run_check('&gt', [('entityref', 'gt')], convert_charrefs=False)
self._run_check('&gt', [('data', '>')], convert_charrefs=True)
self._run_check('&g', [('entityref', 'g')], convert_charrefs=False)
self._run_check('&g', [('data', '&g')], convert_charrefs=True)
self._run_check('&undefined', [('entityref', 'undefined')],
convert_charrefs=False)
self._run_check('&undefined', [('data', '&undefined')],
convert_charrefs=True)
self._run_check('&gtundefined', [('entityref', 'gtundefined')],
convert_charrefs=False)
self._run_check('&gtundefined', [('data', '>undefined')],
convert_charrefs=True)
self._run_check('&', [('data', '&')], convert_charrefs=False)
self._run_check('&', [('data', '&')], convert_charrefs=True)
def test_unclosed_charref(self):
self._run_check('&#123 &lt;', [('charref', '123'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&#123 &lt;', [('data', '{ <')], convert_charrefs=True)
self._run_check('&#xab &lt;', [('charref', 'xab'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&#xab &lt;', [('data', '\xab <')], convert_charrefs=True)
self._run_check('&#123456789 &lt;',
[('charref', '123456789'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&#123456789 &lt;', [('data', '\ufffd <')],
convert_charrefs=True)
self._run_check('&#x123456789 &lt;',
[('charref', 'x123456789'), ('data', ' '), ('entityref', 'lt')],
convert_charrefs=False)
self._run_check('&#x123456789 &lt;', [('data', '\ufffd <')],
convert_charrefs=True)
self._run_check('&# &lt;', [('data', '&# '), ('entityref', 'lt')], convert_charrefs=False)
self._run_check('&# &lt;', [('data', '&# <')], convert_charrefs=True)
self._run_check('&#x &lt;', [('data', '&#x '), ('entityref', 'lt')], convert_charrefs=False)
self._run_check('&#x &lt;', [('data', '&#x <')], convert_charrefs=True)
def test_eof_in_charref(self):
self._run_check('&#123', [('charref', '123')], convert_charrefs=False)
self._run_check('&#123', [('data', '{')], convert_charrefs=True)
self._run_check('&#xab', [('charref', 'xab')], convert_charrefs=False)
self._run_check('&#xab', [('data', '\xab')], convert_charrefs=True)
self._run_check('&#123456789', [('charref', '123456789')],
convert_charrefs=False)
self._run_check('&#123456789', [('data', '\ufffd')], convert_charrefs=True)
self._run_check('&#x123456789', [('charref', 'x123456789')],
convert_charrefs=False)
self._run_check('&#x123456789', [('data', '\ufffd')], convert_charrefs=True)
self._run_check('&#', [('data', '&#')], convert_charrefs=False)
self._run_check('&#', [('data', '&#')], convert_charrefs=True)
self._run_check('&#x', [('data', '&#x')], convert_charrefs=False)
self._run_check('&#x', [('data', '&#x')], convert_charrefs=True)
def test_bad_nesting(self):
# Strangely, this *is* supposed to test that overlapping
@ -762,20 +840,6 @@ def test_correct_detection_of_start_tags(self):
]
self._run_check(html, expected)
def test_EOF_in_charref(self):
# see #17802
# This test checks that the UnboundLocalError reported in the issue
# is not raised, however I'm not sure the returned values are correct.
# Maybe HTMLParser should use self.unescape for these
data = [
('a&', [('data', 'a&')]),
('a&b', [('data', 'ab')]),
('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
('a&b;', [('data', 'a'), ('entityref', 'b')]),
]
for html, expected in data:
self._run_check(html, expected)
def test_eof_in_comments(self):
data = [
('<!--', [('comment', '')]),