"""Tests for HTMLParser.py."""
import html.parser
import pprint
import unittest
from test import support
SAMPLE_RCDATA = (
''
"sample
text
“
""", [
("data", "\n"),
("decl", "DOCTYPE html PUBLIC 'foo'"),
("data", "\n"),
("starttag", "html", []),
("entityref", "entity"),
("charref", "32"),
("data", "\n"),
("comment", "comment1a\n->
',
'foo = "";',
'foo = "";',
'foo = <\n/script> ',
'',
('\n//<\\/s\'+\'cript>\');\n//]]>'),
'\n\n',
'',
])
def test_script_content(self, content):
s = f''
self._run_check(s, [
("starttag", "script", []),
("data", content),
("endtag", "script"),
])
@support.subTests('content', [
'a::before { content: ""; }',
'a::before { content: "¬-an-entity-ref;"; }',
'a::before { content: "
''"""
s = f'', True),
])
def test_eof_in_script(self, tail, end):
content = "a = 123"
s = f'{1}'
'{1}'.format(text, charref),
expected, collector=collector())
# check truncated charrefs at the end of the file
html = '&quo '
for x in range(1, len(html)):
self._run_check(html[:x], [('data', html[:x])],
collector=collector())
# check a string with no charrefs
self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector())
# the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup
def test_tolerant_parsing(self):
self._run_check('te>>xt&a<", [('data', '<>')])
self._run_check("< >", [('data', '< >')])
self._run_check("< ", [('data', '< ')])
self._run_check(">", [])
self._run_check("<$>", [('data', '<$>')])
self._run_check("$>", [('comment', '$')])
self._run_check("", [('data', '')])
self._run_check("
'
'foo'
'
')
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
# and "8.2.4.45 Markup declaration open state", comment tokens should
# be emitted instead of 'unknown decl', but calling unknown_decl
# provides more flexibility.
# See also Lib/_markupbase.py:parse_declaration
expected = [
('unknown decl', 'if !(IE)'),
('data', 'broken condcom'),
('unknown decl', 'endif'),
('unknown decl', 'if ! IE'),
('startendtag', 'link', [('href', 'favicon.tiff')]),
('unknown decl', 'endif'),
('unknown decl', 'if !IE 6'),
('startendtag', 'img', [('src', 'firefox.png')]),
('unknown decl', 'endif'),
('unknown decl', 'if !ie 6'),
('starttag', 'b', []),
('data', 'foo'),
('endtag', 'b'),
('unknown decl', 'endif'),
('unknown decl', 'if (!IE)|(lt IE 9)'),
('startendtag', 'img', [('src', 'mammoth.bmp')]),
('unknown decl', 'endif')
]
self._run_check(html, expected)
@support.subTests('content', [
'just some plain text',
'',
'¬-an-entity-ref;',
"
",
[("starttag", "img", [("src", "/foo/bar.png"),
("alt", "\u4e2d\u6587")])])
self._run_check(
"",
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")])])
self._run_check(
'',
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
("href", "\u30c6\u30b9\u30c8.html")])])
def test_attr_entity_replacement(self):
self._run_check(
"",
[("starttag", "a", [("b", "&><\"'")])])
def test_attr_funky_names(self):
self._run_check(
"",
[("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
def test_entityrefs_in_attributes(self):
self._run_check(
"",
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
def test_attr_funky_names2(self):
self._run_check(
r"| " "- software-and-i" "- library |