mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 06:23:16 +00:00

provided a favicon to use. Instead of prefering to use WARC items (or prefering to download as it was before #202), we prefer to use the most suited favicon. Potential favicons are sourced from main HTML page. All favicons are retrieved either from the WARC or downloaded to inspect their sizes. We use the most suited one (i.e. 48x48 or bigger if possible or the biggest one). We still fallback to default ZIM illustration if no favicon is found, to avoid loosing all time spent crawling the website.
161 lines
6.4 KiB
Python
161 lines
6.4 KiB
Python
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from warc2zim.icon_finder import Icon, get_sorted_icons, icons_in_html
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"html, expected",
|
|
[
|
|
pytest.param(
|
|
"""<link rel="foo" href="https://somewhere/favicon.ico">""",
|
|
set(),
|
|
id="other_rel",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="icon" href="https://somewhere/favicon.ico">""",
|
|
{"https://somewhere/favicon.ico"},
|
|
id="simple_icon",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="icon" href="https://somewhere/favicon.ico">""",
|
|
{"https://somewhere/favicon.ico"},
|
|
id="simple_icon",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="icon">""",
|
|
set(),
|
|
id="icon_href_missing",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="shortcut icon" href="https://somewhere/favicon.ico">""",
|
|
{"https://somewhere/favicon.ico"},
|
|
id="simple_shortcut_icon",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="icon" sizes="48x48" href="https://somewhere/favicon.ico">
|
|
<link rel="icon" sizes="96x96" href="https://somewhere/favicon.ico">""",
|
|
{"https://somewhere/favicon.ico"},
|
|
id="no_duplicates",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="icon" sizes="96x96" href="https://somewhere/favicon1.ico">
|
|
<link rel="icon" sizes="48x48" href="https://somewhere/favicon2.ico">""",
|
|
{"https://somewhere/favicon2.ico", "https://somewhere/favicon1.ico"},
|
|
id="sort_by_size",
|
|
),
|
|
pytest.param(
|
|
Path("tests/data-special/icons.html").read_text(),
|
|
{
|
|
"https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/android-chrome-192x192.png",
|
|
"https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon-96x96.png",
|
|
"https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon-32x32.png",
|
|
"https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon.ico",
|
|
"https://womenshistory.si.edu//sites/default/themes/si_sawhm/favicons/favicon-16x16.png",
|
|
},
|
|
id="real_life",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="shortcut icon" sizes="aaxbb" href="https://somewhere/favicon.ico">""",
|
|
{"https://somewhere/favicon.ico"},
|
|
id="bad_sizes_1",
|
|
),
|
|
pytest.param(
|
|
"""<link rel="shortcut icon" sizes="12x12x12" href="https://somewhere/favicon.ico">""",
|
|
{"https://somewhere/favicon.ico"},
|
|
id="bad_sizes_2",
|
|
),
|
|
],
|
|
)
|
|
def test_icons_in_html(html, expected):
|
|
assert icons_in_html(html) == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"unsorted, expected",
|
|
[
|
|
pytest.param([], [], id="empty"),
|
|
pytest.param(
|
|
[Icon("url1", 12, 12, b"", None)],
|
|
[Icon("url1", 12, 12, b"", None)],
|
|
id="one_item",
|
|
),
|
|
pytest.param(
|
|
[Icon("url3", 12, 12, b"", None), Icon("url2", 96, 96, b"", None)],
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
id="two_items_with_size1",
|
|
),
|
|
pytest.param(
|
|
[Icon("url3", 128, 128, b"", None), Icon("url2", 96, 96, b"", None)],
|
|
[Icon("url3", 128, 128, b"", None), Icon("url2", 96, 96, b"", None)],
|
|
id="two_items_with_size2",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 128, 128, b"", None)],
|
|
[Icon("url3", 128, 128, b"", None), Icon("url2", 96, 96, b"", None)],
|
|
id="two_items_with_size3",
|
|
),
|
|
pytest.param(
|
|
[Icon("url3", 12, 12, b"", None), Icon("url2", 26, 26, b"", None)],
|
|
[Icon("url2", 26, 26, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
id="two_items_with_size4",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 26, 26, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
[Icon("url2", 26, 26, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
id="two_items_with_size5",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
id="two_items_with_size6",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 96, 96, b"", None)],
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 96, 96, b"", None)],
|
|
id="two_items_with_size7",
|
|
),
|
|
pytest.param(
|
|
[Icon("url3", 12, 12, b"", None), Icon("url2", 48, 48, b"", None)],
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 12, 12, b"", None)],
|
|
id="two_items_with_size8",
|
|
),
|
|
pytest.param(
|
|
[Icon("url3", 96, 96, b"", None), Icon("url2", 48, 48, b"", None)],
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 96, 96, b"", None)],
|
|
id="two_items_with_size9",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 48, 48, b"", None)],
|
|
[Icon("url2", 48, 48, b"", None), Icon("url3", 48, 48, b"", None)],
|
|
id="two_items_with_size10",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 96, 96, b"", None)],
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 96, 96, b"", None)],
|
|
id="two_items_with_size11",
|
|
),
|
|
pytest.param(
|
|
[Icon("url3", 32, 32, b"", None), Icon("url2", 96, 96, b"", None)],
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 32, 32, b"", None)],
|
|
id="two_items_with_size12",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 32, 32, b"", None)],
|
|
[Icon("url2", 96, 96, b"", None), Icon("url3", 32, 32, b"", None)],
|
|
id="two_items_with_size13",
|
|
),
|
|
pytest.param(
|
|
[Icon("url2", 26, 26, b"", None), Icon("url3", 26, 26, b"", None)],
|
|
[Icon("url2", 26, 26, b"", None), Icon("url3", 26, 26, b"", None)],
|
|
id="two_items_with_size14",
|
|
),
|
|
],
|
|
)
|
|
def test_get_sorted_icons(unsorted, expected):
|
|
assert get_sorted_icons(unsorted) == expected
|
|
if len(unsorted) == 2:
|
|
if unsorted[0] == expected[1]:
|
|
assert unsorted[0] < unsorted[1]
|
|
assert unsorted[1] > unsorted[0]
|