2020-08-11 03:41:33 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# vim: ai ts=4 sts=4 et sw=4 nu
|
|
|
|
|
2024-03-04 13:28:47 +00:00
|
|
|
import io
|
2020-12-09 10:55:59 +00:00
|
|
|
import json
|
2020-07-22 13:22:36 -07:00
|
|
|
import os
|
2023-11-14 16:12:17 +01:00
|
|
|
import re
|
2020-08-18 22:59:22 -07:00
|
|
|
import time
|
2024-03-18 09:56:32 +00:00
|
|
|
from urllib.parse import unquote
|
2020-07-22 21:11:18 -07:00
|
|
|
|
2020-07-27 01:51:55 +00:00
|
|
|
import pytest
|
2021-01-12 16:17:46 +00:00
|
|
|
import requests
|
2024-03-04 13:28:47 +00:00
|
|
|
from zimscraperlib.image.convertion import convert_image, resize_image
|
2022-01-07 13:52:57 +00:00
|
|
|
from zimscraperlib.zim import Archive
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2024-01-31 11:47:01 +01:00
|
|
|
from warc2zim.__about__ import __version__
|
2023-11-14 14:56:23 +01:00
|
|
|
from warc2zim.converter import iter_warc_records
|
2023-11-28 15:27:01 +01:00
|
|
|
from warc2zim.main import main
|
2024-03-18 09:56:32 +00:00
|
|
|
from warc2zim.url_rewriting import HttpUrl, ZimPath, normalize
|
2024-01-18 17:15:16 +01:00
|
|
|
from warc2zim.utils import get_record_url
|
2023-11-14 14:56:23 +01:00
|
|
|
|
2021-10-29 16:52:36 +00:00
|
|
|
TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
|
|
|
|
|
2024-01-31 11:47:01 +01:00
|
|
|
SCRAPER_SUFFIX = " + zimit x.y.z-devw"
|
2021-10-29 16:52:36 +00:00
|
|
|
|
|
|
|
# ============================================================================
|
2020-08-02 22:59:41 +00:00
|
|
|
CMDLINES = [
|
|
|
|
["example-response.warc"],
|
2020-12-09 10:55:59 +00:00
|
|
|
["example-response.warc", "--progress-file", "progress.json"],
|
2024-01-31 11:47:01 +01:00
|
|
|
["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX],
|
2020-10-06 04:40:03 +00:00
|
|
|
["example-revisit.warc.gz"],
|
2020-08-03 17:22:29 -07:00
|
|
|
[
|
|
|
|
"example-revisit.warc.gz",
|
|
|
|
"-u",
|
|
|
|
"http://example.iana.org/",
|
|
|
|
"--lang",
|
|
|
|
"eng",
|
|
|
|
],
|
2020-08-02 23:04:32 +00:00
|
|
|
[
|
|
|
|
"example-utf8.warc",
|
|
|
|
"-u",
|
|
|
|
"https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93",
|
|
|
|
],
|
2020-10-06 04:40:03 +00:00
|
|
|
["single-page-test.warc"],
|
2020-07-27 08:26:57 -07:00
|
|
|
]
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-08-03 17:22:29 -07:00
|
|
|
@pytest.fixture(params=CMDLINES, ids=[" ".join(cmds) for cmds in CMDLINES])
|
2020-08-02 22:59:41 +00:00
|
|
|
def cmdline(request):
|
2020-07-22 13:22:36 -07:00
|
|
|
return request.param
|
|
|
|
|
|
|
|
|
2021-10-29 16:52:36 +00:00
|
|
|
# ============================================================================
|
|
|
|
FUZZYCHECKS = [
|
|
|
|
{
|
|
|
|
"filename": "video-yt.warc.gz",
|
|
|
|
"entries": [
|
2023-11-14 16:12:17 +01:00
|
|
|
"youtube.fuzzy.replayweb.page/get_video_info?video_id=aT-Up5Y4uRI",
|
|
|
|
"youtube.fuzzy.replayweb.page/videoplayback?id=o-AE3bg3qVNY-gAWwYgL52vgpHKJe9ijdbu2eciNi5Uo_w",
|
2021-10-29 16:55:45 +00:00
|
|
|
],
|
2021-10-29 16:52:36 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"filename": "video-yt-2.warc.gz",
|
|
|
|
"entries": [
|
2023-11-14 16:12:17 +01:00
|
|
|
"youtube.fuzzy.replayweb.page/youtubei/v1/player?videoId=aT-Up5Y4uRI",
|
|
|
|
"youtube.fuzzy.replayweb.page/videoplayback?id=o-AGDtIqpFRmvgVVZk96wgGyFxL_SFSdpBxs0iBHatQpRD",
|
2021-10-29 16:55:45 +00:00
|
|
|
],
|
2021-10-29 16:52:36 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"filename": "video-vimeo.warc.gz",
|
|
|
|
"entries": [
|
2023-11-14 16:12:17 +01:00
|
|
|
"vimeo.fuzzy.replayweb.page/video/347119375",
|
|
|
|
"vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4",
|
2021-10-29 16:55:45 +00:00
|
|
|
],
|
|
|
|
},
|
2021-10-29 16:52:36 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(params=FUZZYCHECKS, ids=[fuzzy["filename"] for fuzzy in FUZZYCHECKS])
|
|
|
|
def fuzzycheck(request):
|
|
|
|
return request.param
|
|
|
|
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
# ============================================================================
|
2024-01-18 17:15:16 +01:00
|
|
|
class TestWarc2Zim:
|
2020-08-02 00:12:50 -07:00
|
|
|
def list_articles(self, zimfile):
|
2022-01-07 13:52:57 +00:00
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
for x in range(zim_fh.entry_count):
|
|
|
|
yield zim_fh.get_entry_by_id(x)
|
|
|
|
|
2024-03-04 13:28:47 +00:00
|
|
|
def get_main_entry_with_redirect(self, zimfile):
|
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
if zim_fh.main_entry.is_redirect:
|
|
|
|
return zim_fh.main_entry.get_redirect_entry()
|
|
|
|
return zim_fh.main_entry
|
|
|
|
|
2022-01-07 13:52:57 +00:00
|
|
|
def get_metadata(self, zimfile, name):
|
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
return zim_fh.get_metadata(name)
|
2020-08-02 00:12:50 -07:00
|
|
|
|
|
|
|
def get_article(self, zimfile, path):
|
2022-01-07 13:52:57 +00:00
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
return zim_fh.get_content(path)
|
2020-08-02 00:12:50 -07:00
|
|
|
|
2020-08-03 09:26:48 -07:00
|
|
|
def get_article_raw(self, zimfile, path):
|
2022-01-07 13:52:57 +00:00
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
return zim_fh.get_item(path)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2024-04-30 14:24:58 +00:00
|
|
|
def assert_item_exist(self, zimfile, path):
|
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
assert zim_fh.get_item(path)
|
|
|
|
|
|
|
|
def assert_item_does_not_exist(self, zimfile, path):
|
|
|
|
zim_fh = Archive(zimfile)
|
|
|
|
try:
|
|
|
|
payload = zim_fh.get_item(path)
|
|
|
|
except KeyError:
|
|
|
|
payload = None
|
|
|
|
assert payload is None
|
|
|
|
|
2024-01-31 11:47:01 +01:00
|
|
|
def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
|
2020-07-22 13:22:36 -07:00
|
|
|
assert os.path.isfile(warcfile)
|
|
|
|
assert os.path.isfile(zimfile)
|
|
|
|
|
2023-05-26 15:39:38 +03:00
|
|
|
# [TOFIX]
|
|
|
|
head_insert = b""
|
2020-08-10 18:48:01 -07:00
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
# track to avoid checking duplicates, which are not written to ZIM
|
|
|
|
warc_urls = set()
|
|
|
|
|
2022-01-07 13:52:57 +00:00
|
|
|
zim_fh = Archive(zimfile)
|
2024-01-31 11:47:01 +01:00
|
|
|
|
|
|
|
if verify_scraper_suffix:
|
|
|
|
assert (
|
|
|
|
f"warc2zim {__version__}{SCRAPER_SUFFIX}"
|
|
|
|
== zim_fh.get_text_metadata("Scraper")
|
|
|
|
)
|
2024-02-10 17:44:55 +01:00
|
|
|
else:
|
|
|
|
assert f"warc2zim {__version__}" == zim_fh.get_text_metadata("Scraper")
|
2024-01-31 11:47:01 +01:00
|
|
|
|
2021-10-29 16:52:36 +00:00
|
|
|
for record in iter_warc_records([warcfile]):
|
|
|
|
url = get_record_url(record)
|
|
|
|
if not url:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if url in warc_urls:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if record.rec_type not in (("response", "resource", "revisit")):
|
|
|
|
continue
|
|
|
|
|
|
|
|
# ignore revisit records that are to the same url
|
|
|
|
if (
|
|
|
|
record.rec_type == "revisit"
|
|
|
|
and record.rec_headers["WARC-Refers-To-Target-URI"] == url
|
|
|
|
):
|
|
|
|
continue
|
|
|
|
|
|
|
|
# parse headers as record, ensure headers match
|
|
|
|
url_no_scheme = url.split("//", 2)[1]
|
|
|
|
|
2023-11-14 16:12:17 +01:00
|
|
|
if "www.youtube.com/embed" in url_no_scheme:
|
|
|
|
# We know that those url are rewritten in zim. Don't check for them.
|
|
|
|
break
|
|
|
|
|
|
|
|
url_no_scheme = re.sub(r"\?\d+$", "?", url_no_scheme)
|
2021-10-29 16:52:36 +00:00
|
|
|
|
2024-03-18 09:56:32 +00:00
|
|
|
# remove user/password
|
|
|
|
if "@" in url_no_scheme:
|
|
|
|
at_index = url_no_scheme.index("@")
|
|
|
|
if at_index >= 0:
|
|
|
|
if "/" in url_no_scheme:
|
|
|
|
slash_index = url_no_scheme.index("/")
|
|
|
|
if at_index < slash_index:
|
|
|
|
url_no_scheme = url_no_scheme[at_index + 1 :]
|
|
|
|
else:
|
|
|
|
url_no_scheme = url_no_scheme[at_index + 1 :]
|
|
|
|
|
|
|
|
# remove trailing ?
|
|
|
|
if url_no_scheme.endswith("?"):
|
|
|
|
url_no_scheme = url_no_scheme[:-1]
|
|
|
|
|
|
|
|
# unquote url since everything is not encoded in ZIM
|
|
|
|
url_no_scheme = unquote(url_no_scheme)
|
2021-10-29 16:52:36 +00:00
|
|
|
|
|
|
|
# ensure payloads match
|
|
|
|
try:
|
2023-11-17 10:59:48 +01:00
|
|
|
payload = zim_fh.get_item(url_no_scheme)
|
2021-10-29 16:52:36 +00:00
|
|
|
except KeyError:
|
|
|
|
payload = None
|
|
|
|
|
2023-11-14 16:37:40 +01:00
|
|
|
if record.http_headers and record.http_headers.get("Content-Length") == "0":
|
2024-04-30 14:24:58 +00:00
|
|
|
if record.http_headers.get("Location"):
|
|
|
|
assert payload # this is a redirect, it must be handled
|
|
|
|
else:
|
|
|
|
assert not payload
|
2023-11-14 16:37:40 +01:00
|
|
|
elif record.rec_type == "revisit":
|
|
|
|
# We must have a payload
|
|
|
|
# We should check with the content of the targeted record...
|
|
|
|
# But difficult to test as we don't have it
|
|
|
|
assert payload
|
2024-02-29 18:15:20 +01:00
|
|
|
elif record.rec_type == "response":
|
2024-01-25 17:43:44 +01:00
|
|
|
# We must have a payload
|
|
|
|
assert payload
|
2021-10-29 16:52:36 +00:00
|
|
|
payload_content = payload.content.tobytes()
|
|
|
|
|
2024-01-18 17:15:16 +01:00
|
|
|
# if HTML, still need to account for the head insert, otherwise should
|
|
|
|
# have exact match
|
2023-02-02 15:32:10 +00:00
|
|
|
if payload.mimetype.startswith("text/html"):
|
2021-10-29 16:52:36 +00:00
|
|
|
assert head_insert in payload_content
|
2024-02-29 18:15:20 +01:00
|
|
|
elif record.rec_type == "resource":
|
|
|
|
# we do not want to embed resources "as-is"
|
|
|
|
assert not payload
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2021-10-29 16:52:36 +00:00
|
|
|
warc_urls.add(url)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2024-03-04 13:28:47 +00:00
|
|
|
def rebuild_favicon_bytes(self, zim, favicon_path) -> bytes:
|
|
|
|
favicon_bytes = self.get_article(
|
|
|
|
zim,
|
|
|
|
favicon_path,
|
2020-10-21 20:52:05 +00:00
|
|
|
)
|
2024-03-04 13:28:47 +00:00
|
|
|
assert favicon_bytes
|
|
|
|
dst = io.BytesIO()
|
|
|
|
convert_image(
|
|
|
|
io.BytesIO(
|
|
|
|
favicon_bytes
|
|
|
|
), # pyright: ignore[reportGeneralTypeIssues, reportArgumentType]
|
|
|
|
dst, # pyright: ignore[reportGeneralTypeIssues, reportArgumentType]
|
|
|
|
fmt="PNG", # pyright: ignore[reportGeneralTypeIssues, reportArgumentType]
|
2020-10-21 20:52:05 +00:00
|
|
|
)
|
2024-03-04 13:28:47 +00:00
|
|
|
resize_image(dst, width=48, height=48, method="cover")
|
|
|
|
return dst.getvalue()
|
|
|
|
|
2024-03-18 09:56:32 +00:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"url,zim_path",
|
|
|
|
[
|
|
|
|
("https://exemple.com", "exemple.com/"),
|
|
|
|
("https://exemple.com/", "exemple.com/"),
|
|
|
|
("http://example.com/resource", "example.com/resource"),
|
|
|
|
("http://example.com/resource/", "example.com/resource/"),
|
|
|
|
(
|
|
|
|
"http://example.com/resource/folder/sub.txt",
|
|
|
|
"example.com/resource/folder/sub.txt",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"http://example.com/resource/folder/sub",
|
|
|
|
"example.com/resource/folder/sub",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"http://example.com/resource/folder/sub?foo=bar",
|
|
|
|
"example.com/resource/folder/sub?foo=bar",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"http://example.com/resource/folder/sub?foo=bar#anchor1",
|
|
|
|
"example.com/resource/folder/sub?foo=bar",
|
|
|
|
),
|
|
|
|
("http://example.com/resource/#anchor1", "example.com/resource/"),
|
|
|
|
("http://example.com/resource/?foo=bar", "example.com/resource/?foo=bar"),
|
|
|
|
("http://example.com#anchor1", "example.com/"),
|
|
|
|
("http://example.com?foo=bar#anchor1", "example.com/?foo=bar"),
|
|
|
|
("http://example.com/?foo=bar", "example.com/?foo=bar"),
|
|
|
|
("http://example.com/?foo=ba+r", "example.com/?foo=ba r"),
|
|
|
|
(
|
|
|
|
"http://example.com/?foo=ba r",
|
|
|
|
"example.com/?foo=ba r",
|
|
|
|
), # situation where the ` ` has not been properly escaped in document
|
|
|
|
("http://example.com/?foo=ba%2Br", "example.com/?foo=ba+r"),
|
|
|
|
("http://example.com/?foo=ba+%2B+r", "example.com/?foo=ba + r"),
|
|
|
|
("http://example.com/#anchor1", "example.com/"),
|
|
|
|
(
|
|
|
|
"http://example.com/some/path/http://example.com//some/path",
|
|
|
|
"example.com/some/path/http://example.com//some/path",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"http://example.com/some/pa?th/http://example.com//some/path",
|
|
|
|
"example.com/some/pa?th/http://example.com//some/path",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"http://example.com/so?me/pa?th/http://example.com//some/path",
|
|
|
|
"example.com/so?me/pa?th/http://example.com//some/path",
|
|
|
|
),
|
|
|
|
("http://example.com/resource?", "example.com/resource"),
|
|
|
|
("http://example.com/resource#", "example.com/resource"),
|
|
|
|
("http://user@example.com/resource", "example.com/resource"),
|
|
|
|
("http://user:password@example.com/resource", "example.com/resource"),
|
|
|
|
("http://example.com:8080/resource", "example.com/resource"),
|
|
|
|
(
|
|
|
|
"http://foobargooglevideo.com/videoplayback?id=1576&key=value",
|
|
|
|
"youtube.fuzzy.replayweb.page/videoplayback?id=1576",
|
|
|
|
), # Fuzzy rule is applied in addition to path transformations
|
|
|
|
("https://xn--exmple-cva.com", "exémple.com/"),
|
|
|
|
("https://xn--exmple-cva.com/", "exémple.com/"),
|
|
|
|
("https://xn--exmple-cva.com/resource", "exémple.com/resource"),
|
|
|
|
("https://exémple.com/", "exémple.com/"),
|
|
|
|
("https://exémple.com/resource", "exémple.com/resource"),
|
2024-05-13 14:51:54 +00:00
|
|
|
# host_ip is an invalid hostname according to spec
|
2024-03-18 09:56:32 +00:00
|
|
|
("https://host_ip/", "host_ip/"),
|
|
|
|
("https://host_ip/resource", "host_ip/resource"),
|
2024-05-13 14:51:54 +00:00
|
|
|
("https://192.168.1.1/", "192.168.1.1/"),
|
|
|
|
("https://192.168.1.1/resource", "192.168.1.1/resource"),
|
2024-03-18 09:56:32 +00:00
|
|
|
("http://example.com/res%24urce", "example.com/res$urce"),
|
|
|
|
(
|
|
|
|
"http://example.com/resource?foo=b%24r",
|
|
|
|
"example.com/resource?foo=b$r",
|
|
|
|
),
|
|
|
|
("http://example.com/resource@300x", "example.com/resource@300x"),
|
|
|
|
("http://example.com:8080/resource", "example.com/resource"),
|
|
|
|
("http://user@example.com:8080/resource", "example.com/resource"),
|
|
|
|
("http://user:password@example.com:8080/resource", "example.com/resource"),
|
|
|
|
# the two URI below are an illustration of a potential collision (two
|
|
|
|
# differents URI leading to the same ZIM path)
|
|
|
|
(
|
|
|
|
"http://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
|
|
|
|
"de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1%40300x.png",
|
|
|
|
"tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
|
|
|
|
"Solidarité-Numérique_1@300x.png",
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"https://tmp.kiwix.org/ci/test-website/images/urlencoding1_ico%CC%82ne-"
|
|
|
|
"de%CC%81buter-Solidarite%CC%81-Nume%CC%81rique_1@300x.png",
|
|
|
|
"tmp.kiwix.org/ci/test-website/images/urlencoding1_icône-débuter-"
|
|
|
|
"Solidarité-Numérique_1@300x.png",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_normalize(self, url, zim_path):
|
|
|
|
assert normalize(HttpUrl(url)) == ZimPath(zim_path)
|
2020-10-21 20:52:05 +00:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output = "zim-out-filename.zim"
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-07-27 08:26:57 -07:00
|
|
|
[
|
|
|
|
"-v",
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
|
|
|
"--name",
|
|
|
|
"example-response",
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
2020-07-27 08:26:57 -07:00
|
|
|
zim_output,
|
2020-08-02 00:12:50 -07:00
|
|
|
"--tags",
|
2024-05-21 12:35:09 +00:00
|
|
|
" foo ;bar; ; some;_foo:bar;_foo_,_bar_",
|
2020-08-02 00:12:50 -07:00
|
|
|
"--desc",
|
|
|
|
"test zim",
|
2020-08-03 17:22:29 -07:00
|
|
|
"--title",
|
|
|
|
"Some Title",
|
2020-07-27 08:26:57 -07:00
|
|
|
]
|
|
|
|
)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-07-22 21:31:18 -07:00
|
|
|
assert os.path.isfile(zim_output)
|
|
|
|
|
2020-08-02 00:12:50 -07:00
|
|
|
all_articles = {
|
2022-01-07 13:52:57 +00:00
|
|
|
article.path: article.title for article in self.list_articles(zim_output)
|
2020-08-02 00:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
assert all_articles == {
|
|
|
|
# entries from WARC
|
2023-12-19 14:32:28 +01:00
|
|
|
"example.com/": "Example Domain",
|
|
|
|
"_zim_static/__wb_module_decl.js": "_zim_static/__wb_module_decl.js",
|
2023-12-12 11:32:13 +01:00
|
|
|
"_zim_static/wombat.js": "_zim_static/wombat.js",
|
2024-03-28 15:52:49 +00:00
|
|
|
"_zim_static/wombatSetup.js": "_zim_static/wombatSetup.js",
|
2020-08-02 00:12:50 -07:00
|
|
|
}
|
|
|
|
|
2022-01-07 13:52:57 +00:00
|
|
|
zim_fh = Archive(zim_output)
|
|
|
|
|
|
|
|
# ZIM metadata
|
|
|
|
assert list(zim_fh.metadata.keys()) == [
|
|
|
|
"Counter",
|
|
|
|
"Creator",
|
|
|
|
"Date",
|
|
|
|
"Description",
|
|
|
|
"Language",
|
|
|
|
"Name",
|
|
|
|
"Publisher",
|
|
|
|
"Scraper",
|
|
|
|
"Tags",
|
|
|
|
"Title",
|
|
|
|
]
|
|
|
|
|
|
|
|
assert zim_fh.has_fulltext_index
|
|
|
|
assert zim_fh.has_title_index
|
|
|
|
|
|
|
|
assert self.get_metadata(zim_output, "Description") == b"test zim"
|
2024-05-21 12:35:09 +00:00
|
|
|
# we compare sets of tags since tags ordering has no meaning
|
|
|
|
assert set(
|
|
|
|
self.get_metadata(zim_output, "Tags").decode("utf-8").split(";")
|
|
|
|
) == {
|
|
|
|
"_ftindex:yes",
|
|
|
|
"_category:other",
|
|
|
|
"some",
|
|
|
|
"foo",
|
|
|
|
"bar",
|
|
|
|
"_foo:bar",
|
|
|
|
"_foo_,_bar_",
|
|
|
|
}
|
2022-01-07 13:52:57 +00:00
|
|
|
assert self.get_metadata(zim_output, "Title") == b"Some Title"
|
2020-08-02 00:12:50 -07:00
|
|
|
|
2024-03-18 09:56:32 +00:00
|
|
|
def test_warc_to_zim_main(self, cmdline, tmp_path):
|
2020-08-18 23:15:07 -07:00
|
|
|
# intput filename
|
2020-08-02 22:59:41 +00:00
|
|
|
filename = cmdline[0]
|
2020-08-03 17:22:29 -07:00
|
|
|
|
2020-08-18 23:15:07 -07:00
|
|
|
# set intput filename (first arg) to absolute path from test dir
|
2020-08-18 22:59:22 -07:00
|
|
|
warcfile = os.path.join(TEST_DATA_DIR, filename)
|
2020-08-18 23:15:07 -07:00
|
|
|
cmdline[0] = warcfile
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 23:15:07 -07:00
|
|
|
cmdline.extend(["--output", str(tmp_path), "--name", filename])
|
2020-08-18 22:59:22 -07:00
|
|
|
|
2023-11-28 15:27:01 +01:00
|
|
|
main(cmdline)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 23:15:07 -07:00
|
|
|
zimfile = filename + "_" + time.strftime("%Y-%m") + ".zim"
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-12-09 10:55:59 +00:00
|
|
|
if "--progress-file" in cmdline:
|
2024-01-18 17:15:16 +01:00
|
|
|
with open(tmp_path / "progress.json") as fh:
|
2020-12-09 10:55:59 +00:00
|
|
|
progress = json.load(fh)
|
|
|
|
assert (
|
|
|
|
progress["written"] > 0
|
|
|
|
and progress["total"] > 0
|
|
|
|
and progress["written"] <= progress["total"]
|
|
|
|
)
|
|
|
|
|
2024-01-31 11:47:01 +01:00
|
|
|
self.verify_warc_and_zim(
|
|
|
|
warcfile, tmp_path / zimfile, "--scraper-suffix" in cmdline
|
|
|
|
)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_same_domain_only(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output = "same-domain.zim"
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-08-03 17:22:29 -07:00
|
|
|
[
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"),
|
2020-08-03 17:22:29 -07:00
|
|
|
"--favicon",
|
|
|
|
"http://example.com/favicon.ico",
|
2020-10-06 04:40:03 +00:00
|
|
|
"--include-domains",
|
|
|
|
"example.com/",
|
2020-08-03 17:22:29 -07:00
|
|
|
"--lang",
|
|
|
|
"eng",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--zim-file",
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output,
|
2020-08-18 22:59:22 -07:00
|
|
|
"--name",
|
|
|
|
"same-domain",
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
2020-08-03 17:22:29 -07:00
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-08-03 17:22:29 -07:00
|
|
|
for article in self.list_articles(zim_output):
|
2022-01-07 13:52:57 +00:00
|
|
|
url = article.path
|
2020-08-03 17:22:29 -07:00
|
|
|
# ignore the replay files, which have only one path segment
|
2023-11-17 10:59:48 +01:00
|
|
|
if not url.startswith("_zim_static/"):
|
|
|
|
assert url.startswith("example.com/")
|
2020-08-03 17:22:29 -07:00
|
|
|
|
2020-09-25 02:47:16 +00:00
|
|
|
def test_skip_self_redirect(self, tmp_path):
|
|
|
|
zim_output = "self-redir.zim"
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-09-25 02:47:16 +00:00
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "self-redirect.warc"),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"self-redir",
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_include_domains_favicon_and_language(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output = "spt.zim"
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-08-03 10:26:24 -07:00
|
|
|
[
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "single-page-test.warc"),
|
2020-08-03 10:26:24 -07:00
|
|
|
"-i",
|
|
|
|
"reseau-canope.fr",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
2020-08-03 10:26:24 -07:00
|
|
|
zim_output,
|
2020-08-18 22:59:22 -07:00
|
|
|
"--name",
|
|
|
|
"spt",
|
2020-08-03 10:26:24 -07:00
|
|
|
]
|
|
|
|
)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-08-03 09:26:48 -07:00
|
|
|
for article in self.list_articles(zim_output):
|
2022-01-07 13:52:57 +00:00
|
|
|
url = article.path
|
2020-08-03 17:22:29 -07:00
|
|
|
# ignore the replay files, which have only one path segment
|
2023-11-17 10:59:48 +01:00
|
|
|
if not url.startswith("_zim_static/"):
|
2020-08-03 17:22:29 -07:00
|
|
|
assert "reseau-canope.fr/" in url
|
2020-08-03 09:26:48 -07:00
|
|
|
|
|
|
|
# test detected language
|
2022-01-07 13:52:57 +00:00
|
|
|
assert self.get_metadata(zim_output, "Language") == b"fra"
|
2020-08-03 09:26:48 -07:00
|
|
|
|
|
|
|
# test detected favicon
|
2024-03-04 13:28:47 +00:00
|
|
|
zim_favicon = self.get_metadata(zim_output, "Illustration_48x48@1")
|
|
|
|
assert zim_favicon
|
|
|
|
|
|
|
|
assert (
|
|
|
|
self.rebuild_favicon_bytes(
|
|
|
|
zim_output,
|
|
|
|
"lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico",
|
|
|
|
)
|
|
|
|
== zim_favicon
|
2020-08-03 10:26:24 -07:00
|
|
|
)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2024-05-21 12:35:09 +00:00
|
|
|
# test default tags added ; we compare sets of tags since tags ordering has no
|
|
|
|
# meaning
|
|
|
|
assert set(
|
|
|
|
self.get_metadata(zim_output, "Tags").decode("utf-8").split(";")
|
|
|
|
) == {
|
|
|
|
"_ftindex:yes",
|
|
|
|
"_category:other",
|
|
|
|
}
|
2020-08-19 18:36:04 +00:00
|
|
|
|
2024-03-04 13:28:47 +00:00
|
|
|
def test_website_with_redirect(self, tmp_path):
|
|
|
|
zim_output = "kiwix.zim"
|
|
|
|
main(
|
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "kiwix-with-redirects.warc.gz"),
|
|
|
|
"-u",
|
|
|
|
"http://www.kiwix.org",
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"kiwix",
|
|
|
|
]
|
2020-08-03 10:26:24 -07:00
|
|
|
)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2024-03-04 13:28:47 +00:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
|
|
|
# check that redirections have been followed
|
|
|
|
assert self.get_main_entry_with_redirect(zim_output).path == "kiwix.org/en/"
|
|
|
|
|
|
|
|
# test detected language
|
|
|
|
assert self.get_metadata(zim_output, "Language") == b"eng"
|
|
|
|
|
|
|
|
# test detected favicon
|
|
|
|
zim_favicon = self.get_metadata(zim_output, "Illustration_48x48@1")
|
|
|
|
assert zim_favicon
|
|
|
|
|
2020-08-19 18:36:04 +00:00
|
|
|
assert (
|
2024-03-04 13:28:47 +00:00
|
|
|
self.rebuild_favicon_bytes(
|
|
|
|
zim_output,
|
|
|
|
"kiwix.org/favicon.ico",
|
|
|
|
)
|
|
|
|
== zim_favicon
|
2020-08-19 18:36:04 +00:00
|
|
|
)
|
|
|
|
|
2020-10-19 18:00:10 +00:00
|
|
|
def test_all_warcs_root_dir(self, tmp_path):
|
|
|
|
zim_output = "test-all.zim"
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-10-19 18:00:10 +00:00
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"test-all",
|
|
|
|
"--url",
|
|
|
|
"http://example.com",
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
2020-10-22 01:05:57 +00:00
|
|
|
|
|
|
|
# check articles from different warc records in tests/data dir
|
|
|
|
|
|
|
|
# from example.warc.gz
|
2023-05-26 15:39:38 +03:00
|
|
|
assert self.get_article(zim_output, "example.com/") != b""
|
2020-10-22 01:05:57 +00:00
|
|
|
|
|
|
|
# from single-page-test.warc
|
2020-10-19 18:00:10 +00:00
|
|
|
assert (
|
|
|
|
self.get_article(
|
2023-05-26 15:39:38 +03:00
|
|
|
zim_output, "lesfondamentaux.reseau-canope.fr/accueil.html"
|
2020-10-19 18:00:10 +00:00
|
|
|
)
|
|
|
|
!= b""
|
|
|
|
)
|
|
|
|
|
2020-10-22 01:05:57 +00:00
|
|
|
# timestamp fuzzy match from example-with-timestamp.warc
|
2024-03-18 09:56:32 +00:00
|
|
|
assert self.get_article(zim_output, "example.com/path.txt") != b""
|
2020-10-22 01:05:57 +00:00
|
|
|
|
2021-10-29 16:52:36 +00:00
|
|
|
def test_fuzzy_urls(self, tmp_path, fuzzycheck):
|
|
|
|
zim_output = fuzzycheck["filename"] + ".zim"
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-10-21 04:23:36 +00:00
|
|
|
[
|
2021-10-29 16:52:36 +00:00
|
|
|
os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]),
|
2020-10-21 04:23:36 +00:00
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"test-fuzzy",
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
2021-10-29 02:20:01 +00:00
|
|
|
|
2021-10-29 16:52:36 +00:00
|
|
|
for entry in fuzzycheck["entries"]:
|
2024-01-18 17:15:16 +01:00
|
|
|
# This should be item and get_article_raw is eq to getItem and it will fail
|
|
|
|
# if it is not a item
|
2023-11-14 16:12:17 +01:00
|
|
|
self.get_article_raw(zim_output, entry)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_error_bad_main_page(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output_not_created = "zim-out-not-created.zim"
|
2024-03-18 09:56:32 +00:00
|
|
|
with pytest.raises(KeyError, match="Unable to find WARC record for main page:"):
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2020-08-03 09:26:48 -07:00
|
|
|
[
|
|
|
|
"-v",
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
2020-08-03 09:26:48 -07:00
|
|
|
"-u",
|
|
|
|
"https://no-such-url.example.com",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--name",
|
|
|
|
"bad",
|
|
|
|
"--zim-file",
|
2020-08-03 09:26:48 -07:00
|
|
|
zim_output_not_created,
|
|
|
|
]
|
|
|
|
)
|
2020-10-06 04:26:34 +00:00
|
|
|
|
|
|
|
def test_args_only(self):
|
|
|
|
# error, name required
|
|
|
|
with pytest.raises(SystemExit) as e:
|
2023-11-28 15:27:01 +01:00
|
|
|
main([])
|
2024-01-25 16:27:42 +01:00
|
|
|
assert e.value.code == 2
|
2020-10-06 04:26:34 +00:00
|
|
|
|
|
|
|
# error, no such output directory
|
2024-04-11 21:49:14 +05:30
|
|
|
with pytest.raises(SystemExit) as e:
|
2023-11-28 15:27:01 +01:00
|
|
|
main(["--name", "test", "--output", "/no-such-dir"])
|
2024-05-24 13:53:12 +00:00
|
|
|
assert e.value.code == 1
|
2020-10-06 04:26:34 +00:00
|
|
|
|
2024-04-18 10:38:16 +05:30
|
|
|
# error, name has invalid characters for Linux filesystem
|
|
|
|
with pytest.raises(SystemExit) as e:
|
|
|
|
main(["--name", "te/st", "--output", "./"])
|
2024-05-24 13:53:12 +00:00
|
|
|
assert e.value.code == 3
|
2024-04-18 10:38:16 +05:30
|
|
|
|
2024-01-31 15:13:32 +01:00
|
|
|
# success, special return code for no output files
|
2023-11-28 15:27:01 +01:00
|
|
|
assert main(["--name", "test", "--output", "./"]) == 100
|
2021-01-12 16:17:46 +00:00
|
|
|
|
|
|
|
def test_custom_css(self, tmp_path):
|
|
|
|
custom_css = b"* { background-color: red; }"
|
|
|
|
custom_css_path = tmp_path / "custom.css"
|
|
|
|
with open(custom_css_path, "wb") as fh:
|
|
|
|
fh.write(custom_css)
|
|
|
|
|
|
|
|
zim_output = "test-css.zim"
|
|
|
|
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2021-01-12 16:17:46 +00:00
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"test-css",
|
|
|
|
"--custom-css",
|
|
|
|
str(custom_css_path),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2023-11-17 10:59:48 +01:00
|
|
|
res = self.get_article(zim_output, "example.com/")
|
2024-05-21 09:14:15 +00:00
|
|
|
assert b"static_prefix" not in res
|
|
|
|
assert b"../_zim_static/custom.css" in res
|
2021-01-12 16:17:46 +00:00
|
|
|
|
2024-05-21 09:14:15 +00:00
|
|
|
res = self.get_article(zim_output, "_zim_static/custom.css")
|
2021-01-12 16:17:46 +00:00
|
|
|
assert custom_css == res
|
|
|
|
|
|
|
|
def test_custom_css_remote(self, tmp_path):
|
|
|
|
zim_output = "test-css.zim"
|
|
|
|
url = (
|
|
|
|
"https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap-reboot.css"
|
|
|
|
)
|
|
|
|
|
2023-11-28 15:27:01 +01:00
|
|
|
main(
|
2021-01-12 16:17:46 +00:00
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"test-css",
|
|
|
|
"--custom-css",
|
|
|
|
url,
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2023-11-17 10:59:48 +01:00
|
|
|
res = self.get_article(zim_output, "example.com/")
|
2024-05-21 09:14:15 +00:00
|
|
|
assert b"static_prefix" not in res
|
|
|
|
assert b"../_zim_static/custom.css" in res
|
2021-01-12 16:17:46 +00:00
|
|
|
|
2024-05-21 09:14:15 +00:00
|
|
|
res = self.get_article(zim_output, "_zim_static/custom.css")
|
2024-01-18 17:15:16 +01:00
|
|
|
assert res == requests.get(url, timeout=10).content
|
2024-04-30 14:24:58 +00:00
|
|
|
|
|
|
|
def test_http_return_codes(self, tmp_path):
|
|
|
|
zim_output = "test-http-return-codes.zim"
|
2021-01-12 16:17:46 +00:00
|
|
|
|
2024-04-30 14:24:58 +00:00
|
|
|
main(
|
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "http-return-codes.warc.gz"),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"test-http-return-codes",
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
|
|
|
for exising_website_items in [
|
|
|
|
"200-response",
|
|
|
|
"201-response",
|
|
|
|
"202-response",
|
|
|
|
"301-internal-redirect-ok",
|
|
|
|
"301-external-redirect-ok",
|
|
|
|
"302-internal-redirect-ok",
|
|
|
|
"302-external-redirect-ok",
|
|
|
|
"307-internal-redirect-ok",
|
|
|
|
"307-external-redirect-ok",
|
|
|
|
"308-internal-redirect-ok",
|
|
|
|
"308-external-redirect-ok",
|
|
|
|
]:
|
|
|
|
self.assert_item_exist(
|
|
|
|
zim_output, f"website.test.openzim.org/{exising_website_items}"
|
|
|
|
)
|
|
|
|
|
|
|
|
self.assert_item_exist(zim_output, "www.example.com/")
|
|
|
|
|
|
|
|
for ignored_website_items in [
|
|
|
|
"204-response",
|
|
|
|
"206-response",
|
|
|
|
"300-response",
|
|
|
|
"303-response",
|
|
|
|
"304-response",
|
|
|
|
"305-response",
|
|
|
|
"306-response",
|
|
|
|
"400-response",
|
|
|
|
"401-response",
|
|
|
|
"402-response",
|
|
|
|
"403-response",
|
|
|
|
"404-response",
|
|
|
|
"500-response",
|
|
|
|
"501-response",
|
|
|
|
"502-response",
|
|
|
|
"301-internal-redirect-ko",
|
|
|
|
"301-external-redirect-ko",
|
|
|
|
"302-internal-redirect-ko",
|
|
|
|
"302-external-redirect-ko",
|
|
|
|
"307-internal-redirect-ko",
|
|
|
|
"307-external-redirect-ko",
|
|
|
|
"308-internal-redirect-ko",
|
|
|
|
"308-external-redirect-ko",
|
|
|
|
]:
|
|
|
|
self.assert_item_does_not_exist(
|
|
|
|
zim_output, f"website.test.openzim.org/{ignored_website_items}"
|
|
|
|
)
|
2024-05-27 13:06:32 +00:00
|
|
|
|
|
|
|
def test_redirection_loops(self, tmp_path):
|
|
|
|
zim_output = "test-redir-loops.zim"
|
|
|
|
|
|
|
|
main(
|
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "redir-loops.warc.gz"),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"test-redir-loops",
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
|
|
|
for exising_website_items in [
|
|
|
|
"redirection-loops.html",
|
|
|
|
]:
|
|
|
|
self.assert_item_exist(
|
|
|
|
zim_output, f"website.test.openzim.org/{exising_website_items}"
|
|
|
|
)
|
|
|
|
|
|
|
|
for ignored_website_items in [
|
|
|
|
"/bad-redir-loop-A",
|
|
|
|
"/bad-redir-loop-B",
|
|
|
|
"/bad-redir-loop-C",
|
|
|
|
"/bad-redir-loop-D",
|
|
|
|
]:
|
|
|
|
self.assert_item_does_not_exist(
|
|
|
|
zim_output, f"website.test.openzim.org/{ignored_website_items}"
|
|
|
|
)
|
2024-06-13 07:40:05 +00:00
|
|
|
|
|
|
|
def test_content_resource_types(self, tmp_path):
|
|
|
|
zim_output = "tests_en_content-resource-types.zim"
|
|
|
|
|
|
|
|
main(
|
|
|
|
[
|
|
|
|
os.path.join(TEST_DATA_DIR, "content-resource-types.warc.gz"),
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
|
|
|
zim_output,
|
|
|
|
"--name",
|
|
|
|
"tests_en_content-resource-types",
|
|
|
|
]
|
|
|
|
)
|
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
|
|
|
res = self.get_article(
|
|
|
|
zim_output, "website.test.openzim.org/content-types/index.html"
|
|
|
|
)
|
|
|
|
assert b"<!-- WB Insert -->" in res # simple check that rewriting has been done
|
|
|
|
|
|
|
|
for js_file in [
|
|
|
|
"website.test.openzim.org/content-types/script1.js",
|
|
|
|
"website.test.openzim.org/content-types/script2.js",
|
|
|
|
]:
|
|
|
|
res = self.get_article(zim_output, js_file)
|
|
|
|
assert b"wombat" in res # simple check that rewriting has been done
|