2020-08-11 03:41:33 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# vim: ai ts=4 sts=4 et sw=4 nu
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
import os
|
2020-08-18 22:59:22 -07:00
|
|
|
import time
|
2020-07-22 21:11:18 -07:00
|
|
|
from io import BytesIO
|
|
|
|
|
2020-07-27 01:51:55 +00:00
|
|
|
import pytest
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
import libzim.reader
|
|
|
|
from warcio import ArchiveIterator
|
2020-08-10 18:48:01 -07:00
|
|
|
from jinja2 import Environment, PackageLoader
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-08-10 18:48:01 -07:00
|
|
|
from warc2zim.main import warc2zim, HTML_RAW
|
2020-07-27 01:51:55 +00:00
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-08-02 22:59:41 +00:00
|
|
|
CMDLINES = [
|
|
|
|
["example-response.warc"],
|
2020-08-03 17:22:29 -07:00
|
|
|
["example-resource.warc.gz", "--favicon", "https://example.com/some/favicon.ico"],
|
2020-08-02 22:59:41 +00:00
|
|
|
["example-revisit.warc.gz", "-a"],
|
2020-08-03 17:22:29 -07:00
|
|
|
[
|
|
|
|
"example-revisit.warc.gz",
|
|
|
|
"-a",
|
|
|
|
"-u",
|
|
|
|
"http://example.iana.org/",
|
|
|
|
"--lang",
|
|
|
|
"eng",
|
|
|
|
],
|
2020-08-02 23:04:32 +00:00
|
|
|
[
|
|
|
|
"example-utf8.warc",
|
|
|
|
"-u",
|
|
|
|
"https://httpbin.org/anything/utf8=%E2%9C%93?query=test&a=b&1=%E2%9C%93",
|
|
|
|
],
|
2020-08-03 09:26:48 -07:00
|
|
|
["single-page-test.warc", "-a"],
|
2020-07-27 08:26:57 -07:00
|
|
|
]
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
|
|
|
|
|
|
|
|
|
2020-08-03 17:22:29 -07:00
|
|
|
@pytest.fixture(params=CMDLINES, ids=[" ".join(cmds) for cmds in CMDLINES])
|
2020-08-02 22:59:41 +00:00
|
|
|
def cmdline(request):
|
2020-07-22 13:22:36 -07:00
|
|
|
return request.param
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================================
|
|
|
|
class TestWarc2Zim(object):
|
2020-08-02 00:12:50 -07:00
|
|
|
def list_articles(self, zimfile):
|
|
|
|
zim_fh = libzim.reader.File(zimfile)
|
|
|
|
for x in range(zim_fh.article_count):
|
|
|
|
yield zim_fh.get_article_by_id(x)
|
|
|
|
|
|
|
|
def get_article(self, zimfile, path):
|
|
|
|
zim_fh = libzim.reader.File(zimfile)
|
|
|
|
return zim_fh.get_article(path).content.tobytes()
|
|
|
|
|
2020-08-03 09:26:48 -07:00
|
|
|
def get_article_raw(self, zimfile, path):
|
|
|
|
zim_fh = libzim.reader.File(zimfile)
|
|
|
|
return zim_fh.get_article(path)
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
def verify_warc_and_zim(self, warcfile, zimfile):
|
|
|
|
assert os.path.isfile(warcfile)
|
|
|
|
assert os.path.isfile(zimfile)
|
|
|
|
|
2020-08-11 03:43:54 +00:00
|
|
|
# autoescape=False to allow injecting html entities from translated text
|
2020-08-10 18:48:01 -07:00
|
|
|
env = Environment(
|
|
|
|
loader=PackageLoader("warc2zim", "templates"),
|
|
|
|
extensions=["jinja2.ext.i18n"],
|
2020-08-11 03:43:54 +00:00
|
|
|
autoescape=False,
|
2020-08-10 18:48:01 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
head_insert = env.get_template("sw_check.html").render().encode("utf-8")
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
# track to avoid checking duplicates, which are not written to ZIM
|
|
|
|
warc_urls = set()
|
|
|
|
|
|
|
|
zim_fh = libzim.reader.File(zimfile)
|
2020-07-27 08:26:57 -07:00
|
|
|
with open(warcfile, "rb") as warc_fh:
|
2020-07-22 13:22:36 -07:00
|
|
|
for record in ArchiveIterator(warc_fh):
|
2020-07-27 08:26:57 -07:00
|
|
|
url = record.rec_headers["WARC-Target-URI"]
|
2020-07-22 13:22:36 -07:00
|
|
|
if not url:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if url in warc_urls:
|
|
|
|
continue
|
|
|
|
|
2020-07-27 08:26:57 -07:00
|
|
|
if record.rec_type not in (("response", "resource", "revisit")):
|
2020-07-22 13:22:36 -07:00
|
|
|
continue
|
|
|
|
|
2020-07-22 21:31:18 -07:00
|
|
|
# ignore revisit records that are to the same url
|
2020-07-27 08:26:57 -07:00
|
|
|
if (
|
|
|
|
record.rec_type == "revisit"
|
|
|
|
and record.rec_headers["WARC-Refers-To-Target-URI"] == url
|
|
|
|
):
|
2020-07-22 21:31:18 -07:00
|
|
|
continue
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-22 21:11:18 -07:00
|
|
|
# parse headers as record, ensure headers match
|
2020-07-27 08:26:57 -07:00
|
|
|
url_no_scheme = url.split("//", 2)[1]
|
|
|
|
headers = zim_fh.get_article("H/" + url_no_scheme)
|
|
|
|
parsed_record = next(
|
|
|
|
ArchiveIterator(BytesIO(headers.content.tobytes()))
|
|
|
|
)
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-22 21:11:18 -07:00
|
|
|
assert record.rec_headers == parsed_record.rec_headers
|
|
|
|
assert record.http_headers == parsed_record.http_headers
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-22 21:31:18 -07:00
|
|
|
# ensure payloads match
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
try:
|
|
|
|
payload = zim_fh.get_article("A/" + url_no_scheme)
|
2020-07-29 14:34:53 -07:00
|
|
|
except KeyError:
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
payload = None
|
|
|
|
|
2020-07-29 13:33:54 -07:00
|
|
|
if record.rec_type == "revisit" or (
|
|
|
|
record.http_headers
|
|
|
|
and record.http_headers.get("Content-Length") == "0"
|
|
|
|
):
|
2020-07-22 21:31:18 -07:00
|
|
|
assert payload == None
|
|
|
|
else:
|
2020-08-10 18:48:01 -07:00
|
|
|
payload_content = payload.content.tobytes()
|
|
|
|
|
|
|
|
# if HTML_RAW, still need to account for the head insert, otherwise should have exact match
|
|
|
|
if payload.mimetype == HTML_RAW:
|
|
|
|
assert head_insert in payload_content
|
2020-08-11 02:23:28 +00:00
|
|
|
assert (
|
|
|
|
payload_content.replace(head_insert, b"")
|
|
|
|
== record.content_stream().read()
|
|
|
|
)
|
2020-08-10 18:48:01 -07:00
|
|
|
else:
|
|
|
|
assert payload_content == record.content_stream().read()
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-07-22 21:11:18 -07:00
|
|
|
warc_urls.add(url)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output = "zim-out-filename.zim"
|
2020-07-27 08:26:57 -07:00
|
|
|
warc2zim(
|
|
|
|
[
|
|
|
|
"-v",
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
|
|
|
"--name",
|
|
|
|
"example-response",
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
2020-07-27 08:26:57 -07:00
|
|
|
zim_output,
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
"-r",
|
|
|
|
"https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.1.0-dev.3/dist/",
|
2020-08-02 00:12:50 -07:00
|
|
|
"--tags",
|
|
|
|
"some",
|
|
|
|
"--tags",
|
|
|
|
"foo",
|
|
|
|
"--desc",
|
|
|
|
"test zim",
|
|
|
|
"--tags",
|
2020-08-02 18:58:08 +00:00
|
|
|
"bar",
|
2020-08-03 17:22:29 -07:00
|
|
|
"--title",
|
|
|
|
"Some Title",
|
2020-07-27 08:26:57 -07:00
|
|
|
]
|
|
|
|
)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-07-22 21:31:18 -07:00
|
|
|
assert os.path.isfile(zim_output)
|
|
|
|
|
2020-08-02 00:12:50 -07:00
|
|
|
all_articles = {
|
|
|
|
article.longurl: article.title for article in self.list_articles(zim_output)
|
|
|
|
}
|
|
|
|
|
|
|
|
assert all_articles == {
|
|
|
|
# entries from WARC
|
|
|
|
"A/example.com/": "Example Domain",
|
2020-08-04 19:59:28 -07:00
|
|
|
"H/example.com/": "example.com/",
|
2020-08-02 00:12:50 -07:00
|
|
|
# replay system files
|
|
|
|
"A/index.html": "index.html",
|
|
|
|
"A/load.js": "load.js",
|
2020-08-06 22:54:44 -07:00
|
|
|
"A/404.html": "404.html",
|
2020-08-02 00:12:50 -07:00
|
|
|
"A/sw.js": "sw.js",
|
|
|
|
"A/topFrame.html": "topFrame.html",
|
|
|
|
# ZIM metadata
|
2020-08-18 22:59:22 -07:00
|
|
|
"M/Compression": "Compression",
|
2020-08-02 00:12:50 -07:00
|
|
|
"M/Counter": "Counter",
|
2020-08-02 22:59:41 +00:00
|
|
|
"M/Creator": "Creator",
|
2020-08-02 00:12:50 -07:00
|
|
|
"M/Date": "Date",
|
|
|
|
"M/Description": "Description",
|
|
|
|
"M/Language": "Language",
|
|
|
|
"M/Name": "Name",
|
|
|
|
"M/Publisher": "Publisher",
|
|
|
|
"M/Scraper": "Scraper",
|
|
|
|
"M/Source": "Source",
|
|
|
|
"M/Tags": "Tags",
|
|
|
|
"M/Title": "Title",
|
|
|
|
# Xapian
|
|
|
|
"X/fulltext/xapian": "Xapian Fulltext Index",
|
|
|
|
"X/title/xapian": "Xapian Title Index",
|
|
|
|
}
|
|
|
|
|
|
|
|
assert self.get_article(zim_output, "M/Description") == b"test zim"
|
|
|
|
assert self.get_article(zim_output, "M/Tags") == b"some;foo;bar"
|
2020-08-03 17:22:29 -07:00
|
|
|
assert self.get_article(zim_output, "M/Title") == b"Some Title"
|
2020-08-02 00:12:50 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_warc_to_zim(self, cmdline, tmp_path):
|
2020-08-02 22:59:41 +00:00
|
|
|
filename = cmdline[0]
|
2020-08-03 17:22:29 -07:00
|
|
|
|
|
|
|
# cwd is set to root dir
|
2020-08-18 22:59:22 -07:00
|
|
|
warcfile = os.path.join(TEST_DATA_DIR, filename)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-03 17:22:29 -07:00
|
|
|
# warc2zim([warcfile] + cmdline[1:])
|
2020-08-18 22:59:22 -07:00
|
|
|
cmdline.extend(["--output", str(tmp_path), "--name", cmdline[0]])
|
|
|
|
|
2020-08-03 17:22:29 -07:00
|
|
|
warc2zim(cmdline)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zimfile = cmdline[0] + "_" + time.strftime("%Y-%m") + ".zim"
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
self.verify_warc_and_zim(warcfile, tmp_path / zimfile)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_same_domain_only(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output = "same-domain.zim"
|
|
|
|
warc2zim(
|
|
|
|
[
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"),
|
2020-08-03 17:22:29 -07:00
|
|
|
"--favicon",
|
|
|
|
"http://example.com/favicon.ico",
|
|
|
|
"--lang",
|
|
|
|
"eng",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--zim-file",
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output,
|
2020-08-18 22:59:22 -07:00
|
|
|
"--name",
|
|
|
|
"same-domain",
|
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
2020-08-03 17:22:29 -07:00
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-08-03 17:22:29 -07:00
|
|
|
for article in self.list_articles(zim_output):
|
|
|
|
url = article.longurl
|
|
|
|
# ignore the replay files, which have only one path segment
|
|
|
|
if url.startswith("A/") and len(url.split("/")) > 2:
|
|
|
|
assert url.startswith("A/example.com/")
|
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_include_domains_favicon_and_language(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output = "spt.zim"
|
2020-08-03 10:26:24 -07:00
|
|
|
warc2zim(
|
|
|
|
[
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "single-page-test.warc"),
|
2020-08-03 10:26:24 -07:00
|
|
|
"-i",
|
|
|
|
"reseau-canope.fr",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--zim-file",
|
2020-08-03 10:26:24 -07:00
|
|
|
zim_output,
|
2020-08-18 22:59:22 -07:00
|
|
|
"--name",
|
|
|
|
"spt",
|
2020-08-03 10:26:24 -07:00
|
|
|
]
|
|
|
|
)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
zim_output = tmp_path / zim_output
|
|
|
|
|
2020-08-03 09:26:48 -07:00
|
|
|
for article in self.list_articles(zim_output):
|
|
|
|
url = article.longurl
|
2020-08-03 17:22:29 -07:00
|
|
|
# ignore the replay files, which have only one path segment
|
2020-08-03 09:26:48 -07:00
|
|
|
if url.startswith("A/") and len(url.split("/")) > 2:
|
2020-08-03 17:22:29 -07:00
|
|
|
assert "reseau-canope.fr/" in url
|
2020-08-03 09:26:48 -07:00
|
|
|
|
|
|
|
# test detected language
|
2020-08-10 14:16:21 +02:00
|
|
|
assert self.get_article(zim_output, "M/Language") == b"fra"
|
2020-08-03 09:26:48 -07:00
|
|
|
|
|
|
|
# test detected favicon
|
|
|
|
favicon = self.get_article_raw(zim_output, "-/favicon")
|
|
|
|
assert favicon.is_redirect
|
2020-08-03 10:26:24 -07:00
|
|
|
assert (
|
|
|
|
favicon.get_redirect_article().longurl
|
|
|
|
== "A/lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico"
|
|
|
|
)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_error_bad_replay_viewer_url(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output_not_created = "zim-out-not-created.zim"
|
2020-07-22 21:31:18 -07:00
|
|
|
with pytest.raises(Exception) as e:
|
2020-07-27 08:26:57 -07:00
|
|
|
warc2zim(
|
|
|
|
[
|
|
|
|
"-v",
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
2020-07-27 08:26:57 -07:00
|
|
|
"-r",
|
|
|
|
"x-invalid-x",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--name",
|
|
|
|
"bad",
|
|
|
|
"--zim-file",
|
2020-07-27 08:26:57 -07:00
|
|
|
zim_output_not_created,
|
|
|
|
]
|
|
|
|
)
|
2020-07-27 01:56:13 +00:00
|
|
|
|
|
|
|
# zim file should not have been created since replay viewer could not be loaded
|
|
|
|
assert not os.path.isfile(zim_output_not_created)
|
2020-08-03 09:26:48 -07:00
|
|
|
|
2020-08-18 22:59:22 -07:00
|
|
|
def test_error_bad_main_page(self, tmp_path):
|
2020-08-03 17:22:29 -07:00
|
|
|
zim_output_not_created = "zim-out-not-created.zim"
|
2020-08-03 09:26:48 -07:00
|
|
|
with pytest.raises(Exception) as e:
|
|
|
|
warc2zim(
|
|
|
|
[
|
|
|
|
"-v",
|
2020-08-18 22:59:22 -07:00
|
|
|
os.path.join(TEST_DATA_DIR, "example-response.warc"),
|
2020-08-03 09:26:48 -07:00
|
|
|
"-u",
|
|
|
|
"https://no-such-url.example.com",
|
2020-08-18 22:59:22 -07:00
|
|
|
"--output",
|
|
|
|
str(tmp_path),
|
|
|
|
"--name",
|
|
|
|
"bad",
|
|
|
|
"--zim-file",
|
2020-08-03 09:26:48 -07:00
|
|
|
zim_output_not_created,
|
|
|
|
]
|
|
|
|
)
|