2020-07-22 13:22:36 -07:00
|
|
|
import tempfile
|
|
|
|
import shutil
|
|
|
|
import os
|
2020-07-22 21:11:18 -07:00
|
|
|
from io import BytesIO
|
|
|
|
|
2020-07-27 01:51:55 +00:00
|
|
|
import pytest
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
import libzim.reader
|
|
|
|
from warcio import ArchiveIterator
|
|
|
|
|
2020-07-27 01:51:55 +00:00
|
|
|
from warc2zim.main import warc2zim
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-27 08:26:57 -07:00
|
|
|
WARCS = [
|
|
|
|
"example-response.warc",
|
|
|
|
"example-resource.warc.gz",
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
"example-revisit.warc.gz",
|
2020-07-27 08:26:57 -07:00
|
|
|
"example-utf8.warc",
|
|
|
|
"netpreserve-twitter.warc",
|
|
|
|
]
|
|
|
|
|
2020-07-22 13:22:36 -07:00
|
|
|
|
|
|
|
@pytest.fixture(params=WARCS)
|
|
|
|
def filename(request):
|
|
|
|
return request.param
|
|
|
|
|
|
|
|
|
|
|
|
# ============================================================================
|
|
|
|
class TestWarc2Zim(object):
|
|
|
|
@classmethod
|
|
|
|
def setup_class(cls):
|
|
|
|
cls.root_dir = os.path.realpath(tempfile.mkdtemp())
|
|
|
|
cls.orig_cwd = os.getcwd()
|
|
|
|
os.chdir(cls.root_dir)
|
|
|
|
|
2020-07-27 08:26:57 -07:00
|
|
|
cls.test_data_dir = os.path.join(
|
|
|
|
os.path.dirname(os.path.realpath(__file__)), "data"
|
|
|
|
)
|
2020-07-22 13:22:36 -07:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def teardown_class(cls):
|
|
|
|
os.chdir(cls.orig_cwd)
|
|
|
|
shutil.rmtree(cls.root_dir)
|
|
|
|
|
|
|
|
def verify_warc_and_zim(self, warcfile, zimfile):
|
|
|
|
assert os.path.isfile(warcfile)
|
|
|
|
assert os.path.isfile(zimfile)
|
|
|
|
|
|
|
|
# track to avoid checking duplicates, which are not written to ZIM
|
|
|
|
warc_urls = set()
|
|
|
|
|
|
|
|
zim_fh = libzim.reader.File(zimfile)
|
2020-07-27 08:26:57 -07:00
|
|
|
with open(warcfile, "rb") as warc_fh:
|
2020-07-22 13:22:36 -07:00
|
|
|
for record in ArchiveIterator(warc_fh):
|
2020-07-27 08:26:57 -07:00
|
|
|
url = record.rec_headers["WARC-Target-URI"]
|
2020-07-22 13:22:36 -07:00
|
|
|
if not url:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if url in warc_urls:
|
|
|
|
continue
|
|
|
|
|
2020-07-27 08:26:57 -07:00
|
|
|
if record.rec_type not in (("response", "resource", "revisit")):
|
2020-07-22 13:22:36 -07:00
|
|
|
continue
|
|
|
|
|
2020-07-22 21:31:18 -07:00
|
|
|
# ignore revisit records that are to the same url
|
2020-07-27 08:26:57 -07:00
|
|
|
if (
|
|
|
|
record.rec_type == "revisit"
|
|
|
|
and record.rec_headers["WARC-Refers-To-Target-URI"] == url
|
|
|
|
):
|
2020-07-22 21:31:18 -07:00
|
|
|
continue
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-22 21:11:18 -07:00
|
|
|
# parse headers as record, ensure headers match
|
2020-07-27 08:26:57 -07:00
|
|
|
url_no_scheme = url.split("//", 2)[1]
|
|
|
|
headers = zim_fh.get_article("H/" + url_no_scheme)
|
|
|
|
parsed_record = next(
|
|
|
|
ArchiveIterator(BytesIO(headers.content.tobytes()))
|
|
|
|
)
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-22 21:11:18 -07:00
|
|
|
assert record.rec_headers == parsed_record.rec_headers
|
|
|
|
assert record.http_headers == parsed_record.http_headers
|
2020-07-22 13:22:36 -07:00
|
|
|
|
2020-07-22 21:31:18 -07:00
|
|
|
# ensure payloads match
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
try:
|
|
|
|
payload = zim_fh.get_article("A/" + url_no_scheme)
|
2020-07-29 14:34:53 -07:00
|
|
|
except KeyError:
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
payload = None
|
|
|
|
|
2020-07-29 13:33:54 -07:00
|
|
|
if record.rec_type == "revisit" or (
|
|
|
|
record.http_headers
|
|
|
|
and record.http_headers.get("Content-Length") == "0"
|
|
|
|
):
|
2020-07-22 21:31:18 -07:00
|
|
|
assert payload == None
|
|
|
|
else:
|
|
|
|
assert payload.content.tobytes() == record.content_stream().read()
|
|
|
|
|
2020-07-22 21:11:18 -07:00
|
|
|
warc_urls.add(url)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
def test_warc_to_zim_specify_output_and_viewer(self):
|
2020-07-27 08:26:57 -07:00
|
|
|
zim_output = os.path.join(self.root_dir, "zim-out-filename.zim")
|
|
|
|
warc2zim(
|
|
|
|
[
|
|
|
|
"-v",
|
|
|
|
os.path.join(self.test_data_dir, "example-response.warc"),
|
|
|
|
"-n",
|
|
|
|
zim_output,
|
Multi-Page Mode via SW (#28)
* new replay system using wabac.js directly, without replaywebpage UI or single page mode.
* ignore wabac sw.js from github (automatically added on packaging)
bump version of wabac.js
* use more generic 'notFoundPage.html' to handle all navigate errors
notFoundPage includes live redirect handling (#23), including checking for kiwix-serve redirect warning page
don't use format() for templates, confuses codefactor, juse juse replace()
don't write empty A/ records, if content-length is 0, don't write payload, addresses #21
* formatting, bump to latest wabac.js as default
* lint tweaks
* tests: add more revisit tests, don't clear raw_stream (needed by warcio), ensure indexed urls are tracked
2020-08-03 09:47:32 -07:00
|
|
|
"-r",
|
|
|
|
"https://cdn.jsdelivr.net/npm/@webrecorder/wabac@2.1.0-dev.3/dist/",
|
2020-07-27 08:26:57 -07:00
|
|
|
]
|
|
|
|
)
|
2020-07-22 21:31:18 -07:00
|
|
|
|
|
|
|
assert os.path.isfile(zim_output)
|
|
|
|
|
|
|
|
def test_warc_to_zim(self, filename):
|
|
|
|
warcfile = os.path.join(self.root_dir, filename)
|
|
|
|
|
|
|
|
# copy test WARCs to test dir to test different output scenarios
|
|
|
|
shutil.copy(os.path.join(self.test_data_dir, filename), warcfile)
|
|
|
|
|
|
|
|
warc2zim([warcfile])
|
|
|
|
|
|
|
|
zimfile, ext = os.path.splitext(warcfile)
|
2020-07-27 08:26:57 -07:00
|
|
|
zimfile += ".zim"
|
2020-07-22 21:31:18 -07:00
|
|
|
|
|
|
|
self.verify_warc_and_zim(warcfile, zimfile)
|
|
|
|
|
|
|
|
def test_error_bad_replay_viewer_url(self):
|
2020-07-27 08:26:57 -07:00
|
|
|
zim_output_not_created = os.path.join(self.root_dir, "zim-out-not-created.zim")
|
2020-07-22 21:31:18 -07:00
|
|
|
with pytest.raises(Exception) as e:
|
2020-07-27 08:26:57 -07:00
|
|
|
warc2zim(
|
|
|
|
[
|
|
|
|
"-v",
|
|
|
|
os.path.join(self.test_data_dir, "example-response.warc"),
|
|
|
|
"-r",
|
|
|
|
"x-invalid-x",
|
|
|
|
"-n",
|
|
|
|
zim_output_not_created,
|
|
|
|
]
|
|
|
|
)
|
2020-07-27 01:56:13 +00:00
|
|
|
|
|
|
|
# zim file should not have been created since replay viewer could not be loaded
|
|
|
|
assert not os.path.isfile(zim_output_not_created)
|