mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
- verify output of crawl, warc2zim and zimit file - using a simpler tag for CI test image as to not confuse it with public image
63 lines
1.7 KiB
Python
63 lines
1.7 KiB
Python
import os
|
|
import glob
|
|
import json
|
|
|
|
import libzim.reader
|
|
from warcio import ArchiveIterator
|
|
|
|
|
|
def get_zim_article(zimfile, path):
|
|
zim_fh = libzim.reader.File(zimfile)
|
|
return zim_fh.get_article(path).content.tobytes()
|
|
|
|
|
|
def test_is_file():
|
|
""" Ensure ZIM file exists"""
|
|
assert os.path.isfile("/output/isago.zim")
|
|
|
|
|
|
def test_zim_main_page():
|
|
"""Main page specified, http://isago.ml/, was a redirect to https
|
|
Ensure main page is the redirected page"""
|
|
|
|
assert b'"https://isago.ml/"' in get_zim_article(
|
|
"/output/isago.zim", "A/index.html"
|
|
)
|
|
|
|
|
|
def test_user_agent():
|
|
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
|
|
|
|
found = False
|
|
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
|
|
with open(warc, "rb") as fh:
|
|
for record in ArchiveIterator(fh):
|
|
if record.rec_type == "request":
|
|
print(record.http_headers)
|
|
ua = record.http_headers.get_header("User-Agent")
|
|
if ua:
|
|
assert "iPhone" in ua
|
|
assert ua.endswith(" +Zimit test@example.com")
|
|
found = True
|
|
|
|
# should find at least one
|
|
assert found
|
|
|
|
|
|
def test_stats_output():
|
|
with open("/output/crawl.json") as fh:
|
|
assert json.loads(fh.read()) == {
|
|
"numCrawled": 5,
|
|
"workersRunning": 0,
|
|
"total": 5,
|
|
}
|
|
with open("/output/warc2zim.json") as fh:
|
|
assert json.loads(fh.read()) == {
|
|
"written": 7,
|
|
"total": 7,
|
|
}
|
|
with open("/output/stats.json") as fh:
|
|
assert json.loads(fh.read()) == {
|
|
"done": 7,
|
|
"total": 7,
|
|
}
|