zimit/test/integration.py
renaud gaudin 85fad62b61 Updated test to new stats files
- verify output of crawl, warc2zim and zimit file
- using a simpler tag for CI test image as to not confuse it with public image
2020-12-10 10:44:49 +00:00

63 lines
1.7 KiB
Python

import os
import glob
import json
import libzim.reader
from warcio import ArchiveIterator
def get_zim_article(zimfile, path):
zim_fh = libzim.reader.File(zimfile)
return zim_fh.get_article(path).content.tobytes()
def test_is_file():
""" Ensure ZIM file exists"""
assert os.path.isfile("/output/isago.zim")
def test_zim_main_page():
"""Main page specified, http://isago.ml/, was a redirect to https
Ensure main page is the redirected page"""
assert b'"https://isago.ml/"' in get_zim_article(
"/output/isago.zim", "A/index.html"
)
def test_user_agent():
""" Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
found = False
for warc in glob.glob("/output/.tmp*/collections/capture/archive/*.warc.gz"):
with open(warc, "rb") as fh:
for record in ArchiveIterator(fh):
if record.rec_type == "request":
print(record.http_headers)
ua = record.http_headers.get_header("User-Agent")
if ua:
assert "iPhone" in ua
assert ua.endswith(" +Zimit test@example.com")
found = True
# should find at least one
assert found
def test_stats_output():
with open("/output/crawl.json") as fh:
assert json.loads(fh.read()) == {
"numCrawled": 5,
"workersRunning": 0,
"total": 5,
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 7,
"total": 7,
}