mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
174 lines
4.3 KiB
Python
Executable file
174 lines
4.3 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# vim: ai ts=4 sts=4 et sw=4 nu
|
|
|
|
"""
|
|
Main zimit run script
|
|
This script validates arguments with warc2zim, checks permissions
|
|
and then calls the Node based driver
|
|
"""
|
|
|
|
from argparse import ArgumentParser
|
|
import os
|
|
import tempfile
|
|
import subprocess
|
|
import atexit
|
|
import shutil
|
|
import glob
|
|
import signal
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from warc2zim.main import warc2zim
|
|
|
|
|
|
def zimit(args=None):
|
|
parser = ArgumentParser(
|
|
description="Run a browser-based crawl on the specified URL and convert to ZIM"
|
|
)
|
|
|
|
parser.add_argument("-u", "--url", help="The URL to start crawling from")
|
|
|
|
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
|
|
|
|
parser.add_argument(
|
|
"--newContext",
|
|
help="The context for each new capture, can be a new: page, session or browser.",
|
|
choices=["page", "session", "browser"],
|
|
default="page",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--waitUntil",
|
|
help="Puppeteer page.goto() condition to wait for before continuing",
|
|
choices=["load", "domcontentloaded", "networkidle0", "networkidle2"],
|
|
default="load",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--limit", help="Limit crawl to this number of pages", type=int, default=0
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--timeout",
|
|
help="Timeout for each page to load (in seconds)",
|
|
type=int,
|
|
default=90,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--scope",
|
|
help="Regex of page URLs that should be included in the crawl (defaults to the immediate directory of the URL)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--exclude", help="Regex of page URLs that should be excluded from the crawl."
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--scroll",
|
|
help="If set, will autoscroll to bottom of the page",
|
|
action="store_true",
|
|
default=False,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--keep",
|
|
help="If set, keep WARC files after crawl, don't delete",
|
|
action="store_true",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output", help="Output directory for ZIM and WARC files", default="/output"
|
|
)
|
|
|
|
zimit_args, warc2zim_args = parser.parse_known_args(args)
|
|
|
|
# pass url and output to warc2zim also
|
|
if zimit_args.output:
|
|
warc2zim_args.append("--output")
|
|
warc2zim_args.append(zimit_args.output)
|
|
|
|
if zimit_args.url:
|
|
warc2zim_args.append("--url")
|
|
warc2zim_args.append(zimit_args.url)
|
|
|
|
print("----------")
|
|
print("Testing warc2zim args")
|
|
print("Running: warc2zim " + " ".join(warc2zim_args))
|
|
res = warc2zim(warc2zim_args)
|
|
if res != 100:
|
|
print("Exiting, invalid warc2zim params")
|
|
return 2
|
|
|
|
# make temp dir for this crawl
|
|
temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
|
|
|
|
if not zimit_args.keep:
|
|
|
|
def cleanup():
|
|
print("")
|
|
print("----------")
|
|
print("Cleanup, removing temp dir: " + str(temp_root_dir))
|
|
shutil.rmtree(temp_root_dir)
|
|
|
|
atexit.register(cleanup)
|
|
|
|
cmd_args = get_node_cmd_line(zimit_args)
|
|
cmd_args.append("--cwd")
|
|
cmd_args.append(str(temp_root_dir))
|
|
|
|
cmd_line = " ".join(cmd_args)
|
|
|
|
print("")
|
|
print("----------")
|
|
print("running browsertrix-crawler crawl: " + cmd_line)
|
|
subprocess.run(cmd_args, check=True)
|
|
|
|
warc_files = temp_root_dir / "collections" / "capture" / "archive"
|
|
warc2zim_args.append(str(warc_files))
|
|
|
|
num_files = sum(1 for e in warc_files.iterdir())
|
|
|
|
print("")
|
|
print("----------")
|
|
print("Processing {0} WARC files to ZIM".format(num_files))
|
|
|
|
return warc2zim(warc2zim_args)
|
|
|
|
def get_node_cmd_line(args):
|
|
node_cmd = ["crawl"]
|
|
for arg in [
|
|
"url",
|
|
"workers",
|
|
"newContext",
|
|
"waitUntil",
|
|
"limit",
|
|
"timeout",
|
|
"scope",
|
|
"exclude",
|
|
"scroll",
|
|
]:
|
|
value = getattr(args, arg)
|
|
if value:
|
|
node_cmd.append("--" + arg)
|
|
if not isinstance(value, bool):
|
|
node_cmd.append(str(value))
|
|
|
|
return node_cmd
|
|
|
|
|
|
def sigint_handler(*args):
|
|
print("")
|
|
print("")
|
|
print("SIGINT/SIGTERM received, stopping zimit")
|
|
print("")
|
|
print("")
|
|
sys.exit(3)
|
|
|
|
|
|
signal.signal(signal.SIGINT, sigint_handler)
|
|
signal.signal(signal.SIGTERM, sigint_handler)
|
|
|
|
if __name__ == "__main__":
|
|
zimit()
|