zimit/zimit.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu

"""
Main zimit run script
This script validates arguments with warc2zim, checks permissions
and then calls the Node based driver
"""

import atexit
import itertools
import json
import shutil
import signal
import subprocess
import sys
import tempfile
import urllib.parse
from argparse import ArgumentParser
from multiprocessing import Process
from pathlib import Path

import inotify
import inotify.adapters
import requests
from tld import get_fld
from warc2zim.main import main as warc2zim
from zimscraperlib.uri import rebuild_uri

DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"


class ProgressFileWatcher:
    def __init__(self, output_dir, stats_path):
        self.crawl_path = output_dir / "crawl.json"
        self.warc2zim_path = output_dir / "warc2zim.json"
        self.stats_path = Path(stats_path)

        if not self.stats_path.is_absolute():
            self.stats_path = output_dir / self.stats_path

        # touch them all so inotify is not unhappy on add_watch
        self.crawl_path.touch()
        self.warc2zim_path.touch()
        self.process = None

    def stop(self):
        self.process.join(0.1)
        self.process.terminate()

    def watch(self):
        self.process = Process(
            target=self.inotify_watcher,
            args=(str(self.crawl_path), str(self.warc2zim_path), str(self.stats_path)),
        )
        self.process.daemon = True
        self.process.start()

    @staticmethod
    def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath):
        ino = inotify.adapters.Inotify()
        ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
        ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)

        class Limit:
            def __init__(self):
                self.max = self.hit = None

            @property
            def as_dict(self):
                return {"max": self.max, "hit": self.hit}

        # limit is only reported by crawl but needs to be reported up
        limit = Limit()

        def crawl_conv(data, limit):
            # we consider crawl to be 90% of the workload so total = craw_total * 90%
            # limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
            limit.max = data["limit"]["max"]
            limit.hit = data["limit"]["hit"]
            return {
                "done": data["crawled"],
                "total": int(data["total"] / 0.9),
                "limit": limit.as_dict,
            }

        def warc2zim_conv(data, limit):
            # we consider warc2zim to be 10% of the workload so
            # warc2zim_total = 10% and  total = 90 + warc2zim_total * 10%
            return {
                "done": int(
                    data["total"]
                    * (0.9 + (float(data["written"]) / data["total"]) / 10)
                ),
                "total": data["total"],
                "limit": limit.as_dict,
            }

        for _, _, fpath, _ in ino.event_gen(yield_nones=False):
            func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
            if not func:
                continue
            # open input and output separatly as to not clear output on error
            with open(fpath, "r") as ifh:
                try:
                    out = func(json.load(ifh), limit)
                except Exception:  # nosec
                    # simply ignore progress update should an error arise
                    # might be malformed input for instance
                    continue
                if not out:
                    continue
                with open(output_fpath, "w") as ofh:
                    json.dump(out, ofh)


def zimit(args=None):
    wait_until_options = ["load", "domcontentloaded", "networkidle"]
    wait_until_all = wait_until_options + [
        f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
    ]
    parser = ArgumentParser(
        description="Run a browser-based crawl on the specified URL and convert to ZIM"
    )

    parser.add_argument("-u", "--url", help="The URL to start crawling from")
    parser.add_argument("--title", help="ZIM title")
    parser.add_argument("--description", help="ZIM description")
    parser.add_argument("--long-description", help="ZIM long description metadata")

    parser.add_argument(
        "--urlFile",
        help="If set, read a list of seed urls, " "one per line, from the specified",
    )

    parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")

    parser.add_argument(
        "--waitUntil",
        help="Puppeteer page.goto() condition to wait for before continuing. One of "
        f"{wait_until_options} or a comma-separated combination of those.",
        choices=wait_until_all,
        default="load",
    )

    parser.add_argument(
        "--depth", help="The depth of the crawl for all seeds", type=int, default=-1
    )

    parser.add_argument(
        "--extraHops",
        help="Number of extra 'hops' to follow, beyond the current scope",
        type=int,
    )

    parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int)

    parser.add_argument(
        "--maxPageLimit",
        help="Maximum pages to crawl, overriding pageLimit if both are set",
        type=int,
    )

    parser.add_argument(
        "--timeout",
        help="Timeout for each page to load (in seconds)",
        type=int,
        default=90,
    )

    parser.add_argument(
        "--scopeType",
        help="A predfined scope of the crawl. For more customization, "
        "use 'custom' and set scopeIncludeRx regexes",
        choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
    )

    parser.add_argument(
        "--include",
        help="Regex of page URLs that should be "
        "included in the crawl (defaults to "
        "the immediate directory of URL)",
    )

    parser.add_argument(
        "--exclude",
        help="Regex of page URLs that should be excluded from the crawl",
    )

    parser.add_argument(
        "--collection",
        help="Collection name to crawl to (replay will be accessible "
        "under this name in pywb preview) instead of crawl-@ts",
    )

    parser.add_argument(
        "--allowHashUrls",
        help="Allow Hashtag URLs, useful for "
        "single-page-application crawling or "
        "when different hashtags load dynamic "
        "content",
        action="store_true",
    )

    parser.add_argument(
        "--lang",
        help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
    )

    parser.add_argument(
        "--zim-lang",
        help="Language metadata of ZIM "
        "(warc2zim --lang param). ISO-639-3 code. "
        "Retrieved from homepage if found, fallback to `eng`",
    )

    parser.add_argument(
        "--mobileDevice",
        help="Emulate mobile device by name from "
        "https://github.com/puppeteer/puppeteer/blob/"
        "main/packages/puppeteer-core/src/common/Device.ts",
    )

    parser.add_argument(
        "--userAgent",
        help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
        default=DEFAULT_USER_AGENT,
    )

    parser.add_argument(
        "--userAgentSuffix",
        help="Append suffix to existing browser user-agent "
        "(ex: +MyCrawler, info@example.com)",
        default="+Zimit",
    )

    parser.add_argument(
        "--useSitemap",
        help="If set, use the URL as sitemap to get additional URLs for the crawl "
        "(usually /sitemap.xml)",
    )

    parser.add_argument(
        "--behaviors",
        help="Which background behaviors to enable on each page",
        default="autoplay,autofetch,siteSpecific",
    )

    parser.add_argument(
        "--behaviorTimeout",
        help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
        "If 0, a behavior can run until finish",
        type=int,
        default=90,
    )

    parser.add_argument(
        "--delay",
        help="If >0, amount of time to sleep (in seconds) after behaviors "
        "before moving on to next page",
        type=int,
    )

    parser.add_argument(
        "--profile",
        help="Path to tar.gz file which will be extracted "
        "and used as the browser profile",
    )

    parser.add_argument(
        "--sizeLimit",
        help="If set, save state and exit if size limit exceeds this value",
        type=int,
    )

    parser.add_argument(
        "--diskUtilization",
        help="If set, save state and exit if diskutilization "
        "exceeds this percentage value",
        type=int,
        default=90,
    )

    parser.add_argument(
        "--timeLimit",
        help="If set, save state and exit after time limit, in seconds",
        type=int,
    )

    parser.add_argument(
        "--healthCheckPort",
        help="port to run healthcheck on",
        type=int,
    )

    parser.add_argument(
        "--overwrite",
        help="overwrite current crawl data: if set, existing collection directory "
        "will be deleted before crawl is started",
        action="store_true",
        default=False,
    )

    parser.add_argument(
        "--keep",
        help="If set, keep WARC files after crawl, don't delete",
        action="store_true",
    )

    parser.add_argument("--output", help="Output directory for ZIM", default="/output")

    parser.add_argument(
        "--build",
        help="Build directory for WARC files (if not set, output directory is used)",
    )

    parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")

    parser.add_argument(
        "--custom-css",
        help="[warc2zim] Custom CSS file URL/path to inject into all articles",
    )

    parser.add_argument(
        "--statsFilename",
        help="If set, output stats as JSON to this file",
    )

    parser.add_argument(
        "--config",
        help="Path to YAML config file. If set, browsertrix-crawler will use this file"
        "to configure the crawling behaviour if not set via argument.",
    )

    zimit_args, warc2zim_args = parser.parse_known_args(args)

    # pass url and output to warc2zim also
    if zimit_args.output:
        warc2zim_args.append("--output")
        warc2zim_args.append(zimit_args.output)

    url = zimit_args.url

    user_agent = zimit_args.userAgent
    if zimit_args.userAgentSuffix:
        user_agent += f" {zimit_args.userAgentSuffix}"
    if zimit_args.adminEmail:
        user_agent += f" {zimit_args.adminEmail}"

    if url:
        url = check_url(url, user_agent, zimit_args.scopeType)
        warc2zim_args.append("--url")
        warc2zim_args.append(url)

    if zimit_args.custom_css:
        warc2zim_args += ["--custom-css", zimit_args.custom_css]

    if zimit_args.title:
        warc2zim_args.append("--title")
        warc2zim_args.append(zimit_args.title)

    if zimit_args.description:
        warc2zim_args.append("--description")
        warc2zim_args.append(zimit_args.description)

    if zimit_args.long_description:
        warc2zim_args.append("--long-description")
        warc2zim_args.append(zimit_args.long_description)

    if zimit_args.zim_lang:
        warc2zim_args.append("--lang")
        warc2zim_args.append(zimit_args.zim_lang)

    print("----------")
    print("Testing warc2zim args")
    print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
    res = warc2zim(warc2zim_args)
    if res != 100:
        print("Exiting, invalid warc2zim params")
        return 2

    # make temp dir for this crawl
    if zimit_args.build:
        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
    else:
        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))

    if not zimit_args.keep:

        def cleanup():
            print("")
            print("----------")
            print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
            shutil.rmtree(temp_root_dir)

        atexit.register(cleanup)

    cmd_args = get_node_cmd_line(zimit_args)
    if url:
        cmd_args.append("--url")
        cmd_args.append(url)

    cmd_args.append("--userAgent")
    cmd_args.append(user_agent)

    cmd_args.append("--cwd")
    cmd_args.append(str(temp_root_dir))

    # setup inotify crawler progress watcher
    if zimit_args.statsFilename:
        watcher = ProgressFileWatcher(
            Path(zimit_args.output), Path(zimit_args.statsFilename)
        )
        print(f"Writing progress to {watcher.stats_path}")
        # update crawler command
        cmd_args.append("--statsFilename")
        cmd_args.append(str(watcher.crawl_path))
        # update warc2zim command
        warc2zim_args.append("-v")
        warc2zim_args.append("--progress-file")
        warc2zim_args.append(str(watcher.warc2zim_path))
        watcher.watch()

    cmd_line = " ".join(cmd_args)

    print("")
    print("----------")
    print(
        f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
    )
    print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
    crawl = subprocess.run(cmd_args)
    if crawl.returncode == 11:
        print("crawl interupted by a limit")
    elif crawl.returncode != 0:
        raise subprocess.CalledProcessError(crawl.returncode, cmd_args)

    if zimit_args.collection:
        warc_directory = temp_root_dir.joinpath(
            f"collections/{zimit_args.collection}/archive/"
        )
    else:
        warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
        if len(warc_dirs) == 0:
            raise RuntimeError(
                "Failed to find directory where WARC files have been created"
            )
        elif len(warc_dirs) > 1:
            print("Found many WARC files directories, only last one will be used")
            for directory in warc_dirs:
                print(f"- {directory}")
        warc_directory = warc_dirs[-1]

    print("")
    print("----------")
    print(f"Processing WARC files in {warc_directory}")
    warc2zim_args.append(str(warc_directory))

    num_files = sum(1 for _ in warc_directory.iterdir())
    print(f"{num_files} WARC files found", flush=True)
    print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)

    return warc2zim(warc2zim_args)


def check_url(url, user_agent, scope=None):
    url = urllib.parse.urlparse(url)
    try:
        with requests.get(
            url.geturl(),
            stream=True,
            allow_redirects=True,
            timeout=(12.2, 27),
            headers={"User-Agent": user_agent},
        ) as resp:
            resp.raise_for_status()
    except requests.exceptions.RequestException as exc:
        print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
        raise SystemExit(1)
    actual_url = urllib.parse.urlparse(resp.url)

    # remove explicit port in URI for default-for-scheme as browsers does it
    if actual_url.scheme == "https" and actual_url.port == 443:
        actual_url = rebuild_uri(actual_url, port="")
    if actual_url.scheme == "http" and actual_url.port == 80:
        actual_url = rebuild_uri(actual_url, port="")

    if actual_url.geturl() != url.geturl():
        if scope in (None, "any"):
            return actual_url.geturl()

        print(
            "[WARN] Your URL ({0}) redirects to {1} which {2} on same "
            "first-level domain. Depending on your scopeType ({3}), "
            "your homepage might be out-of-scope. Please check!".format(
                url.geturl(),
                actual_url.geturl(),
                "is"
                if get_fld(url.geturl()) == get_fld(actual_url.geturl())
                else "is not",
                scope,
            )
        )

        return actual_url.geturl()

    return url.geturl()


def get_node_cmd_line(args):
    node_cmd = ["crawl", "--failOnFailedSeed"]
    for arg in [
        "workers",
        "waitUntil",
        "urlFile",
        "title",
        "description",
        "depth",
        "extraHops",
        "limit",
        "maxPageLimit",
        "timeout",
        "scopeType",
        "include",
        "exclude",
        "collection",
        "allowHashUrls",
        "lang",
        "mobileDevice",
        "useSitemap",
        "behaviors",
        "behaviorTimeout",
        "delay",
        "profile",
        "sizeLimit",
        "diskUtilization",
        "timeLimit",
        "healthCheckPort",
        "overwrite",
        "config",
    ]:
        value = getattr(args, arg)
        if value == None or (isinstance(value, bool) and value == False):
            continue
        node_cmd.append("--" + arg)
        if not isinstance(value, bool):
            node_cmd.append(str(value))

    return node_cmd


def sigint_handler(*args):
    print("")
    print("")
    print("SIGINT/SIGTERM received, stopping zimit")
    print("")
    print("", flush=True)
    sys.exit(3)


signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)

if __name__ == "__main__":
    zimit()