From 101fb71a0bdd0cc308de060f59c83a32ad5cdbaf Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 11 Feb 2025 16:57:19 +0000
Subject: [PATCH] Better processing of crawler exit codes with soft/hard limits

---
 .github/workflows/Tests.yaml     |  22 ++++--
 CHANGELOG.md                     |   8 +++
 src/zimit/constants.py           |   3 +-
 src/zimit/zimit.py               | 115 +++++++++++++++++++++----------
 tests-integration/integration.py |  46 +++++++++++--
 5 files changed, 149 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
index 592a5aa..601f9ba 100644
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@@ -57,13 +57,25 @@ jobs:
         uses: actions/checkout@v4
 
       - name: build image
-        run: docker build -t zimit .
+        run: docker build -t local-zimit .
 
       - name: ensure help display without issue
-        run: docker run -v $PWD/output:/output zimit zimit --help
+        run: docker run -v $PWD/output:/output local-zimit zimit --help
 
-      - name: run crawl
-        run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
+      - name: run crawl with soft size limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
+
+      - name: run crawl with hard size limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
+
+      - name: run crawl with soft time limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
+
+      - name: run crawl with hard time limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
+
+      - name: run standard crawl
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
 
       - name: run integration test suite
-        run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
+        run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6387cbd..f6d7044 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
+
+### Fixed
+
+- Do not create the ZIM when crawl is incomplete (#444)
+
 ## [2.1.8] - 2024-02-07
 
 ### Changed
diff --git a/src/zimit/constants.py b/src/zimit/constants.py
index f81905a..35baeb9 100644
--- a/src/zimit/constants.py
+++ b/src/zimit/constants.py
@@ -3,7 +3,8 @@ import logging
 from zimscraperlib.logging import getLogger
 
 EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
-EXIT_CODE_CRAWLER_LIMIT_HIT = 11
+EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
+EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
 NORMAL_WARC2ZIM_EXIT_CODE = 100
 REQUESTS_TIMEOUT = 10
 
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index ec989f1..416bec9 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -25,7 +25,8 @@ from zimscraperlib.uri import rebuild_uri
 
 from zimit.__about__ import __version__
 from zimit.constants import (
-    EXIT_CODE_CRAWLER_LIMIT_HIT,
+    EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT,
+    EXIT_CODE_CRAWLER_TIME_LIMIT_HIT,
     EXIT_CODE_WARC2ZIM_CHECK_FAILED,
     NORMAL_WARC2ZIM_EXIT_CODE,
     logger,
@@ -61,35 +62,19 @@ class ProgressFileWatcher:
         self.process.daemon = True
         self.process.start()
 
-    @staticmethod
-    def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
+    def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
         ino = inotify.adapters.Inotify()
         ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
         ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
 
-        class Limit:
-            def __init__(self):
-                self.max = self.hit = None
-
-            @property
-            def as_dict(self):
-                return {"max": self.max, "hit": self.hit}
-
-        # limit is only reported by crawl but needs to be reported up
-        limit = Limit()
-
-        def crawl_conv(data, limit):
+        def crawl_conv(data):
             # we consider crawl to be 90% of the workload so total = craw_total * 90%
-            # limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
-            limit.max = data["limit"]["max"]
-            limit.hit = data["limit"]["hit"]
             return {
                 "done": data["crawled"],
                 "total": int(data["total"] / 0.9),
-                "limit": limit.as_dict,
             }
 
-        def warc2zim_conv(data, limit):
+        def warc2zim_conv(data):
             # we consider warc2zim to be 10% of the workload so
             # warc2zim_total = 10% and  total = 90 + warc2zim_total * 10%
             return {
@@ -98,7 +83,6 @@ class ProgressFileWatcher:
                     * (0.9 + (float(data["written"]) / data["total"]) / 10)
                 ),
                 "total": data["total"],
-                "limit": limit.as_dict,
             }
 
         for _, _, fpath, _ in ino.event_gen(yield_nones=False):  # pyright: ignore
@@ -108,7 +92,7 @@ class ProgressFileWatcher:
             # open input and output separatly as to not clear output on error
             with open(fpath) as ifh:
                 try:
-                    out = func(json.load(ifh), limit)
+                    out = func(json.load(ifh))
                 except Exception:  # nosec # noqa: S112
                     # simply ignore progress update should an error arise
                     # might be malformed input for instance
@@ -278,9 +262,17 @@ def run(raw_args):
         "directory",
     )
 
-    parser.add_argument(
-        "--sizeLimit",
-        help="If set, save state and exit if size limit exceeds this value",
+    size_group = parser.add_mutually_exclusive_group()
+    size_group.add_argument(
+        "--sizeSoftLimit",
+        help="If set, save crawl state and stop crawl if WARC size exceeds this value. "
+        "ZIM will still be created.",
+        type=int,
+    )
+    size_group.add_argument(
+        "--sizeHardLimit",
+        help="If set, exit crawler and fail the scraper immediately if WARC size "
+        "exceeds this value",
         type=int,
     )
 
@@ -292,9 +284,17 @@ def run(raw_args):
         default=90,
     )
 
-    parser.add_argument(
-        "--timeLimit",
-        help="If set, save state and exit after time limit, in seconds",
+    time_group = parser.add_mutually_exclusive_group()
+    time_group.add_argument(
+        "--timeSoftLimit",
+        help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes "
+        "longer than this value, in seconds. ZIM will still be created.",
+        type=int,
+    )
+    time_group.add_argument(
+        "--timeHardLimit",
+        help="If set, exit crawler and fail the scraper immediately if WARC(s) creation"
+        " takes longer than this value, in seconds",
         type=int,
     )
 
@@ -369,6 +369,13 @@ def run(raw_args):
         "path/URLs separated by comma",
     )
 
+    parser.add_argument(
+        "--acceptable-crawler-exit-codes",
+        help="Non-zero crawler exit codes to consider as acceptable to continue with "
+        " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if "
+        " used). Single value with individual error codes separated by comma",
+    )
+
     zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
 
     # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
@@ -504,6 +511,8 @@ def run(raw_args):
         f"{'will keep' if zimit_args.keep else 'will delete'}"
     )
 
+    partial_zim = False
+
     # if warc files are passed, do not run browsertrix crawler but fetch the files if
     # they are provided as an HTTP URL + extract the archive if it is a tar.gz
     warc_files: list[Path] = []
@@ -568,10 +577,29 @@ def run(raw_args):
 
         logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
         crawl = subprocess.run(cmd_args, check=False)
-        if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
-            logger.info("crawl interupted by a limit")
+        if (
+            crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
+            and zimit_args.sizeSoftLimit
+        ):
+            logger.info(
+                "Crawl size soft limit hit. Continuing with warc2zim conversion."
+            )
+            if zimit_args.statsFilename:
+                partial_zim = True
+        elif (
+            crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
+            and zimit_args.timeSoftLimit
+        ):
+            logger.info(
+                "Crawl time soft limit hit. Continuing with warc2zim conversion."
+            )
+            if zimit_args.statsFilename:
+                partial_zim = True
         elif crawl.returncode != 0:
-            raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
+            logger.error(
+                f"Crawl returned an error: {crawl.returncode}, scraper exiting"
+            )
+            return crawl.returncode
 
         if zimit_args.collection:
             warc_files = [
@@ -606,7 +634,15 @@ def run(raw_args):
 
     logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
 
-    return warc2zim(warc2zim_args)
+    warc2zim_exit_code = warc2zim(warc2zim_args)
+
+    if zimit_args.statsFilename:
+        stats = Path(zimit_args.statsFilename)
+        stats_content = json.loads(stats.read_bytes())
+        stats_content["partialZim"] = partial_zim
+        stats.write_text(json.dumps(stats_content))
+
+    return warc2zim_exit_code
 
 
 def get_cleaned_url(url: str):
@@ -646,9 +682,11 @@ def get_node_cmd_line(args):
         "behaviorTimeout",
         "delay",
         "profile",
-        "sizeLimit",
+        "sizeSoftLimit",
+        "sizeHardLimit",
         "diskUtilization",
-        "timeLimit",
+        "timeSoftLimit",
+        "timeHardLimit",
         "healthCheckPort",
         "overwrite",
         "config",
@@ -668,7 +706,14 @@ def get_node_cmd_line(args):
                 continue
         if value is None or (isinstance(value, bool) and value is False):
             continue
-        node_cmd.append("--" + arg)
+        node_cmd.append(
+            "--"
+            + (
+                "sizeLimit"
+                if arg in ["sizeSoftLimit", "sizeHardLimit"]
+                else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg
+            )
+        )
         if not isinstance(value, bool):
             node_cmd.append(str(value))
 
diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index d9bfc94..7e79f52 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -3,13 +3,34 @@ import json
 import os
 from pathlib import Path
 
+import pytest
 from warcio import ArchiveIterator
 from zimscraperlib.zim import Archive
 
 
-def test_is_file():
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/tests_en_onepage.zim", id="onepage"),
+        pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
+        pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
+    ],
+)
+def test_zim_created(filename):
     """Ensure ZIM file exists"""
-    assert os.path.isfile("/output/tests_en_onepage.zim")
+    assert os.path.isfile(filename)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
+        pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
+    ],
+)
+def test_zim_not_created(filename):
+    """Ensure ZIM file does not exists"""
+    assert not os.path.exists(filename)
 
 
 def test_zim_main_page():
@@ -85,7 +106,7 @@ def test_user_agent():
     assert found
 
 
-def test_stats_output():
+def test_stats_output_standard():
     assert json.loads(Path("/output/crawl.json").read_bytes()) == {
         "crawled": 17,
         "pending": 0,
@@ -103,5 +124,22 @@ def test_stats_output():
     assert json.loads(Path("/output/stats.json").read_bytes()) == {
         "done": 8,
         "total": 8,
-        "limit": {"max": 0, "hit": False},
+        "partialZim": False,
     }
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
+        pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
+    ],
+)
+def test_stats_output_softlimit(filename):
+    file = Path(filename)
+    assert file.exists
+    content = json.loads(file.read_bytes())
+    assert "done" in content
+    assert "total" in content
+    assert "partialZim" in content
+    assert content["partialZim"]