From 15b72022cebf42143dda2eb32b6508b4576bcc5f Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:03:03 +0000
Subject: [PATCH 01/65] Prepare for 2.1.7

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6beb584..cdd3fd2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [2.1.6] - 2024-11-07
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index b213161..ffedf8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.1.3",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index edc60b3..63e60b5 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "2.1.6"
+__version__ = "2.1.7-dev0"

From bfa226bf81da7c5c7c1a624f22f3063abd5058c9 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:22:35 +0000
Subject: [PATCH 02/65] Properly exit with code

---
 src/zimit/zimit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 44c6d4f..ec989f1 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -685,7 +685,7 @@ def sigint_handler(*args):  # noqa: ARG001
 
 
 def zimit():
-    run(sys.argv[1:])
+    sys.exit(run(sys.argv[1:]))
 
 
 signal.signal(signal.SIGINT, sigint_handler)

From 16a4f8d4d830932632999109a9965c9c6f108277 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 15 Nov 2024 15:46:40 +0000
Subject: [PATCH 03/65] Upgrade to browsertrix crawler 1.4.0-beta.0

---
 CHANGELOG.md | 4 ++++
 Dockerfile   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cdd3fd2..20dce9f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434)
+
 ## [2.1.6] - 2024-11-07
 
 ### Changed
diff --git a/Dockerfile b/Dockerfile
index bac1b30..67e0e18 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.3.5
+FROM webrecorder/browsertrix-crawler:1.4.0-beta.0
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit
 
 RUN apt-get update \

From 00d2433383c32e539a84ac36a19746d6aaf9c6a0 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 9 Jan 2025 09:06:08 +0000
Subject: [PATCH 04/65] Upgrade to browsertrix crawler 1.4.2

---
 CHANGELOG.md | 2 +-
 Dockerfile   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 20dce9f..0711aea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434)
+- Upgrade to browsertrix crawler 1.4.2 (#450)
 
 ## [2.1.6] - 2024-11-07
 
diff --git a/Dockerfile b/Dockerfile
index 67e0e18..9b304d0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.4.0-beta.0
+FROM webrecorder/browsertrix-crawler:1.4.2
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit
 
 RUN apt-get update \

From 8d42a8dd93ba12aeb75d6294ff9d55ed51e6de5a Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:41:05 +0000
Subject: [PATCH 05/65] Move integration tests to test website

---
 .github/workflows/Tests.yaml     |  2 +-
 tests-integration/integration.py | 83 ++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
index 9e21fa7..592a5aa 100644
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@@ -63,7 +63,7 @@ jobs:
         run: docker run -v $PWD/output:/output zimit zimit --help
 
       - name: run crawl
-        run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
+        run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
 
       - name: run integration test suite
         run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index 16ab337..9d37b0f 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -1,6 +1,7 @@
 import glob
 import json
 import os
+from pathlib import Path
 
 from warcio import ArchiveIterator
 from zimscraperlib.zim import Archive
@@ -8,23 +9,26 @@ from zimscraperlib.zim import Archive
 
 def test_is_file():
     """Ensure ZIM file exists"""
-    assert os.path.isfile("/output/isago.zim")
+    assert os.path.isfile("/output/tests_en_onepage.zim")
 
 
 def test_zim_main_page():
-    """Main page specified, http://isago.rskg.org/, was a redirect to https
+    """Main page specified, http://website.test.openzim.org/http-return-codes.html,
+    was a redirect to https
     Ensure main page is the redirected page"""
 
-    main_entry = Archive("/output/isago.zim").main_entry
+    main_entry = Archive("/output/tests_en_onepage.zim").main_entry
     assert main_entry.is_redirect
-    assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
+    assert (
+        main_entry.get_redirect_entry().path
+        == "website.test.openzim.org/http-return-codes.html"
+    )
 
 
 def test_zim_scraper():
-    """Main page specified, http://isago.rskg.org/, was a redirect to https
-    Ensure main page is the redirected page"""
+    """Check content of scraper metadata"""
 
-    zim_fh = Archive("/output/isago.zim")
+    zim_fh = Archive("/output/tests_en_onepage.zim")
     scraper = zim_fh.get_text_metadata("Scraper")
     assert "zimit " in scraper
     assert "warc2zim " in scraper
@@ -33,18 +37,28 @@ def test_zim_scraper():
 
 def test_files_list():
     """Check that expected files are present in the ZIM at proper path"""
-    zim_fh = Archive("/output/isago.zim")
+    zim_fh = Archive("/output/tests_en_onepage.zim")
     for expected_entry in [
         "_zim_static/__wb_module_decl.js",
         "_zim_static/wombat.js",
         "_zim_static/wombatSetup.js",
-        "isago.rskg.org/",
-        "isago.rskg.org/a-propos",
-        "isago.rskg.org/conseils",
-        "isago.rskg.org/faq",
-        "isago.rskg.org/static/favicon256.png",
-        "isago.rskg.org/static/tarifs-isago.pdf",
-        "maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css",
+        "website.test.openzim.org/http-return-codes.html",
+        "website.test.openzim.org/200-response",
+        "website.test.openzim.org/201-response",
+        "website.test.openzim.org/202-response",
+        "website.test.openzim.org/301-external-redirect-ok",
+        "website.test.openzim.org/301-internal-redirect-ok",
+        "website.test.openzim.org/302-external-redirect-ok",
+        "website.test.openzim.org/302-internal-redirect-ok",
+        "website.test.openzim.org/307-external-redirect-ok",
+        "website.test.openzim.org/307-internal-redirect-ok",
+        "website.test.openzim.org/308-external-redirect-ok",
+        "website.test.openzim.org/308-internal-redirect-ok",
+        "website.test.openzim.org/http-return-codes.html",
+        "website.test.openzim.org/icons/favicon.ico",
+        "website.test.openzim.org/icons/site.webmanifest",
+        "website.test.openzim.org/internal_redirect_target.html",
+        "www.example.com/",
     ]:
         assert zim_fh.get_content(expected_entry)
 
@@ -72,23 +86,22 @@ def test_user_agent():
 
 
 def test_stats_output():
-    with open("/output/crawl.json") as fh:
-        assert json.loads(fh.read()) == {
-            "crawled": 5,
-            "pending": 0,
-            "pendingPages": [],
-            "total": 5,
-            "failed": 0,
-            "limit": {"max": 0, "hit": False},
-        }
-    with open("/output/warc2zim.json") as fh:
-        assert json.loads(fh.read()) == {
-            "written": 7,
-            "total": 7,
-        }
-    with open("/output/stats.json") as fh:
-        assert json.loads(fh.read()) == {
-            "done": 7,
-            "total": 7,
-            "limit": {"max": 0, "hit": False},
-        }
+    assert json.loads(Path("/output/crawl.json").read_bytes()) == {
+        "crawled": 35,
+        "pending": 0,
+        "pendingPages": [],
+        "total": 35,
+        "failed": 18,
+        "limit": {"max": 0, "hit": False},
+    }
+
+    assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
+        "written": 8,
+        "total": 8,
+    }
+
+    assert json.loads(Path("/output/stats.json").read_bytes()) == {
+        "done": 8,
+        "total": 8,
+        "limit": {"max": 0, "hit": False},
+    }

From 97ea6dfd7b8d74026e859a49c1681af88e1cbcb8 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 9 Jan 2025 10:41:22 +0000
Subject: [PATCH 06/65] Fix Docker label to follow new convention

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 9b304d0..9d88f45 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 FROM webrecorder/browsertrix-crawler:1.4.2
-LABEL org.opencontainers.image.source https://github.com/openzim/zimit
+LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 RUN apt-get update \
  && apt-get install -qqy --no-install-recommends \

From 14670d4c6959f2c55851c2f9df4df0505c930d0b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 10 Jan 2025 10:24:47 +0000
Subject: [PATCH 07/65] Release 2.1.7

---
 CHANGELOG.md           | 3 ++-
 pyproject.toml         | 5 +----
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0711aea..d0e9044 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,12 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [2.1.7] - 2024-01-10
 
 ### Changed
 
 - Upgrade to browsertrix crawler 1.4.2 (#450)
+- Upgrade to warc2zim 2.2.0
 
 ## [2.1.6] - 2024-11-07
 
diff --git a/pyproject.toml b/pyproject.toml
index ffedf8b..e522b95 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,16 +11,13 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.0",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
 [tool.hatch.metadata.hooks.openzim-metadata]
 kind = "scraper"
 
-[tool.hatch.metadata]
-allow-direct-references = true  # to be removed once we use a released warc2zim version
-
 [project.optional-dependencies]
 scripts = [
   "invoke==2.2.0",
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 63e60b5..b6d2a0c 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "2.1.7-dev0"
+__version__ = "2.1.7"

From 4835adbdd7c7d43225a62f86d0128285af2ea3d4 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:41:01 +0000
Subject: [PATCH 08/65] Prepare for 2.1.8

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0e9044..c437da8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [2.1.7] - 2024-01-10
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index e522b95..1bc23a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.0",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index b6d2a0c..72f5b42 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "2.1.7"
+__version__ = "2.1.8-dev0"

From 0cb84f212677e1794913489cb6aefce7715d283e Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:41:01 +0000
Subject: [PATCH 09/65] Prepare for 2.1.8

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 1bc23a7..ffedf8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,9 @@ dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 [tool.hatch.metadata.hooks.openzim-metadata]
 kind = "scraper"
 
+[tool.hatch.metadata]
+allow-direct-references = true  # to be removed once we use a released warc2zim version
+
 [project.optional-dependencies]
 scripts = [
   "invoke==2.2.0",

From 0f136d2f2f49d583d0768b0e46dc763afb5c5169 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 4 Feb 2025 15:12:49 +0000
Subject: [PATCH 10/65] Upgrade Python 3.13, Crawler 1.5.0 and others

---
 .pre-commit-config.yaml          |  8 ++++----
 CHANGELOG.md                     |  4 ++++
 Dockerfile                       |  9 ++++++---
 pyproject.toml                   | 26 +++++++++++++-------------
 tests-daily/Dockerfile           |  8 ++++----
 tests-integration/integration.py |  6 +++---
 6 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4f91d0b..b362d62 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,20 +2,20 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.4.0
+  rev: v5.0.0
   hooks:
   -   id: trailing-whitespace
   -   id: end-of-file-fixer
 - repo: https://github.com/psf/black
-  rev: "24.10.0"
+  rev: "25.1.0"
   hooks:
   -   id: black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.9
+  rev: v0.9.4
   hooks:
   - id: ruff
 - repo: https://github.com/RobertCraigie/pyright-python
-  rev: v1.1.383
+  rev: v1.1.393
   hooks:
   - id: pyright
     name: pyright (system)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c437da8..4033a33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.0, Python 3.13 and others (#462)
+
 ## [2.1.7] - 2024-01-10
 
 ### Changed
diff --git a/Dockerfile b/Dockerfile
index 9d88f45..d2854dc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,13 +1,16 @@
-FROM webrecorder/browsertrix-crawler:1.4.2
+FROM webrecorder/browsertrix-crawler:1.5.0
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
+# add deadsnakes ppa for latest Python on Ubuntu
+RUN add-apt-repository ppa:deadsnakes/ppa -y
+
 RUN apt-get update \
  && apt-get install -qqy --no-install-recommends \
       libmagic1 \
-      python3.12-venv \
+      python3.13-venv \
  && rm -rf /var/lib/apt/lists/* \
  # python setup (in venv not to conflict with browsertrix)
- && python3.12 -m venv /app/zimit \
+ && python3.13 -m venv /app/zimit \
  # placeholder (default output location)
  && mkdir -p /output \
  # disable chrome upgrade
diff --git a/pyproject.toml b/pyproject.toml
index ffedf8b..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,10 @@
 [build-system]
-requires = ["hatchling", "hatch-openzim==0.2.0"]
+requires = ["hatchling", "hatch-openzim"]
 build-backend = "hatchling.build"
 
 [project]
 name = "zimit"
-requires-python = ">=3.12,<3.13"
+requires-python = ">=3.13,<3.14"
 description = "Make ZIM file from any website through crawling"
 readme = "README.md"
 dependencies = [
@@ -26,20 +26,20 @@ scripts = [
   "invoke==2.2.0",
 ]
 lint = [
-  "black==24.10.0",
-  "ruff==0.6.9",
+  "black==25.1.0",
+  "ruff==0.9.4",
 ]
 check = [
-  "pyright==1.1.383",
+  "pyright==1.1.393",
 ]
 test = [
-  "pytest==8.3.3",
-  "coverage==7.6.1",
+  "pytest==8.3.4",
+  "coverage==7.6.10",
 ]
 dev = [
-  "pre-commit==4.0.0",
-  "debugpy==1.8.6",
-  "selenium==4.25.0", # used in daily tests, convenient for dev purpose (autocompletion)
+  "pre-commit==4.1.0",
+  "debugpy==1.8.12",
+  "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
   "zimit[scripts]",
   "zimit[lint]",
   "zimit[test]",
@@ -95,10 +95,10 @@ all = "inv checkall --args '{args}'"
 
 [tool.black]
 line-length = 88
-target-version = ['py312']
+target-version = ['py313']
 
 [tool.ruff]
-target-version = "py312"
+target-version = "py313"
 line-length = 88
 src = ["src"]
 
@@ -221,5 +221,5 @@ exclude_lines = [
 include = ["src", "tests", "tasks.py"]
 exclude = [".env/**", ".venv/**"]
 extraPaths = ["src"]
-pythonVersion = "3.12"
+pythonVersion = "3.13"
 typeCheckingMode="basic"
diff --git a/tests-daily/Dockerfile b/tests-daily/Dockerfile
index f6118fe..22d45ef 100644
--- a/tests-daily/Dockerfile
+++ b/tests-daily/Dockerfile
@@ -1,5 +1,5 @@
 # Let's extract kiwix-tools as usual on alpine temporary build container
-FROM alpine:3.18 as kiwix-serve
+FROM alpine:3.21 as kiwix-serve
 LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
 
 # TARGETPLATFORM is injected by docker build
@@ -30,7 +30,7 @@ RUN set -e && \
     curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
 
 # Build real "workload" container
-FROM python:3.12-slim-bookworm
+FROM python:3.13-slim-bookworm
 
 # Add kiwix-serve
 COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
@@ -70,6 +70,6 @@ RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
 RUN \
    python -m pip install --no-cache-dir -U \
      pip \
-     selenium==4.23.0 \
-     pytest==8.2.2 \
+     selenium==4.28.1 \
+     pytest==8.3.4 \
 && mkdir -p /work
diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index 9d37b0f..55fcecb 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -17,7 +17,7 @@ def test_zim_main_page():
     was a redirect to https
     Ensure main page is the redirected page"""
 
-    main_entry = Archive("/output/tests_en_onepage.zim").main_entry
+    main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
     assert main_entry.is_redirect
     assert (
         main_entry.get_redirect_entry().path
@@ -28,7 +28,7 @@ def test_zim_main_page():
 def test_zim_scraper():
     """Check content of scraper metadata"""
 
-    zim_fh = Archive("/output/tests_en_onepage.zim")
+    zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
     scraper = zim_fh.get_text_metadata("Scraper")
     assert "zimit " in scraper
     assert "warc2zim " in scraper
@@ -37,7 +37,7 @@ def test_zim_scraper():
 
 def test_files_list():
     """Check that expected files are present in the ZIM at proper path"""
-    zim_fh = Archive("/output/tests_en_onepage.zim")
+    zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
     for expected_entry in [
         "_zim_static/__wb_module_decl.js",
         "_zim_static/wombat.js",

From 9396cf1ca05ab73a1c6ee001328a6420c4fd9385 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 6 Feb 2025 13:38:10 +0000
Subject: [PATCH 11/65] Alter crawl statistics following 1.5.0 release

---
 tests-integration/integration.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index 55fcecb..b757e3d 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -87,11 +87,12 @@ def test_user_agent():
 
 def test_stats_output():
     assert json.loads(Path("/output/crawl.json").read_bytes()) == {
-        "crawled": 35,
+        "crawled": 17,
         "pending": 0,
         "pendingPages": [],
-        "total": 35,
-        "failed": 18,
+        "total": 17,
+        "failed": 1,
+        "failedWillRetry": 17,
         "limit": {"max": 0, "hit": False},
     }
 

From 4ef9a0d380ae992448151baeafa888befeb2f446 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 6 Feb 2025 21:11:40 +0000
Subject: [PATCH 12/65] Remove support for ARM64, this is not working anymore
 and was painfully slow

---
 .github/workflows/Publish.yml                | 1 -
 .github/workflows/PublishDockerDevImage.yaml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml
index b6660d0..6fabc32 100644
--- a/.github/workflows/Publish.yml
+++ b/.github/workflows/Publish.yml
@@ -26,4 +26,3 @@ jobs:
           repo_overview: auto
           platforms: |
             linux/amd64
-            linux/arm64
diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml
index 5e2431e..d893882 100644
--- a/.github/workflows/PublishDockerDevImage.yaml
+++ b/.github/workflows/PublishDockerDevImage.yaml
@@ -28,4 +28,3 @@ jobs:
           repo_overview: auto
           platforms: |
             linux/amd64
-            linux/arm64

From cea10bd3b5cb3866f1c18680fbe0ce307391e367 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 6 Feb 2025 21:17:46 +0000
Subject: [PATCH 13/65] Add second build job on native arch for ARM64

---
 .github/workflows/Publish.yml                | 28 ++++++++++++++++++--
 .github/workflows/PublishDockerDevImage.yaml | 28 ++++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml
index 6fabc32..0b22e1c 100644
--- a/.github/workflows/Publish.yml
+++ b/.github/workflows/Publish.yml
@@ -5,8 +5,9 @@ on:
     types: [published]
 
 jobs:
-   publish:
-    runs-on: ubuntu-22.04
+   publish-amd64:
+    runs-on: ubuntu-24.04
+    name: "Publish for AMD64"
 
     steps:
       - uses: actions/checkout@v4
@@ -26,3 +27,26 @@ jobs:
           repo_overview: auto
           platforms: |
             linux/amd64
+
+   publish-arm64:
+    runs-on: ubuntu-24.04-arm
+    name: "Publish for ARM64"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build and push Docker image
+        uses: openzim/docker-publish-action@v10
+        with:
+          image-name: openzim/zimit
+          tag-pattern: /^v([0-9.]+)$/
+          latest-on-tag: true
+          restrict-to: openzim/zimit
+          registries: ghcr.io
+          credentials:
+            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+          repo_description: auto
+          repo_overview: auto
+          platforms: |
+            linux/arm64
diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml
index d893882..61c9140 100644
--- a/.github/workflows/PublishDockerDevImage.yaml
+++ b/.github/workflows/PublishDockerDevImage.yaml
@@ -7,8 +7,9 @@ on:
   workflow_dispatch:
 
 jobs:
-  publish:
-    runs-on: ubuntu-22.04
+  publish-amd64:
+    runs-on: ubuntu-24.04
+    name: "Publish for AMD64"
 
     steps:
       - uses: actions/checkout@v4
@@ -28,3 +29,26 @@ jobs:
           repo_overview: auto
           platforms: |
             linux/amd64
+
+  publish-arm64:
+    runs-on: ubuntu-24.04-arm64
+    name: "Publish for ARM64"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build and push Docker image
+        uses: openzim/docker-publish-action@v10
+        with:
+          image-name: openzim/zimit
+          manual-tag: dev
+          latest-on-tag: false
+          restrict-to: openzim/zimit
+          registries: ghcr.io
+          credentials:
+            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+          repo_description: auto
+          repo_overview: auto
+          platforms: |
+            linux/arm64

From b4c0495f48a325544a59c03fd239a83c14c8c02c Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 6 Feb 2025 21:19:08 +0000
Subject: [PATCH 14/65] Fix arm runner selector

---
 .github/workflows/Publish.yml                | 2 +-
 .github/workflows/PublishDockerDevImage.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml
index 0b22e1c..7121cb7 100644
--- a/.github/workflows/Publish.yml
+++ b/.github/workflows/Publish.yml
@@ -29,7 +29,7 @@ jobs:
             linux/amd64
 
    publish-arm64:
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-24.04
     name: "Publish for ARM64"
 
     steps:
diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml
index 61c9140..05a8ae1 100644
--- a/.github/workflows/PublishDockerDevImage.yaml
+++ b/.github/workflows/PublishDockerDevImage.yaml
@@ -31,7 +31,7 @@ jobs:
             linux/amd64
 
   publish-arm64:
-    runs-on: ubuntu-24.04-arm64
+    runs-on: ubuntu-24.04-arm
     name: "Publish for ARM64"
 
     steps:

From 5af981c01c6f7f65b43fc13204884ebfc58fdaa8 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 7 Feb 2025 08:07:23 +0000
Subject: [PATCH 15/65] Remove ARM64 job temporarily, still not working

---
 .github/workflows/Publish.yml                | 49 ++++++++++----------
 .github/workflows/PublishDockerDevImage.yaml | 47 ++++++++++---------
 2 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml
index 7121cb7..1ddb343 100644
--- a/.github/workflows/Publish.yml
+++ b/.github/workflows/Publish.yml
@@ -5,7 +5,7 @@ on:
     types: [published]
 
 jobs:
-   publish-amd64:
+  publish-amd64:
     runs-on: ubuntu-24.04
     name: "Publish for AMD64"
 
@@ -20,7 +20,7 @@ jobs:
           latest-on-tag: true
           restrict-to: openzim/zimit
           registries: ghcr.io
-          credentials:
+          credentials: |
             GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
             GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
           repo_description: auto
@@ -28,25 +28,26 @@ jobs:
           platforms: |
             linux/amd64
 
-   publish-arm64:
-    runs-on: ubuntu-24.04
-    name: "Publish for ARM64"
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Build and push Docker image
-        uses: openzim/docker-publish-action@v10
-        with:
-          image-name: openzim/zimit
-          tag-pattern: /^v([0-9.]+)$/
-          latest-on-tag: true
-          restrict-to: openzim/zimit
-          registries: ghcr.io
-          credentials:
-            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
-            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
-          repo_description: auto
-          repo_overview: auto
-          platforms: |
-            linux/arm64
+  # Disabled for now, see https://github.com/openzim/zimit/issues/463
+  # publish-arm64:
+  #   runs-on: ubuntu-24.04
+  #   name: "Publish for ARM64"
+  #
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #
+  #     - name: Build and push Docker image
+  #       uses: openzim/docker-publish-action@v10
+  #       with:
+  #         image-name: openzim/zimit
+  #         tag-pattern: /^v([0-9.]+)$/
+  #         latest-on-tag: true
+  #         restrict-to: openzim/zimit
+  #         registries: ghcr.io
+  #         credentials: |
+  #           GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+  #           GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+  #         repo_description: auto
+  #         repo_overview: auto
+  #         platforms: |
+  #           linux/arm64
diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml
index 05a8ae1..1cbecea 100644
--- a/.github/workflows/PublishDockerDevImage.yaml
+++ b/.github/workflows/PublishDockerDevImage.yaml
@@ -22,7 +22,7 @@ jobs:
           latest-on-tag: false
           restrict-to: openzim/zimit
           registries: ghcr.io
-          credentials:
+          credentials: |
             GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
             GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
           repo_description: auto
@@ -30,25 +30,26 @@ jobs:
           platforms: |
             linux/amd64
 
-  publish-arm64:
-    runs-on: ubuntu-24.04-arm
-    name: "Publish for ARM64"
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Build and push Docker image
-        uses: openzim/docker-publish-action@v10
-        with:
-          image-name: openzim/zimit
-          manual-tag: dev
-          latest-on-tag: false
-          restrict-to: openzim/zimit
-          registries: ghcr.io
-          credentials:
-            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
-            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
-          repo_description: auto
-          repo_overview: auto
-          platforms: |
-            linux/arm64
+  # Disabled for now, see https://github.com/openzim/zimit/issues/463
+  # publish-arm64:
+  #   runs-on: ubuntu-24.04-arm
+  #   name: "Publish for ARM64"
+  #
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #
+  #     - name: Build and push Docker image
+  #       uses: openzim/docker-publish-action@v10
+  #       with:
+  #         image-name: openzim/zimit
+  #         manual-tag: dev
+  #         latest-on-tag: false
+  #         restrict-to: openzim/zimit
+  #         registries: ghcr.io
+  #         credentials: |
+  #           GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+  #           GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+  #         repo_description: auto
+  #         repo_overview: auto
+  #         platforms: |
+  #           linux/arm64

From 6ec53f774f009e436beab98fadfbe7620a1f61fc Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 7 Feb 2025 08:24:02 +0000
Subject: [PATCH 16/65] Upgrade to Browsertrix Crawler 1.5.1

---
 CHANGELOG.md                     | 2 +-
 Dockerfile                       | 2 +-
 tests-integration/integration.py | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4033a33..3ac740a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Upgrade to browsertrix crawler 1.5.0, Python 3.13 and others (#462)
+- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462)
 
 ## [2.1.7] - 2024-01-10
 
diff --git a/Dockerfile b/Dockerfile
index d2854dc..6a14b59 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.0
+FROM webrecorder/browsertrix-crawler:1.5.1
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu
diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index b757e3d..50cfa00 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -91,8 +91,7 @@ def test_stats_output():
         "pending": 0,
         "pendingPages": [],
         "total": 17,
-        "failed": 1,
-        "failedWillRetry": 17,
+        "failed": 18,
         "limit": {"max": 0, "hit": False},
     }
 

From a7e1026b2ec17069056aaacf048a5b11a52ce6f1 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 7 Feb 2025 08:38:20 +0000
Subject: [PATCH 17/65] Pin warc2zim for release

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..50298f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.1",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

From d228e9f346625bc87ac1c9e48d5a4f5616d60ad7 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 7 Feb 2025 08:57:09 +0000
Subject: [PATCH 18/65] Release 2.1.8

---
 CHANGELOG.md           | 4 ++--
 src/zimit/__about__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ac740a..2c1d1f5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,11 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [2.1.8] - 2024-02-07
 
 ### Changed
 
-- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462)
+- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
 
 ## [2.1.7] - 2024-01-10
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 72f5b42..c377db3 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "2.1.8-dev0"
+__version__ = "2.1.8"

From 8b4b18bfb79002fc4b4bc475d0b2683868e4a285 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 7 Feb 2025 08:59:54 +0000
Subject: [PATCH 19/65] Prepare for 2.1.9

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c1d1f5..6387cbd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [2.1.8] - 2024-02-07
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 50298f0..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.1",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index c377db3..4b55b6b 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "2.1.8"
+__version__ = "2.1.9-dev0"

From 3a7f583a96ed70e79f85c53c24567edcb6e37190 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 7 Feb 2025 08:59:54 +0000
Subject: [PATCH 20/65] Upgrade to Browsertrix Crawler 1.5.3

Include restore of total number of pages, following upstream fix.
---
 Dockerfile                       | 2 +-
 tests-integration/integration.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 6a14b59..c37dfcc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.1
+FROM webrecorder/browsertrix-crawler:1.5.3
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu
diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index 50cfa00..d9bfc94 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -90,7 +90,7 @@ def test_stats_output():
         "crawled": 17,
         "pending": 0,
         "pendingPages": [],
-        "total": 17,
+        "total": 35,
         "failed": 18,
         "limit": {"max": 0, "hit": False},
     }

From 101fb71a0bdd0cc308de060f59c83a32ad5cdbaf Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 11 Feb 2025 16:57:19 +0000
Subject: [PATCH 21/65] Better processing of crawler exit codes with soft/hard
 limits

---
 .github/workflows/Tests.yaml     |  22 ++++--
 CHANGELOG.md                     |   8 +++
 src/zimit/constants.py           |   3 +-
 src/zimit/zimit.py               | 115 +++++++++++++++++++++----------
 tests-integration/integration.py |  46 +++++++++++--
 5 files changed, 149 insertions(+), 45 deletions(-)

diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
index 592a5aa..601f9ba 100644
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@@ -57,13 +57,25 @@ jobs:
         uses: actions/checkout@v4
 
       - name: build image
-        run: docker build -t zimit .
+        run: docker build -t local-zimit .
 
       - name: ensure help display without issue
-        run: docker run -v $PWD/output:/output zimit zimit --help
+        run: docker run -v $PWD/output:/output local-zimit zimit --help
 
-      - name: run crawl
-        run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
+      - name: run crawl with soft size limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
+
+      - name: run crawl with hard size limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
+
+      - name: run crawl with soft time limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
+
+      - name: run crawl with hard time limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
+
+      - name: run standard crawl
+        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
 
       - name: run integration test suite
-        run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
+        run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6387cbd..f6d7044 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
+
+### Fixed
+
+- Do not create the ZIM when crawl is incomplete (#444)
+
 ## [2.1.8] - 2024-02-07
 
 ### Changed
diff --git a/src/zimit/constants.py b/src/zimit/constants.py
index f81905a..35baeb9 100644
--- a/src/zimit/constants.py
+++ b/src/zimit/constants.py
@@ -3,7 +3,8 @@ import logging
 from zimscraperlib.logging import getLogger
 
 EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
-EXIT_CODE_CRAWLER_LIMIT_HIT = 11
+EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
+EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
 NORMAL_WARC2ZIM_EXIT_CODE = 100
 REQUESTS_TIMEOUT = 10
 
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index ec989f1..416bec9 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -25,7 +25,8 @@ from zimscraperlib.uri import rebuild_uri
 
 from zimit.__about__ import __version__
 from zimit.constants import (
-    EXIT_CODE_CRAWLER_LIMIT_HIT,
+    EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT,
+    EXIT_CODE_CRAWLER_TIME_LIMIT_HIT,
     EXIT_CODE_WARC2ZIM_CHECK_FAILED,
     NORMAL_WARC2ZIM_EXIT_CODE,
     logger,
@@ -61,35 +62,19 @@ class ProgressFileWatcher:
         self.process.daemon = True
         self.process.start()
 
-    @staticmethod
-    def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
+    def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
         ino = inotify.adapters.Inotify()
         ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
         ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
 
-        class Limit:
-            def __init__(self):
-                self.max = self.hit = None
-
-            @property
-            def as_dict(self):
-                return {"max": self.max, "hit": self.hit}
-
-        # limit is only reported by crawl but needs to be reported up
-        limit = Limit()
-
-        def crawl_conv(data, limit):
+        def crawl_conv(data):
             # we consider crawl to be 90% of the workload so total = craw_total * 90%
-            # limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]}
-            limit.max = data["limit"]["max"]
-            limit.hit = data["limit"]["hit"]
             return {
                 "done": data["crawled"],
                 "total": int(data["total"] / 0.9),
-                "limit": limit.as_dict,
             }
 
-        def warc2zim_conv(data, limit):
+        def warc2zim_conv(data):
             # we consider warc2zim to be 10% of the workload so
             # warc2zim_total = 10% and  total = 90 + warc2zim_total * 10%
             return {
@@ -98,7 +83,6 @@ class ProgressFileWatcher:
                     * (0.9 + (float(data["written"]) / data["total"]) / 10)
                 ),
                 "total": data["total"],
-                "limit": limit.as_dict,
             }
 
         for _, _, fpath, _ in ino.event_gen(yield_nones=False):  # pyright: ignore
@@ -108,7 +92,7 @@ class ProgressFileWatcher:
             # open input and output separatly as to not clear output on error
             with open(fpath) as ifh:
                 try:
-                    out = func(json.load(ifh), limit)
+                    out = func(json.load(ifh))
                 except Exception:  # nosec # noqa: S112
                     # simply ignore progress update should an error arise
                     # might be malformed input for instance
@@ -278,9 +262,17 @@ def run(raw_args):
         "directory",
     )
 
-    parser.add_argument(
-        "--sizeLimit",
-        help="If set, save state and exit if size limit exceeds this value",
+    size_group = parser.add_mutually_exclusive_group()
+    size_group.add_argument(
+        "--sizeSoftLimit",
+        help="If set, save crawl state and stop crawl if WARC size exceeds this value. "
+        "ZIM will still be created.",
+        type=int,
+    )
+    size_group.add_argument(
+        "--sizeHardLimit",
+        help="If set, exit crawler and fail the scraper immediately if WARC size "
+        "exceeds this value",
         type=int,
     )
 
@@ -292,9 +284,17 @@ def run(raw_args):
         default=90,
     )
 
-    parser.add_argument(
-        "--timeLimit",
-        help="If set, save state and exit after time limit, in seconds",
+    time_group = parser.add_mutually_exclusive_group()
+    time_group.add_argument(
+        "--timeSoftLimit",
+        help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes "
+        "longer than this value, in seconds. ZIM will still be created.",
+        type=int,
+    )
+    time_group.add_argument(
+        "--timeHardLimit",
+        help="If set, exit crawler and fail the scraper immediately if WARC(s) creation"
+        " takes longer than this value, in seconds",
         type=int,
     )
 
@@ -369,6 +369,13 @@ def run(raw_args):
         "path/URLs separated by comma",
     )
 
+    parser.add_argument(
+        "--acceptable-crawler-exit-codes",
+        help="Non-zero crawler exit codes to consider as acceptable to continue with "
+        " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if "
+        " used). Single value with individual error codes separated by comma",
+    )
+
     zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
 
     # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
@@ -504,6 +511,8 @@ def run(raw_args):
         f"{'will keep' if zimit_args.keep else 'will delete'}"
     )
 
+    partial_zim = False
+
     # if warc files are passed, do not run browsertrix crawler but fetch the files if
     # they are provided as an HTTP URL + extract the archive if it is a tar.gz
     warc_files: list[Path] = []
@@ -568,10 +577,29 @@ def run(raw_args):
 
         logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
         crawl = subprocess.run(cmd_args, check=False)
-        if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
-            logger.info("crawl interupted by a limit")
+        if (
+            crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
+            and zimit_args.sizeSoftLimit
+        ):
+            logger.info(
+                "Crawl size soft limit hit. Continuing with warc2zim conversion."
+            )
+            if zimit_args.statsFilename:
+                partial_zim = True
+        elif (
+            crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
+            and zimit_args.timeSoftLimit
+        ):
+            logger.info(
+                "Crawl time soft limit hit. Continuing with warc2zim conversion."
+            )
+            if zimit_args.statsFilename:
+                partial_zim = True
         elif crawl.returncode != 0:
-            raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
+            logger.error(
+                f"Crawl returned an error: {crawl.returncode}, scraper exiting"
+            )
+            return crawl.returncode
 
         if zimit_args.collection:
             warc_files = [
@@ -606,7 +634,15 @@ def run(raw_args):
 
     logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
 
-    return warc2zim(warc2zim_args)
+    warc2zim_exit_code = warc2zim(warc2zim_args)
+
+    if zimit_args.statsFilename:
+        stats = Path(zimit_args.statsFilename)
+        stats_content = json.loads(stats.read_bytes())
+        stats_content["partialZim"] = partial_zim
+        stats.write_text(json.dumps(stats_content))
+
+    return warc2zim_exit_code
 
 
 def get_cleaned_url(url: str):
@@ -646,9 +682,11 @@ def get_node_cmd_line(args):
         "behaviorTimeout",
         "delay",
         "profile",
-        "sizeLimit",
+        "sizeSoftLimit",
+        "sizeHardLimit",
         "diskUtilization",
-        "timeLimit",
+        "timeSoftLimit",
+        "timeHardLimit",
         "healthCheckPort",
         "overwrite",
         "config",
@@ -668,7 +706,14 @@ def get_node_cmd_line(args):
                 continue
         if value is None or (isinstance(value, bool) and value is False):
             continue
-        node_cmd.append("--" + arg)
+        node_cmd.append(
+            "--"
+            + (
+                "sizeLimit"
+                if arg in ["sizeSoftLimit", "sizeHardLimit"]
+                else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg
+            )
+        )
         if not isinstance(value, bool):
             node_cmd.append(str(value))
 
diff --git a/tests-integration/integration.py b/tests-integration/integration.py
index d9bfc94..7e79f52 100644
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@@ -3,13 +3,34 @@ import json
 import os
 from pathlib import Path
 
+import pytest
 from warcio import ArchiveIterator
 from zimscraperlib.zim import Archive
 
 
-def test_is_file():
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/tests_en_onepage.zim", id="onepage"),
+        pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
+        pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
+    ],
+)
+def test_zim_created(filename):
     """Ensure ZIM file exists"""
-    assert os.path.isfile("/output/tests_en_onepage.zim")
+    assert os.path.isfile(filename)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
+        pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
+    ],
+)
+def test_zim_not_created(filename):
+    """Ensure ZIM file does not exists"""
+    assert not os.path.exists(filename)
 
 
 def test_zim_main_page():
@@ -85,7 +106,7 @@ def test_user_agent():
     assert found
 
 
-def test_stats_output():
+def test_stats_output_standard():
     assert json.loads(Path("/output/crawl.json").read_bytes()) == {
         "crawled": 17,
         "pending": 0,
@@ -103,5 +124,22 @@ def test_stats_output():
     assert json.loads(Path("/output/stats.json").read_bytes()) == {
         "done": 8,
         "total": 8,
-        "limit": {"max": 0, "hit": False},
+        "partialZim": False,
     }
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
+        pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
+    ],
+)
+def test_stats_output_softlimit(filename):
+    file = Path(filename)
+    assert file.exists
+    content = json.loads(file.read_bytes())
+    assert "done" in content
+    assert "total" in content
+    assert "partialZim" in content
+    assert content["partialZim"]

From ee82837aaa12de19c6bcc3fdd1ea641f8bdb559b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 13 Feb 2025 13:18:06 +0000
Subject: [PATCH 22/65] Keep temporary folder when crawler or warc2zim fails,
 even if not asked for

---
 CHANGELOG.md       |  1 +
 src/zimit/zimit.py | 34 +++++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f6d7044..308058b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
+- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
 
 ### Fixed
 
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 416bec9..70dfdbd 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -33,6 +33,8 @@ from zimit.constants import (
 )
 from zimit.utils import download_file
 
+temp_root_dir: Path | None = None
+
 
 class ProgressFileWatcher:
     def __init__(self, output_dir: Path, stats_path: Path):
@@ -103,6 +105,24 @@ class ProgressFileWatcher:
                     json.dump(out, ofh)
 
 
+def cleanup():
+    if not temp_root_dir:
+        logger.warning("Temporary root dir not already set, cannot clean this up")
+        return
+    logger.info("")
+    logger.info("----------")
+    logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
+    shutil.rmtree(temp_root_dir)
+
+
+def cancel_cleanup():
+    logger.info(
+        f"Temporary files have been kept in {temp_root_dir}, please clean them"
+        " up manually once you don't need them anymore"
+    )
+    atexit.unregister(cleanup)
+
+
 def run(raw_args):
     parser = ArgumentParser(
         description="Run a browser-based crawl on the specified URL and convert to ZIM"
@@ -427,19 +447,13 @@ def run(raw_args):
         return EXIT_CODE_WARC2ZIM_CHECK_FAILED
 
     # make temp dir for this crawl
+    global temp_root_dir  # noqa: PLW0603
     if zimit_args.build:
         temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
     else:
         temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
 
     if not zimit_args.keep:
-
-        def cleanup():
-            logger.info("")
-            logger.info("----------")
-            logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
-            shutil.rmtree(temp_root_dir)
-
         atexit.register(cleanup)
 
     # copy / download custom behaviors to one single folder and configure crawler
@@ -599,6 +613,7 @@ def run(raw_args):
             logger.error(
                 f"Crawl returned an error: {crawl.returncode}, scraper exiting"
             )
+            cancel_cleanup()
             return crawl.returncode
 
         if zimit_args.collection:
@@ -642,6 +657,11 @@ def run(raw_args):
         stats_content["partialZim"] = partial_zim
         stats.write_text(json.dumps(stats_content))
 
+    # also call cancel_cleanup when --keep, even if it is not supposed to be registered,
+    # so that we will display temporary files location just like in other situations
+    if warc2zim_exit_code or zimit_args.keep:
+        cancel_cleanup()
+
     return warc2zim_exit_code
 
 

From b4ec60f31663e97ad3c4dc508e8632d6ce0a1472 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 13 Feb 2025 15:31:51 +0000
Subject: [PATCH 23/65] fixup! Keep temporary folder when crawler or warc2zim
 fails, even if not asked for

---
 README.md          | 2 +-
 src/zimit/zimit.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9bfba9b..1598ead 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ The image accepts the following parameters, **as well as any of the [warc2zim](h
 - `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
 - `--workers N` - number of crawl workers to be run in parallel
 - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
-- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
+- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
 
 Example command:
 
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 70dfdbd..49ead05 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -334,7 +334,10 @@ def run(raw_args):
 
     parser.add_argument(
         "--keep",
-        help="If set, keep WARC files after crawl, don't delete",
+        help="In case of failure, WARC files and other temporary files (which are "
+        "stored as a subfolder of output directory) are always kept, otherwise "
+        "they are automatically deleted. Use this flag to always keep WARC files, "
+        "even in case of success.",
         action="store_true",
     )
 

From dc6b5aafb70f61771bf209746bce9679e985743f Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 13 Feb 2025 15:14:53 +0000
Subject: [PATCH 24/65] Enhance support of Browsertrix Crawler arguments

---
 CHANGELOG.md       |   2 +
 src/zimit/zimit.py | 522 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 462 insertions(+), 62 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 308058b..3d20f59 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Change solution to report partial ZIM to the Zimfarm and other clients (#304)
 - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
+- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
+- Document all Browsertrix Crawler default arguments values (#416)
 
 ### Fixed
 
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 49ead05..8634b71 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -129,6 +129,7 @@ def run(raw_args):
     )
 
     parser.add_argument("-u", "--url", help="The URL to start crawling from")
+
     parser.add_argument("--title", help="ZIM title")
     parser.add_argument("--description", help="ZIM description")
     parser.add_argument("--long-description", help="ZIM long description metadata")
@@ -138,52 +139,66 @@ def run(raw_args):
         help="If set, read a list of seed urls, one per line, from the specified",
     )
 
-    parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
+    parser.add_argument(
+        "-w", "--workers", type=int, help="Number of parallel workers. Default is 1."
+    )
+
+    parser.add_argument(
+        "--crawlId",
+        help="A user provided ID for this crawl or crawl configuration (can also be "
+        "set via CRAWL_ID env var, defaults to hostname)",
+    )
 
     parser.add_argument(
         "--waitUntil",
         help="Puppeteer page.goto() condition to wait for before continuing. One of "
         "load, domcontentloaded, networkidle0 or networkidle2, or a "
-        "comma-separated combination of those.",
-        default="load",
+        "comma-separated combination of those. Default is load,networkidle2",
     )
 
     parser.add_argument(
-        "--depth", help="The depth of the crawl for all seeds", type=int, default=-1
+        "--depth",
+        help="The depth of the crawl for all seeds. Default is -1.",
+        type=int,
     )
 
     parser.add_argument(
         "--extraHops",
-        help="Number of extra 'hops' to follow, beyond the current scope",
+        help="Number of extra 'hops' to follow, beyond the current scope. "
+        "Default is 0.",
         type=int,
     )
 
-    parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int)
+    parser.add_argument(
+        "--limit",
+        help="Limit crawl to this number of pages. Default is 0 (no limit).",
+        type=int,
+    )
 
     parser.add_argument(
         "--maxPageLimit",
-        help="Maximum pages to crawl, overriding pageLimit if both are set",
+        help="Maximum pages to crawl, overriding pageLimit if both are set. Default is "
+        "0 (no limit)",
         type=int,
     )
 
     parser.add_argument(
         "--timeout",
-        help="Timeout for each page to load (in seconds)",
+        help="Timeout for each page to load (in seconds). Default is 90 secs.",
         type=int,
-        default=90,
     )
 
     parser.add_argument(
         "--scopeType",
         help="A predfined scope of the crawl. For more customization, "
-        "use 'custom' and set scopeIncludeRx regexes",
+        "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom"
+        "if scopeIncludeRx is set, prefix otherwise.",
         choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"],
     )
 
     parser.add_argument(
         "--include",
-        help="Regex of page URLs that should be "
-        "included in the crawl (defaults to "
+        help="Regex of page URLs that should be included in the crawl (defaults to "
         "the immediate directory of URL)",
     )
 
@@ -192,48 +207,185 @@ def run(raw_args):
         help="Regex of page URLs that should be excluded from the crawl",
     )
 
-    parser.add_argument(
-        "--collection",
-        help="Collection name to crawl to (replay will be accessible "
-        "under this name in pywb preview) instead of crawl-@ts",
-    )
-
     parser.add_argument(
         "--allowHashUrls",
-        help="Allow Hashtag URLs, useful for "
-        "single-page-application crawling or "
-        "when different hashtags load dynamic "
-        "content",
+        help="Allow Hashtag URLs, useful for single-page-application crawling or "
+        "when different hashtags load dynamic content",
         action="store_true",
     )
 
     parser.add_argument(
-        "--lang",
-        help="if set, sets the language used by the browser, should be ISO 639 "
-        "language[-country] code",
+        "--selectLinks",
+        help="One or more selectors for extracting links, in the format "
+        "[css selector]->[property to use],[css selector]->@[attribute to use]",
     )
 
     parser.add_argument(
-        "--zim-lang",
-        help="Language metadata of ZIM "
-        "(warc2zim --lang param). ISO-639-3 code. "
-        "Retrieved from homepage if found, fallback to `eng`",
+        "--clickSelector",
+        help="Selector for elements to click when using the autoclick behavior. Default"
+        " is 'a'",
     )
 
+    parser.add_argument(
+        "--blockRules",
+        help="Additional rules for blocking certain URLs from being loaded, by URL "
+        "regex and optionally via text match in an iframe",
+    )
+
+    parser.add_argument(
+        "--blockMessage",
+        help="If specified, when a URL is blocked, a record with this error message is"
+        " added instead",
+    )
+
+    parser.add_argument(
+        "--blockAds",
+        help="If set, block advertisements from being loaded (based on Stephen Black's"
+        " blocklist). Note that some bad domains are also blocked by zimit"
+        " configuration even if this option is not set.",
+    )
+
+    parser.add_argument(
+        "--adBlockMessage",
+        help="If specified, when an ad is blocked, a record with this error message is"
+        " added instead",
+    )
+
+    parser.add_argument(
+        "--collection",
+        help="Collection name to crawl to (replay will be accessible "
+        "under this name in pywb preview). Default is crawl-@ts.",
+    )
+
+    parser.add_argument(
+        "--headless",
+        help="Run in headless mode, otherwise start xvfb",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--driver",
+        help="Custom driver for the crawler, if any",
+    )
+
+    parser.add_argument(
+        "--generateCDX",
+        help="If set, generate index (CDXJ) for use with pywb after crawl is done",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--combineWARC",
+        help="If set, combine the warcs",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--rolloverSize",
+        help="If set, declare the rollover size. Default is 1000000000.",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--generateWACZ",
+        help="If set, generate WACZ on disk",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--logging",
+        help="Crawler logging configuration",
+    )
+
+    parser.add_argument(
+        "--logLevel",
+        help="Comma-separated list of log levels to include in logs",
+    )
+
+    parser.add_argument(
+        "--logContext",
+        help="Comma-separated list of contexts to include in logs",
+        choices=[
+            "general",
+            "worker",
+            "recorder",
+            "recorderNetwork",
+            "writer",
+            "state",
+            "redis",
+            "storage",
+            "text",
+            "exclusion",
+            "screenshots",
+            "screencast",
+            "originOverride",
+            "healthcheck",
+            "browser",
+            "blocking",
+            "behavior",
+            "behaviorScript",
+            "jsError",
+            "fetch",
+            "pageStatus",
+            "memoryStatus",
+            "crawlStatus",
+            "links",
+            "sitemap",
+            "wacz",
+            "replay",
+            "proxy",
+        ],
+    )
+
+    parser.add_argument(
+        "--logExcludeContext",
+        help="Comma-separated list of contexts to NOT include in logs. Default is "
+        "recorderNetwork,jsError,screencast",
+        choices=[
+            "general",
+            "worker",
+            "recorder",
+            "recorderNetwork",
+            "writer",
+            "state",
+            "redis",
+            "storage",
+            "text",
+            "exclusion",
+            "screenshots",
+            "screencast",
+            "originOverride",
+            "healthcheck",
+            "browser",
+            "blocking",
+            "behavior",
+            "behaviorScript",
+            "jsError",
+            "fetch",
+            "pageStatus",
+            "memoryStatus",
+            "crawlStatus",
+            "links",
+            "sitemap",
+            "wacz",
+            "replay",
+            "proxy",
+        ],
+    )
+
+    parser.add_argument(
+        "--text",
+        help="Extract initial (default) or final text to pages.jsonl or WARC resource"
+        " record(s)",
+    )
+
+    # cwd is not manipulable
+
     parser.add_argument(
         "--mobileDevice",
         help="Emulate mobile device by name from "
         "https://github.com/puppeteer/puppeteer/blob/"
         "main/packages/puppeteer-core/src/common/Device.ts",
-        default="Pixel 2",
-    )
-
-    parser.add_argument(
-        "--noMobileDevice",
-        help="Do not emulate a mobile device (use at your own risk, behavior is"
-        "uncertain)",
-        action="store_true",
-        default=False,
     )
 
     parser.add_argument(
@@ -255,33 +407,108 @@ def run(raw_args):
         "(usually /sitemap.xml)",
     )
 
+    parser.add_argument(
+        "--sitemapFromDate",
+        help="If set, filter URLs from sitemaps to those greater than or equal to (>=)"
+        " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
+    )
+
+    parser.add_argument(
+        "--sitemapToDate",
+        help="If set, filter URLs from sitemaps to those less than or equal to (<=) "
+        "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
+    )
+
+    parser.add_argument(
+        "--statsFilename",
+        help="If set, output stats as JSON to this file. (Relative filename resolves "
+        "to crawl working directory)",
+    )
+
     parser.add_argument(
         "--behaviors",
-        help="Which background behaviors to enable on each page",
-        default="autoplay,autofetch,siteSpecific",
+        help="Which background behaviors to enable on each page. Default is autoplay,"
+        "autofetch,autoscroll,siteSpecific",
     )
 
     parser.add_argument(
         "--behaviorTimeout",
         help="If >0, timeout (in seconds) for in-page behavior will run on each page. "
-        "If 0, a behavior can run until finish",
+        "If 0, a behavior can run until finish. Default is 90.",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--postLoadDelay",
+        help="If >0, amount of time to sleep (in seconds) after page has loaded, before"
+        " taking screenshots / getting text / running behaviors. Default is 0.",
         type=int,
-        default=90,
     )
 
     parser.add_argument(
         "--delay",
         help="If >0, amount of time to sleep (in seconds) after behaviors "
-        "before moving on to next page",
+        "before moving on to next page. Default is 0.",
         type=int,
     )
 
+    parser.add_argument(
+        "--dedupPolicy",
+        help="Deduplication policy. Default is skip",
+        choices=["skip", "revisit", "keep"],
+    )
+
     parser.add_argument(
         "--profile",
         help="Path or HTTP(S) URL to tar.gz file which contains the browser profile "
         "directory",
     )
 
+    parser.add_argument(
+        "--screenshot",
+        help="Screenshot options for crawler. One of view, thumbnail, fullPage, "
+        "fullPageFinal or a comma-separated combination of those.",
+    )
+
+    parser.add_argument(
+        "--screencastPort",
+        help="If set to a non-zero value, starts an HTTP server with screencast "
+        "accessible on this port.",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--screencastRedis",
+        help="If set, will use the state store redis pubsub for screencasting",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--warcInfo",
+        help="Optional fields added to the warcinfo record in combined WARCs",
+    )
+
+    parser.add_argument(
+        "--saveState",
+        help="If the crawl state should be serialized to the crawls/ directory. "
+        "Defaults to 'partial', only saved when crawl is interrupted",
+        choices=["never", "partial", "always"],
+    )
+
+    parser.add_argument(
+        "--saveStateInterval",
+        help="If save state is set to 'always', also save state during the crawl at "
+        "this interval (in seconds). Default to 300.",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--saveStateHistory",
+        help="Number of save states to keep during the duration of a crawl. "
+        "Default to 5.",
+        type=int,
+    )
+
     size_group = parser.add_mutually_exclusive_group()
     size_group.add_argument(
         "--sizeSoftLimit",
@@ -329,7 +556,134 @@ def run(raw_args):
         help="overwrite current crawl data: if set, existing collection directory "
         "will be deleted before crawl is started",
         action="store_true",
-        default=False,
+    )
+
+    parser.add_argument(
+        "--waitOnDone",
+        help="if set, wait for interrupt signal when finished instead of exiting",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--restartsOnError",
+        help="if set, assume will be restarted if interrupted, don't run post-crawl "
+        "processes on interrupt",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--netIdleWait",
+        help="If set, wait for network idle after page load and after behaviors are "
+        "done (in seconds). if -1 (default), determine based on scope.",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--lang",
+        help="if set, sets the language used by the browser, should be ISO 639 "
+        "language[-country] code",
+    )
+
+    parser.add_argument(
+        "--originOverride",
+        help="if set, will redirect requests from each origin in key to origin in the "
+        "value, eg. --originOverride https://host:port=http://alt-host:alt-port",
+    )
+
+    parser.add_argument(
+        "--logErrorsToRedis",
+        help="If set, write error messages to redis",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--writePagesToRedis",
+        help="If set, write page objects to redis",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--maxPageRetries",
+        help="If set, number of times to retry a page that failed to load before page"
+        " is considered to have failed. Default is 2.",
+        type=int,
+    )
+
+    parser.add_argument(
+        "--failOnFailedSeed",
+        help="If set, crawler will fail with exit code 1 if any seed fails. When "
+        "combined with --failOnInvalidStatus, will result in crawl failing with exit "
+        "code 1 if any seed has a 4xx/5xx response",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--failOnFailedLimit",
+        help="If set, save state and exit if number of failed pages exceeds this value",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--failOnInvalidStatus",
+        help="If set, will treat pages with 4xx or 5xx response as failures. When "
+        "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl "
+        "failing due to non-200 responses",
+        action="store_true",
+    )
+
+    # customBehaviors not included because it has special handling
+    # debugAccessRedis not included due to custom redis engine in zimit
+
+    parser.add_argument(
+        "--debugAccessBrowser",
+        help="if set, allow debugging browser on port 9222 via CDP",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--warcPrefix",
+        help="prefix for WARC files generated, including WARCs added to WACZ",
+    )
+
+    parser.add_argument(
+        "--serviceWorker",
+        help="service worker handling: disabled, enabled or disabled-if-profile. "
+        "Default: disabled.",
+    )
+
+    parser.add_argument(
+        "--proxyServer",
+        help="if set, will use specified proxy server. Takes precedence over any env "
+        "var proxy settings",
+    )
+
+    parser.add_argument(
+        "--dryRun",
+        help="If true, no archive data is written to disk, only pages and logs (and "
+        "optionally saved state).",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--qaSource",
+        help="Required for QA mode. Source (WACZ or multi WACZ) for QA",
+    )
+
+    parser.add_argument(
+        "--qaDebugImageDiff",
+        help="if specified, will write crawl.png, replay.png and diff.png for each "
+        "page where they're different",
+        action="store_true",
+    )
+
+    parser.add_argument(
+        "--sshProxyPrivateKeyFile",
+        help="path to SSH private key for SOCKS5 over SSH proxy connection",
+    )
+
+    parser.add_argument(
+        "--sshProxyKnownHostsFile",
+        help="path to SSH known hosts file for SOCKS5 over SSH proxy connection",
     )
 
     parser.add_argument(
@@ -355,11 +709,6 @@ def run(raw_args):
         help="[warc2zim] Custom CSS file URL/path to inject into all articles",
     )
 
-    parser.add_argument(
-        "--statsFilename",
-        help="If set, output stats as JSON to this file",
-    )
-
     parser.add_argument(
         "--config",
         help="Path to YAML config file. If set, browsertrix-crawler will use this file"
@@ -374,8 +723,10 @@ def run(raw_args):
     )
 
     parser.add_argument(
-        "--logging",
-        help="Crawler logging configuration",
+        "--zim-lang",
+        help="Language metadata of ZIM "
+        "(warc2zim --lang param). ISO-639-3 code. "
+        "Retrieved from homepage if found, fallback to `eng`",
     )
 
     parser.add_argument(
@@ -497,10 +848,6 @@ def run(raw_args):
     cmd_args.append("--userAgentSuffix")
     cmd_args.append(user_agent_suffix)
 
-    if not zimit_args.noMobileDevice:
-        cmd_args.append("--mobileDevice")
-        cmd_args.append(zimit_args.mobileDevice)
-
     cmd_args.append("--cwd")
     cmd_args.append(str(temp_root_dir))
 
@@ -681,13 +1028,14 @@ def get_cleaned_url(url: str):
 
 
 def get_node_cmd_line(args):
-    node_cmd = ["crawl", "--failOnFailedSeed"]
+    node_cmd = ["crawl"]
     for arg in [
-        "workers",
-        "waitUntil",
-        "urlFile",
         "title",
         "description",
+        "urlFile",
+        "workers",
+        "crawlId",
+        "waitUntil",
         "depth",
         "extraHops",
         "limit",
@@ -698,13 +1046,44 @@ def get_node_cmd_line(args):
         "exclude",
         "collection",
         "allowHashUrls",
-        "lang",
+        "selectLinks",
+        "clickSelector",
+        "blockRules",
+        "blockMessage",
+        "blockAds",
+        "adBlockMessage",
+        "collection",
+        "headless",
+        "driver",
+        "generateCDX",
+        "combineWARC",
+        "rolloverSize",
+        "generateWACZ",
+        "logging",
+        "logLevel",
+        "logContext",
+        "logExcludeContext",
+        "text",
+        "mobileDevice",
         "userAgent",
+        # userAgentSuffix (manipulated),
         "useSitemap",
+        "sitemapFromDate",
+        "sitemapToDate",
+        # statsFilename (manipulated),
         "behaviors",
         "behaviorTimeout",
+        "postLoadDelay",
         "delay",
+        "dedupPolicy",
         "profile",
+        "screenshot",
+        "screencastPort",
+        "screencastRedis",
+        "warcInfo",
+        "saveState",
+        "saveStateInterval",
+        "saveStateHistory",
         "sizeSoftLimit",
         "sizeHardLimit",
         "diskUtilization",
@@ -712,9 +1091,28 @@ def get_node_cmd_line(args):
         "timeHardLimit",
         "healthCheckPort",
         "overwrite",
-        "config",
-        "logging",
+        "waitOnDone",
+        "restartsOnError",
+        "netIdleWait",
+        "lang",
+        "originOverride",
+        "logErrorsToRedis",
+        "writePagesToRedis",
+        "maxPageRetries",
+        "failOnFailedSeed",
+        "failOnFailedLimit",
+        "failOnInvalidStatus",
+        "debugAccessBrowser",
+        "warcPrefix",
+        "serviceWorker",
+        "proxyServer",
+        "dryRun",
+        "qaSource",
+        "qaDebugImageDiff",
+        "sshProxyPrivateKeyFile",
+        "sshProxyKnownHostsFile",
         "customBehaviors",
+        "config",
     ]:
         value = getattr(args, arg)
         if arg == "userAgent":

From ed1a8a0aa9a1ef28b0d81ab20ed3d4c61d59a01b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 13 Feb 2025 15:30:30 +0000
Subject: [PATCH 25/65] Use preferred Browsertrix Crawler arguments and fix
 multiple/file seeds support

---
 .github/workflows/DailyTests.yaml |  2 +-
 .github/workflows/Tests.yaml      | 10 ++--
 CHANGELOG.md                      |  8 +++
 README.md                         | 13 +++--
 src/zimit/__about__.py            |  2 +-
 src/zimit/zimit.py                | 86 +++++++++++++++++++------------
 6 files changed, 75 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/DailyTests.yaml b/.github/workflows/DailyTests.yaml
index 0585721..2bc9bc5 100644
--- a/.github/workflows/DailyTests.yaml
+++ b/.github/workflows/DailyTests.yaml
@@ -18,7 +18,7 @@ jobs:
         run: docker build -t local-zimit .
 
       - name: run crawl of test website
-        run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
 
       - name: archive ZIM
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
index 601f9ba..afdc18b 100644
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@@ -63,19 +63,19 @@ jobs:
         run: docker run -v $PWD/output:/output local-zimit zimit --help
 
       - name: run crawl with soft size limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
 
       - name: run crawl with hard size limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
 
       - name: run crawl with soft time limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
 
       - name: run crawl with hard time limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
 
       - name: run standard crawl
-        run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
 
       - name: run integration test suite
         run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3d20f59..ae0d548 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
 - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
 - Document all Browsertrix Crawler default arguments values (#416)
+- Use preferred Browsertrix Crawler arguments names:
+  - `--seeds` instead of `--url`
+  - `--seedFile` instead of `--urlFile`
+  - `--pageLimit` instead of `--limit`
+  - `--pageLoadTimeout` instead of `--timeout`
+  - `--scopeIncludeRx` instead of `--include`
+  - `--scopeExcludeRx` instead of `--exclude`
+  - `--pageExtraDelay` instead of `--delay`
 
 ### Fixed
 
diff --git a/README.md b/README.md
index 1598ead..bc18dc6 100644
--- a/README.md
+++ b/README.md
@@ -38,16 +38,15 @@ Usage
 
 `zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
 
-The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance:
+The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
 
-- Required: `--url URL` - the url to be crawled
+- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
 - Required: `--name` - Name of ZIM file
 - `--output` - output directory (defaults to `/output`)
-- `--limit U` - Limit capture to at most U URLs
-- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded)
-- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
+- `--pageLimit U` - Limit capture to at most U URLs
+- `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
 - `--workers N` - number of crawl workers to be run in parallel
-- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
+- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
 - `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
 
 Example command:
@@ -55,7 +54,7 @@ Example command:
 ```bash
 docker run ghcr.io/openzim/zimit zimit --help
 docker run ghcr.io/openzim/zimit warc2zim --help
-docker run  -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile
+docker run  -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
 ```
 
 **Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 4b55b6b..d733cff 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "2.1.9-dev0"
+__version__ = "3.0.0-dev0"
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 8634b71..2337c00 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -128,15 +128,21 @@ def run(raw_args):
         description="Run a browser-based crawl on the specified URL and convert to ZIM"
     )
 
-    parser.add_argument("-u", "--url", help="The URL to start crawling from")
+    parser.add_argument(
+        "--seeds",
+        help="The seed URL(s) to start crawling from. Multile seed URL must be "
+        "separated by a comma (usually not needed, these are just the crawl seeds). "
+        "First seed URL is used as ZIM homepage",
+    )
 
-    parser.add_argument("--title", help="ZIM title")
-    parser.add_argument("--description", help="ZIM description")
+    parser.add_argument("--title", help="WARC and ZIM title")
+    parser.add_argument("--description", help="WARC and ZIM description")
     parser.add_argument("--long-description", help="ZIM long description metadata")
 
     parser.add_argument(
-        "--urlFile",
-        help="If set, read a list of seed urls, one per line, from the specified",
+        "--seedFile",
+        help="If set, read a list of seed urls, one per line. Can be a local file or "
+        "the HTTP(s) URL to an online file.",
     )
 
     parser.add_argument(
@@ -170,7 +176,7 @@ def run(raw_args):
     )
 
     parser.add_argument(
-        "--limit",
+        "--pageLimit",
         help="Limit crawl to this number of pages. Default is 0 (no limit).",
         type=int,
     )
@@ -183,7 +189,7 @@ def run(raw_args):
     )
 
     parser.add_argument(
-        "--timeout",
+        "--pageLoadTimeout",
         help="Timeout for each page to load (in seconds). Default is 90 secs.",
         type=int,
     )
@@ -197,13 +203,13 @@ def run(raw_args):
     )
 
     parser.add_argument(
-        "--include",
+        "--scopeIncludeRx",
         help="Regex of page URLs that should be included in the crawl (defaults to "
         "the immediate directory of URL)",
     )
 
     parser.add_argument(
-        "--exclude",
+        "--scopeExcludeRx",
         help="Regex of page URLs that should be excluded from the crawl",
     )
 
@@ -446,7 +452,7 @@ def run(raw_args):
     )
 
     parser.add_argument(
-        "--delay",
+        "--pageExtraDelay",
         help="If >0, amount of time to sleep (in seconds) after behaviors "
         "before moving on to next page. Default is 0.",
         type=int,
@@ -762,16 +768,40 @@ def run(raw_args):
         warc2zim_args.append("--output")
         warc2zim_args.append(zimit_args.output)
 
-    url = zimit_args.url
-
     user_agent_suffix = zimit_args.userAgentSuffix
     if zimit_args.adminEmail:
         user_agent_suffix += f" {zimit_args.adminEmail}"
 
-    if url:
-        url = get_cleaned_url(url)
-        warc2zim_args.append("--url")
-        warc2zim_args.append(url)
+    # make temp dir for this crawl
+    global temp_root_dir  # noqa: PLW0603
+    if zimit_args.build:
+        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
+    else:
+        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
+
+    seeds = []
+    if zimit_args.seeds:
+        seeds += [get_cleaned_url(url) for url in zimit_args.seeds.split(",")]
+    if zimit_args.seedFile:
+        if re.match(r"^https?\://", zimit_args.seedFile):
+            with tempfile.NamedTemporaryFile(
+                dir=temp_root_dir,
+                prefix="seeds_",
+                suffix=".txt",
+                delete_on_close=True,
+            ) as filename:
+                seed_file = Path(filename.name)
+                download_file(zimit_args.seedFile, seed_file)
+                seeds += [
+                    get_cleaned_url(url) for url in seed_file.read_text().splitlines()
+                ]
+        else:
+            seeds += [
+                get_cleaned_url(url)
+                for url in Path(zimit_args.seedFile).read_text().splitlines()
+            ]
+    warc2zim_args.append("--url")
+    warc2zim_args.append(seeds[0])
 
     if zimit_args.custom_css:
         warc2zim_args += ["--custom-css", zimit_args.custom_css]
@@ -800,13 +830,6 @@ def run(raw_args):
         logger.info("Exiting, invalid warc2zim params")
         return EXIT_CODE_WARC2ZIM_CHECK_FAILED
 
-    # make temp dir for this crawl
-    global temp_root_dir  # noqa: PLW0603
-    if zimit_args.build:
-        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
-    else:
-        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
-
     if not zimit_args.keep:
         atexit.register(cleanup)
 
@@ -841,9 +864,9 @@ def run(raw_args):
         zimit_args.customBehaviors = None
 
     cmd_args = get_node_cmd_line(zimit_args)
-    if url:
-        cmd_args.append("--url")
-        cmd_args.append(url)
+    for seed in seeds:
+        cmd_args.append("--seeds")
+        cmd_args.append(seed)
 
     cmd_args.append("--userAgentSuffix")
     cmd_args.append(user_agent_suffix)
@@ -1032,18 +1055,17 @@ def get_node_cmd_line(args):
     for arg in [
         "title",
         "description",
-        "urlFile",
         "workers",
         "crawlId",
         "waitUntil",
         "depth",
         "extraHops",
-        "limit",
+        "pageLimit",
         "maxPageLimit",
-        "timeout",
+        "pageLoadTimeout",
         "scopeType",
-        "include",
-        "exclude",
+        "scopeIncludeRx",
+        "scopeExcludeRx",
         "collection",
         "allowHashUrls",
         "selectLinks",
@@ -1074,7 +1096,7 @@ def get_node_cmd_line(args):
         "behaviors",
         "behaviorTimeout",
         "postLoadDelay",
-        "delay",
+        "pageExtraDelay",
         "dedupPolicy",
         "profile",
         "screenshot",

From 7bfb4b25f0e390e191002345268013f243b0d53b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 13 Feb 2025 17:08:22 +0000
Subject: [PATCH 26/65] Remove confusion between zimit, warc2zim and crawler
 stats filenames

---
 .github/workflows/Tests.yaml |  10 +--
 CHANGELOG.md                 |   7 +-
 src/zimit/zimit.py           | 121 +++++++++++++++++++++++++++--------
 3 files changed, 105 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml
index afdc18b..8c74b21 100644
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@@ -63,19 +63,19 @@ jobs:
         run: docker run -v $PWD/output:/output local-zimit zimit --help
 
       - name: run crawl with soft size limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
 
       - name: run crawl with hard size limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
 
       - name: run crawl with soft time limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
 
       - name: run crawl with hard time limit
-        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
 
       - name: run standard crawl
-        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
 
       - name: run integration test suite
         run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae0d548..de71c25 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
 - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
 - Document all Browsertrix Crawler default arguments values (#416)
-- Use preferred Browsertrix Crawler arguments names:
+- Use preferred Browsertrix Crawler arguments names: (part of #471)
   - `--seeds` instead of `--url`
   - `--seedFile` instead of `--urlFile`
   - `--pageLimit` instead of `--limit`
@@ -21,6 +21,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `--scopeIncludeRx` instead of `--include`
   - `--scopeExcludeRx` instead of `--exclude`
   - `--pageExtraDelay` instead of `--delay`
+- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
+  - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
+  - `--zimit-progress-file` is now the zimit stats location
+  - `--warc2zim-progress-file` is the warc2zim stats location
+  - all are optional values, if not set and needed temporary files are used
 
 ### Fixed
 
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 2337c00..abd2978 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -37,17 +37,16 @@ temp_root_dir: Path | None = None
 
 
 class ProgressFileWatcher:
-    def __init__(self, output_dir: Path, stats_path: Path):
-        self.crawl_path = output_dir / "crawl.json"
-        self.warc2zim_path = output_dir / "warc2zim.json"
-        self.stats_path = stats_path
-
-        if not self.stats_path.is_absolute():
-            self.stats_path = output_dir / self.stats_path
+    def __init__(
+        self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path
+    ):
+        self.crawl_stats_path = crawl_stats_path
+        self.warc2zim_stats_path = warc2zim_stats_path
+        self.zimit_stats_path = zimit_stats_path
 
         # touch them all so inotify is not unhappy on add_watch
-        self.crawl_path.touch()
-        self.warc2zim_path.touch()
+        self.crawl_stats_path.touch()
+        self.warc2zim_stats_path.touch()
         self.process = None
 
     def stop(self):
@@ -59,12 +58,16 @@ class ProgressFileWatcher:
     def watch(self):
         self.process = Process(
             target=self.inotify_watcher,
-            args=(str(self.crawl_path), str(self.warc2zim_path), str(self.stats_path)),
+            args=(
+                str(self.crawl_stats_path),
+                str(self.warc2zim_stats_path),
+                str(self.zimit_stats_path),
+            ),
         )
         self.process.daemon = True
         self.process.start()
 
-    def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
+    def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, zimit_fpath: str):
         ino = inotify.adapters.Inotify()
         ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
         ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)  # pyright: ignore
@@ -101,7 +104,7 @@ class ProgressFileWatcher:
                     continue
                 if not out:
                     continue
-                with open(output_fpath, "w") as ofh:
+                with open(zimit_fpath, "w") as ofh:
                     json.dump(out, ofh)
 
 
@@ -427,8 +430,22 @@ def run(raw_args):
 
     parser.add_argument(
         "--statsFilename",
-        help="If set, output stats as JSON to this file. (Relative filename resolves "
-        "to crawl working directory)",
+        help="If set, output crawl stats as JSON to this file. Relative filename "
+        "resolves to output directory, see --output.",
+    )
+
+    parser.add_argument(
+        "--zimit-progress-file",
+        help="If set, output zimit stats as JSON to this file. Forces the creation of"
+        "crawler and warc2zim stats as well. If --statsFilename and/or "
+        "--warc2zim-progress-file are not set, default temporary files will be used. "
+        "Relative filename resolves to output directory, see --output.",
+    )
+
+    parser.add_argument(
+        "--warc2zim-progress-file",
+        help="If set, output warc2zim stats as JSON to this file. Relative filename "
+        "resolves to output directory, see --output.",
     )
 
     parser.add_argument(
@@ -701,7 +718,11 @@ def run(raw_args):
         action="store_true",
     )
 
-    parser.add_argument("--output", help="Output directory for ZIM", default="/output")
+    parser.add_argument(
+        "--output",
+        help="Output directory for ZIM. Default to /output.",
+        default="/output",
+    )
 
     parser.add_argument(
         "--build",
@@ -874,20 +895,67 @@ def run(raw_args):
     cmd_args.append("--cwd")
     cmd_args.append(str(temp_root_dir))
 
-    # setup inotify crawler progress watcher
-    if zimit_args.statsFilename:
+    output_dir = Path(zimit_args.output)
+    warc2zim_stats_file = (
+        Path(zimit_args.warc2zim_progress_file)
+        if zimit_args.warc2zim_progress_file
+        else temp_root_dir / "warc2zim.json"
+    )
+    if not warc2zim_stats_file.is_absolute():
+        warc2zim_stats_file = output_dir / warc2zim_stats_file
+        warc2zim_stats_file.parent.mkdir(parents=True, exist_ok=True)
+    warc2zim_stats_file.unlink(missing_ok=True)
+
+    crawler_stats_file = (
+        Path(zimit_args.statsFilename)
+        if zimit_args.statsFilename
+        else temp_root_dir / "crawl.json"
+    )
+    if not crawler_stats_file.is_absolute():
+        crawler_stats_file = output_dir / crawler_stats_file
+        crawler_stats_file.parent.mkdir(parents=True, exist_ok=True)
+    crawler_stats_file.unlink(missing_ok=True)
+
+    zimit_stats_file = (
+        Path(zimit_args.zimit_progress_file)
+        if zimit_args.zimit_progress_file
+        else temp_root_dir / "stats.json"
+    )
+    if not zimit_stats_file.is_absolute():
+        zimit_stats_file = output_dir / zimit_stats_file
+        zimit_stats_file.parent.mkdir(parents=True, exist_ok=True)
+    zimit_stats_file.unlink(missing_ok=True)
+
+    if zimit_args.zimit_progress_file:
+        # setup inotify crawler progress watcher
         watcher = ProgressFileWatcher(
-            Path(zimit_args.output), Path(zimit_args.statsFilename)
+            zimit_stats_path=zimit_stats_file,
+            crawl_stats_path=crawler_stats_file,
+            warc2zim_stats_path=warc2zim_stats_file,
+        )
+        logger.info(
+            f"Writing zimit progress to {watcher.zimit_stats_path}, crawler progress to"
+            f" {watcher.crawl_stats_path} and warc2zim progress to "
+            f"{watcher.warc2zim_stats_path}"
         )
-        logger.info(f"Writing progress to {watcher.stats_path}")
         # update crawler command
         cmd_args.append("--statsFilename")
-        cmd_args.append(str(watcher.crawl_path))
+        cmd_args.append(str(crawler_stats_file))
         # update warc2zim command
         warc2zim_args.append("-v")
         warc2zim_args.append("--progress-file")
-        warc2zim_args.append(str(watcher.warc2zim_path))
+        warc2zim_args.append(str(warc2zim_stats_file))
         watcher.watch()
+    else:
+        if zimit_args.statsFilename:
+            logger.info(f"Writing crawler progress to {crawler_stats_file}")
+            cmd_args.append("--statsFilename")
+            cmd_args.append(str(crawler_stats_file))
+        if zimit_args.warc2zim_progress_file:
+            logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}")
+            warc2zim_args.append("-v")
+            warc2zim_args.append("--progress-file")
+            warc2zim_args.append(str(warc2zim_stats_file))
 
     cmd_line = " ".join(cmd_args)
 
@@ -971,7 +1039,7 @@ def run(raw_args):
             logger.info(
                 "Crawl size soft limit hit. Continuing with warc2zim conversion."
             )
-            if zimit_args.statsFilename:
+            if zimit_args.zimit_progress_file:
                 partial_zim = True
         elif (
             crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
@@ -980,7 +1048,7 @@ def run(raw_args):
             logger.info(
                 "Crawl time soft limit hit. Continuing with warc2zim conversion."
             )
-            if zimit_args.statsFilename:
+            if zimit_args.zimit_progress_file:
                 partial_zim = True
         elif crawl.returncode != 0:
             logger.error(
@@ -1024,11 +1092,10 @@ def run(raw_args):
 
     warc2zim_exit_code = warc2zim(warc2zim_args)
 
-    if zimit_args.statsFilename:
-        stats = Path(zimit_args.statsFilename)
-        stats_content = json.loads(stats.read_bytes())
+    if zimit_args.zimit_progress_file:
+        stats_content = json.loads(zimit_stats_file.read_bytes())
         stats_content["partialZim"] = partial_zim
-        stats.write_text(json.dumps(stats_content))
+        zimit_stats_file.write_text(json.dumps(stats_content))
 
     # also call cancel_cleanup when --keep, even if it is not supposed to be registered,
     # so that we will display temporary files location just like in other situations

From 96c4c3bdfd5af5a62cf458effe9b84dfb7caa7bd Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 13 Feb 2025 17:11:32 +0000
Subject: [PATCH 27/65] Clarify args variables/functions names

---
 src/zimit/zimit.py | 133 +++++++++++++++++++++++----------------------
 1 file changed, 68 insertions(+), 65 deletions(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index abd2978..d298909 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -777,7 +777,9 @@ def run(raw_args):
         " used). Single value with individual error codes separated by comma",
     )
 
-    zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
+    # by design, all unknown args are for warc2zim ; known one are either for crawler
+    # or shared
+    known_args, warc2zim_args = parser.parse_known_args(raw_args)
 
     # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are
     # associated with the ZIM ; make it a CSV for easier parsing
@@ -785,26 +787,26 @@ def run(raw_args):
     warc2zim_args.append(f"zimit {__version__}")
 
     # pass url and output to warc2zim also
-    if zimit_args.output:
+    if known_args.output:
         warc2zim_args.append("--output")
-        warc2zim_args.append(zimit_args.output)
+        warc2zim_args.append(known_args.output)
 
-    user_agent_suffix = zimit_args.userAgentSuffix
-    if zimit_args.adminEmail:
-        user_agent_suffix += f" {zimit_args.adminEmail}"
+    user_agent_suffix = known_args.userAgentSuffix
+    if known_args.adminEmail:
+        user_agent_suffix += f" {known_args.adminEmail}"
 
     # make temp dir for this crawl
     global temp_root_dir  # noqa: PLW0603
-    if zimit_args.build:
-        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp"))
+    if known_args.build:
+        temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp"))
     else:
-        temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp"))
+        temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
 
     seeds = []
-    if zimit_args.seeds:
-        seeds += [get_cleaned_url(url) for url in zimit_args.seeds.split(",")]
-    if zimit_args.seedFile:
-        if re.match(r"^https?\://", zimit_args.seedFile):
+    if known_args.seeds:
+        seeds += [get_cleaned_url(url) for url in known_args.seeds.split(",")]
+    if known_args.seedFile:
+        if re.match(r"^https?\://", known_args.seedFile):
             with tempfile.NamedTemporaryFile(
                 dir=temp_root_dir,
                 prefix="seeds_",
@@ -812,36 +814,36 @@ def run(raw_args):
                 delete_on_close=True,
             ) as filename:
                 seed_file = Path(filename.name)
-                download_file(zimit_args.seedFile, seed_file)
+                download_file(known_args.seedFile, seed_file)
                 seeds += [
                     get_cleaned_url(url) for url in seed_file.read_text().splitlines()
                 ]
         else:
             seeds += [
                 get_cleaned_url(url)
-                for url in Path(zimit_args.seedFile).read_text().splitlines()
+                for url in Path(known_args.seedFile).read_text().splitlines()
             ]
     warc2zim_args.append("--url")
     warc2zim_args.append(seeds[0])
 
-    if zimit_args.custom_css:
-        warc2zim_args += ["--custom-css", zimit_args.custom_css]
+    if known_args.custom_css:
+        warc2zim_args += ["--custom-css", known_args.custom_css]
 
-    if zimit_args.title:
+    if known_args.title:
         warc2zim_args.append("--title")
-        warc2zim_args.append(zimit_args.title)
+        warc2zim_args.append(known_args.title)
 
-    if zimit_args.description:
+    if known_args.description:
         warc2zim_args.append("--description")
-        warc2zim_args.append(zimit_args.description)
+        warc2zim_args.append(known_args.description)
 
-    if zimit_args.long_description:
+    if known_args.long_description:
         warc2zim_args.append("--long-description")
-        warc2zim_args.append(zimit_args.long_description)
+        warc2zim_args.append(known_args.long_description)
 
-    if zimit_args.zim_lang:
+    if known_args.zim_lang:
         warc2zim_args.append("--lang")
-        warc2zim_args.append(zimit_args.zim_lang)
+        warc2zim_args.append(known_args.zim_lang)
 
     logger.info("----------")
     logger.info("Testing warc2zim args")
@@ -851,16 +853,16 @@ def run(raw_args):
         logger.info("Exiting, invalid warc2zim params")
         return EXIT_CODE_WARC2ZIM_CHECK_FAILED
 
-    if not zimit_args.keep:
+    if not known_args.keep:
         atexit.register(cleanup)
 
     # copy / download custom behaviors to one single folder and configure crawler
-    if zimit_args.custom_behaviors:
+    if known_args.custom_behaviors:
         behaviors_dir = temp_root_dir / "custom-behaviors"
         behaviors_dir.mkdir()
         for custom_behavior in [
             custom_behavior.strip()
-            for custom_behavior in zimit_args.custom_behaviors.split(",")
+            for custom_behavior in known_args.custom_behaviors.split(",")
         ]:
             behaviors_file = tempfile.NamedTemporaryFile(
                 dir=behaviors_dir,
@@ -880,25 +882,25 @@ def run(raw_args):
                     f"to {behaviors_file.name}"
                 )
                 shutil.copy(custom_behavior, behaviors_file.name)
-        zimit_args.customBehaviors = str(behaviors_dir)
+        known_args.customBehaviors = str(behaviors_dir)
     else:
-        zimit_args.customBehaviors = None
+        known_args.customBehaviors = None
 
-    cmd_args = get_node_cmd_line(zimit_args)
+    crawler_args = get_crawler_cmd_line(known_args)
     for seed in seeds:
-        cmd_args.append("--seeds")
-        cmd_args.append(seed)
+        crawler_args.append("--seeds")
+        crawler_args.append(seed)
 
-    cmd_args.append("--userAgentSuffix")
-    cmd_args.append(user_agent_suffix)
+    crawler_args.append("--userAgentSuffix")
+    crawler_args.append(user_agent_suffix)
 
-    cmd_args.append("--cwd")
-    cmd_args.append(str(temp_root_dir))
+    crawler_args.append("--cwd")
+    crawler_args.append(str(temp_root_dir))
 
-    output_dir = Path(zimit_args.output)
+    output_dir = Path(known_args.output)
     warc2zim_stats_file = (
-        Path(zimit_args.warc2zim_progress_file)
-        if zimit_args.warc2zim_progress_file
+        Path(known_args.warc2zim_progress_file)
+        if known_args.warc2zim_progress_file
         else temp_root_dir / "warc2zim.json"
     )
     if not warc2zim_stats_file.is_absolute():
@@ -907,8 +909,8 @@ def run(raw_args):
     warc2zim_stats_file.unlink(missing_ok=True)
 
     crawler_stats_file = (
-        Path(zimit_args.statsFilename)
-        if zimit_args.statsFilename
+        Path(known_args.statsFilename)
+        if known_args.statsFilename
         else temp_root_dir / "crawl.json"
     )
     if not crawler_stats_file.is_absolute():
@@ -917,8 +919,8 @@ def run(raw_args):
     crawler_stats_file.unlink(missing_ok=True)
 
     zimit_stats_file = (
-        Path(zimit_args.zimit_progress_file)
-        if zimit_args.zimit_progress_file
+        Path(known_args.zimit_progress_file)
+        if known_args.zimit_progress_file
         else temp_root_dir / "stats.json"
     )
     if not zimit_stats_file.is_absolute():
@@ -926,7 +928,7 @@ def run(raw_args):
         zimit_stats_file.parent.mkdir(parents=True, exist_ok=True)
     zimit_stats_file.unlink(missing_ok=True)
 
-    if zimit_args.zimit_progress_file:
+    if known_args.zimit_progress_file:
         # setup inotify crawler progress watcher
         watcher = ProgressFileWatcher(
             zimit_stats_path=zimit_stats_file,
@@ -939,31 +941,31 @@ def run(raw_args):
             f"{watcher.warc2zim_stats_path}"
         )
         # update crawler command
-        cmd_args.append("--statsFilename")
-        cmd_args.append(str(crawler_stats_file))
+        crawler_args.append("--statsFilename")
+        crawler_args.append(str(crawler_stats_file))
         # update warc2zim command
         warc2zim_args.append("-v")
         warc2zim_args.append("--progress-file")
         warc2zim_args.append(str(warc2zim_stats_file))
         watcher.watch()
     else:
-        if zimit_args.statsFilename:
+        if known_args.statsFilename:
             logger.info(f"Writing crawler progress to {crawler_stats_file}")
-            cmd_args.append("--statsFilename")
-            cmd_args.append(str(crawler_stats_file))
-        if zimit_args.warc2zim_progress_file:
+            crawler_args.append("--statsFilename")
+            crawler_args.append(str(crawler_stats_file))
+        if known_args.warc2zim_progress_file:
             logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}")
             warc2zim_args.append("-v")
             warc2zim_args.append("--progress-file")
             warc2zim_args.append(str(warc2zim_stats_file))
 
-    cmd_line = " ".join(cmd_args)
+    cmd_line = " ".join(crawler_args)
 
     logger.info("")
     logger.info("----------")
     logger.info(
         f"Output to tempdir: {temp_root_dir} - "
-        f"{'will keep' if zimit_args.keep else 'will delete'}"
+        f"{'will keep' if known_args.keep else 'will delete'}"
     )
 
     partial_zim = False
@@ -971,9 +973,9 @@ def run(raw_args):
     # if warc files are passed, do not run browsertrix crawler but fetch the files if
     # they are provided as an HTTP URL + extract the archive if it is a tar.gz
     warc_files: list[Path] = []
-    if zimit_args.warcs:
+    if known_args.warcs:
         for warc_location in [
-            warc_location.strip() for warc_location in zimit_args.warcs.split(",")
+            warc_location.strip() for warc_location in known_args.warcs.split(",")
         ]:
             suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes)
             if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}:
@@ -1031,24 +1033,24 @@ def run(raw_args):
     else:
 
         logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
-        crawl = subprocess.run(cmd_args, check=False)
+        crawl = subprocess.run(crawler_args, check=False)
         if (
             crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT
-            and zimit_args.sizeSoftLimit
+            and known_args.sizeSoftLimit
         ):
             logger.info(
                 "Crawl size soft limit hit. Continuing with warc2zim conversion."
             )
-            if zimit_args.zimit_progress_file:
+            if known_args.zimit_progress_file:
                 partial_zim = True
         elif (
             crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT
-            and zimit_args.timeSoftLimit
+            and known_args.timeSoftLimit
         ):
             logger.info(
                 "Crawl time soft limit hit. Continuing with warc2zim conversion."
             )
-            if zimit_args.zimit_progress_file:
+            if known_args.zimit_progress_file:
                 partial_zim = True
         elif crawl.returncode != 0:
             logger.error(
@@ -1057,9 +1059,9 @@ def run(raw_args):
             cancel_cleanup()
             return crawl.returncode
 
-        if zimit_args.collection:
+        if known_args.collection:
             warc_files = [
-                temp_root_dir.joinpath(f"collections/{zimit_args.collection}/archive/")
+                temp_root_dir.joinpath(f"collections/{known_args.collection}/archive/")
             ]
 
         else:
@@ -1092,14 +1094,14 @@ def run(raw_args):
 
     warc2zim_exit_code = warc2zim(warc2zim_args)
 
-    if zimit_args.zimit_progress_file:
+    if known_args.zimit_progress_file:
         stats_content = json.loads(zimit_stats_file.read_bytes())
         stats_content["partialZim"] = partial_zim
         zimit_stats_file.write_text(json.dumps(stats_content))
 
     # also call cancel_cleanup when --keep, even if it is not supposed to be registered,
     # so that we will display temporary files location just like in other situations
-    if warc2zim_exit_code or zimit_args.keep:
+    if warc2zim_exit_code or known_args.keep:
         cancel_cleanup()
 
     return warc2zim_exit_code
@@ -1117,7 +1119,8 @@ def get_cleaned_url(url: str):
     return parsed_url.geturl()
 
 
-def get_node_cmd_line(args):
+def get_crawler_cmd_line(args):
+    """Build the command line for Browsertrix crawler"""
     node_cmd = ["crawl"]
     for arg in [
         "title",

From 2f7a83e1872c3c81859d8e5157ffe5ff200cf0c9 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 14 Feb 2025 14:22:30 +0000
Subject: [PATCH 28/65] Fixes following review

---
 src/zimit/zimit.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index d298909..02b167d 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -155,7 +155,7 @@ def run(raw_args):
     parser.add_argument(
         "--crawlId",
         help="A user provided ID for this crawl or crawl configuration (can also be "
-        "set via CRAWL_ID env var, defaults to hostname)",
+        "set via CRAWL_ID env var, defaults to machine hostname)",
     )
 
     parser.add_argument(
@@ -167,7 +167,7 @@ def run(raw_args):
 
     parser.add_argument(
         "--depth",
-        help="The depth of the crawl for all seeds. Default is -1.",
+        help="The depth of the crawl for all seeds. Default is -1 (infinite).",
         type=int,
     )
 
@@ -388,7 +388,8 @@ def run(raw_args):
         " record(s)",
     )
 
-    # cwd is not manipulable
+    # cwd is manipulated directly by zimit, based on --output / --build, we do not want
+    # to expose this setting
 
     parser.add_argument(
         "--mobileDevice",
@@ -689,7 +690,7 @@ def run(raw_args):
 
     parser.add_argument(
         "--qaSource",
-        help="Required for QA mode. Source (WACZ or multi WACZ) for QA",
+        help="Required for QA mode. Path to the source WACZ or multi WACZ file for QA",
     )
 
     parser.add_argument(

From 3eb6c090465f38cbbcf078df81ce44717b54efa5 Mon Sep 17 00:00:00 2001
From: clach04 <clach04@gmail.com>
Date: Fri, 14 Feb 2025 22:02:17 -0800
Subject: [PATCH 29/65] Correct link in README.md

Signed-off-by: clach04 <clach04@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bc18dc6..894f523 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Zimit is a scraper allowing to create ZIM file from any Web site.
 
 Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
 
-Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**.
+Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
 
 Capabilities and known limitations
 --------------------

From ee0f4c6cec3007760365ab619916624499596a03 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 17 Feb 2025 09:52:55 +0000
Subject: [PATCH 30/65] Use released warc2zim 2.2.2

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..9aa830a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

From e3cd12b0d1ba19853f527f1edff5acc045c8fc40 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 17 Feb 2025 10:02:43 +0000
Subject: [PATCH 31/65] Release 3.0.0

---
 CHANGELOG.md           | 2 +-
 src/zimit/__about__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de71c25..0105b54 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [3.0.0] - 2024-02-17
 
 ### Changed
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index d733cff..528787c 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.0-dev0"
+__version__ = "3.0.0"

From bce22ceac11804144e223ed3fb63b5a4415838c6 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 17 Feb 2025 10:08:49 +0000
Subject: [PATCH 32/65] Prepare for 3.0.1

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0105b54..dd9d4b9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [3.0.0] - 2024-02-17
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 9aa830a..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 528787c..038a5da 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.0"
+__version__ = "3.0.1-dev0"

From 1b5b9bb80b6a11aa26ae11f8d3fe5863e49a26b9 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Feb 2025 09:21:19 +0000
Subject: [PATCH 33/65] Upgrade to browsertrix crawler 1.5.4

---
 CHANGELOG.md | 4 ++++
 Dockerfile   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd9d4b9..0335b39 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.4 (#476)
+
 ## [3.0.0] - 2024-02-17
 
 ### Changed
diff --git a/Dockerfile b/Dockerfile
index c37dfcc..672fbd8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.3
+FROM webrecorder/browsertrix-crawler:1.5.4
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu

From 5e53be6fa4cfa827acd05bcbfee376dd74d173c7 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Feb 2025 09:22:40 +0000
Subject: [PATCH 34/65] Pin warc2zim version in preparation for 3.0.1 release

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..9aa830a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

From dd659025562e4ce976c593e18254861f43f4bbfe Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Feb 2025 09:37:40 +0000
Subject: [PATCH 35/65] Release 3.0.1

---
 CHANGELOG.md           | 2 +-
 src/zimit/__about__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0335b39..c0c7d3d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [3.0.1] - 2024-02-24
 
 ### Changed
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 038a5da..0552768 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.1-dev0"
+__version__ = "3.0.1"

From 363ff4076711e9b507698736599fed4b2bd79761 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Mon, 24 Feb 2025 09:40:04 +0000
Subject: [PATCH 36/65] Prepare for 3.0.2

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c0c7d3d..399ef49 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [3.0.1] - 2024-02-24
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 9aa830a..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 0552768..f50125d 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.1"
+__version__ = "3.0.2-dev0"

From 00f0e475ae903076d20788190262be2180c64c70 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 27 Feb 2025 07:31:28 +0000
Subject: [PATCH 37/65] Upgrade to browsertrix crawler 1.5.5

---
 CHANGELOG.md | 4 ++++
 Dockerfile   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 399ef49..347200f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.5 (#480)
+
 ## [3.0.1] - 2024-02-24
 
 ### Changed
diff --git a/Dockerfile b/Dockerfile
index 672fbd8..dc09fe8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.4
+FROM webrecorder/browsertrix-crawler:1.5.5
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu

From eebc75f868af9f8d156442548b19630e42c0977f Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 27 Feb 2025 07:32:25 +0000
Subject: [PATCH 38/65] Pin warc2zim version in preparation for 3.0.2 release

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..9aa830a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

From 1287351c1d587b35b8810ae5edcd6fb7cb0e3309 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 27 Feb 2025 19:36:11 +0000
Subject: [PATCH 39/65] Upgrade to browsertrix crawler 1.5.6

---
 CHANGELOG.md | 2 +-
 Dockerfile   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 347200f..1734ccc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Upgrade to browsertrix crawler 1.5.5 (#480)
+- Upgrade to browsertrix crawler 1.5.6 (#482)
 
 ## [3.0.1] - 2024-02-24
 
diff --git a/Dockerfile b/Dockerfile
index dc09fe8..923d499 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.5
+FROM webrecorder/browsertrix-crawler:1.5.6
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu

From 6ee053af5f6cfa1998647106ff72f6905337cb84 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 27 Feb 2025 19:58:51 +0000
Subject: [PATCH 40/65] Release 3.0.2

---
 CHANGELOG.md           | 2 +-
 src/zimit/__about__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1734ccc..d7f6c9b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [3.0.2] - 2024-02-27
 
 ### Changed
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index f50125d..131942e 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.2-dev0"
+__version__ = "3.0.2"

From 9fc66a95b7c905a1f5467a8d16b4add95ca83716 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Thu, 27 Feb 2025 20:03:37 +0000
Subject: [PATCH 41/65] Prepare for 3.0.3

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7f6c9b..0b36f38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [3.0.2] - 2024-02-27
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 9aa830a..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 131942e..a1e7aaa 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.2"
+__version__ = "3.0.3-dev0"

From 88b85311e0be0c2180d927139ad3e6100e5fe96f Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 28 Feb 2025 06:13:45 +0000
Subject: [PATCH 42/65] Upgrade to browsertrix crawler 1.5.7

---
 CHANGELOG.md   | 4 ++++
 Dockerfile     | 2 +-
 pyproject.toml | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b36f38..143c9d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.7 (#483)
+
 ## [3.0.2] - 2024-02-27
 
 ### Changed
diff --git a/Dockerfile b/Dockerfile
index 923d499..c6e9bbb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.6
+FROM webrecorder/browsertrix-crawler:1.5.7
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu
diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..9aa830a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

From 1e6748ab69e1716953b95ae166570974079278ab Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 28 Feb 2025 06:21:27 +0000
Subject: [PATCH 43/65] Release 3.0.3

---
 CHANGELOG.md           | 2 +-
 src/zimit/__about__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 143c9d3..abd27a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [3.0.3] - 2024-02-28
 
 ### Changed
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index a1e7aaa..8d1c862 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.3-dev0"
+__version__ = "3.0.3"

From 4e0174836d36aa2b1061fff179bdf032ee587d54 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 28 Feb 2025 06:38:52 +0000
Subject: [PATCH 44/65] Prepare for 3.0.4

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index abd27a2..109319d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [3.0.3] - 2024-02-28
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 9aa830a..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 8d1c862..31a915c 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.3"
+__version__ = "3.0.4-dev0"

From 146af5de0ad22d1aa479a80786bc075b124122f2 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Wed, 2 Apr 2025 08:04:32 +0000
Subject: [PATCH 45/65] Upgrade to browsertrix crawler 1.5.10

---
 CHANGELOG.md | 4 ++++
 Dockerfile   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 109319d..94e47e0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.10 (#491)
+
 ## [3.0.3] - 2024-02-28
 
 ### Changed
diff --git a/Dockerfile b/Dockerfile
index c6e9bbb..9860ccb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.7
+FROM webrecorder/browsertrix-crawler:1.5.9
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu

From 12fde3af9810d09425441df8f870965e16513034 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 4 Apr 2025 11:00:29 +0000
Subject: [PATCH 46/65] Release 3.0.4

---
 CHANGELOG.md           | 2 +-
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94e47e0..e188a13 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [3.0.4] - 2024-04-04
 
 ### Changed
 
diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..9aa830a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 31a915c..8e10cb4 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.4-dev0"
+__version__ = "3.0.4"

From 3421ca02127aa6a09ff0ff5d4fc874e027a3a910 Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 4 Apr 2025 11:09:50 +0000
Subject: [PATCH 47/65] Prepare for 3.0.5

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e188a13..2464bbc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [3.0.4] - 2024-04-04
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 9aa830a..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index 8e10cb4..dd14b28 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.4"
+__version__ = "3.0.5-dev0"

From 511c3a5021823e588408b280940f51f9f9ba6b4e Mon Sep 17 00:00:00 2001
From: orangetin <abhy@together.ai>
Date: Thu, 10 Apr 2025 17:52:19 -0700
Subject: [PATCH 48/65] Upgrade browsertrix-crawler to version 1.6.0 in
 Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 9860ccb..9666c0b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.9
+FROM webrecorder/browsertrix-crawler:1.6.0
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
 
 # add deadsnakes ppa for latest Python on Ubuntu

From b5d87198d85746988a23fdf3def66ca96982eb0c Mon Sep 17 00:00:00 2001
From: orangetin <abhy@together.ai>
Date: Thu, 10 Apr 2025 17:54:34 -0700
Subject: [PATCH 49/65] update changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2464bbc..2512f06 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Upgrade to browsertrix crawler 1.6.0 (#493)
+
 ## [3.0.4] - 2024-04-04
 
 ### Changed

From 009b8b4bd66e9520e0797c01fd0406a8259eb66d Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:18:18 +0000
Subject: [PATCH 50/65] Release 3.0.5

---
 CHANGELOG.md           | 2 +-
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2512f06..bc99b8f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
-## [Unreleased]
+## [3.0.5] - 2024-04-11
 
 ### Changed
 
diff --git a/pyproject.toml b/pyproject.toml
index e4e7696..9aa830a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+  "warc2zim==2.2.2",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index dd14b28..e94f36f 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.5-dev0"
+__version__ = "3.0.5"

From 8c471d9ee2269f22d63cfef383b4fba02241319b Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Fri, 11 Apr 2025 07:46:42 +0000
Subject: [PATCH 51/65] Prepare for 3.0.6

---
 CHANGELOG.md           | 2 ++
 pyproject.toml         | 2 +-
 src/zimit/__about__.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc99b8f..5e06e20 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
 
+## [Unreleased]
+
 ## [3.0.5] - 2024-04-11
 
 ### Changed
diff --git a/pyproject.toml b/pyproject.toml
index 9aa830a..e4e7696 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
   "requests==2.32.3",
   "inotify==0.2.10",
   "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 
diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py
index e94f36f..281b1bb 100644
--- a/src/zimit/__about__.py
+++ b/src/zimit/__about__.py
@@ -1 +1 @@
-__version__ = "3.0.5"
+__version__ = "3.0.6-dev0"

From 5624cbf08142b321996bfd85ac9c12f1e52d2dae Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Tue, 7 Oct 2025 04:08:14 +0100
Subject: [PATCH 52/65] set up offliner definitions

---
 .../update-zim-offliner-definition.yaml       |  38 +
 offliner-definition.json                      | 973 ++++++++++++++++++
 2 files changed, 1011 insertions(+)
 create mode 100644 .github/workflows/update-zim-offliner-definition.yaml
 create mode 100644 offliner-definition.json

diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml
new file mode 100644
index 0000000..4662e62
--- /dev/null
+++ b/.github/workflows/update-zim-offliner-definition.yaml
@@ -0,0 +1,38 @@
+name: Update ZIMFarm Definitions
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "offliner-definition.json"
+  release:
+    types: [published]
+
+jobs:
+  prepare-json:
+    runs-on: ubuntu-24.04
+    outputs:
+      offliner_definition: ${{ steps.read-json.outputs.offliner_definition }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - id: read-json
+        run: |
+          if [ ! -f "offliner-definition.json" ]; then
+            echo "File not found!" >&2
+            exit 1
+          fi
+          json=$(jq -c . offliner-definition.json)
+          echo "offliner_definition=$json" >> $GITHUB_OUTPUT
+  call-workflow:
+    needs: prepare-json
+    uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
+    with:
+      version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }}
+      offliner: zimit
+      offliner_definition: ${{ needs.prepare-json.outputs.offliner_definition }}
+    secrets:
+      zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
diff --git a/offliner-definition.json b/offliner-definition.json
new file mode 100644
index 0000000..c7fed57
--- /dev/null
+++ b/offliner-definition.json
@@ -0,0 +1,973 @@
+{
+  "offliner_id": "zimit",
+  "stdOutput": true,
+  "stdStats": "zimit-progress-file",
+  "flags": {
+    "seeds": {
+      "type": "string",
+      "required": false,
+      "title": "Seeds",
+      "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
+    },
+    "seed_file": {
+      "type": "string",
+      "required": false,
+      "title": "Seed File",
+      "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
+    },
+    "lang": {
+      "type": "string",
+      "required": false,
+      "title": "Browser Language",
+      "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
+    },
+    "title": {
+      "type": "string",
+      "required": false,
+      "title": "Title",
+      "description": "Custom title for your ZIM. Defaults to title of main page",
+      "minLength": 1,
+      "maxLength": 30
+    },
+    "description": {
+      "type": "string",
+      "required": false,
+      "title": "Description",
+      "description": "Description for ZIM",
+      "minLength": 1,
+      "maxLength": 80
+    },
+    "favicon": {
+      "type": "url",
+      "required": false,
+      "title": "Illustration",
+      "description": "URL for Illustration. "
+    },
+    "tags": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM Tags",
+      "description": "Single string with individual tags separated by a semicolon."
+    },
+    "creator": {
+      "type": "string",
+      "required": false,
+      "title": "Creator",
+      "description": "Name of content creator"
+    },
+    "publisher": {
+      "type": "string",
+      "required": false,
+      "title": "Publisher",
+      "isPublisher": true,
+      "description": "Custom publisher name (ZIM metadata). openZIM otherwise"
+    },
+    "source": {
+      "type": "string",
+      "required": false,
+      "title": "Source",
+      "description": "Source name/URL of content"
+    },
+    "workers": {
+      "type": "integer",
+      "required": false,
+      "title": "Workers",
+      "description": "The number of workers to run in parallel. Defaults to 1",
+      "min": 1
+    },
+    "wait_until": {
+      "type": "string",
+      "required": false,
+      "title": "WaitUntil",
+      "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
+    },
+    "extra_hops": {
+      "type": "integer",
+      "required": false,
+      "title": "Extra Hops",
+      "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
+      "min": 0
+    },
+    "page_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Limit",
+      "description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
+      "min": 0
+    },
+    "max_page_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Max Page Limit",
+      "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
+      "min": 0
+    },
+    "page_load_timeout": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Load Timeout",
+      "description": "Timeout for each page to load (in seconds). Default is 90",
+      "min": 0
+    },
+    "scope_type": {
+      "type": "string-enum",
+      "required": false,
+      "title": "Scope Type",
+      "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
+      "choices": [
+        {
+          "title": "Page",
+          "value": "page"
+        },
+        {
+          "title": "Page SPA",
+          "value": "page-spa"
+        },
+        {
+          "title": "Prefix",
+          "value": "prefix"
+        },
+        {
+          "title": "Host",
+          "value": "host"
+        },
+        {
+          "title": "Domain",
+          "value": "domain"
+        },
+        {
+          "title": "Any",
+          "value": "any"
+        },
+        {
+          "title": "Custom",
+          "value": "custom"
+        }
+      ]
+    },
+    "scope_include_rx": {
+      "type": "string",
+      "required": false,
+      "title": "Scope Include Regex",
+      "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
+    },
+    "scope_exclude_rx": {
+      "type": "string",
+      "required": false,
+      "title": "Scope Exclude Regex",
+      "description": "Regex of page URLs that should be excluded from the crawl"
+    },
+    "allow_hash_urls": {
+      "type": "boolean",
+      "required": false,
+      "title": "Allow Hashtag URLs",
+      "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
+    },
+    "mobile_device": {
+      "type": "string-enum",
+      "required": false,
+      "title": "As device",
+      "description": "Device to crawl as. See Pupeeter's Device.ts for a list",
+      "choices": [
+        {
+          "title": "Blackberry Playbook",
+          "value": "Blackberry PlayBook"
+        },
+        {
+          "title": "Blackberry Playbook Landscape",
+          "value": "Blackberry PlayBook landscape"
+        },
+        {
+          "title": "Blackberry Z30",
+          "value": "BlackBerry Z30"
+        },
+        {
+          "title": "Blackberry Z30 Landscape",
+          "value": "BlackBerry Z30 landscape"
+        },
+        {
+          "title": "Galaxy Note 3",
+          "value": "Galaxy Note 3"
+        },
+        {
+          "title": "Galaxy Note 3 Landscape",
+          "value": "Galaxy Note 3 landscape"
+        },
+        {
+          "title": "Galaxy Note II",
+          "value": "Galaxy Note II"
+        },
+        {
+          "title": "Galaxy Note II Landscape",
+          "value": "Galaxy Note II landscape"
+        },
+        {
+          "title": "Galaxy S III",
+          "value": "Galaxy S III"
+        },
+        {
+          "title": "Galaxy S III Landscape",
+          "value": "Galaxy S III landscape"
+        },
+        {
+          "title": "Galaxy S5",
+          "value": "Galaxy S5"
+        },
+        {
+          "title": "Galaxy S5 Landscape",
+          "value": "Galaxy S5 landscape"
+        },
+        {
+          "title": "Galaxy S8",
+          "value": "Galaxy S8"
+        },
+        {
+          "title": "Galaxy S8 Landscape",
+          "value": "Galaxy S8 landscape"
+        },
+        {
+          "title": "Galaxy S9 Plus",
+          "value": "Galaxy S9+"
+        },
+        {
+          "title": "Galaxy S9 Plus Landscape",
+          "value": "Galaxy S9+ landscape"
+        },
+        {
+          "title": "Galaxy Tab S4",
+          "value": "Galaxy Tab S4"
+        },
+        {
+          "title": "Galaxy Tab S4 Landscape",
+          "value": "Galaxy Tab S4 landscape"
+        },
+        {
+          "title": "iPad",
+          "value": "iPad"
+        },
+        {
+          "title": "iPad Landscape",
+          "value": "iPad landscape"
+        },
+        {
+          "title": "iPad Gen 6",
+          "value": "iPad (gen 6)"
+        },
+        {
+          "title": "iPad Gen 6 Landscape",
+          "value": "iPad (gen 6) landscape"
+        },
+        {
+          "title": "iPad Gen 7",
+          "value": "iPad (gen 7)"
+        },
+        {
+          "title": "iPad Gen 7 Landscape",
+          "value": "iPad (gen 7) landscape"
+        },
+        {
+          "title": "iPad Mini",
+          "value": "iPad Mini"
+        },
+        {
+          "title": "iPad Mini Landscape",
+          "value": "iPad Mini landscape"
+        },
+        {
+          "title": "iPad Pro",
+          "value": "iPad Pro"
+        },
+        {
+          "title": "iPad Pro Landscape",
+          "value": "iPad Pro landscape"
+        },
+        {
+          "title": "iPad Pro 11",
+          "value": "iPad Pro 11"
+        },
+        {
+          "title": "iPad Pro 11 Landscape",
+          "value": "iPad Pro 11 landscape"
+        },
+        {
+          "title": "iPhone 4",
+          "value": "iPhone 4"
+        },
+        {
+          "title": "iPhone 4 Landscape",
+          "value": "iPhone 4 landscape"
+        },
+        {
+          "title": "iPhone 5",
+          "value": "iPhone 5"
+        },
+        {
+          "title": "iPhone 5 Landscape",
+          "value": "iPhone 5 landscape"
+        },
+        {
+          "title": "iPhone 6",
+          "value": "iPhone 6"
+        },
+        {
+          "title": "iPhone 6 Landscape",
+          "value": "iPhone 6 landscape"
+        },
+        {
+          "title": "iPhone 6 Plus",
+          "value": "iPhone 6 Plus"
+        },
+        {
+          "title": "iPhone 6 Plus Landscape",
+          "value": "iPhone 6 Plus landscape"
+        },
+        {
+          "title": "iPhone 7",
+          "value": "iPhone 7"
+        },
+        {
+          "title": "iPhone 7 Landscape",
+          "value": "iPhone 7 landscape"
+        },
+        {
+          "title": "iPhone 7 Plus",
+          "value": "iPhone 7 Plus"
+        },
+        {
+          "title": "iPhone 7 Plus Landscape",
+          "value": "iPhone 7 Plus landscape"
+        },
+        {
+          "title": "iPhone 8",
+          "value": "iPhone 8"
+        },
+        {
+          "title": "iPhone 8 Landscape",
+          "value": "iPhone 8 landscape"
+        },
+        {
+          "title": "iPhone 8 Plus",
+          "value": "iPhone 8 Plus"
+        },
+        {
+          "title": "iPhone 8 Plus Landscape",
+          "value": "iPhone 8 Plus landscape"
+        },
+        {
+          "title": "iPhone SE",
+          "value": "iPhone SE"
+        },
+        {
+          "title": "iPhone SE Landscape",
+          "value": "iPhone SE landscape"
+        },
+        {
+          "title": "iPhone X",
+          "value": "iPhone X"
+        },
+        {
+          "title": "iPhone X Landscape",
+          "value": "iPhone X landscape"
+        },
+        {
+          "title": "iPhone XR",
+          "value": "iPhone XR"
+        },
+        {
+          "title": "iPhone XR Landscape",
+          "value": "iPhone XR landscape"
+        },
+        {
+          "title": "iPhone 11",
+          "value": "iPhone 11"
+        },
+        {
+          "title": "iPhone 11 Landscape",
+          "value": "iPhone 11 landscape"
+        },
+        {
+          "title": "iPhone 11 Pro",
+          "value": "iPhone 11 Pro"
+        },
+        {
+          "title": "iPhone 11 Pro Landscape",
+          "value": "iPhone 11 Pro landscape"
+        },
+        {
+          "title": "iPhone 11 Pro Max",
+          "value": "iPhone 11 Pro Max"
+        },
+        {
+          "title": "iPhone 11 Pro Max Landscape",
+          "value": "iPhone 11 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 12",
+          "value": "iPhone 12"
+        },
+        {
+          "title": "iPhone 12 Landscape",
+          "value": "iPhone 12 landscape"
+        },
+        {
+          "title": "iPhone 12 Pro",
+          "value": "iPhone 12 Pro"
+        },
+        {
+          "title": "iPhone 12 Pro Landscape",
+          "value": "iPhone 12 Pro landscape"
+        },
+        {
+          "title": "iPhone 12 Pro Max",
+          "value": "iPhone 12 Pro Max"
+        },
+        {
+          "title": "iPhone 12 Pro Max Landscape",
+          "value": "iPhone 12 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 12 Mini",
+          "value": "iPhone 12 Mini"
+        },
+        {
+          "title": "iPhone 12 Mini Landscape",
+          "value": "iPhone 12 Mini landscape"
+        },
+        {
+          "title": "iPhone 13",
+          "value": "iPhone 13"
+        },
+        {
+          "title": "iPhone 13 Landscape",
+          "value": "iPhone 13 landscape"
+        },
+        {
+          "title": "iPhone 13 Pro",
+          "value": "iPhone 13 Pro"
+        },
+        {
+          "title": "iPhone 13 Pro Landscape",
+          "value": "iPhone 13 Pro landscape"
+        },
+        {
+          "title": "iPhone 13 Pro Max",
+          "value": "iPhone 13 Pro Max"
+        },
+        {
+          "title": "iPhone 13 Pro Max Landscape",
+          "value": "iPhone 13 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 13 Mini",
+          "value": "iPhone 13 Mini"
+        },
+        {
+          "title": "iPhone 13 Mini Landscape",
+          "value": "iPhone 13 Mini landscape"
+        },
+        {
+          "title": "Jio Phone 2",
+          "value": "JioPhone 2"
+        },
+        {
+          "title": "Jio Phone 2 Landscape",
+          "value": "JioPhone 2 landscape"
+        },
+        {
+          "title": "Kindle Fire HDX",
+          "value": "Kindle Fire HDX"
+        },
+        {
+          "title": "Kindle Fire HDX Landscape",
+          "value": "Kindle Fire HDX landscape"
+        },
+        {
+          "title": "LG Optimus L70",
+          "value": "LG Optimus L70"
+        },
+        {
+          "title": "LG Optimus L70 Landscape",
+          "value": "LG Optimus L70 landscape"
+        },
+        {
+          "title": "Microsoft Lumia 550",
+          "value": "Microsoft Lumia 550"
+        },
+        {
+          "title": "Microsoft Lumia 950",
+          "value": "Microsoft Lumia 950"
+        },
+        {
+          "title": "Microsoft Lumia 950 Landscape",
+          "value": "Microsoft Lumia 950 landscape"
+        },
+        {
+          "title": "Nexus 10",
+          "value": "Nexus 10"
+        },
+        {
+          "title": "Nexus 10 Landscape",
+          "value": "Nexus 10 landscape"
+        },
+        {
+          "title": "Nexus 4",
+          "value": "Nexus 4"
+        },
+        {
+          "title": "Nexus 4 Landscape",
+          "value": "Nexus 4 landscape"
+        },
+        {
+          "title": "Nexus 5",
+          "value": "Nexus 5"
+        },
+        {
+          "title": "Nexus 5 Landscape",
+          "value": "Nexus 5 landscape"
+        },
+        {
+          "title": "Nexus 5X",
+          "value": "Nexus 5X"
+        },
+        {
+          "title": "Nexus 5X Landscape",
+          "value": "Nexus 5X landscape"
+        },
+        {
+          "title": "Nexus 6",
+          "value": "Nexus 6"
+        },
+        {
+          "title": "Nexus 6 Landscape",
+          "value": "Nexus 6 landscape"
+        },
+        {
+          "title": "Nexus 6P",
+          "value": "Nexus 6P"
+        },
+        {
+          "title": "Nexus 6P Landscape",
+          "value": "Nexus 6P landscape"
+        },
+        {
+          "title": "Nexus 7",
+          "value": "Nexus 7"
+        },
+        {
+          "title": "Nexus 7 Landscape",
+          "value": "Nexus 7 landscape"
+        },
+        {
+          "title": "Nokia Lumia 520",
+          "value": "Nokia Lumia 520"
+        },
+        {
+          "title": "Nokia Lumia 520 Landscape",
+          "value": "Nokia Lumia 520 landscape"
+        },
+        {
+          "title": "Nokia N9",
+          "value": "Nokia N9"
+        },
+        {
+          "title": "Nokia N9 Landscape",
+          "value": "Nokia N9 landscape"
+        },
+        {
+          "title": "Pixel 2",
+          "value": "Pixel 2"
+        },
+        {
+          "title": "Pixel 2 Landscape",
+          "value": "Pixel 2 landscape"
+        },
+        {
+          "title": "Pixel 2 XL",
+          "value": "Pixel 2 XL"
+        },
+        {
+          "title": "Pixel 2 XL Landscape",
+          "value": "Pixel 2 XL landscape"
+        },
+        {
+          "title": "Pixel 3",
+          "value": "Pixel 3"
+        },
+        {
+          "title": "Pixel 3 Landscape",
+          "value": "Pixel 3 landscape"
+        },
+        {
+          "title": "Pixel 4",
+          "value": "Pixel 4"
+        },
+        {
+          "title": "Pixel 4 Landscape",
+          "value": "Pixel 4 landscape"
+        },
+        {
+          "title": "Pixel 4A 5G",
+          "value": "Pixel 4a (5G)"
+        },
+        {
+          "title": "Pixel 4A 5G Landscape",
+          "value": "Pixel 4a (5G) landscape"
+        },
+        {
+          "title": "Pixel 5",
+          "value": "Pixel 5"
+        },
+        {
+          "title": "Pixel 5 Landscape",
+          "value": "Pixel 5 landscape"
+        },
+        {
+          "title": "Moto G4",
+          "value": "Moto G4"
+        },
+        {
+          "title": "Moto G4 Landscape",
+          "value": "Moto G4 landscape"
+        }
+      ]
+    },
+    "select_links": {
+      "type": "string",
+      "required": false,
+      "title": "Select Links",
+      "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
+    },
+    "click_selector": {
+      "type": "string",
+      "required": false,
+      "title": "Click Selector",
+      "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
+    },
+    "block_rules": {
+      "type": "string",
+      "required": false,
+      "title": "Block Rules",
+      "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
+    },
+    "block_message": {
+      "type": "string",
+      "required": false,
+      "title": "Block Message",
+      "description": "If specified, when a URL is blocked, a record with this error message is added instead"
+    },
+    "block_ads": {
+      "type": "boolean",
+      "required": false,
+      "title": "Block Ads",
+      "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
+    },
+    "ad_block_message": {
+      "type": "string",
+      "required": false,
+      "title": "Ads Block Message",
+      "description": "If specified, when an ad is blocked, a record with this error message is added instead"
+    },
+    "user_agent": {
+      "type": "string",
+      "required": false,
+      "title": "User Agent",
+      "description": "Override user-agent with specified"
+    },
+    "user_agent_suffix": {
+      "type": "string",
+      "required": false,
+      "title": "User Agent Suffix",
+      "description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
+    },
+    "use_sitemap": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap URL",
+      "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
+    },
+    "sitemap_from_date": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap From Date",
+      "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
+    },
+    "sitemap_to_date": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap To Date",
+      "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
+    },
+    "behavior_timeout": {
+      "type": "integer",
+      "required": false,
+      "title": "Behavior Timeout",
+      "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
+      "min": 0
+    },
+    "post_load_delay": {
+      "type": "integer",
+      "required": false,
+      "title": "Post Load Delay",
+      "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
+      "min": 0
+    },
+    "page_extra_delay": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Extra Delay",
+      "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
+      "min": 0
+    },
+    "dedup_policy": {
+      "type": "string-enum",
+      "required": false,
+      "title": "Dedup Policy",
+      "description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
+      "choices": [
+        {
+          "title": "Skip",
+          "value": "skip"
+        },
+        {
+          "title": "Revisit",
+          "value": "revisit"
+        },
+        {
+          "title": "Keep",
+          "value": "keep"
+        }
+      ]
+    },
+    "screenshot": {
+      "type": "string",
+      "required": false,
+      "title": "Screenshot",
+      "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
+    },
+    "size_soft_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Size Soft Limit",
+      "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
+      "min": 0
+    },
+    "size_hard_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Size Hard Limit",
+      "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
+      "min": 0
+    },
+    "disk_utilization": {
+      "type": "integer",
+      "required": false,
+      "title": "Disk Utilization",
+      "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
+      "min": 0
+    },
+    "time_soft_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Time Soft Limit",
+      "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
+      "min": 0
+    },
+    "time_hard_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Time Hard Limit",
+      "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
+      "min": 0
+    },
+    "net_idle_wait": {
+      "type": "integer",
+      "required": false,
+      "title": "Net Idle Wait",
+      "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
+    },
+    "origin_override": {
+      "type": "string",
+      "required": false,
+      "title": "Origin Override",
+      "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
+    },
+    "max_page_retries": {
+      "type": "integer",
+      "required": false,
+      "title": "Max Page Retries",
+      "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
+      "min": 0
+    },
+    "fail_on_failed_seed": {
+      "type": "boolean",
+      "required": false,
+      "title": "Fail on failed seed",
+      "description": "Whether to display additional logs"
+    },
+    "fail_on_invalid_status": {
+      "type": "boolean",
+      "required": false,
+      "title": "Fail on invalid status",
+      "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
+    },
+    "fail_on_failed_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Fail on failed - Limit",
+      "description": "If set, save state and exit if number of failed pages exceeds this value.",
+      "min": 0
+    },
+    "warcs": {
+      "type": "string",
+      "required": false,
+      "title": "WARC files",
+      "description": "Comma-separated list of WARC files to use as input."
+    },
+    "verbose": {
+      "type": "boolean",
+      "required": false,
+      "title": "Verbose mode",
+      "description": "Whether to display additional logs"
+    },
+    "keep": {
+      "type": "boolean",
+      "required": false,
+      "title": "Keep",
+      "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
+      "default": true
+    },
+    "output": {
+      "type": "string",
+      "required": false,
+      "title": "Output folder",
+      "description": "Output folder for ZIM file(s). Leave it as `/output`",
+      "pattern": "^/output$"
+    },
+    "admin_email": {
+      "type": "email",
+      "required": false,
+      "title": "Admin Email",
+      "description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
+      "default": "contact+zimfarm@kiwix.org"
+    },
+    "profile": {
+      "type": "string",
+      "required": false,
+      "title": "Browser profile",
+      "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
+    },
+    "behaviors": {
+      "type": "string",
+      "required": false,
+      "title": "Behaviors",
+      "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
+    },
+    "depth": {
+      "type": "integer",
+      "required": false,
+      "title": "Depth",
+      "description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
+      "min": -1
+    },
+    "zim_lang": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM Language",
+      "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
+      "alias": "zim-lang",
+      "customValidator": "language_code"
+    },
+    "long_description": {
+      "type": "string",
+      "required": false,
+      "title": "Long description",
+      "description": "Optional long description for your ZIM",
+      "minLength": 1,
+      "maxLength": 4000,
+      "alias": "long-description"
+    },
+    "custom_css": {
+      "type": "url",
+      "required": false,
+      "title": "Custom CSS",
+      "description": "URL to a CSS file to inject into pages",
+      "alias": "custom-css"
+    },
+    "charsets_to_try": {
+      "type": "string",
+      "required": false,
+      "title": "Charsets to try",
+      "description": "List of charsets to try decode content when charset is not found",
+      "alias": "charsets-to-try"
+    },
+    "ignore_content_header_charsets": {
+      "type": "boolean",
+      "required": false,
+      "title": "Ignore Content Header Charsets",
+      "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
+      "alias": "ignore-content-header-charsets"
+    },
+    "content_header_bytes_length": {
+      "type": "integer",
+      "required": false,
+      "title": "Content Header Bytes Length",
+      "description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
+      "alias": "content-header-bytes-length",
+      "min": 0
+    },
+    "ignore_http_header_charsets": {
+      "type": "boolean",
+      "required": false,
+      "title": "Ignore HTTP Header Charsets",
+      "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
+      "alias": "ignore-http-header-charsets"
+    },
+    "encoding_aliases": {
+      "type": "string",
+      "required": false,
+      "title": "Encoding Aliases",
+      "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
+      "alias": "encoding-aliases"
+    },
+    "custom_behaviors": {
+      "type": "string",
+      "required": false,
+      "title": "Custom Behaviors",
+      "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
+      "alias": "custom-behaviours"
+    },
+    "zimit_progress_file": {
+      "type": "string",
+      "required": false,
+      "title": "Zimit Progress File",
+      "description": "Scraping progress file. Leave it as `/output/task_progress.json`",
+      "alias": "zimit-progress-file",
+      "pattern": "^/output/task_progress\\.json$"
+    },
+    "replay_viewer_source": {
+      "type": "url",
+      "required": false,
+      "title": "Replay Viewer Source",
+      "description": "URL from which to load the ReplayWeb.page replay viewer from",
+      "alias": "replay-viewer-source"
+    },
+    "zim_file": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM filename",
+      "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
+      "alias": "zim-file",
+      "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
+      "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    },
+    "name": {
+      "type": "string",
+      "required": true,
+      "title": "ZIM name",
+      "description": "Name of the ZIM.",
+      "alias": "name",
+      "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
+      "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    }
+  }
+}

From 4ec47cd6dd7c8c69fa390f69c488fbc5ff9d1966 Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Wed, 8 Oct 2025 04:25:12 +0100
Subject: [PATCH 53/65] use base64 string as argument to workflow call

---
 .github/workflows/update-zim-offliner-definition.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml
index 4662e62..ee26474 100644
--- a/.github/workflows/update-zim-offliner-definition.yaml
+++ b/.github/workflows/update-zim-offliner-definition.yaml
@@ -25,14 +25,14 @@ jobs:
             echo "File not found!" >&2
             exit 1
           fi
-          json=$(jq -c . offliner-definition.json)
-          echo "offliner_definition=$json" >> $GITHUB_OUTPUT
+          json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
+          echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
   call-workflow:
     needs: prepare-json
     uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
     with:
       version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }}
       offliner: zimit
-      offliner_definition: ${{ needs.prepare-json.outputs.offliner_definition }}
+      offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
     secrets:
       zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}

From ad09665c4a93a503b0394f50a3835f69e6b6c6e5 Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Fri, 10 Oct 2025 10:22:29 +0100
Subject: [PATCH 54/65] add workflow dispatch to update-offliner ci

---
 .github/workflows/update-zim-offliner-definition.yaml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml
index ee26474..982fe03 100644
--- a/.github/workflows/update-zim-offliner-definition.yaml
+++ b/.github/workflows/update-zim-offliner-definition.yaml
@@ -8,6 +8,13 @@ on:
   release:
     types: [published]
 
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish (leave blank to use 'dev')"
+        required: false
+        default: "dev"
+
 jobs:
   prepare-json:
     runs-on: ubuntu-24.04
@@ -31,7 +38,7 @@ jobs:
     needs: prepare-json
     uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
     with:
-      version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }}
+      version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
       offliner: zimit
       offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
     secrets:

From a9805c84c284fc23f0e6497b79cb42e53e4adb28 Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Fri, 10 Oct 2025 10:34:26 +0100
Subject: [PATCH 55/65] set proper outputs name

---
 .github/workflows/update-zim-offliner-definition.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml
index 982fe03..f481354 100644
--- a/.github/workflows/update-zim-offliner-definition.yaml
+++ b/.github/workflows/update-zim-offliner-definition.yaml
@@ -11,7 +11,7 @@ on:
   workflow_dispatch:
     inputs:
       version:
-        description: "Version to publish (leave blank to use 'dev')"
+        description: "Version to publish"
         required: false
         default: "dev"
 
@@ -19,7 +19,7 @@ jobs:
   prepare-json:
     runs-on: ubuntu-24.04
     outputs:
-      offliner_definition: ${{ steps.read-json.outputs.offliner_definition }}
+      offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4

From 44cf4218cb1940b4fd0cfa45032da1b8d3fdf130 Mon Sep 17 00:00:00 2001
From: Vitaly Zdanevich <zdanevich.vitaly@ya.ru>
Date: Mon, 20 Oct 2025 01:22:31 +0400
Subject: [PATCH 56/65] README.md: add link to
 https://en.wikipedia.org/wiki/ZIM_(file_format)

Signed-off-by: Vitaly Zdanevich <zdanevich.vitaly@ya.ru>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 894f523..188615f 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 Zimit
 =====
 
-Zimit is a scraper allowing to create ZIM file from any Web site.
+Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
 
 [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)

From 611d2033f7500a117aed9069fcad0abd7384b14c Mon Sep 17 00:00:00 2001
From: Chris Routh <chris@routh.ca>
Date: Thu, 6 Nov 2025 09:29:15 -0800
Subject: [PATCH 57/65] Issue #499 - Use build dir rather than random tmp dir
 when passed.

---
 src/zimit/zimit.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 02b167d..30c5de0 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -796,11 +796,13 @@ def run(raw_args):
     if known_args.adminEmail:
         user_agent_suffix += f" {known_args.adminEmail}"
 
-    # make temp dir for this crawl
+    # set temp dir to use for this crawl
     global temp_root_dir  # noqa: PLW0603
     if known_args.build:
-        temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp"))
+        # use build dir argument if passed
+        temp_root_dir = Path(known_args.build)
     else:
+        # make new randomized temp dir
         temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
 
     seeds = []

From 4595d2a3027c82fb04629a5e949a8a12bb2483a5 Mon Sep 17 00:00:00 2001
From: Chris Routh <chris@routh.ca>
Date: Thu, 6 Nov 2025 09:36:47 -0800
Subject: [PATCH 58/65] Issue #499 - Only register cleanup if neither build or
 keep arguments have been passed.

---
 src/zimit/zimit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 30c5de0..9ed8a20 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -856,7 +856,8 @@ def run(raw_args):
         logger.info("Exiting, invalid warc2zim params")
         return EXIT_CODE_WARC2ZIM_CHECK_FAILED
 
-    if not known_args.keep:
+    # only trigger cleanup when the keep argument is passed without a custom build dir.
+    if not known_args.build and not known_args.keep:
         atexit.register(cleanup)
 
     # copy / download custom behaviors to one single folder and configure crawler

From 57a88434e22517f6cffb63070d7852b11ad2d7b8 Mon Sep 17 00:00:00 2001
From: Chris Routh <chris@routh.ca>
Date: Thu, 6 Nov 2025 11:49:58 -0800
Subject: [PATCH 59/65] Issue #499 - Use all warc_directories found when no
 specific collection has been passed.

---
 src/zimit/zimit.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index 9ed8a20..a91c4e4 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -1079,12 +1079,11 @@ def run(raw_args):
                 )
             elif len(warc_dirs) > 1:
                 logger.info(
-                    "Found many WARC files directories, only most recently modified one"
-                    " will be used"
+                    "Found many WARC files directories, combining pages from all of them"
                 )
                 for directory in warc_dirs:
                     logger.info(f"- {directory}")
-            warc_files = [warc_dirs[-1]]
+            warc_files = warc_dirs
 
     logger.info("")
     logger.info("----------")

From 6db73a0a83f6d3b028175ba3a918a75493340f70 Mon Sep 17 00:00:00 2001
From: Chris Routh <chris@routh.ca>
Date: Thu, 6 Nov 2025 12:19:28 -0800
Subject: [PATCH 60/65] Issue #499 - Ensure build directory exists when passed.

---
 src/zimit/zimit.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index a91c4e4..fb070a0 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -801,6 +801,7 @@ def run(raw_args):
     if known_args.build:
         # use build dir argument if passed
         temp_root_dir = Path(known_args.build)
+        temp_root_dir.mkdir(parents=True, exist_ok=True)
     else:
         # make new randomized temp dir
         temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))

From ef004f38635a2b1db1d1385212f913f04ee659e4 Mon Sep 17 00:00:00 2001
From: Chris Routh <chris@routh.ca>
Date: Fri, 7 Nov 2025 11:33:01 -0800
Subject: [PATCH 61/65] Issue #499 Record changes in CHANGELOG

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e06e20..58fb40a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+- Fix issues preventing interrupted crawls from being resumed. (#499)
+  - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
+  - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
+  - Don't cleanup an explicitly passed build directory.
+
 ## [3.0.5] - 2024-04-11
 
 ### Changed

From e30a82a91c4e75de290e04b6d9b56aa9d5832799 Mon Sep 17 00:00:00 2001
From: Chris Routh <chris@routh.ca>
Date: Fri, 7 Nov 2025 12:59:25 -0800
Subject: [PATCH 62/65] PR #524 Fix line length.

---
 src/zimit/zimit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index fb070a0..e982cbd 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -1080,7 +1080,8 @@ def run(raw_args):
                 )
             elif len(warc_dirs) > 1:
                 logger.info(
-                    "Found many WARC files directories, combining pages from all of them"
+                    "Found many WARC files directories, combining pages from all "
+                    "of them"
                 )
                 for directory in warc_dirs:
                     logger.info(f"- {directory}")

From aec19d95d2257f72445746f92759e9b88574a31a Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Mon, 15 Dec 2025 14:25:24 +0100
Subject: [PATCH 63/65] migrate custom_css and favicon flags to blob types

---
 offliner-definition.json | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/offliner-definition.json b/offliner-definition.json
index c7fed57..89bdd51 100644
--- a/offliner-definition.json
+++ b/offliner-definition.json
@@ -38,7 +38,8 @@
       "maxLength": 80
     },
     "favicon": {
-      "type": "url",
+      "type": "blob",
+      "kind": "image",
       "required": false,
       "title": "Illustration",
       "description": "URL for Illustration. "
@@ -887,7 +888,8 @@
       "alias": "long-description"
     },
     "custom_css": {
-      "type": "url",
+      "type": "blob",
+      "kind": "image",
       "required": false,
       "title": "Custom CSS",
       "description": "URL to a CSS file to inject into pages",

From 34ce7eb98dc7a35dac7f3824a856e5e1e23587bf Mon Sep 17 00:00:00 2001
From: benoit74 <benoit74@users.noreply.github.com>
Date: Tue, 16 Dec 2025 16:33:53 +0000
Subject: [PATCH 64/65] Fix offliner definition

---
 offliner-definition.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offliner-definition.json b/offliner-definition.json
index 89bdd51..4000466 100644
--- a/offliner-definition.json
+++ b/offliner-definition.json
@@ -889,7 +889,7 @@
     },
     "custom_css": {
       "type": "blob",
-      "kind": "image",
+      "kind": "css",
       "required": false,
       "title": "Custom CSS",
       "description": "URL to a CSS file to inject into pages",

From 81018f06fa15517917c4c6e52d0212ca669b35dc Mon Sep 17 00:00:00 2001
From: Aaryan Kumar Sinha <sinhaaaryankumar@outlook.com>
Date: Sat, 13 Dec 2025 01:30:33 +0530
Subject: [PATCH 65/65] Added --overwrite flag to zimit

---
 CHANGELOG.md                     |   3 ++
 offliner-definition.json         |   6 +++
 src/zimit/zimit.py               |   6 ++-
 tests/conftest.py                |  14 ++++++
 tests/data/example-response.warc | Bin 0 -> 2272 bytes
 tests/test_overwrite.py          |  83 +++++++++++++++++++++++++++++++
 6 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/data/example-response.warc
 create mode 100644 tests/test_overwrite.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 58fb40a..2a99b30 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
+
 ### Changed
 - Fix issues preventing interrupted crawls from being resumed. (#499)
   - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
diff --git a/offliner-definition.json b/offliner-definition.json
index 4000466..4bb68b5 100644
--- a/offliner-definition.json
+++ b/offliner-definition.json
@@ -970,6 +970,12 @@
       "alias": "name",
       "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
       "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    },
+    "overwrite": {
+      "type": "boolean",
+      "required": false,
+      "title": "Overwrite",
+      "description": "Whether to overwrite existing ZIM file if it exists"
     }
   }
 }
diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py
index e982cbd..b205007 100755
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@@ -849,6 +849,9 @@ def run(raw_args):
         warc2zim_args.append("--lang")
         warc2zim_args.append(known_args.zim_lang)
 
+    if known_args.overwrite:
+        warc2zim_args.append("--overwrite")
+
     logger.info("----------")
     logger.info("Testing warc2zim args")
     logger.info("Running: warc2zim " + " ".join(warc2zim_args))
@@ -1036,7 +1039,6 @@ def run(raw_args):
             warc_files.append(Path(extract_path))
 
     else:
-
         logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
         crawl = subprocess.run(crawler_args, check=False)
         if (
@@ -1091,7 +1093,7 @@ def run(raw_args):
     logger.info("----------")
     logger.info(
         f"Processing WARC files in/at "
-        f'{" ".join(str(warc_file) for warc_file in warc_files)}'
+        f"{' '.join(str(warc_file) for warc_file in warc_files)}"
     )
     warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..d51650d
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,14 @@
+import pytest
+
+from zimit import zimit as app
+
+"""
+ cleanup disabled because atexit hooks run at the very end of the Python process
+ shutdown. By the time cleanup() is called, the logging module has already closed its
+ file streams.
+"""
+
+
+@pytest.fixture(autouse=True)
+def disable_zimit_cleanup(monkeypatch):
+    monkeypatch.setattr(app, "cleanup", lambda: None)
diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc
new file mode 100644
index 0000000000000000000000000000000000000000..143b947d121e61b479cf9cae596b3853a9a60633
GIT binary patch
literal 2272
zcmb`I`9Bkk1IJ13mB}8VF(i@kY%^vP#gc21YjZ|(tK<xG6RQ@vm%@}Iw;F4;hmeN8
z%YB8GklZF$j>qG9zOV21Klpw>KYxDuyxy<(2SO1MX-n+EA32QhCSDKm^z*;tt%-oa
zbzumE4h)IVMxfD1Jx$oZqp70_gTvu^a6R;OZ!|(H;-5YrE|U`=pae?&C9rW^Ya*p1
zbjT~mR!ua~n4kol@A;&6KjUFCyxAaDN7gUAWNv-;u#?O2K?`&&1yHTSX2yZ~do0BF
z1v%l+=Di-I;tyLt)05F<6`<!YUJr+kSJv5y>6pGKI@`J(9hMq1{(0}2gk8{zb>Pdn
ztf2h@%$|g4W_!Wub6PgVBlKrY&?+_I1R=Rwg_rSHm)Ep2@w(m?{$*hGDov_TiqXL}
z0tcpoDm71%@65h^YyJA;--K40^~0$xbQB6a&<_M!xeX7W*;F6p>U2!VxOy1^y$a5Q
z-pjO~<X5GH2s)7@MV}<)T1o6U5;}v-BWQ=3L%EO*ZE>%c9tE?}km)g1?1MA5sn`*w
z$lKUUV=a7qe20eud{<SR7*u{fKFI$%^-bFwI6Ge%Asz?=;Pm0RSAcVZ5(dAv!TmO`
zEQD(qIUIW=WG<$0cwi*-iklIt<LGYz`T4B3`LAHF(ZN`=ri>_YdRm1V!X&#}%_2<;
zXY$Dd(Rrsz$A=p4{*@O+`xQl->mGC&Kt>|U@w4i`4>?aKf|ZRw&fGDFxi((>swq|x
zEJ-enG&wo9g*lq8WCo_E$T*v2AT1<;SSM-eXF!jHexp|6`4&aoOOY-yjl?Oy14%U_
z7x;tIX6YVO19|aFr*itiluYb0eR&EgH!r|eIE(wiWj}%a%kT15Qt`cPy}M_!gXME^
z-*5=?N(cWyD!?to@xoeU!EuGI<4jAb=T8aHF|S$QzZ%#@x?aXqtJfMaNU0)v-@=d4
zVhUJS?IT!ujK6MK%=`r3gMMQ{Q`-za=tu1&>2F7R1@bzSQU+6QK5he86a*^%nVmee
zZlYH*D|*?IPYec9Bf?!h)a3;k$h*MgqrHm>&Sm&Ams+1QAlFv_EKas}NY1s%CFspf
zFi|5sG6(Z*1$gY>LEu?3NB|;sGsrO61_Rh<Y9Ka0K9FZ$VJw;};{S-yGVbdlI_~I!
zRY2lx{ZP6JrwT9YRSd9D^v~-j0@BW=_^7Id(>kBY$gXypsfgb@9Hoh>$adRqN~4Qg
zl_)0UBp=D<QEB7sdm;QtcIKv3m@a74C=(0tAM1h&1Ceji`+D)!A&HLfl}JV8BH#UL
zhu!%ZE?R|Kf!x2^RnkPr;<D91Cd~KAhbT`WPvg~h%QPUoKBndjF|4&}GM=A;j{E&K
zRc1j=hU6QwJ?pBn)lpGtaQ3GTfknMFnHfp0JjHswJ?EK*;+3k(*W}}Mr*EQ0<d}y_
zf}hIJeUPS>sOvUiy`T2mO49{tc%+KMV`<X^ztZLG@o5c0d7b@M;baMKE4s*<o5|Rs
zDDm>7?C|dMl|0gd$8u(Qm&XJ+++rn_k~<pkiOiIT@#M(YZu@BED>PGgMojC`zD!i@
z6PGKwA7;NuQ<DZz8tlw;yAWK++ps0>2=@C$oRj}X)uO2txn)@yLoe1YAGD@@j;}z)
zb<{ILaFg#+BRTs7{n=Ng7}lyrUF>w~f#n2oN$klv((};%fk97HV_I}_!G=+MwVqac
zXinZrw95%0>+%D}#`Mwxwx#h`z5a+FBG<JRGCZ5D`}Lq<hP2A%NtyP{65ztp&LnY{
zf#Y$+be9`4fns^yL!|}j0d4~!QOrBP#^BwtsZMWnM@_@N<3zn^5Zms1>%+0A{ff1h
zKR>m03#_$;o-QX4m(;D+HryntdXKkob^StIv-Z|`I^t2mx5lfc79i2WC#rOFYs{dI
zg|<)8ih=r}UOoqvsUxlv)w0U;xHvmDIzI6k{LKK0bd4?1y_;Q9i%t|4<Z?Q3g?hEx
zcMfV>)6CQtB6g<Vk_n$fT-Hd|ZM&1K{eeeziQO02RxJ#5?BPmt{sG7NGVd+sucvir
zVj9Vv*q&u+Gpfqoubik$sAH8kGg-=p_aTc<KKt+V)j^hfB(`sJwa-UQil3@}r$E~s
z3#Qn7{iDEnN{dqPvZ3vQo~p8P*!6gU|5%XiG$r65_<yiqSQhun7Z@;wcY0PnMihBv
z`OnTA3(&cX3C@~MiKJw(Zx+L>U?})Q-i!JMs0gDfL|e|fCnnTq)tcbTzjc4<%Y+EE
z#ni*By<S$ep||!TfYm{m1H=|Fh#SxwLItKtFOL0qUaVJ#W=BbhgXbDg$o4keMY(Su
znfv0{pV0@(29nLqiPD#$(71Quvn;{n(Q1uU-HwM)5f9G(D*<;v?8|!0shr0QA@3k`
zjF)-)J!2b5*A$zi8K8)km>(?Y{UA8f=x(Iv?)b53Q5t8Yr?andDyRP*7EgFyQH42e
zuacpvQqzv4Y={{<e-Bm=uy*S_@z`y@&@u+Bul^=8SxkCyCN3Z?79GXp)&3=9e?u3*
zHebfnh5oF)#Lt3qAR|$Wm>0&aenFiM`WqwXlHV`sOAP(2mztg{DvB`LyC!M@VAyM7
zS14cud+eaoBLn0xaRwvtMs`zsRA<QY!_~Z<oU0d(9Rw*wI$ZB+9dTTLw_%u0cSF!#
z$NTOF3fn}|nGf5v?Mos+;AUl359vtSN-~|rY^N+^6jTh2gqf~p-l_hf59?}9$<swQ
z4~`osS}3T;mXha(wiBZ(>dHj3)HX^$@~-ZAOq9ancd24=pmn<vW@7#sS+OUUIA`Wb
z8vt>Crrvma*d4A#_O6AbcZTEp5`+$D98#-XBzbn{k>%2>;U*#I@X_s6KHew>eu=87
RH3|9e>Fz|Z3l0B4?{AQ;Bs>5B

literal 0
HcmV?d00001

diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py
new file mode 100644
index 0000000..e41baca
--- /dev/null
+++ b/tests/test_overwrite.py
@@ -0,0 +1,83 @@
+import pathlib
+
+import pytest
+
+from zimit.zimit import run
+
+TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+def test_overwrite_flag_behaviour(tmp_path):
+    zim_output = "overwrite-test.zim"
+    output_path = tmp_path / zim_output
+
+    # 1st run → creates file
+    result = run(
+        [
+            "--seeds",
+            "https://example.com",
+            "--warcs",
+            str(TEST_DATA_DIR / "example-response.warc"),
+            "--output",
+            str(tmp_path),
+            "--zim-file",
+            zim_output,
+            "--name",
+            "overwrite-test",
+        ]
+    )
+    assert result in (None, 100)
+    assert output_path.exists()
+
+    # 2nd run, no overwrite → should fail
+    with pytest.raises(SystemExit) as exc:
+        run(
+            [
+                "--seeds",
+                "https://example.com",
+                "--warcs",
+                str(TEST_DATA_DIR / "example-response.warc"),
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--name",
+                "overwrite-test",
+            ]
+        )
+    assert exc.value.code == 2
+
+    # 2nd run, no overwrite → should fail
+    with pytest.raises(SystemExit) as exc:
+        run(
+            [
+                "--seeds",
+                "https://example.com",
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--name",
+                "overwrite-test",
+            ]
+        )
+    assert exc.value.code == 2
+
+    # 3rd run, with overwrite → should succeed
+    result = run(
+        [
+            "--seeds",
+            "https://example.com",
+            "--warcs",
+            str(TEST_DATA_DIR / "example-response.warc"),
+            "--output",
+            str(tmp_path),
+            "--zim-file",
+            zim_output,
+            "--name",
+            "overwrite-test",
+            "--overwrite",
+        ]
+    )
+    assert result in (None, 100)
+    assert output_path.exists()