Merge pull request #528 from aaryansinhaa/feature/overwrite

Added --overwrite flag to zimit
2025-12-31 12:33:15 +00:00 · 2025-12-22 11:56:16 +01:00 · 2025-12-22 16:08:02 +05:30 · 2025-12-16 16:33:53 +00:00 · 2025-12-16 17:32:44 +01:00 · 2025-12-15 14:25:24 +01:00
11 changed files with 1158 additions and 12 deletions
--- a/.github/workflows/update-zim-offliner-definition.yaml
+++ b/.github/workflows/update-zim-offliner-definition.yaml
@ -0,0 +1,45 @@
+name: Update ZIMFarm Definitions
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "offliner-definition.json"
+  release:
+    types: [published]
+
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish"
+        required: false
+        default: "dev"
+
+jobs:
+  prepare-json:
+    runs-on: ubuntu-24.04
+    outputs:
+      offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - id: read-json
+        run: |
+          if [ ! -f "offliner-definition.json" ]; then
+            echo "File not found!" >&2
+            exit 1
+          fi
+          json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
+          echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
+  call-workflow:
+    needs: prepare-json
+    uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
+    with:
+      version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
+      offliner: zimit
+      offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
+    secrets:
+      zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,23 @@ All notable changes to this project are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).

+## [Unreleased]
+
+### Added
+- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
+
+### Changed
+- Fix issues preventing interrupted crawls from being resumed. (#499)
+  - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
+  - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
+  - Don't cleanup an explicitly passed build directory.
+
+## [3.0.5] - 2024-04-11
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.6.0 (#493)
+
 ## [3.0.4] - 2024-04-04

 ### Changed
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.5.9
+FROM webrecorder/browsertrix-crawler:1.6.0
 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit

 # add deadsnakes ppa for latest Python on Ubuntu
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 Zimit
 =====

-Zimit is a scraper allowing to create ZIM file from any Web site.
+Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.

 [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
 [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
--- a/offliner-definition.json
+++ b/offliner-definition.json
@ -0,0 +1,981 @@
+{
+  "offliner_id": "zimit",
+  "stdOutput": true,
+  "stdStats": "zimit-progress-file",
+  "flags": {
+    "seeds": {
+      "type": "string",
+      "required": false,
+      "title": "Seeds",
+      "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
+    },
+    "seed_file": {
+      "type": "string",
+      "required": false,
+      "title": "Seed File",
+      "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
+    },
+    "lang": {
+      "type": "string",
+      "required": false,
+      "title": "Browser Language",
+      "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
+    },
+    "title": {
+      "type": "string",
+      "required": false,
+      "title": "Title",
+      "description": "Custom title for your ZIM. Defaults to title of main page",
+      "minLength": 1,
+      "maxLength": 30
+    },
+    "description": {
+      "type": "string",
+      "required": false,
+      "title": "Description",
+      "description": "Description for ZIM",
+      "minLength": 1,
+      "maxLength": 80
+    },
+    "favicon": {
+      "type": "blob",
+      "kind": "image",
+      "required": false,
+      "title": "Illustration",
+      "description": "URL for Illustration. "
+    },
+    "tags": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM Tags",
+      "description": "Single string with individual tags separated by a semicolon."
+    },
+    "creator": {
+      "type": "string",
+      "required": false,
+      "title": "Creator",
+      "description": "Name of content creator"
+    },
+    "publisher": {
+      "type": "string",
+      "required": false,
+      "title": "Publisher",
+      "isPublisher": true,
+      "description": "Custom publisher name (ZIM metadata). openZIM otherwise"
+    },
+    "source": {
+      "type": "string",
+      "required": false,
+      "title": "Source",
+      "description": "Source name/URL of content"
+    },
+    "workers": {
+      "type": "integer",
+      "required": false,
+      "title": "Workers",
+      "description": "The number of workers to run in parallel. Defaults to 1",
+      "min": 1
+    },
+    "wait_until": {
+      "type": "string",
+      "required": false,
+      "title": "WaitUntil",
+      "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
+    },
+    "extra_hops": {
+      "type": "integer",
+      "required": false,
+      "title": "Extra Hops",
+      "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
+      "min": 0
+    },
+    "page_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Limit",
+      "description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
+      "min": 0
+    },
+    "max_page_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Max Page Limit",
+      "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
+      "min": 0
+    },
+    "page_load_timeout": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Load Timeout",
+      "description": "Timeout for each page to load (in seconds). Default is 90",
+      "min": 0
+    },
+    "scope_type": {
+      "type": "string-enum",
+      "required": false,
+      "title": "Scope Type",
+      "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
+      "choices": [
+        {
+          "title": "Page",
+          "value": "page"
+        },
+        {
+          "title": "Page SPA",
+          "value": "page-spa"
+        },
+        {
+          "title": "Prefix",
+          "value": "prefix"
+        },
+        {
+          "title": "Host",
+          "value": "host"
+        },
+        {
+          "title": "Domain",
+          "value": "domain"
+        },
+        {
+          "title": "Any",
+          "value": "any"
+        },
+        {
+          "title": "Custom",
+          "value": "custom"
+        }
+      ]
+    },
+    "scope_include_rx": {
+      "type": "string",
+      "required": false,
+      "title": "Scope Include Regex",
+      "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
+    },
+    "scope_exclude_rx": {
+      "type": "string",
+      "required": false,
+      "title": "Scope Exclude Regex",
+      "description": "Regex of page URLs that should be excluded from the crawl"
+    },
+    "allow_hash_urls": {
+      "type": "boolean",
+      "required": false,
+      "title": "Allow Hashtag URLs",
+      "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
+    },
+    "mobile_device": {
+      "type": "string-enum",
+      "required": false,
+      "title": "As device",
+      "description": "Device to crawl as. See Pupeeter's Device.ts for a list",
+      "choices": [
+        {
+          "title": "Blackberry Playbook",
+          "value": "Blackberry PlayBook"
+        },
+        {
+          "title": "Blackberry Playbook Landscape",
+          "value": "Blackberry PlayBook landscape"
+        },
+        {
+          "title": "Blackberry Z30",
+          "value": "BlackBerry Z30"
+        },
+        {
+          "title": "Blackberry Z30 Landscape",
+          "value": "BlackBerry Z30 landscape"
+        },
+        {
+          "title": "Galaxy Note 3",
+          "value": "Galaxy Note 3"
+        },
+        {
+          "title": "Galaxy Note 3 Landscape",
+          "value": "Galaxy Note 3 landscape"
+        },
+        {
+          "title": "Galaxy Note II",
+          "value": "Galaxy Note II"
+        },
+        {
+          "title": "Galaxy Note II Landscape",
+          "value": "Galaxy Note II landscape"
+        },
+        {
+          "title": "Galaxy S III",
+          "value": "Galaxy S III"
+        },
+        {
+          "title": "Galaxy S III Landscape",
+          "value": "Galaxy S III landscape"
+        },
+        {
+          "title": "Galaxy S5",
+          "value": "Galaxy S5"
+        },
+        {
+          "title": "Galaxy S5 Landscape",
+          "value": "Galaxy S5 landscape"
+        },
+        {
+          "title": "Galaxy S8",
+          "value": "Galaxy S8"
+        },
+        {
+          "title": "Galaxy S8 Landscape",
+          "value": "Galaxy S8 landscape"
+        },
+        {
+          "title": "Galaxy S9 Plus",
+          "value": "Galaxy S9+"
+        },
+        {
+          "title": "Galaxy S9 Plus Landscape",
+          "value": "Galaxy S9+ landscape"
+        },
+        {
+          "title": "Galaxy Tab S4",
+          "value": "Galaxy Tab S4"
+        },
+        {
+          "title": "Galaxy Tab S4 Landscape",
+          "value": "Galaxy Tab S4 landscape"
+        },
+        {
+          "title": "iPad",
+          "value": "iPad"
+        },
+        {
+          "title": "iPad Landscape",
+          "value": "iPad landscape"
+        },
+        {
+          "title": "iPad Gen 6",
+          "value": "iPad (gen 6)"
+        },
+        {
+          "title": "iPad Gen 6 Landscape",
+          "value": "iPad (gen 6) landscape"
+        },
+        {
+          "title": "iPad Gen 7",
+          "value": "iPad (gen 7)"
+        },
+        {
+          "title": "iPad Gen 7 Landscape",
+          "value": "iPad (gen 7) landscape"
+        },
+        {
+          "title": "iPad Mini",
+          "value": "iPad Mini"
+        },
+        {
+          "title": "iPad Mini Landscape",
+          "value": "iPad Mini landscape"
+        },
+        {
+          "title": "iPad Pro",
+          "value": "iPad Pro"
+        },
+        {
+          "title": "iPad Pro Landscape",
+          "value": "iPad Pro landscape"
+        },
+        {
+          "title": "iPad Pro 11",
+          "value": "iPad Pro 11"
+        },
+        {
+          "title": "iPad Pro 11 Landscape",
+          "value": "iPad Pro 11 landscape"
+        },
+        {
+          "title": "iPhone 4",
+          "value": "iPhone 4"
+        },
+        {
+          "title": "iPhone 4 Landscape",
+          "value": "iPhone 4 landscape"
+        },
+        {
+          "title": "iPhone 5",
+          "value": "iPhone 5"
+        },
+        {
+          "title": "iPhone 5 Landscape",
+          "value": "iPhone 5 landscape"
+        },
+        {
+          "title": "iPhone 6",
+          "value": "iPhone 6"
+        },
+        {
+          "title": "iPhone 6 Landscape",
+          "value": "iPhone 6 landscape"
+        },
+        {
+          "title": "iPhone 6 Plus",
+          "value": "iPhone 6 Plus"
+        },
+        {
+          "title": "iPhone 6 Plus Landscape",
+          "value": "iPhone 6 Plus landscape"
+        },
+        {
+          "title": "iPhone 7",
+          "value": "iPhone 7"
+        },
+        {
+          "title": "iPhone 7 Landscape",
+          "value": "iPhone 7 landscape"
+        },
+        {
+          "title": "iPhone 7 Plus",
+          "value": "iPhone 7 Plus"
+        },
+        {
+          "title": "iPhone 7 Plus Landscape",
+          "value": "iPhone 7 Plus landscape"
+        },
+        {
+          "title": "iPhone 8",
+          "value": "iPhone 8"
+        },
+        {
+          "title": "iPhone 8 Landscape",
+          "value": "iPhone 8 landscape"
+        },
+        {
+          "title": "iPhone 8 Plus",
+          "value": "iPhone 8 Plus"
+        },
+        {
+          "title": "iPhone 8 Plus Landscape",
+          "value": "iPhone 8 Plus landscape"
+        },
+        {
+          "title": "iPhone SE",
+          "value": "iPhone SE"
+        },
+        {
+          "title": "iPhone SE Landscape",
+          "value": "iPhone SE landscape"
+        },
+        {
+          "title": "iPhone X",
+          "value": "iPhone X"
+        },
+        {
+          "title": "iPhone X Landscape",
+          "value": "iPhone X landscape"
+        },
+        {
+          "title": "iPhone XR",
+          "value": "iPhone XR"
+        },
+        {
+          "title": "iPhone XR Landscape",
+          "value": "iPhone XR landscape"
+        },
+        {
+          "title": "iPhone 11",
+          "value": "iPhone 11"
+        },
+        {
+          "title": "iPhone 11 Landscape",
+          "value": "iPhone 11 landscape"
+        },
+        {
+          "title": "iPhone 11 Pro",
+          "value": "iPhone 11 Pro"
+        },
+        {
+          "title": "iPhone 11 Pro Landscape",
+          "value": "iPhone 11 Pro landscape"
+        },
+        {
+          "title": "iPhone 11 Pro Max",
+          "value": "iPhone 11 Pro Max"
+        },
+        {
+          "title": "iPhone 11 Pro Max Landscape",
+          "value": "iPhone 11 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 12",
+          "value": "iPhone 12"
+        },
+        {
+          "title": "iPhone 12 Landscape",
+          "value": "iPhone 12 landscape"
+        },
+        {
+          "title": "iPhone 12 Pro",
+          "value": "iPhone 12 Pro"
+        },
+        {
+          "title": "iPhone 12 Pro Landscape",
+          "value": "iPhone 12 Pro landscape"
+        },
+        {
+          "title": "iPhone 12 Pro Max",
+          "value": "iPhone 12 Pro Max"
+        },
+        {
+          "title": "iPhone 12 Pro Max Landscape",
+          "value": "iPhone 12 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 12 Mini",
+          "value": "iPhone 12 Mini"
+        },
+        {
+          "title": "iPhone 12 Mini Landscape",
+          "value": "iPhone 12 Mini landscape"
+        },
+        {
+          "title": "iPhone 13",
+          "value": "iPhone 13"
+        },
+        {
+          "title": "iPhone 13 Landscape",
+          "value": "iPhone 13 landscape"
+        },
+        {
+          "title": "iPhone 13 Pro",
+          "value": "iPhone 13 Pro"
+        },
+        {
+          "title": "iPhone 13 Pro Landscape",
+          "value": "iPhone 13 Pro landscape"
+        },
+        {
+          "title": "iPhone 13 Pro Max",
+          "value": "iPhone 13 Pro Max"
+        },
+        {
+          "title": "iPhone 13 Pro Max Landscape",
+          "value": "iPhone 13 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 13 Mini",
+          "value": "iPhone 13 Mini"
+        },
+        {
+          "title": "iPhone 13 Mini Landscape",
+          "value": "iPhone 13 Mini landscape"
+        },
+        {
+          "title": "Jio Phone 2",
+          "value": "JioPhone 2"
+        },
+        {
+          "title": "Jio Phone 2 Landscape",
+          "value": "JioPhone 2 landscape"
+        },
+        {
+          "title": "Kindle Fire HDX",
+          "value": "Kindle Fire HDX"
+        },
+        {
+          "title": "Kindle Fire HDX Landscape",
+          "value": "Kindle Fire HDX landscape"
+        },
+        {
+          "title": "LG Optimus L70",
+          "value": "LG Optimus L70"
+        },
+        {
+          "title": "LG Optimus L70 Landscape",
+          "value": "LG Optimus L70 landscape"
+        },
+        {
+          "title": "Microsoft Lumia 550",
+          "value": "Microsoft Lumia 550"
+        },
+        {
+          "title": "Microsoft Lumia 950",
+          "value": "Microsoft Lumia 950"
+        },
+        {
+          "title": "Microsoft Lumia 950 Landscape",
+          "value": "Microsoft Lumia 950 landscape"
+        },
+        {
+          "title": "Nexus 10",
+          "value": "Nexus 10"
+        },
+        {
+          "title": "Nexus 10 Landscape",
+          "value": "Nexus 10 landscape"
+        },
+        {
+          "title": "Nexus 4",
+          "value": "Nexus 4"
+        },
+        {
+          "title": "Nexus 4 Landscape",
+          "value": "Nexus 4 landscape"
+        },
+        {
+          "title": "Nexus 5",
+          "value": "Nexus 5"
+        },
+        {
+          "title": "Nexus 5 Landscape",
+          "value": "Nexus 5 landscape"
+        },
+        {
+          "title": "Nexus 5X",
+          "value": "Nexus 5X"
+        },
+        {
+          "title": "Nexus 5X Landscape",
+          "value": "Nexus 5X landscape"
+        },
+        {
+          "title": "Nexus 6",
+          "value": "Nexus 6"
+        },
+        {
+          "title": "Nexus 6 Landscape",
+          "value": "Nexus 6 landscape"
+        },
+        {
+          "title": "Nexus 6P",
+          "value": "Nexus 6P"
+        },
+        {
+          "title": "Nexus 6P Landscape",
+          "value": "Nexus 6P landscape"
+        },
+        {
+          "title": "Nexus 7",
+          "value": "Nexus 7"
+        },
+        {
+          "title": "Nexus 7 Landscape",
+          "value": "Nexus 7 landscape"
+        },
+        {
+          "title": "Nokia Lumia 520",
+          "value": "Nokia Lumia 520"
+        },
+        {
+          "title": "Nokia Lumia 520 Landscape",
+          "value": "Nokia Lumia 520 landscape"
+        },
+        {
+          "title": "Nokia N9",
+          "value": "Nokia N9"
+        },
+        {
+          "title": "Nokia N9 Landscape",
+          "value": "Nokia N9 landscape"
+        },
+        {
+          "title": "Pixel 2",
+          "value": "Pixel 2"
+        },
+        {
+          "title": "Pixel 2 Landscape",
+          "value": "Pixel 2 landscape"
+        },
+        {
+          "title": "Pixel 2 XL",
+          "value": "Pixel 2 XL"
+        },
+        {
+          "title": "Pixel 2 XL Landscape",
+          "value": "Pixel 2 XL landscape"
+        },
+        {
+          "title": "Pixel 3",
+          "value": "Pixel 3"
+        },
+        {
+          "title": "Pixel 3 Landscape",
+          "value": "Pixel 3 landscape"
+        },
+        {
+          "title": "Pixel 4",
+          "value": "Pixel 4"
+        },
+        {
+          "title": "Pixel 4 Landscape",
+          "value": "Pixel 4 landscape"
+        },
+        {
+          "title": "Pixel 4A 5G",
+          "value": "Pixel 4a (5G)"
+        },
+        {
+          "title": "Pixel 4A 5G Landscape",
+          "value": "Pixel 4a (5G) landscape"
+        },
+        {
+          "title": "Pixel 5",
+          "value": "Pixel 5"
+        },
+        {
+          "title": "Pixel 5 Landscape",
+          "value": "Pixel 5 landscape"
+        },
+        {
+          "title": "Moto G4",
+          "value": "Moto G4"
+        },
+        {
+          "title": "Moto G4 Landscape",
+          "value": "Moto G4 landscape"
+        }
+      ]
+    },
+    "select_links": {
+      "type": "string",
+      "required": false,
+      "title": "Select Links",
+      "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
+    },
+    "click_selector": {
+      "type": "string",
+      "required": false,
+      "title": "Click Selector",
+      "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
+    },
+    "block_rules": {
+      "type": "string",
+      "required": false,
+      "title": "Block Rules",
+      "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
+    },
+    "block_message": {
+      "type": "string",
+      "required": false,
+      "title": "Block Message",
+      "description": "If specified, when a URL is blocked, a record with this error message is added instead"
+    },
+    "block_ads": {
+      "type": "boolean",
+      "required": false,
+      "title": "Block Ads",
+      "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
+    },
+    "ad_block_message": {
+      "type": "string",
+      "required": false,
+      "title": "Ads Block Message",
+      "description": "If specified, when an ad is blocked, a record with this error message is added instead"
+    },
+    "user_agent": {
+      "type": "string",
+      "required": false,
+      "title": "User Agent",
+      "description": "Override user-agent with specified"
+    },
+    "user_agent_suffix": {
+      "type": "string",
+      "required": false,
+      "title": "User Agent Suffix",
+      "description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
+    },
+    "use_sitemap": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap URL",
+      "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
+    },
+    "sitemap_from_date": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap From Date",
+      "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
+    },
+    "sitemap_to_date": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap To Date",
+      "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
+    },
+    "behavior_timeout": {
+      "type": "integer",
+      "required": false,
+      "title": "Behavior Timeout",
+      "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
+      "min": 0
+    },
+    "post_load_delay": {
+      "type": "integer",
+      "required": false,
+      "title": "Post Load Delay",
+      "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
+      "min": 0
+    },
+    "page_extra_delay": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Extra Delay",
+      "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
+      "min": 0
+    },
+    "dedup_policy": {
+      "type": "string-enum",
+      "required": false,
+      "title": "Dedup Policy",
+      "description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
+      "choices": [
+        {
+          "title": "Skip",
+          "value": "skip"
+        },
+        {
+          "title": "Revisit",
+          "value": "revisit"
+        },
+        {
+          "title": "Keep",
+          "value": "keep"
+        }
+      ]
+    },
+    "screenshot": {
+      "type": "string",
+      "required": false,
+      "title": "Screenshot",
+      "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
+    },
+    "size_soft_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Size Soft Limit",
+      "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
+      "min": 0
+    },
+    "size_hard_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Size Hard Limit",
+      "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
+      "min": 0
+    },
+    "disk_utilization": {
+      "type": "integer",
+      "required": false,
+      "title": "Disk Utilization",
+      "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
+      "min": 0
+    },
+    "time_soft_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Time Soft Limit",
+      "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
+      "min": 0
+    },
+    "time_hard_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Time Hard Limit",
+      "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
+      "min": 0
+    },
+    "net_idle_wait": {
+      "type": "integer",
+      "required": false,
+      "title": "Net Idle Wait",
+      "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
+    },
+    "origin_override": {
+      "type": "string",
+      "required": false,
+      "title": "Origin Override",
+      "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
+    },
+    "max_page_retries": {
+      "type": "integer",
+      "required": false,
+      "title": "Max Page Retries",
+      "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
+      "min": 0
+    },
+    "fail_on_failed_seed": {
+      "type": "boolean",
+      "required": false,
+      "title": "Fail on failed seed",
+      "description": "Whether to display additional logs"
+    },
+    "fail_on_invalid_status": {
+      "type": "boolean",
+      "required": false,
+      "title": "Fail on invalid status",
+      "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
+    },
+    "fail_on_failed_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Fail on failed - Limit",
+      "description": "If set, save state and exit if number of failed pages exceeds this value.",
+      "min": 0
+    },
+    "warcs": {
+      "type": "string",
+      "required": false,
+      "title": "WARC files",
+      "description": "Comma-separated list of WARC files to use as input."
+    },
+    "verbose": {
+      "type": "boolean",
+      "required": false,
+      "title": "Verbose mode",
+      "description": "Whether to display additional logs"
+    },
+    "keep": {
+      "type": "boolean",
+      "required": false,
+      "title": "Keep",
+      "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
+      "default": true
+    },
+    "output": {
+      "type": "string",
+      "required": false,
+      "title": "Output folder",
+      "description": "Output folder for ZIM file(s). Leave it as `/output`",
+      "pattern": "^/output$"
+    },
+    "admin_email": {
+      "type": "email",
+      "required": false,
+      "title": "Admin Email",
+      "description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
+      "default": "contact+zimfarm@kiwix.org"
+    },
+    "profile": {
+      "type": "string",
+      "required": false,
+      "title": "Browser profile",
+      "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
+    },
+    "behaviors": {
+      "type": "string",
+      "required": false,
+      "title": "Behaviors",
+      "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
+    },
+    "depth": {
+      "type": "integer",
+      "required": false,
+      "title": "Depth",
+      "description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
+      "min": -1
+    },
+    "zim_lang": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM Language",
+      "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
+      "alias": "zim-lang",
+      "customValidator": "language_code"
+    },
+    "long_description": {
+      "type": "string",
+      "required": false,
+      "title": "Long description",
+      "description": "Optional long description for your ZIM",
+      "minLength": 1,
+      "maxLength": 4000,
+      "alias": "long-description"
+    },
+    "custom_css": {
+      "type": "blob",
+      "kind": "css",
+      "required": false,
+      "title": "Custom CSS",
+      "description": "URL to a CSS file to inject into pages",
+      "alias": "custom-css"
+    },
+    "charsets_to_try": {
+      "type": "string",
+      "required": false,
+      "title": "Charsets to try",
+      "description": "List of charsets to try decode content when charset is not found",
+      "alias": "charsets-to-try"
+    },
+    "ignore_content_header_charsets": {
+      "type": "boolean",
+      "required": false,
+      "title": "Ignore Content Header Charsets",
+      "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
+      "alias": "ignore-content-header-charsets"
+    },
+    "content_header_bytes_length": {
+      "type": "integer",
+      "required": false,
+      "title": "Content Header Bytes Length",
+      "description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
+      "alias": "content-header-bytes-length",
+      "min": 0
+    },
+    "ignore_http_header_charsets": {
+      "type": "boolean",
+      "required": false,
+      "title": "Ignore HTTP Header Charsets",
+      "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
+      "alias": "ignore-http-header-charsets"
+    },
+    "encoding_aliases": {
+      "type": "string",
+      "required": false,
+      "title": "Encoding Aliases",
+      "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
+      "alias": "encoding-aliases"
+    },
+    "custom_behaviors": {
+      "type": "string",
+      "required": false,
+      "title": "Custom Behaviors",
+      "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
+      "alias": "custom-behaviours"
+    },
+    "zimit_progress_file": {
+      "type": "string",
+      "required": false,
+      "title": "Zimit Progress File",
+      "description": "Scraping progress file. Leave it as `/output/task_progress.json`",
+      "alias": "zimit-progress-file",
+      "pattern": "^/output/task_progress\\.json$"
+    },
+    "replay_viewer_source": {
+      "type": "url",
+      "required": false,
+      "title": "Replay Viewer Source",
+      "description": "URL from which to load the ReplayWeb.page replay viewer from",
+      "alias": "replay-viewer-source"
+    },
+    "zim_file": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM filename",
+      "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
+      "alias": "zim-file",
+      "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
+      "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    },
+    "name": {
+      "type": "string",
+      "required": true,
+      "title": "ZIM name",
+      "description": "Name of the ZIM.",
+      "alias": "name",
+      "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
+      "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    },
+    "overwrite": {
+      "type": "boolean",
+      "required": false,
+      "title": "Overwrite",
+      "description": "Whether to overwrite existing ZIM file if it exists"
+    }
+  }
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,7 +11,7 @@ dependencies = [
  "requests==2.32.3",
  "inotify==0.2.10",
  "tld==0.13",
-  "warc2zim==2.2.2",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

--- a/src/zimit/about.py
+++ b/src/zimit/about.py
@ -1 +1 @@
-__version__ = "3.0.4"
+__version__ = "3.0.6-dev0"
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@ -796,11 +796,14 @@ def run(raw_args):
    if known_args.adminEmail:
        user_agent_suffix += f" {known_args.adminEmail}"

-    # make temp dir for this crawl
+    # set temp dir to use for this crawl
    global temp_root_dir  # noqa: PLW0603
    if known_args.build:
-        temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp"))
+        # use build dir argument if passed
+        temp_root_dir = Path(known_args.build)
+        temp_root_dir.mkdir(parents=True, exist_ok=True)
    else:
+        # make new randomized temp dir
        temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))

    seeds = []
@ -846,6 +849,9 @@ def run(raw_args):
        warc2zim_args.append("--lang")
        warc2zim_args.append(known_args.zim_lang)

+    if known_args.overwrite:
+        warc2zim_args.append("--overwrite")
+
    logger.info("----------")
    logger.info("Testing warc2zim args")
    logger.info("Running: warc2zim " + " ".join(warc2zim_args))
@ -854,7 +860,8 @@ def run(raw_args):
        logger.info("Exiting, invalid warc2zim params")
        return EXIT_CODE_WARC2ZIM_CHECK_FAILED

-    if not known_args.keep:
+    # only trigger cleanup when the keep argument is passed without a custom build dir.
+    if not known_args.build and not known_args.keep:
        atexit.register(cleanup)

    # copy / download custom behaviors to one single folder and configure crawler
@ -1032,7 +1039,6 @@ def run(raw_args):
            warc_files.append(Path(extract_path))

    else:
-
        logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
        crawl = subprocess.run(crawler_args, check=False)
        if (
@ -1076,18 +1082,18 @@ def run(raw_args):
                )
            elif len(warc_dirs) > 1:
                logger.info(
-                    "Found many WARC files directories, only most recently modified one"
-                    " will be used"
+                    "Found many WARC files directories, combining pages from all "
+                    "of them"
                )
                for directory in warc_dirs:
                    logger.info(f"- {directory}")
-            warc_files = [warc_dirs[-1]]
+            warc_files = warc_dirs

    logger.info("")
    logger.info("----------")
    logger.info(
        f"Processing WARC files in/at "
-        f'{" ".join(str(warc_file) for warc_file in warc_files)}'
+        f"{' '.join(str(warc_file) for warc_file in warc_files)}"
    )
    warc2zim_args.extend(str(warc_file) for warc_file in warc_files)

--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,14 @@
+import pytest
+
+from zimit import zimit as app
+
+"""
+ cleanup disabled because atexit hooks run at the very end of the Python process
+ shutdown. By the time cleanup() is called, the logging module has already closed its
+ file streams.
+"""
+
+
+@pytest.fixture(autouse=True)
+def disable_zimit_cleanup(monkeypatch):
+    monkeypatch.setattr(app, "cleanup", lambda: None)
--- a/tests/data/example-response.warc
+++ b/tests/data/example-response.warc
--- a/tests/test_overwrite.py
+++ b/tests/test_overwrite.py
@ -0,0 +1,83 @@
+import pathlib
+
+import pytest
+
+from zimit.zimit import run
+
+TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+def test_overwrite_flag_behaviour(tmp_path):
+    zim_output = "overwrite-test.zim"
+    output_path = tmp_path / zim_output
+
+    # 1st run → creates file
+    result = run(
+        [
+            "--seeds",
+            "https://example.com",
+            "--warcs",
+            str(TEST_DATA_DIR / "example-response.warc"),
+            "--output",
+            str(tmp_path),
+            "--zim-file",
+            zim_output,
+            "--name",
+            "overwrite-test",
+        ]
+    )
+    assert result in (None, 100)
+    assert output_path.exists()
+
+    # 2nd run, no overwrite → should fail
+    with pytest.raises(SystemExit) as exc:
+        run(
+            [
+                "--seeds",
+                "https://example.com",
+                "--warcs",
+                str(TEST_DATA_DIR / "example-response.warc"),
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--name",
+                "overwrite-test",
+            ]
+        )
+    assert exc.value.code == 2
+
+    # 2nd run, no overwrite → should fail
+    with pytest.raises(SystemExit) as exc:
+        run(
+            [
+                "--seeds",
+                "https://example.com",
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--name",
+                "overwrite-test",
+            ]
+        )
+    assert exc.value.code == 2
+
+    # 3rd run, with overwrite → should succeed
+    result = run(
+        [
+            "--seeds",
+            "https://example.com",
+            "--warcs",
+            str(TEST_DATA_DIR / "example-response.warc"),
+            "--output",
+            str(tmp_path),
+            "--zim-file",
+            zim_output,
+            "--name",
+            "overwrite-test",
+            "--overwrite",
+        ]
+    )
+    assert result in (None, 100)
+    assert output_path.exists()
Author	SHA1	Message	Date
benoit74	a7e236f0d7	Merge pull request #528 from aaryansinhaa/feature/overwrite Added --overwrite flag to zimit	2025-12-22 11:56:16 +01:00
Aaryan Kumar Sinha	81018f06fa	Added --overwrite flag to zimit	2025-12-22 16:08:02 +05:30
benoit74	34ce7eb98d	Fix offliner definition	2025-12-16 16:33:53 +00:00
benoit74	5bb068ffea	Merge pull request #529 from openzim/blob-types migrate custom_css and favicon flags to blob types	2025-12-16 17:32:44 +01:00
Uchechukwu Orji	aec19d95d2	migrate custom_css and favicon flags to blob types	2025-12-15 14:25:24 +01:00
benoit74	277473884e	Merge pull request #524 from Routhinator/issue-490-resume-crawl-from-interrupt Fixes: #499 - Resolve issues preventing graceful crawl resumption after interrupt	2025-11-08 10:55:07 +01:00
Chris Routh	e30a82a91c	PR #524 Fix line length.	2025-11-07 12:59:25 -08:00
Chris Routh	ef004f3863	Issue #499 Record changes in CHANGELOG	2025-11-07 11:33:01 -08:00
Chris Routh	6db73a0a83	Issue #499 - Ensure build directory exists when passed.	2025-11-06 13:44:26 -08:00
Chris Routh	57a88434e2	Issue #499 - Use all warc_directories found when no specific collection has been passed.	2025-11-06 13:44:18 -08:00
Chris Routh	4595d2a302	Issue #499 - Only register cleanup if neither build or keep arguments have been passed.	2025-11-06 13:44:10 -08:00
Chris Routh	611d2033f7	Issue #499 - Use build dir rather than random tmp dir when passed.	2025-11-06 13:43:52 -08:00
benoit74	00845293d6	Merge pull request #522 from vitaly-zdanevich/patch-3 README.md: add link to https://en.wikipedia.org/wiki/ZIM_(file_format)	2025-10-20 07:53:34 +02:00
Vitaly Zdanevich	44cf4218cb	README.md: add link to https://en.wikipedia.org/wiki/ZIM_(file_format) Signed-off-by: Vitaly Zdanevich <zdanevich.vitaly@ya.ru>	2025-10-20 01:22:31 +04:00
benoit74	6b520318a2	Merge pull request #521 from openzim/prepare-json-output set proper outputs name	2025-10-10 11:52:08 +02:00
Uchechukwu Orji	a9805c84c2	set proper outputs name	2025-10-10 10:38:38 +01:00
benoit74	8630b87a1f	Merge pull request #520 from openzim/offliner-definition-workflow-dispatch add workflow dispatch to update-offliner ci	2025-10-10 11:25:51 +02:00
Uchechukwu Orji	ad09665c4a	add workflow dispatch to update-offliner ci	2025-10-10 10:22:29 +01:00
benoit74	1d2069a66b	Merge pull request #519 from openzim/offliner-definitions use base64 string as argument to workflow call	2025-10-09 10:21:30 +02:00
Uchechukwu Orji	4ec47cd6dd	use base64 string as argument to workflow call	2025-10-08 04:25:12 +01:00
benoit74	b60dd388e7	Merge pull request #518 from openzim/offliner-definitions set up offliner definitions	2025-10-07 21:50:57 +02:00
Uchechukwu Orji	5624cbf081	set up offliner definitions	2025-10-07 04:08:14 +01:00
benoit74	8c471d9ee2	Prepare for 3.0.6	2025-04-11 07:46:42 +00:00
benoit74	009b8b4bd6	Release 3.0.5	2025-04-11 07:18:18 +00:00
benoit74	0c795b0051	Merge pull request #493 from orangetin/update-browsertrix Upgrade browsertrix-crawler to version 1.6.0 in Dockerfile	2025-04-11 09:14:34 +02:00
orangetin	b5d87198d8	update changelog	2025-04-10 17:54:34 -07:00
orangetin	511c3a5021	Upgrade browsertrix-crawler to version 1.6.0 in Dockerfile	2025-04-10 17:52:19 -07:00
benoit74	3421ca0212	Prepare for 3.0.5	2025-04-04 11:09:50 +00:00