From 8c471d9ee2269f22d63cfef383b4fba02241319b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 11 Apr 2025 07:46:42 +0000 Subject: [PATCH 01/15] Prepare for 3.0.6 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc99b8f..5e06e20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.5] - 2024-04-11 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index e94f36f..281b1bb 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.5" +__version__ = "3.0.6-dev0" From 5624cbf08142b321996bfd85ac9c12f1e52d2dae Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Tue, 7 Oct 2025 04:08:14 +0100 Subject: [PATCH 02/15] set up offliner definitions --- .../update-zim-offliner-definition.yaml | 38 + offliner-definition.json | 973 ++++++++++++++++++ 2 files changed, 1011 insertions(+) create mode 100644 .github/workflows/update-zim-offliner-definition.yaml create mode 100644 offliner-definition.json diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml new file mode 100644 index 0000000..4662e62 --- /dev/null +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -0,0 +1,38 @@ +name: Update ZIMFarm Definitions + +on: + push: + branches: [main] + paths: + - "offliner-definition.json" + release: + types: [published] + +jobs: + prepare-json: + runs-on: ubuntu-24.04 + outputs: + offliner_definition: ${{ steps.read-json.outputs.offliner_definition }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - id: read-json + run: | + if [ ! -f "offliner-definition.json" ]; then + echo "File not found!" >&2 + exit 1 + fi + json=$(jq -c . offliner-definition.json) + echo "offliner_definition=$json" >> $GITHUB_OUTPUT + call-workflow: + needs: prepare-json + uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main + with: + version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }} + offliner: zimit + offliner_definition: ${{ needs.prepare-json.outputs.offliner_definition }} + secrets: + zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} diff --git a/offliner-definition.json b/offliner-definition.json new file mode 100644 index 0000000..c7fed57 --- /dev/null +++ b/offliner-definition.json @@ -0,0 +1,973 @@ +{ + "offliner_id": "zimit", + "stdOutput": true, + "stdStats": "zimit-progress-file", + "flags": { + "seeds": { + "type": "string", + "required": false, + "title": "Seeds", + "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage" + }, + "seed_file": { + "type": "string", + "required": false, + "title": "Seed File", + "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file." + }, + "lang": { + "type": "string", + "required": false, + "title": "Browser Language", + "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`" + }, + "title": { + "type": "string", + "required": false, + "title": "Title", + "description": "Custom title for your ZIM. Defaults to title of main page", + "minLength": 1, + "maxLength": 30 + }, + "description": { + "type": "string", + "required": false, + "title": "Description", + "description": "Description for ZIM", + "minLength": 1, + "maxLength": 80 + }, + "favicon": { + "type": "url", + "required": false, + "title": "Illustration", + "description": "URL for Illustration. " + }, + "tags": { + "type": "string", + "required": false, + "title": "ZIM Tags", + "description": "Single string with individual tags separated by a semicolon." + }, + "creator": { + "type": "string", + "required": false, + "title": "Creator", + "description": "Name of content creator" + }, + "publisher": { + "type": "string", + "required": false, + "title": "Publisher", + "isPublisher": true, + "description": "Custom publisher name (ZIM metadata). openZIM otherwise" + }, + "source": { + "type": "string", + "required": false, + "title": "Source", + "description": "Source name/URL of content" + }, + "workers": { + "type": "integer", + "required": false, + "title": "Workers", + "description": "The number of workers to run in parallel. Defaults to 1", + "min": 1 + }, + "wait_until": { + "type": "string", + "required": false, + "title": "WaitUntil", + "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2" + }, + "extra_hops": { + "type": "integer", + "required": false, + "title": "Extra Hops", + "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0", + "min": 0 + }, + "page_limit": { + "type": "integer", + "required": false, + "title": "Page Limit", + "description": "Limit crawl to this number of pages. Default is 0 (no-limit).", + "min": 0 + }, + "max_page_limit": { + "type": "integer", + "required": false, + "title": "Max Page Limit", + "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)", + "min": 0 + }, + "page_load_timeout": { + "type": "integer", + "required": false, + "title": "Page Load Timeout", + "description": "Timeout for each page to load (in seconds). Default is 90", + "min": 0 + }, + "scope_type": { + "type": "string-enum", + "required": false, + "title": "Scope Type", + "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.", + "choices": [ + { + "title": "Page", + "value": "page" + }, + { + "title": "Page SPA", + "value": "page-spa" + }, + { + "title": "Prefix", + "value": "prefix" + }, + { + "title": "Host", + "value": "host" + }, + { + "title": "Domain", + "value": "domain" + }, + { + "title": "Any", + "value": "any" + }, + { + "title": "Custom", + "value": "custom" + } + ] + }, + "scope_include_rx": { + "type": "string", + "required": false, + "title": "Scope Include Regex", + "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)" + }, + "scope_exclude_rx": { + "type": "string", + "required": false, + "title": "Scope Exclude Regex", + "description": "Regex of page URLs that should be excluded from the crawl" + }, + "allow_hash_urls": { + "type": "boolean", + "required": false, + "title": "Allow Hashtag URLs", + "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" + }, + "mobile_device": { + "type": "string-enum", + "required": false, + "title": "As device", + "description": "Device to crawl as. See Pupeeter's Device.ts for a list", + "choices": [ + { + "title": "Blackberry Playbook", + "value": "Blackberry PlayBook" + }, + { + "title": "Blackberry Playbook Landscape", + "value": "Blackberry PlayBook landscape" + }, + { + "title": "Blackberry Z30", + "value": "BlackBerry Z30" + }, + { + "title": "Blackberry Z30 Landscape", + "value": "BlackBerry Z30 landscape" + }, + { + "title": "Galaxy Note 3", + "value": "Galaxy Note 3" + }, + { + "title": "Galaxy Note 3 Landscape", + "value": "Galaxy Note 3 landscape" + }, + { + "title": "Galaxy Note II", + "value": "Galaxy Note II" + }, + { + "title": "Galaxy Note II Landscape", + "value": "Galaxy Note II landscape" + }, + { + "title": "Galaxy S III", + "value": "Galaxy S III" + }, + { + "title": "Galaxy S III Landscape", + "value": "Galaxy S III landscape" + }, + { + "title": "Galaxy S5", + "value": "Galaxy S5" + }, + { + "title": "Galaxy S5 Landscape", + "value": "Galaxy S5 landscape" + }, + { + "title": "Galaxy S8", + "value": "Galaxy S8" + }, + { + "title": "Galaxy S8 Landscape", + "value": "Galaxy S8 landscape" + }, + { + "title": "Galaxy S9 Plus", + "value": "Galaxy S9+" + }, + { + "title": "Galaxy S9 Plus Landscape", + "value": "Galaxy S9+ landscape" + }, + { + "title": "Galaxy Tab S4", + "value": "Galaxy Tab S4" + }, + { + "title": "Galaxy Tab S4 Landscape", + "value": "Galaxy Tab S4 landscape" + }, + { + "title": "iPad", + "value": "iPad" + }, + { + "title": "iPad Landscape", + "value": "iPad landscape" + }, + { + "title": "iPad Gen 6", + "value": "iPad (gen 6)" + }, + { + "title": "iPad Gen 6 Landscape", + "value": "iPad (gen 6) landscape" + }, + { + "title": "iPad Gen 7", + "value": "iPad (gen 7)" + }, + { + "title": "iPad Gen 7 Landscape", + "value": "iPad (gen 7) landscape" + }, + { + "title": "iPad Mini", + "value": "iPad Mini" + }, + { + "title": "iPad Mini Landscape", + "value": "iPad Mini landscape" + }, + { + "title": "iPad Pro", + "value": "iPad Pro" + }, + { + "title": "iPad Pro Landscape", + "value": "iPad Pro landscape" + }, + { + "title": "iPad Pro 11", + "value": "iPad Pro 11" + }, + { + "title": "iPad Pro 11 Landscape", + "value": "iPad Pro 11 landscape" + }, + { + "title": "iPhone 4", + "value": "iPhone 4" + }, + { + "title": "iPhone 4 Landscape", + "value": "iPhone 4 landscape" + }, + { + "title": "iPhone 5", + "value": "iPhone 5" + }, + { + "title": "iPhone 5 Landscape", + "value": "iPhone 5 landscape" + }, + { + "title": "iPhone 6", + "value": "iPhone 6" + }, + { + "title": "iPhone 6 Landscape", + "value": "iPhone 6 landscape" + }, + { + "title": "iPhone 6 Plus", + "value": "iPhone 6 Plus" + }, + { + "title": "iPhone 6 Plus Landscape", + "value": "iPhone 6 Plus landscape" + }, + { + "title": "iPhone 7", + "value": "iPhone 7" + }, + { + "title": "iPhone 7 Landscape", + "value": "iPhone 7 landscape" + }, + { + "title": "iPhone 7 Plus", + "value": "iPhone 7 Plus" + }, + { + "title": "iPhone 7 Plus Landscape", + "value": "iPhone 7 Plus landscape" + }, + { + "title": "iPhone 8", + "value": "iPhone 8" + }, + { + "title": "iPhone 8 Landscape", + "value": "iPhone 8 landscape" + }, + { + "title": "iPhone 8 Plus", + "value": "iPhone 8 Plus" + }, + { + "title": "iPhone 8 Plus Landscape", + "value": "iPhone 8 Plus landscape" + }, + { + "title": "iPhone SE", + "value": "iPhone SE" + }, + { + "title": "iPhone SE Landscape", + "value": "iPhone SE landscape" + }, + { + "title": "iPhone X", + "value": "iPhone X" + }, + { + "title": "iPhone X Landscape", + "value": "iPhone X landscape" + }, + { + "title": "iPhone XR", + "value": "iPhone XR" + }, + { + "title": "iPhone XR Landscape", + "value": "iPhone XR landscape" + }, + { + "title": "iPhone 11", + "value": "iPhone 11" + }, + { + "title": "iPhone 11 Landscape", + "value": "iPhone 11 landscape" + }, + { + "title": "iPhone 11 Pro", + "value": "iPhone 11 Pro" + }, + { + "title": "iPhone 11 Pro Landscape", + "value": "iPhone 11 Pro landscape" + }, + { + "title": "iPhone 11 Pro Max", + "value": "iPhone 11 Pro Max" + }, + { + "title": "iPhone 11 Pro Max Landscape", + "value": "iPhone 11 Pro Max landscape" + }, + { + "title": "iPhone 12", + "value": "iPhone 12" + }, + { + "title": "iPhone 12 Landscape", + "value": "iPhone 12 landscape" + }, + { + "title": "iPhone 12 Pro", + "value": "iPhone 12 Pro" + }, + { + "title": "iPhone 12 Pro Landscape", + "value": "iPhone 12 Pro landscape" + }, + { + "title": "iPhone 12 Pro Max", + "value": "iPhone 12 Pro Max" + }, + { + "title": "iPhone 12 Pro Max Landscape", + "value": "iPhone 12 Pro Max landscape" + }, + { + "title": "iPhone 12 Mini", + "value": "iPhone 12 Mini" + }, + { + "title": "iPhone 12 Mini Landscape", + "value": "iPhone 12 Mini landscape" + }, + { + "title": "iPhone 13", + "value": "iPhone 13" + }, + { + "title": "iPhone 13 Landscape", + "value": "iPhone 13 landscape" + }, + { + "title": "iPhone 13 Pro", + "value": "iPhone 13 Pro" + }, + { + "title": "iPhone 13 Pro Landscape", + "value": "iPhone 13 Pro landscape" + }, + { + "title": "iPhone 13 Pro Max", + "value": "iPhone 13 Pro Max" + }, + { + "title": "iPhone 13 Pro Max Landscape", + "value": "iPhone 13 Pro Max landscape" + }, + { + "title": "iPhone 13 Mini", + "value": "iPhone 13 Mini" + }, + { + "title": "iPhone 13 Mini Landscape", + "value": "iPhone 13 Mini landscape" + }, + { + "title": "Jio Phone 2", + "value": "JioPhone 2" + }, + { + "title": "Jio Phone 2 Landscape", + "value": "JioPhone 2 landscape" + }, + { + "title": "Kindle Fire HDX", + "value": "Kindle Fire HDX" + }, + { + "title": "Kindle Fire HDX Landscape", + "value": "Kindle Fire HDX landscape" + }, + { + "title": "LG Optimus L70", + "value": "LG Optimus L70" + }, + { + "title": "LG Optimus L70 Landscape", + "value": "LG Optimus L70 landscape" + }, + { + "title": "Microsoft Lumia 550", + "value": "Microsoft Lumia 550" + }, + { + "title": "Microsoft Lumia 950", + "value": "Microsoft Lumia 950" + }, + { + "title": "Microsoft Lumia 950 Landscape", + "value": "Microsoft Lumia 950 landscape" + }, + { + "title": "Nexus 10", + "value": "Nexus 10" + }, + { + "title": "Nexus 10 Landscape", + "value": "Nexus 10 landscape" + }, + { + "title": "Nexus 4", + "value": "Nexus 4" + }, + { + "title": "Nexus 4 Landscape", + "value": "Nexus 4 landscape" + }, + { + "title": "Nexus 5", + "value": "Nexus 5" + }, + { + "title": "Nexus 5 Landscape", + "value": "Nexus 5 landscape" + }, + { + "title": "Nexus 5X", + "value": "Nexus 5X" + }, + { + "title": "Nexus 5X Landscape", + "value": "Nexus 5X landscape" + }, + { + "title": "Nexus 6", + "value": "Nexus 6" + }, + { + "title": "Nexus 6 Landscape", + "value": "Nexus 6 landscape" + }, + { + "title": "Nexus 6P", + "value": "Nexus 6P" + }, + { + "title": "Nexus 6P Landscape", + "value": "Nexus 6P landscape" + }, + { + "title": "Nexus 7", + "value": "Nexus 7" + }, + { + "title": "Nexus 7 Landscape", + "value": "Nexus 7 landscape" + }, + { + "title": "Nokia Lumia 520", + "value": "Nokia Lumia 520" + }, + { + "title": "Nokia Lumia 520 Landscape", + "value": "Nokia Lumia 520 landscape" + }, + { + "title": "Nokia N9", + "value": "Nokia N9" + }, + { + "title": "Nokia N9 Landscape", + "value": "Nokia N9 landscape" + }, + { + "title": "Pixel 2", + "value": "Pixel 2" + }, + { + "title": "Pixel 2 Landscape", + "value": "Pixel 2 landscape" + }, + { + "title": "Pixel 2 XL", + "value": "Pixel 2 XL" + }, + { + "title": "Pixel 2 XL Landscape", + "value": "Pixel 2 XL landscape" + }, + { + "title": "Pixel 3", + "value": "Pixel 3" + }, + { + "title": "Pixel 3 Landscape", + "value": "Pixel 3 landscape" + }, + { + "title": "Pixel 4", + "value": "Pixel 4" + }, + { + "title": "Pixel 4 Landscape", + "value": "Pixel 4 landscape" + }, + { + "title": "Pixel 4A 5G", + "value": "Pixel 4a (5G)" + }, + { + "title": "Pixel 4A 5G Landscape", + "value": "Pixel 4a (5G) landscape" + }, + { + "title": "Pixel 5", + "value": "Pixel 5" + }, + { + "title": "Pixel 5 Landscape", + "value": "Pixel 5 landscape" + }, + { + "title": "Moto G4", + "value": "Moto G4" + }, + { + "title": "Moto G4 Landscape", + "value": "Moto G4 landscape" + } + ] + }, + "select_links": { + "type": "string", + "required": false, + "title": "Select Links", + "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" + }, + "click_selector": { + "type": "string", + "required": false, + "title": "Click Selector", + "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'" + }, + "block_rules": { + "type": "string", + "required": false, + "title": "Block Rules", + "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" + }, + "block_message": { + "type": "string", + "required": false, + "title": "Block Message", + "description": "If specified, when a URL is blocked, a record with this error message is added instead" + }, + "block_ads": { + "type": "boolean", + "required": false, + "title": "Block Ads", + "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set." + }, + "ad_block_message": { + "type": "string", + "required": false, + "title": "Ads Block Message", + "description": "If specified, when an ad is blocked, a record with this error message is added instead" + }, + "user_agent": { + "type": "string", + "required": false, + "title": "User Agent", + "description": "Override user-agent with specified" + }, + "user_agent_suffix": { + "type": "string", + "required": false, + "title": "User Agent Suffix", + "description": "Append suffix to existing browser user-agent. Defaults to +Zimit" + }, + "use_sitemap": { + "type": "string", + "required": false, + "title": "Sitemap URL", + "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)" + }, + "sitemap_from_date": { + "type": "string", + "required": false, + "title": "Sitemap From Date", + "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "sitemap_to_date": { + "type": "string", + "required": false, + "title": "Sitemap To Date", + "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "behavior_timeout": { + "type": "integer", + "required": false, + "title": "Behavior Timeout", + "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.", + "min": 0 + }, + "post_load_delay": { + "type": "integer", + "required": false, + "title": "Post Load Delay", + "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.", + "min": 0 + }, + "page_extra_delay": { + "type": "integer", + "required": false, + "title": "Page Extra Delay", + "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.", + "min": 0 + }, + "dedup_policy": { + "type": "string-enum", + "required": false, + "title": "Dedup Policy", + "description": "Deduplication policy. One of skip, revisit or keep. Default is skip", + "choices": [ + { + "title": "Skip", + "value": "skip" + }, + { + "title": "Revisit", + "value": "revisit" + }, + { + "title": "Keep", + "value": "keep" + } + ] + }, + "screenshot": { + "type": "string", + "required": false, + "title": "Screenshot", + "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those." + }, + "size_soft_limit": { + "type": "integer", + "required": false, + "title": "Size Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.", + "min": 0 + }, + "size_hard_limit": { + "type": "integer", + "required": false, + "title": "Size Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value", + "min": 0 + }, + "disk_utilization": { + "type": "integer", + "required": false, + "title": "Disk Utilization", + "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.", + "min": 0 + }, + "time_soft_limit": { + "type": "integer", + "required": false, + "title": "Time Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.", + "min": 0 + }, + "time_hard_limit": { + "type": "integer", + "required": false, + "title": "Time Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds", + "min": 0 + }, + "net_idle_wait": { + "type": "integer", + "required": false, + "title": "Net Idle Wait", + "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope." + }, + "origin_override": { + "type": "string", + "required": false, + "title": "Origin Override", + "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port." + }, + "max_page_retries": { + "type": "integer", + "required": false, + "title": "Max Page Retries", + "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.", + "min": 0 + }, + "fail_on_failed_seed": { + "type": "boolean", + "required": false, + "title": "Fail on failed seed", + "description": "Whether to display additional logs" + }, + "fail_on_invalid_status": { + "type": "boolean", + "required": false, + "title": "Fail on invalid status", + "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses" + }, + "fail_on_failed_limit": { + "type": "integer", + "required": false, + "title": "Fail on failed - Limit", + "description": "If set, save state and exit if number of failed pages exceeds this value.", + "min": 0 + }, + "warcs": { + "type": "string", + "required": false, + "title": "WARC files", + "description": "Comma-separated list of WARC files to use as input." + }, + "verbose": { + "type": "boolean", + "required": false, + "title": "Verbose mode", + "description": "Whether to display additional logs" + }, + "keep": { + "type": "boolean", + "required": false, + "title": "Keep", + "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.", + "default": true + }, + "output": { + "type": "string", + "required": false, + "title": "Output folder", + "description": "Output folder for ZIM file(s). Leave it as `/output`", + "pattern": "^/output$" + }, + "admin_email": { + "type": "email", + "required": false, + "title": "Admin Email", + "description": "Admin Email for crawler: used in UserAgent so website admin can contact us", + "default": "contact+zimfarm@kiwix.org" + }, + "profile": { + "type": "string", + "required": false, + "title": "Browser profile", + "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler." + }, + "behaviors": { + "type": "string", + "required": false, + "title": "Behaviors", + "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific." + }, + "depth": { + "type": "integer", + "required": false, + "title": "Depth", + "description": "The depth of the crawl for all seeds. Default is -1 (infinite).", + "min": -1 + }, + "zim_lang": { + "type": "string", + "required": false, + "title": "ZIM Language", + "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`", + "alias": "zim-lang", + "customValidator": "language_code" + }, + "long_description": { + "type": "string", + "required": false, + "title": "Long description", + "description": "Optional long description for your ZIM", + "minLength": 1, + "maxLength": 4000, + "alias": "long-description" + }, + "custom_css": { + "type": "url", + "required": false, + "title": "Custom CSS", + "description": "URL to a CSS file to inject into pages", + "alias": "custom-css" + }, + "charsets_to_try": { + "type": "string", + "required": false, + "title": "Charsets to try", + "description": "List of charsets to try decode content when charset is not found", + "alias": "charsets-to-try" + }, + "ignore_content_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore Content Header Charsets", + "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.", + "alias": "ignore-content-header-charsets" + }, + "content_header_bytes_length": { + "type": "integer", + "required": false, + "title": "Content Header Bytes Length", + "description": "How many bytes to consider when searching for content charsets in header (default is 1024).", + "alias": "content-header-bytes-length", + "min": 0 + }, + "ignore_http_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore HTTP Header Charsets", + "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.", + "alias": "ignore-http-header-charsets" + }, + "encoding_aliases": { + "type": "string", + "required": false, + "title": "Encoding Aliases", + "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.", + "alias": "encoding-aliases" + }, + "custom_behaviors": { + "type": "string", + "required": false, + "title": "Custom Behaviors", + "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.", + "alias": "custom-behaviours" + }, + "zimit_progress_file": { + "type": "string", + "required": false, + "title": "Zimit Progress File", + "description": "Scraping progress file. Leave it as `/output/task_progress.json`", + "alias": "zimit-progress-file", + "pattern": "^/output/task_progress\\.json$" + }, + "replay_viewer_source": { + "type": "url", + "required": false, + "title": "Replay Viewer Source", + "description": "URL from which to load the ReplayWeb.page replay viewer from", + "alias": "replay-viewer-source" + }, + "zim_file": { + "type": "string", + "required": false, + "title": "ZIM filename", + "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically", + "alias": "zim-file", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "name": { + "type": "string", + "required": true, + "title": "ZIM name", + "description": "Name of the ZIM.", + "alias": "name", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + } + } +} From 4ec47cd6dd7c8c69fa390f69c488fbc5ff9d1966 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Wed, 8 Oct 2025 04:25:12 +0100 Subject: [PATCH 03/15] use base64 string as argument to workflow call --- .github/workflows/update-zim-offliner-definition.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml index 4662e62..ee26474 100644 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -25,14 +25,14 @@ jobs: echo "File not found!" >&2 exit 1 fi - json=$(jq -c . offliner-definition.json) - echo "offliner_definition=$json" >> $GITHUB_OUTPUT + json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)") + echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT call-workflow: needs: prepare-json uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main with: version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }} offliner: zimit - offliner_definition: ${{ needs.prepare-json.outputs.offliner_definition }} + offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} secrets: zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} From ad09665c4a93a503b0394f50a3835f69e6b6c6e5 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 10 Oct 2025 10:22:29 +0100 Subject: [PATCH 04/15] add workflow dispatch to update-offliner ci --- .github/workflows/update-zim-offliner-definition.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml index ee26474..982fe03 100644 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -8,6 +8,13 @@ on: release: types: [published] + workflow_dispatch: + inputs: + version: + description: "Version to publish (leave blank to use 'dev')" + required: false + default: "dev" + jobs: prepare-json: runs-on: ubuntu-24.04 @@ -31,7 +38,7 @@ jobs: needs: prepare-json uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main with: - version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }} + version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }} offliner: zimit offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} secrets: From a9805c84c284fc23f0e6497b79cb42e53e4adb28 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 10 Oct 2025 10:34:26 +0100 Subject: [PATCH 05/15] set proper outputs name --- .github/workflows/update-zim-offliner-definition.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml index 982fe03..f481354 100644 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -11,7 +11,7 @@ on: workflow_dispatch: inputs: version: - description: "Version to publish (leave blank to use 'dev')" + description: "Version to publish" required: false default: "dev" @@ -19,7 +19,7 @@ jobs: prepare-json: runs-on: ubuntu-24.04 outputs: - offliner_definition: ${{ steps.read-json.outputs.offliner_definition }} + offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }} steps: - name: Checkout repository uses: actions/checkout@v4 From 44cf4218cb1940b4fd0cfa45032da1b8d3fdf130 Mon Sep 17 00:00:00 2001 From: Vitaly Zdanevich Date: Mon, 20 Oct 2025 01:22:31 +0400 Subject: [PATCH 06/15] README.md: add link to https://en.wikipedia.org/wiki/ZIM_(file_format) Signed-off-by: Vitaly Zdanevich --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 894f523..188615f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Zimit ===== -Zimit is a scraper allowing to create ZIM file from any Web site. +Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) From 611d2033f7500a117aed9069fcad0abd7384b14c Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 09:29:15 -0800 Subject: [PATCH 07/15] Issue #499 - Use build dir rather than random tmp dir when passed. --- src/zimit/zimit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 02b167d..30c5de0 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -796,11 +796,13 @@ def run(raw_args): if known_args.adminEmail: user_agent_suffix += f" {known_args.adminEmail}" - # make temp dir for this crawl + # set temp dir to use for this crawl global temp_root_dir # noqa: PLW0603 if known_args.build: - temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp")) + # use build dir argument if passed + temp_root_dir = Path(known_args.build) else: + # make new randomized temp dir temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) seeds = [] From 4595d2a3027c82fb04629a5e949a8a12bb2483a5 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 09:36:47 -0800 Subject: [PATCH 08/15] Issue #499 - Only register cleanup if neither build or keep arguments have been passed. --- src/zimit/zimit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 30c5de0..9ed8a20 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -856,7 +856,8 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - if not known_args.keep: + # only trigger cleanup when the keep argument is passed without a custom build dir. + if not known_args.build and not known_args.keep: atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler From 57a88434e22517f6cffb63070d7852b11ad2d7b8 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 11:49:58 -0800 Subject: [PATCH 09/15] Issue #499 - Use all warc_directories found when no specific collection has been passed. --- src/zimit/zimit.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 9ed8a20..a91c4e4 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -1079,12 +1079,11 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, only most recently modified one" - " will be used" + "Found many WARC files directories, combining pages from all of them" ) for directory in warc_dirs: logger.info(f"- {directory}") - warc_files = [warc_dirs[-1]] + warc_files = warc_dirs logger.info("") logger.info("----------") From 6db73a0a83f6d3b028175ba3a918a75493340f70 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 12:19:28 -0800 Subject: [PATCH 10/15] Issue #499 - Ensure build directory exists when passed. --- src/zimit/zimit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index a91c4e4..fb070a0 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -801,6 +801,7 @@ def run(raw_args): if known_args.build: # use build dir argument if passed temp_root_dir = Path(known_args.build) + temp_root_dir.mkdir(parents=True, exist_ok=True) else: # make new randomized temp dir temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) From ef004f38635a2b1db1d1385212f913f04ee659e4 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Fri, 7 Nov 2025 11:33:01 -0800 Subject: [PATCH 11/15] Issue #499 Record changes in CHANGELOG --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e06e20..58fb40a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Fix issues preventing interrupted crawls from being resumed. (#499) + - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. + - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed. + - Don't cleanup an explicitly passed build directory. + ## [3.0.5] - 2024-04-11 ### Changed From e30a82a91c4e75de290e04b6d9b56aa9d5832799 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Fri, 7 Nov 2025 12:59:25 -0800 Subject: [PATCH 12/15] PR #524 Fix line length. --- src/zimit/zimit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index fb070a0..e982cbd 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -1080,7 +1080,8 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, combining pages from all of them" + "Found many WARC files directories, combining pages from all " + "of them" ) for directory in warc_dirs: logger.info(f"- {directory}") From aec19d95d2257f72445746f92759e9b88574a31a Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Mon, 15 Dec 2025 14:25:24 +0100 Subject: [PATCH 13/15] migrate custom_css and favicon flags to blob types --- offliner-definition.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/offliner-definition.json b/offliner-definition.json index c7fed57..89bdd51 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -38,7 +38,8 @@ "maxLength": 80 }, "favicon": { - "type": "url", + "type": "blob", + "kind": "image", "required": false, "title": "Illustration", "description": "URL for Illustration. " @@ -887,7 +888,8 @@ "alias": "long-description" }, "custom_css": { - "type": "url", + "type": "blob", + "kind": "image", "required": false, "title": "Custom CSS", "description": "URL to a CSS file to inject into pages", From 34ce7eb98dc7a35dac7f3824a856e5e1e23587bf Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 16 Dec 2025 16:33:53 +0000 Subject: [PATCH 14/15] Fix offliner definition --- offliner-definition.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/offliner-definition.json b/offliner-definition.json index 89bdd51..4000466 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -889,7 +889,7 @@ }, "custom_css": { "type": "blob", - "kind": "image", + "kind": "css", "required": false, "title": "Custom CSS", "description": "URL to a CSS file to inject into pages", From 81018f06fa15517917c4c6e52d0212ca669b35dc Mon Sep 17 00:00:00 2001 From: Aaryan Kumar Sinha Date: Sat, 13 Dec 2025 01:30:33 +0530 Subject: [PATCH 15/15] Added --overwrite flag to zimit --- CHANGELOG.md | 3 ++ offliner-definition.json | 6 +++ src/zimit/zimit.py | 6 ++- tests/conftest.py | 14 ++++++ tests/data/example-response.warc | Bin 0 -> 2272 bytes tests/test_overwrite.py | 83 +++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/data/example-response.warc create mode 100644 tests/test_overwrite.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 58fb40a..2a99b30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) + ### Changed - Fix issues preventing interrupted crawls from being resumed. (#499) - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. diff --git a/offliner-definition.json b/offliner-definition.json index 4000466..4bb68b5 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -970,6 +970,12 @@ "alias": "name", "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "overwrite": { + "type": "boolean", + "required": false, + "title": "Overwrite", + "description": "Whether to overwrite existing ZIM file if it exists" } } } diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index e982cbd..b205007 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -849,6 +849,9 @@ def run(raw_args): warc2zim_args.append("--lang") warc2zim_args.append(known_args.zim_lang) + if known_args.overwrite: + warc2zim_args.append("--overwrite") + logger.info("----------") logger.info("Testing warc2zim args") logger.info("Running: warc2zim " + " ".join(warc2zim_args)) @@ -1036,7 +1039,6 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") crawl = subprocess.run(crawler_args, check=False) if ( @@ -1091,7 +1093,7 @@ def run(raw_args): logger.info("----------") logger.info( f"Processing WARC files in/at " - f'{" ".join(str(warc_file) for warc_file in warc_files)}' + f"{' '.join(str(warc_file) for warc_file in warc_files)}" ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d51650d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,14 @@ +import pytest + +from zimit import zimit as app + +""" + cleanup disabled because atexit hooks run at the very end of the Python process + shutdown. By the time cleanup() is called, the logging module has already closed its + file streams. +""" + + +@pytest.fixture(autouse=True) +def disable_zimit_cleanup(monkeypatch): + monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc new file mode 100644 index 0000000000000000000000000000000000000000..143b947d121e61b479cf9cae596b3853a9a60633 GIT binary patch literal 2272 zcmb`I`9Bkk1IJ13mB}8VF(i@kY%^vP#gc21YjZ|(tKqG9zOV21Klpw>KYxDuyxy<(2SO1MX-n+EA32QhCSDKm^z*;tt%-oa zbzumE4h)IVMxfD1Jx$oZqp70_gTvu^a6R;OZ!|(H;-5YrE|U`=pae?&C9rW^Ya*p1 zbjT~mR!ua~n4kol@A;&6KjUFCyxAaDN7gUAWNv-;u#?O2K?`&&1yHTSX2yZ~do0BF z1v%l+=Di-I;tyLt)05F<6`6pGKI@`J(9hMq1{(0}2gk8{zb>Pdn ztf2h@%$|g4W_!Wub6PgVBlKrY&?+_I1R=Rwg_rSHm)Ep2@w(m?{$*hGDov_TiqXL} z0tcpoDm71%@65h^YyJA;--K40^~0$xbQB6a&<_M!xeX7W*;F6p>U2!VxOy1^y$a5Q z-pjO~%c9tE?}km)g1?1MA5sn`*w z$lKUUV=a7qe20eud{_YdRm1V!X&#}%_2<; zXY$Dd(Rrsz$A=p4{*@O+`xQl->mGC&Kt>|U@w4i`4>?aKf|ZRw&fGDFxi((>swq|x zEJ-enG&wo9g*lq8WCo_E$T*v2AT1<;SSM-eXF!jHexp|6`4&aoOOY-yjl?Oy14%U_ z7x;tIX6YVO19|aFr*itiluYb0eR&EgH!r|eIE(wiWj}%a%kT15Qt`cPy}M_!gXME^ z-*5=?N(cWyD!?to@xoeU!EuGI<4jAb=T8aHF|S$QzZ%#@x?aXqtJfMaNU0)v-@=d4 zVhUJS?IT!ujK6MK%=`r3gMMQ{Q`-za=tu1&>2F7R1@bzSQU+6QK5he86a*^%nVmee zZlYH*D|*?IPYec9Bf?!h)a3;k$h*MgqrHm>&Sm&Ams+1QAlFv_EKas}NY1s%CFspf zFi|5sG6(Z*1$gY>LEu?3NB|;sGsrO61_RhkBY$gXypsfgb@9Hoh>$adRqN~4Qg zl_)0UBp=DsOvUiy`T2mO49{tc%+KMV`7?C|dMl|0gd$8u(Qm&XJ+++rn_k~PGgMojC`zD!i@ z6PGKwA7;NuQ2=@C$oRj}X)uO2txn)@yLoe1YAGD@@j;}z) zb<{ILaFg#+BRTs7{n=Ng7}lyrUF>w~f#n2oN$klv((};%fk97HV_I}_!G=+MwVqac zXinZrw95%0>+%D}#`Mwxwx#h`z5a+FBG%+0A{ff1h zKR>m03#_$;o-QX4m(;D+HryntdXKkob^StIv-Z|`I^t2mx5lfc79i2WC#rOFYs{dI zg|<)8ih=r}UOoqvsUxlv)w0U;xHvmDIzI6k{LKK0bd4?1y_;Q9i%t|4)6CQtB6gU?})Q-i!JMs0gDfL|e|fCnnTq)tcbTzjc4<%Y+EE z#ni*By(?Y{UA8f=x(Iv?)b53Q5t8Yr?andDyRP*7EgFyQH42e zuacpvQqzv4Y={{0&aenFiM`WqwXlHV`sOAP(2mztg{DvB`LyC!M@VAyM7 zS14cud+eaoBLn0xaRwvtMs`zsRAdHj3)HX^$@~-ZAOq9ancd24=pmnCrrvma*d4A#_O6AbcZTEp5`+$D98#-XBzbn{k>%2>;U*#I@X_s6KHew>eu=87 RH3|9e>Fz|Z3l0B4?{AQ;Bs>5B literal 0 HcmV?d00001 diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py new file mode 100644 index 0000000..e41baca --- /dev/null +++ b/tests/test_overwrite.py @@ -0,0 +1,83 @@ +import pathlib + +import pytest + +from zimit.zimit import run + +TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" + + +def test_overwrite_flag_behaviour(tmp_path): + zim_output = "overwrite-test.zim" + output_path = tmp_path / zim_output + + # 1st run → creates file + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert result in (None, 100) + assert output_path.exists() + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 3rd run, with overwrite → should succeed + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + "--overwrite", + ] + ) + assert result in (None, 100) + assert output_path.exists()