diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml new file mode 100644 index 0000000..f481354 --- /dev/null +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -0,0 +1,45 @@ +name: Update ZIMFarm Definitions + +on: + push: + branches: [main] + paths: + - "offliner-definition.json" + release: + types: [published] + + workflow_dispatch: + inputs: + version: + description: "Version to publish" + required: false + default: "dev" + +jobs: + prepare-json: + runs-on: ubuntu-24.04 + outputs: + offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - id: read-json + run: | + if [ ! -f "offliner-definition.json" ]; then + echo "File not found!" >&2 + exit 1 + fi + json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)") + echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT + call-workflow: + needs: prepare-json + uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main + with: + version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }} + offliner: zimit + offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} + secrets: + zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} diff --git a/CHANGELOG.md b/CHANGELOG.md index bc99b8f..2a99b30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + +### Added +- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) + +### Changed +- Fix issues preventing interrupted crawls from being resumed. (#499) + - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. + - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed. + - Don't cleanup an explicitly passed build directory. + ## [3.0.5] - 2024-04-11 ### Changed diff --git a/README.md b/README.md index 894f523..188615f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Zimit ===== -Zimit is a scraper allowing to create ZIM file from any Web site. +Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) diff --git a/offliner-definition.json b/offliner-definition.json new file mode 100644 index 0000000..4bb68b5 --- /dev/null +++ b/offliner-definition.json @@ -0,0 +1,981 @@ +{ + "offliner_id": "zimit", + "stdOutput": true, + "stdStats": "zimit-progress-file", + "flags": { + "seeds": { + "type": "string", + "required": false, + "title": "Seeds", + "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage" + }, + "seed_file": { + "type": "string", + "required": false, + "title": "Seed File", + "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file." + }, + "lang": { + "type": "string", + "required": false, + "title": "Browser Language", + "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`" + }, + "title": { + "type": "string", + "required": false, + "title": "Title", + "description": "Custom title for your ZIM. Defaults to title of main page", + "minLength": 1, + "maxLength": 30 + }, + "description": { + "type": "string", + "required": false, + "title": "Description", + "description": "Description for ZIM", + "minLength": 1, + "maxLength": 80 + }, + "favicon": { + "type": "blob", + "kind": "image", + "required": false, + "title": "Illustration", + "description": "URL for Illustration. " + }, + "tags": { + "type": "string", + "required": false, + "title": "ZIM Tags", + "description": "Single string with individual tags separated by a semicolon." + }, + "creator": { + "type": "string", + "required": false, + "title": "Creator", + "description": "Name of content creator" + }, + "publisher": { + "type": "string", + "required": false, + "title": "Publisher", + "isPublisher": true, + "description": "Custom publisher name (ZIM metadata). openZIM otherwise" + }, + "source": { + "type": "string", + "required": false, + "title": "Source", + "description": "Source name/URL of content" + }, + "workers": { + "type": "integer", + "required": false, + "title": "Workers", + "description": "The number of workers to run in parallel. Defaults to 1", + "min": 1 + }, + "wait_until": { + "type": "string", + "required": false, + "title": "WaitUntil", + "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2" + }, + "extra_hops": { + "type": "integer", + "required": false, + "title": "Extra Hops", + "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0", + "min": 0 + }, + "page_limit": { + "type": "integer", + "required": false, + "title": "Page Limit", + "description": "Limit crawl to this number of pages. Default is 0 (no-limit).", + "min": 0 + }, + "max_page_limit": { + "type": "integer", + "required": false, + "title": "Max Page Limit", + "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)", + "min": 0 + }, + "page_load_timeout": { + "type": "integer", + "required": false, + "title": "Page Load Timeout", + "description": "Timeout for each page to load (in seconds). Default is 90", + "min": 0 + }, + "scope_type": { + "type": "string-enum", + "required": false, + "title": "Scope Type", + "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.", + "choices": [ + { + "title": "Page", + "value": "page" + }, + { + "title": "Page SPA", + "value": "page-spa" + }, + { + "title": "Prefix", + "value": "prefix" + }, + { + "title": "Host", + "value": "host" + }, + { + "title": "Domain", + "value": "domain" + }, + { + "title": "Any", + "value": "any" + }, + { + "title": "Custom", + "value": "custom" + } + ] + }, + "scope_include_rx": { + "type": "string", + "required": false, + "title": "Scope Include Regex", + "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)" + }, + "scope_exclude_rx": { + "type": "string", + "required": false, + "title": "Scope Exclude Regex", + "description": "Regex of page URLs that should be excluded from the crawl" + }, + "allow_hash_urls": { + "type": "boolean", + "required": false, + "title": "Allow Hashtag URLs", + "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" + }, + "mobile_device": { + "type": "string-enum", + "required": false, + "title": "As device", + "description": "Device to crawl as. See Pupeeter's Device.ts for a list", + "choices": [ + { + "title": "Blackberry Playbook", + "value": "Blackberry PlayBook" + }, + { + "title": "Blackberry Playbook Landscape", + "value": "Blackberry PlayBook landscape" + }, + { + "title": "Blackberry Z30", + "value": "BlackBerry Z30" + }, + { + "title": "Blackberry Z30 Landscape", + "value": "BlackBerry Z30 landscape" + }, + { + "title": "Galaxy Note 3", + "value": "Galaxy Note 3" + }, + { + "title": "Galaxy Note 3 Landscape", + "value": "Galaxy Note 3 landscape" + }, + { + "title": "Galaxy Note II", + "value": "Galaxy Note II" + }, + { + "title": "Galaxy Note II Landscape", + "value": "Galaxy Note II landscape" + }, + { + "title": "Galaxy S III", + "value": "Galaxy S III" + }, + { + "title": "Galaxy S III Landscape", + "value": "Galaxy S III landscape" + }, + { + "title": "Galaxy S5", + "value": "Galaxy S5" + }, + { + "title": "Galaxy S5 Landscape", + "value": "Galaxy S5 landscape" + }, + { + "title": "Galaxy S8", + "value": "Galaxy S8" + }, + { + "title": "Galaxy S8 Landscape", + "value": "Galaxy S8 landscape" + }, + { + "title": "Galaxy S9 Plus", + "value": "Galaxy S9+" + }, + { + "title": "Galaxy S9 Plus Landscape", + "value": "Galaxy S9+ landscape" + }, + { + "title": "Galaxy Tab S4", + "value": "Galaxy Tab S4" + }, + { + "title": "Galaxy Tab S4 Landscape", + "value": "Galaxy Tab S4 landscape" + }, + { + "title": "iPad", + "value": "iPad" + }, + { + "title": "iPad Landscape", + "value": "iPad landscape" + }, + { + "title": "iPad Gen 6", + "value": "iPad (gen 6)" + }, + { + "title": "iPad Gen 6 Landscape", + "value": "iPad (gen 6) landscape" + }, + { + "title": "iPad Gen 7", + "value": "iPad (gen 7)" + }, + { + "title": "iPad Gen 7 Landscape", + "value": "iPad (gen 7) landscape" + }, + { + "title": "iPad Mini", + "value": "iPad Mini" + }, + { + "title": "iPad Mini Landscape", + "value": "iPad Mini landscape" + }, + { + "title": "iPad Pro", + "value": "iPad Pro" + }, + { + "title": "iPad Pro Landscape", + "value": "iPad Pro landscape" + }, + { + "title": "iPad Pro 11", + "value": "iPad Pro 11" + }, + { + "title": "iPad Pro 11 Landscape", + "value": "iPad Pro 11 landscape" + }, + { + "title": "iPhone 4", + "value": "iPhone 4" + }, + { + "title": "iPhone 4 Landscape", + "value": "iPhone 4 landscape" + }, + { + "title": "iPhone 5", + "value": "iPhone 5" + }, + { + "title": "iPhone 5 Landscape", + "value": "iPhone 5 landscape" + }, + { + "title": "iPhone 6", + "value": "iPhone 6" + }, + { + "title": "iPhone 6 Landscape", + "value": "iPhone 6 landscape" + }, + { + "title": "iPhone 6 Plus", + "value": "iPhone 6 Plus" + }, + { + "title": "iPhone 6 Plus Landscape", + "value": "iPhone 6 Plus landscape" + }, + { + "title": "iPhone 7", + "value": "iPhone 7" + }, + { + "title": "iPhone 7 Landscape", + "value": "iPhone 7 landscape" + }, + { + "title": "iPhone 7 Plus", + "value": "iPhone 7 Plus" + }, + { + "title": "iPhone 7 Plus Landscape", + "value": "iPhone 7 Plus landscape" + }, + { + "title": "iPhone 8", + "value": "iPhone 8" + }, + { + "title": "iPhone 8 Landscape", + "value": "iPhone 8 landscape" + }, + { + "title": "iPhone 8 Plus", + "value": "iPhone 8 Plus" + }, + { + "title": "iPhone 8 Plus Landscape", + "value": "iPhone 8 Plus landscape" + }, + { + "title": "iPhone SE", + "value": "iPhone SE" + }, + { + "title": "iPhone SE Landscape", + "value": "iPhone SE landscape" + }, + { + "title": "iPhone X", + "value": "iPhone X" + }, + { + "title": "iPhone X Landscape", + "value": "iPhone X landscape" + }, + { + "title": "iPhone XR", + "value": "iPhone XR" + }, + { + "title": "iPhone XR Landscape", + "value": "iPhone XR landscape" + }, + { + "title": "iPhone 11", + "value": "iPhone 11" + }, + { + "title": "iPhone 11 Landscape", + "value": "iPhone 11 landscape" + }, + { + "title": "iPhone 11 Pro", + "value": "iPhone 11 Pro" + }, + { + "title": "iPhone 11 Pro Landscape", + "value": "iPhone 11 Pro landscape" + }, + { + "title": "iPhone 11 Pro Max", + "value": "iPhone 11 Pro Max" + }, + { + "title": "iPhone 11 Pro Max Landscape", + "value": "iPhone 11 Pro Max landscape" + }, + { + "title": "iPhone 12", + "value": "iPhone 12" + }, + { + "title": "iPhone 12 Landscape", + "value": "iPhone 12 landscape" + }, + { + "title": "iPhone 12 Pro", + "value": "iPhone 12 Pro" + }, + { + "title": "iPhone 12 Pro Landscape", + "value": "iPhone 12 Pro landscape" + }, + { + "title": "iPhone 12 Pro Max", + "value": "iPhone 12 Pro Max" + }, + { + "title": "iPhone 12 Pro Max Landscape", + "value": "iPhone 12 Pro Max landscape" + }, + { + "title": "iPhone 12 Mini", + "value": "iPhone 12 Mini" + }, + { + "title": "iPhone 12 Mini Landscape", + "value": "iPhone 12 Mini landscape" + }, + { + "title": "iPhone 13", + "value": "iPhone 13" + }, + { + "title": "iPhone 13 Landscape", + "value": "iPhone 13 landscape" + }, + { + "title": "iPhone 13 Pro", + "value": "iPhone 13 Pro" + }, + { + "title": "iPhone 13 Pro Landscape", + "value": "iPhone 13 Pro landscape" + }, + { + "title": "iPhone 13 Pro Max", + "value": "iPhone 13 Pro Max" + }, + { + "title": "iPhone 13 Pro Max Landscape", + "value": "iPhone 13 Pro Max landscape" + }, + { + "title": "iPhone 13 Mini", + "value": "iPhone 13 Mini" + }, + { + "title": "iPhone 13 Mini Landscape", + "value": "iPhone 13 Mini landscape" + }, + { + "title": "Jio Phone 2", + "value": "JioPhone 2" + }, + { + "title": "Jio Phone 2 Landscape", + "value": "JioPhone 2 landscape" + }, + { + "title": "Kindle Fire HDX", + "value": "Kindle Fire HDX" + }, + { + "title": "Kindle Fire HDX Landscape", + "value": "Kindle Fire HDX landscape" + }, + { + "title": "LG Optimus L70", + "value": "LG Optimus L70" + }, + { + "title": "LG Optimus L70 Landscape", + "value": "LG Optimus L70 landscape" + }, + { + "title": "Microsoft Lumia 550", + "value": "Microsoft Lumia 550" + }, + { + "title": "Microsoft Lumia 950", + "value": "Microsoft Lumia 950" + }, + { + "title": "Microsoft Lumia 950 Landscape", + "value": "Microsoft Lumia 950 landscape" + }, + { + "title": "Nexus 10", + "value": "Nexus 10" + }, + { + "title": "Nexus 10 Landscape", + "value": "Nexus 10 landscape" + }, + { + "title": "Nexus 4", + "value": "Nexus 4" + }, + { + "title": "Nexus 4 Landscape", + "value": "Nexus 4 landscape" + }, + { + "title": "Nexus 5", + "value": "Nexus 5" + }, + { + "title": "Nexus 5 Landscape", + "value": "Nexus 5 landscape" + }, + { + "title": "Nexus 5X", + "value": "Nexus 5X" + }, + { + "title": "Nexus 5X Landscape", + "value": "Nexus 5X landscape" + }, + { + "title": "Nexus 6", + "value": "Nexus 6" + }, + { + "title": "Nexus 6 Landscape", + "value": "Nexus 6 landscape" + }, + { + "title": "Nexus 6P", + "value": "Nexus 6P" + }, + { + "title": "Nexus 6P Landscape", + "value": "Nexus 6P landscape" + }, + { + "title": "Nexus 7", + "value": "Nexus 7" + }, + { + "title": "Nexus 7 Landscape", + "value": "Nexus 7 landscape" + }, + { + "title": "Nokia Lumia 520", + "value": "Nokia Lumia 520" + }, + { + "title": "Nokia Lumia 520 Landscape", + "value": "Nokia Lumia 520 landscape" + }, + { + "title": "Nokia N9", + "value": "Nokia N9" + }, + { + "title": "Nokia N9 Landscape", + "value": "Nokia N9 landscape" + }, + { + "title": "Pixel 2", + "value": "Pixel 2" + }, + { + "title": "Pixel 2 Landscape", + "value": "Pixel 2 landscape" + }, + { + "title": "Pixel 2 XL", + "value": "Pixel 2 XL" + }, + { + "title": "Pixel 2 XL Landscape", + "value": "Pixel 2 XL landscape" + }, + { + "title": "Pixel 3", + "value": "Pixel 3" + }, + { + "title": "Pixel 3 Landscape", + "value": "Pixel 3 landscape" + }, + { + "title": "Pixel 4", + "value": "Pixel 4" + }, + { + "title": "Pixel 4 Landscape", + "value": "Pixel 4 landscape" + }, + { + "title": "Pixel 4A 5G", + "value": "Pixel 4a (5G)" + }, + { + "title": "Pixel 4A 5G Landscape", + "value": "Pixel 4a (5G) landscape" + }, + { + "title": "Pixel 5", + "value": "Pixel 5" + }, + { + "title": "Pixel 5 Landscape", + "value": "Pixel 5 landscape" + }, + { + "title": "Moto G4", + "value": "Moto G4" + }, + { + "title": "Moto G4 Landscape", + "value": "Moto G4 landscape" + } + ] + }, + "select_links": { + "type": "string", + "required": false, + "title": "Select Links", + "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" + }, + "click_selector": { + "type": "string", + "required": false, + "title": "Click Selector", + "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'" + }, + "block_rules": { + "type": "string", + "required": false, + "title": "Block Rules", + "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" + }, + "block_message": { + "type": "string", + "required": false, + "title": "Block Message", + "description": "If specified, when a URL is blocked, a record with this error message is added instead" + }, + "block_ads": { + "type": "boolean", + "required": false, + "title": "Block Ads", + "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set." + }, + "ad_block_message": { + "type": "string", + "required": false, + "title": "Ads Block Message", + "description": "If specified, when an ad is blocked, a record with this error message is added instead" + }, + "user_agent": { + "type": "string", + "required": false, + "title": "User Agent", + "description": "Override user-agent with specified" + }, + "user_agent_suffix": { + "type": "string", + "required": false, + "title": "User Agent Suffix", + "description": "Append suffix to existing browser user-agent. Defaults to +Zimit" + }, + "use_sitemap": { + "type": "string", + "required": false, + "title": "Sitemap URL", + "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)" + }, + "sitemap_from_date": { + "type": "string", + "required": false, + "title": "Sitemap From Date", + "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "sitemap_to_date": { + "type": "string", + "required": false, + "title": "Sitemap To Date", + "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "behavior_timeout": { + "type": "integer", + "required": false, + "title": "Behavior Timeout", + "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.", + "min": 0 + }, + "post_load_delay": { + "type": "integer", + "required": false, + "title": "Post Load Delay", + "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.", + "min": 0 + }, + "page_extra_delay": { + "type": "integer", + "required": false, + "title": "Page Extra Delay", + "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.", + "min": 0 + }, + "dedup_policy": { + "type": "string-enum", + "required": false, + "title": "Dedup Policy", + "description": "Deduplication policy. One of skip, revisit or keep. Default is skip", + "choices": [ + { + "title": "Skip", + "value": "skip" + }, + { + "title": "Revisit", + "value": "revisit" + }, + { + "title": "Keep", + "value": "keep" + } + ] + }, + "screenshot": { + "type": "string", + "required": false, + "title": "Screenshot", + "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those." + }, + "size_soft_limit": { + "type": "integer", + "required": false, + "title": "Size Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.", + "min": 0 + }, + "size_hard_limit": { + "type": "integer", + "required": false, + "title": "Size Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value", + "min": 0 + }, + "disk_utilization": { + "type": "integer", + "required": false, + "title": "Disk Utilization", + "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.", + "min": 0 + }, + "time_soft_limit": { + "type": "integer", + "required": false, + "title": "Time Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.", + "min": 0 + }, + "time_hard_limit": { + "type": "integer", + "required": false, + "title": "Time Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds", + "min": 0 + }, + "net_idle_wait": { + "type": "integer", + "required": false, + "title": "Net Idle Wait", + "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope." + }, + "origin_override": { + "type": "string", + "required": false, + "title": "Origin Override", + "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port." + }, + "max_page_retries": { + "type": "integer", + "required": false, + "title": "Max Page Retries", + "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.", + "min": 0 + }, + "fail_on_failed_seed": { + "type": "boolean", + "required": false, + "title": "Fail on failed seed", + "description": "Whether to display additional logs" + }, + "fail_on_invalid_status": { + "type": "boolean", + "required": false, + "title": "Fail on invalid status", + "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses" + }, + "fail_on_failed_limit": { + "type": "integer", + "required": false, + "title": "Fail on failed - Limit", + "description": "If set, save state and exit if number of failed pages exceeds this value.", + "min": 0 + }, + "warcs": { + "type": "string", + "required": false, + "title": "WARC files", + "description": "Comma-separated list of WARC files to use as input." + }, + "verbose": { + "type": "boolean", + "required": false, + "title": "Verbose mode", + "description": "Whether to display additional logs" + }, + "keep": { + "type": "boolean", + "required": false, + "title": "Keep", + "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.", + "default": true + }, + "output": { + "type": "string", + "required": false, + "title": "Output folder", + "description": "Output folder for ZIM file(s). Leave it as `/output`", + "pattern": "^/output$" + }, + "admin_email": { + "type": "email", + "required": false, + "title": "Admin Email", + "description": "Admin Email for crawler: used in UserAgent so website admin can contact us", + "default": "contact+zimfarm@kiwix.org" + }, + "profile": { + "type": "string", + "required": false, + "title": "Browser profile", + "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler." + }, + "behaviors": { + "type": "string", + "required": false, + "title": "Behaviors", + "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific." + }, + "depth": { + "type": "integer", + "required": false, + "title": "Depth", + "description": "The depth of the crawl for all seeds. Default is -1 (infinite).", + "min": -1 + }, + "zim_lang": { + "type": "string", + "required": false, + "title": "ZIM Language", + "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`", + "alias": "zim-lang", + "customValidator": "language_code" + }, + "long_description": { + "type": "string", + "required": false, + "title": "Long description", + "description": "Optional long description for your ZIM", + "minLength": 1, + "maxLength": 4000, + "alias": "long-description" + }, + "custom_css": { + "type": "blob", + "kind": "css", + "required": false, + "title": "Custom CSS", + "description": "URL to a CSS file to inject into pages", + "alias": "custom-css" + }, + "charsets_to_try": { + "type": "string", + "required": false, + "title": "Charsets to try", + "description": "List of charsets to try decode content when charset is not found", + "alias": "charsets-to-try" + }, + "ignore_content_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore Content Header Charsets", + "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.", + "alias": "ignore-content-header-charsets" + }, + "content_header_bytes_length": { + "type": "integer", + "required": false, + "title": "Content Header Bytes Length", + "description": "How many bytes to consider when searching for content charsets in header (default is 1024).", + "alias": "content-header-bytes-length", + "min": 0 + }, + "ignore_http_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore HTTP Header Charsets", + "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.", + "alias": "ignore-http-header-charsets" + }, + "encoding_aliases": { + "type": "string", + "required": false, + "title": "Encoding Aliases", + "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.", + "alias": "encoding-aliases" + }, + "custom_behaviors": { + "type": "string", + "required": false, + "title": "Custom Behaviors", + "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.", + "alias": "custom-behaviours" + }, + "zimit_progress_file": { + "type": "string", + "required": false, + "title": "Zimit Progress File", + "description": "Scraping progress file. Leave it as `/output/task_progress.json`", + "alias": "zimit-progress-file", + "pattern": "^/output/task_progress\\.json$" + }, + "replay_viewer_source": { + "type": "url", + "required": false, + "title": "Replay Viewer Source", + "description": "URL from which to load the ReplayWeb.page replay viewer from", + "alias": "replay-viewer-source" + }, + "zim_file": { + "type": "string", + "required": false, + "title": "ZIM filename", + "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically", + "alias": "zim-file", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "name": { + "type": "string", + "required": true, + "title": "ZIM name", + "description": "Name of the ZIM.", + "alias": "name", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "overwrite": { + "type": "boolean", + "required": false, + "title": "Overwrite", + "description": "Whether to overwrite existing ZIM file if it exists" + } + } +} diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index e94f36f..281b1bb 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.5" +__version__ = "3.0.6-dev0" diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 02b167d..b205007 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -796,11 +796,14 @@ def run(raw_args): if known_args.adminEmail: user_agent_suffix += f" {known_args.adminEmail}" - # make temp dir for this crawl + # set temp dir to use for this crawl global temp_root_dir # noqa: PLW0603 if known_args.build: - temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp")) + # use build dir argument if passed + temp_root_dir = Path(known_args.build) + temp_root_dir.mkdir(parents=True, exist_ok=True) else: + # make new randomized temp dir temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) seeds = [] @@ -846,6 +849,9 @@ def run(raw_args): warc2zim_args.append("--lang") warc2zim_args.append(known_args.zim_lang) + if known_args.overwrite: + warc2zim_args.append("--overwrite") + logger.info("----------") logger.info("Testing warc2zim args") logger.info("Running: warc2zim " + " ".join(warc2zim_args)) @@ -854,7 +860,8 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - if not known_args.keep: + # only trigger cleanup when the keep argument is passed without a custom build dir. + if not known_args.build and not known_args.keep: atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler @@ -1032,7 +1039,6 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") crawl = subprocess.run(crawler_args, check=False) if ( @@ -1076,18 +1082,18 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, only most recently modified one" - " will be used" + "Found many WARC files directories, combining pages from all " + "of them" ) for directory in warc_dirs: logger.info(f"- {directory}") - warc_files = [warc_dirs[-1]] + warc_files = warc_dirs logger.info("") logger.info("----------") logger.info( f"Processing WARC files in/at " - f'{" ".join(str(warc_file) for warc_file in warc_files)}' + f"{' '.join(str(warc_file) for warc_file in warc_files)}" ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d51650d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,14 @@ +import pytest + +from zimit import zimit as app + +""" + cleanup disabled because atexit hooks run at the very end of the Python process + shutdown. By the time cleanup() is called, the logging module has already closed its + file streams. +""" + + +@pytest.fixture(autouse=True) +def disable_zimit_cleanup(monkeypatch): + monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc new file mode 100644 index 0000000..143b947 Binary files /dev/null and b/tests/data/example-response.warc differ diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py new file mode 100644 index 0000000..e41baca --- /dev/null +++ b/tests/test_overwrite.py @@ -0,0 +1,83 @@ +import pathlib + +import pytest + +from zimit.zimit import run + +TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" + + +def test_overwrite_flag_behaviour(tmp_path): + zim_output = "overwrite-test.zim" + output_path = tmp_path / zim_output + + # 1st run → creates file + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert result in (None, 100) + assert output_path.exists() + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 3rd run, with overwrite → should succeed + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + "--overwrite", + ] + ) + assert result in (None, 100) + assert output_path.exists()