diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml deleted file mode 100644 index f481354..0000000 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ /dev/null @@ -1,45 +0,0 @@ -name: Update ZIMFarm Definitions - -on: - push: - branches: [main] - paths: - - "offliner-definition.json" - release: - types: [published] - - workflow_dispatch: - inputs: - version: - description: "Version to publish" - required: false - default: "dev" - -jobs: - prepare-json: - runs-on: ubuntu-24.04 - outputs: - offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - id: read-json - run: | - if [ ! -f "offliner-definition.json" ]; then - echo "File not found!" >&2 - exit 1 - fi - json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)") - echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT - call-workflow: - needs: prepare-json - uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main - with: - version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }} - offliner: zimit - offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} - secrets: - zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a99b30..bc99b8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,17 +5,6 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] - -### Added -- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) - -### Changed -- Fix issues preventing interrupted crawls from being resumed. (#499) - - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. - - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed. - - Don't cleanup an explicitly passed build directory. - ## [3.0.5] - 2024-04-11 ### Changed diff --git a/README.md b/README.md index 188615f..894f523 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Zimit ===== -Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site. +Zimit is a scraper allowing to create ZIM file from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) diff --git a/offliner-definition.json b/offliner-definition.json deleted file mode 100644 index 4bb68b5..0000000 --- a/offliner-definition.json +++ /dev/null @@ -1,981 +0,0 @@ -{ - "offliner_id": "zimit", - "stdOutput": true, - "stdStats": "zimit-progress-file", - "flags": { - "seeds": { - "type": "string", - "required": false, - "title": "Seeds", - "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage" - }, - "seed_file": { - "type": "string", - "required": false, - "title": "Seed File", - "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file." - }, - "lang": { - "type": "string", - "required": false, - "title": "Browser Language", - "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`" - }, - "title": { - "type": "string", - "required": false, - "title": "Title", - "description": "Custom title for your ZIM. Defaults to title of main page", - "minLength": 1, - "maxLength": 30 - }, - "description": { - "type": "string", - "required": false, - "title": "Description", - "description": "Description for ZIM", - "minLength": 1, - "maxLength": 80 - }, - "favicon": { - "type": "blob", - "kind": "image", - "required": false, - "title": "Illustration", - "description": "URL for Illustration. " - }, - "tags": { - "type": "string", - "required": false, - "title": "ZIM Tags", - "description": "Single string with individual tags separated by a semicolon." - }, - "creator": { - "type": "string", - "required": false, - "title": "Creator", - "description": "Name of content creator" - }, - "publisher": { - "type": "string", - "required": false, - "title": "Publisher", - "isPublisher": true, - "description": "Custom publisher name (ZIM metadata). openZIM otherwise" - }, - "source": { - "type": "string", - "required": false, - "title": "Source", - "description": "Source name/URL of content" - }, - "workers": { - "type": "integer", - "required": false, - "title": "Workers", - "description": "The number of workers to run in parallel. Defaults to 1", - "min": 1 - }, - "wait_until": { - "type": "string", - "required": false, - "title": "WaitUntil", - "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2" - }, - "extra_hops": { - "type": "integer", - "required": false, - "title": "Extra Hops", - "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0", - "min": 0 - }, - "page_limit": { - "type": "integer", - "required": false, - "title": "Page Limit", - "description": "Limit crawl to this number of pages. Default is 0 (no-limit).", - "min": 0 - }, - "max_page_limit": { - "type": "integer", - "required": false, - "title": "Max Page Limit", - "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)", - "min": 0 - }, - "page_load_timeout": { - "type": "integer", - "required": false, - "title": "Page Load Timeout", - "description": "Timeout for each page to load (in seconds). Default is 90", - "min": 0 - }, - "scope_type": { - "type": "string-enum", - "required": false, - "title": "Scope Type", - "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.", - "choices": [ - { - "title": "Page", - "value": "page" - }, - { - "title": "Page SPA", - "value": "page-spa" - }, - { - "title": "Prefix", - "value": "prefix" - }, - { - "title": "Host", - "value": "host" - }, - { - "title": "Domain", - "value": "domain" - }, - { - "title": "Any", - "value": "any" - }, - { - "title": "Custom", - "value": "custom" - } - ] - }, - "scope_include_rx": { - "type": "string", - "required": false, - "title": "Scope Include Regex", - "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)" - }, - "scope_exclude_rx": { - "type": "string", - "required": false, - "title": "Scope Exclude Regex", - "description": "Regex of page URLs that should be excluded from the crawl" - }, - "allow_hash_urls": { - "type": "boolean", - "required": false, - "title": "Allow Hashtag URLs", - "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" - }, - "mobile_device": { - "type": "string-enum", - "required": false, - "title": "As device", - "description": "Device to crawl as. See Pupeeter's Device.ts for a list", - "choices": [ - { - "title": "Blackberry Playbook", - "value": "Blackberry PlayBook" - }, - { - "title": "Blackberry Playbook Landscape", - "value": "Blackberry PlayBook landscape" - }, - { - "title": "Blackberry Z30", - "value": "BlackBerry Z30" - }, - { - "title": "Blackberry Z30 Landscape", - "value": "BlackBerry Z30 landscape" - }, - { - "title": "Galaxy Note 3", - "value": "Galaxy Note 3" - }, - { - "title": "Galaxy Note 3 Landscape", - "value": "Galaxy Note 3 landscape" - }, - { - "title": "Galaxy Note II", - "value": "Galaxy Note II" - }, - { - "title": "Galaxy Note II Landscape", - "value": "Galaxy Note II landscape" - }, - { - "title": "Galaxy S III", - "value": "Galaxy S III" - }, - { - "title": "Galaxy S III Landscape", - "value": "Galaxy S III landscape" - }, - { - "title": "Galaxy S5", - "value": "Galaxy S5" - }, - { - "title": "Galaxy S5 Landscape", - "value": "Galaxy S5 landscape" - }, - { - "title": "Galaxy S8", - "value": "Galaxy S8" - }, - { - "title": "Galaxy S8 Landscape", - "value": "Galaxy S8 landscape" - }, - { - "title": "Galaxy S9 Plus", - "value": "Galaxy S9+" - }, - { - "title": "Galaxy S9 Plus Landscape", - "value": "Galaxy S9+ landscape" - }, - { - "title": "Galaxy Tab S4", - "value": "Galaxy Tab S4" - }, - { - "title": "Galaxy Tab S4 Landscape", - "value": "Galaxy Tab S4 landscape" - }, - { - "title": "iPad", - "value": "iPad" - }, - { - "title": "iPad Landscape", - "value": "iPad landscape" - }, - { - "title": "iPad Gen 6", - "value": "iPad (gen 6)" - }, - { - "title": "iPad Gen 6 Landscape", - "value": "iPad (gen 6) landscape" - }, - { - "title": "iPad Gen 7", - "value": "iPad (gen 7)" - }, - { - "title": "iPad Gen 7 Landscape", - "value": "iPad (gen 7) landscape" - }, - { - "title": "iPad Mini", - "value": "iPad Mini" - }, - { - "title": "iPad Mini Landscape", - "value": "iPad Mini landscape" - }, - { - "title": "iPad Pro", - "value": "iPad Pro" - }, - { - "title": "iPad Pro Landscape", - "value": "iPad Pro landscape" - }, - { - "title": "iPad Pro 11", - "value": "iPad Pro 11" - }, - { - "title": "iPad Pro 11 Landscape", - "value": "iPad Pro 11 landscape" - }, - { - "title": "iPhone 4", - "value": "iPhone 4" - }, - { - "title": "iPhone 4 Landscape", - "value": "iPhone 4 landscape" - }, - { - "title": "iPhone 5", - "value": "iPhone 5" - }, - { - "title": "iPhone 5 Landscape", - "value": "iPhone 5 landscape" - }, - { - "title": "iPhone 6", - "value": "iPhone 6" - }, - { - "title": "iPhone 6 Landscape", - "value": "iPhone 6 landscape" - }, - { - "title": "iPhone 6 Plus", - "value": "iPhone 6 Plus" - }, - { - "title": "iPhone 6 Plus Landscape", - "value": "iPhone 6 Plus landscape" - }, - { - "title": "iPhone 7", - "value": "iPhone 7" - }, - { - "title": "iPhone 7 Landscape", - "value": "iPhone 7 landscape" - }, - { - "title": "iPhone 7 Plus", - "value": "iPhone 7 Plus" - }, - { - "title": "iPhone 7 Plus Landscape", - "value": "iPhone 7 Plus landscape" - }, - { - "title": "iPhone 8", - "value": "iPhone 8" - }, - { - "title": "iPhone 8 Landscape", - "value": "iPhone 8 landscape" - }, - { - "title": "iPhone 8 Plus", - "value": "iPhone 8 Plus" - }, - { - "title": "iPhone 8 Plus Landscape", - "value": "iPhone 8 Plus landscape" - }, - { - "title": "iPhone SE", - "value": "iPhone SE" - }, - { - "title": "iPhone SE Landscape", - "value": "iPhone SE landscape" - }, - { - "title": "iPhone X", - "value": "iPhone X" - }, - { - "title": "iPhone X Landscape", - "value": "iPhone X landscape" - }, - { - "title": "iPhone XR", - "value": "iPhone XR" - }, - { - "title": "iPhone XR Landscape", - "value": "iPhone XR landscape" - }, - { - "title": "iPhone 11", - "value": "iPhone 11" - }, - { - "title": "iPhone 11 Landscape", - "value": "iPhone 11 landscape" - }, - { - "title": "iPhone 11 Pro", - "value": "iPhone 11 Pro" - }, - { - "title": "iPhone 11 Pro Landscape", - "value": "iPhone 11 Pro landscape" - }, - { - "title": "iPhone 11 Pro Max", - "value": "iPhone 11 Pro Max" - }, - { - "title": "iPhone 11 Pro Max Landscape", - "value": "iPhone 11 Pro Max landscape" - }, - { - "title": "iPhone 12", - "value": "iPhone 12" - }, - { - "title": "iPhone 12 Landscape", - "value": "iPhone 12 landscape" - }, - { - "title": "iPhone 12 Pro", - "value": "iPhone 12 Pro" - }, - { - "title": "iPhone 12 Pro Landscape", - "value": "iPhone 12 Pro landscape" - }, - { - "title": "iPhone 12 Pro Max", - "value": "iPhone 12 Pro Max" - }, - { - "title": "iPhone 12 Pro Max Landscape", - "value": "iPhone 12 Pro Max landscape" - }, - { - "title": "iPhone 12 Mini", - "value": "iPhone 12 Mini" - }, - { - "title": "iPhone 12 Mini Landscape", - "value": "iPhone 12 Mini landscape" - }, - { - "title": "iPhone 13", - "value": "iPhone 13" - }, - { - "title": "iPhone 13 Landscape", - "value": "iPhone 13 landscape" - }, - { - "title": "iPhone 13 Pro", - "value": "iPhone 13 Pro" - }, - { - "title": "iPhone 13 Pro Landscape", - "value": "iPhone 13 Pro landscape" - }, - { - "title": "iPhone 13 Pro Max", - "value": "iPhone 13 Pro Max" - }, - { - "title": "iPhone 13 Pro Max Landscape", - "value": "iPhone 13 Pro Max landscape" - }, - { - "title": "iPhone 13 Mini", - "value": "iPhone 13 Mini" - }, - { - "title": "iPhone 13 Mini Landscape", - "value": "iPhone 13 Mini landscape" - }, - { - "title": "Jio Phone 2", - "value": "JioPhone 2" - }, - { - "title": "Jio Phone 2 Landscape", - "value": "JioPhone 2 landscape" - }, - { - "title": "Kindle Fire HDX", - "value": "Kindle Fire HDX" - }, - { - "title": "Kindle Fire HDX Landscape", - "value": "Kindle Fire HDX landscape" - }, - { - "title": "LG Optimus L70", - "value": "LG Optimus L70" - }, - { - "title": "LG Optimus L70 Landscape", - "value": "LG Optimus L70 landscape" - }, - { - "title": "Microsoft Lumia 550", - "value": "Microsoft Lumia 550" - }, - { - "title": "Microsoft Lumia 950", - "value": "Microsoft Lumia 950" - }, - { - "title": "Microsoft Lumia 950 Landscape", - "value": "Microsoft Lumia 950 landscape" - }, - { - "title": "Nexus 10", - "value": "Nexus 10" - }, - { - "title": "Nexus 10 Landscape", - "value": "Nexus 10 landscape" - }, - { - "title": "Nexus 4", - "value": "Nexus 4" - }, - { - "title": "Nexus 4 Landscape", - "value": "Nexus 4 landscape" - }, - { - "title": "Nexus 5", - "value": "Nexus 5" - }, - { - "title": "Nexus 5 Landscape", - "value": "Nexus 5 landscape" - }, - { - "title": "Nexus 5X", - "value": "Nexus 5X" - }, - { - "title": "Nexus 5X Landscape", - "value": "Nexus 5X landscape" - }, - { - "title": "Nexus 6", - "value": "Nexus 6" - }, - { - "title": "Nexus 6 Landscape", - "value": "Nexus 6 landscape" - }, - { - "title": "Nexus 6P", - "value": "Nexus 6P" - }, - { - "title": "Nexus 6P Landscape", - "value": "Nexus 6P landscape" - }, - { - "title": "Nexus 7", - "value": "Nexus 7" - }, - { - "title": "Nexus 7 Landscape", - "value": "Nexus 7 landscape" - }, - { - "title": "Nokia Lumia 520", - "value": "Nokia Lumia 520" - }, - { - "title": "Nokia Lumia 520 Landscape", - "value": "Nokia Lumia 520 landscape" - }, - { - "title": "Nokia N9", - "value": "Nokia N9" - }, - { - "title": "Nokia N9 Landscape", - "value": "Nokia N9 landscape" - }, - { - "title": "Pixel 2", - "value": "Pixel 2" - }, - { - "title": "Pixel 2 Landscape", - "value": "Pixel 2 landscape" - }, - { - "title": "Pixel 2 XL", - "value": "Pixel 2 XL" - }, - { - "title": "Pixel 2 XL Landscape", - "value": "Pixel 2 XL landscape" - }, - { - "title": "Pixel 3", - "value": "Pixel 3" - }, - { - "title": "Pixel 3 Landscape", - "value": "Pixel 3 landscape" - }, - { - "title": "Pixel 4", - "value": "Pixel 4" - }, - { - "title": "Pixel 4 Landscape", - "value": "Pixel 4 landscape" - }, - { - "title": "Pixel 4A 5G", - "value": "Pixel 4a (5G)" - }, - { - "title": "Pixel 4A 5G Landscape", - "value": "Pixel 4a (5G) landscape" - }, - { - "title": "Pixel 5", - "value": "Pixel 5" - }, - { - "title": "Pixel 5 Landscape", - "value": "Pixel 5 landscape" - }, - { - "title": "Moto G4", - "value": "Moto G4" - }, - { - "title": "Moto G4 Landscape", - "value": "Moto G4 landscape" - } - ] - }, - "select_links": { - "type": "string", - "required": false, - "title": "Select Links", - "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" - }, - "click_selector": { - "type": "string", - "required": false, - "title": "Click Selector", - "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'" - }, - "block_rules": { - "type": "string", - "required": false, - "title": "Block Rules", - "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" - }, - "block_message": { - "type": "string", - "required": false, - "title": "Block Message", - "description": "If specified, when a URL is blocked, a record with this error message is added instead" - }, - "block_ads": { - "type": "boolean", - "required": false, - "title": "Block Ads", - "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set." - }, - "ad_block_message": { - "type": "string", - "required": false, - "title": "Ads Block Message", - "description": "If specified, when an ad is blocked, a record with this error message is added instead" - }, - "user_agent": { - "type": "string", - "required": false, - "title": "User Agent", - "description": "Override user-agent with specified" - }, - "user_agent_suffix": { - "type": "string", - "required": false, - "title": "User Agent Suffix", - "description": "Append suffix to existing browser user-agent. Defaults to +Zimit" - }, - "use_sitemap": { - "type": "string", - "required": false, - "title": "Sitemap URL", - "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)" - }, - "sitemap_from_date": { - "type": "string", - "required": false, - "title": "Sitemap From Date", - "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" - }, - "sitemap_to_date": { - "type": "string", - "required": false, - "title": "Sitemap To Date", - "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" - }, - "behavior_timeout": { - "type": "integer", - "required": false, - "title": "Behavior Timeout", - "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.", - "min": 0 - }, - "post_load_delay": { - "type": "integer", - "required": false, - "title": "Post Load Delay", - "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.", - "min": 0 - }, - "page_extra_delay": { - "type": "integer", - "required": false, - "title": "Page Extra Delay", - "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.", - "min": 0 - }, - "dedup_policy": { - "type": "string-enum", - "required": false, - "title": "Dedup Policy", - "description": "Deduplication policy. One of skip, revisit or keep. Default is skip", - "choices": [ - { - "title": "Skip", - "value": "skip" - }, - { - "title": "Revisit", - "value": "revisit" - }, - { - "title": "Keep", - "value": "keep" - } - ] - }, - "screenshot": { - "type": "string", - "required": false, - "title": "Screenshot", - "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those." - }, - "size_soft_limit": { - "type": "integer", - "required": false, - "title": "Size Soft Limit", - "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.", - "min": 0 - }, - "size_hard_limit": { - "type": "integer", - "required": false, - "title": "Size Hard Limit", - "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value", - "min": 0 - }, - "disk_utilization": { - "type": "integer", - "required": false, - "title": "Disk Utilization", - "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.", - "min": 0 - }, - "time_soft_limit": { - "type": "integer", - "required": false, - "title": "Time Soft Limit", - "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.", - "min": 0 - }, - "time_hard_limit": { - "type": "integer", - "required": false, - "title": "Time Hard Limit", - "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds", - "min": 0 - }, - "net_idle_wait": { - "type": "integer", - "required": false, - "title": "Net Idle Wait", - "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope." - }, - "origin_override": { - "type": "string", - "required": false, - "title": "Origin Override", - "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port." - }, - "max_page_retries": { - "type": "integer", - "required": false, - "title": "Max Page Retries", - "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.", - "min": 0 - }, - "fail_on_failed_seed": { - "type": "boolean", - "required": false, - "title": "Fail on failed seed", - "description": "Whether to display additional logs" - }, - "fail_on_invalid_status": { - "type": "boolean", - "required": false, - "title": "Fail on invalid status", - "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses" - }, - "fail_on_failed_limit": { - "type": "integer", - "required": false, - "title": "Fail on failed - Limit", - "description": "If set, save state and exit if number of failed pages exceeds this value.", - "min": 0 - }, - "warcs": { - "type": "string", - "required": false, - "title": "WARC files", - "description": "Comma-separated list of WARC files to use as input." - }, - "verbose": { - "type": "boolean", - "required": false, - "title": "Verbose mode", - "description": "Whether to display additional logs" - }, - "keep": { - "type": "boolean", - "required": false, - "title": "Keep", - "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.", - "default": true - }, - "output": { - "type": "string", - "required": false, - "title": "Output folder", - "description": "Output folder for ZIM file(s). Leave it as `/output`", - "pattern": "^/output$" - }, - "admin_email": { - "type": "email", - "required": false, - "title": "Admin Email", - "description": "Admin Email for crawler: used in UserAgent so website admin can contact us", - "default": "contact+zimfarm@kiwix.org" - }, - "profile": { - "type": "string", - "required": false, - "title": "Browser profile", - "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler." - }, - "behaviors": { - "type": "string", - "required": false, - "title": "Behaviors", - "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific." - }, - "depth": { - "type": "integer", - "required": false, - "title": "Depth", - "description": "The depth of the crawl for all seeds. Default is -1 (infinite).", - "min": -1 - }, - "zim_lang": { - "type": "string", - "required": false, - "title": "ZIM Language", - "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`", - "alias": "zim-lang", - "customValidator": "language_code" - }, - "long_description": { - "type": "string", - "required": false, - "title": "Long description", - "description": "Optional long description for your ZIM", - "minLength": 1, - "maxLength": 4000, - "alias": "long-description" - }, - "custom_css": { - "type": "blob", - "kind": "css", - "required": false, - "title": "Custom CSS", - "description": "URL to a CSS file to inject into pages", - "alias": "custom-css" - }, - "charsets_to_try": { - "type": "string", - "required": false, - "title": "Charsets to try", - "description": "List of charsets to try decode content when charset is not found", - "alias": "charsets-to-try" - }, - "ignore_content_header_charsets": { - "type": "boolean", - "required": false, - "title": "Ignore Content Header Charsets", - "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.", - "alias": "ignore-content-header-charsets" - }, - "content_header_bytes_length": { - "type": "integer", - "required": false, - "title": "Content Header Bytes Length", - "description": "How many bytes to consider when searching for content charsets in header (default is 1024).", - "alias": "content-header-bytes-length", - "min": 0 - }, - "ignore_http_header_charsets": { - "type": "boolean", - "required": false, - "title": "Ignore HTTP Header Charsets", - "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.", - "alias": "ignore-http-header-charsets" - }, - "encoding_aliases": { - "type": "string", - "required": false, - "title": "Encoding Aliases", - "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.", - "alias": "encoding-aliases" - }, - "custom_behaviors": { - "type": "string", - "required": false, - "title": "Custom Behaviors", - "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.", - "alias": "custom-behaviours" - }, - "zimit_progress_file": { - "type": "string", - "required": false, - "title": "Zimit Progress File", - "description": "Scraping progress file. Leave it as `/output/task_progress.json`", - "alias": "zimit-progress-file", - "pattern": "^/output/task_progress\\.json$" - }, - "replay_viewer_source": { - "type": "url", - "required": false, - "title": "Replay Viewer Source", - "description": "URL from which to load the ReplayWeb.page replay viewer from", - "alias": "replay-viewer-source" - }, - "zim_file": { - "type": "string", - "required": false, - "title": "ZIM filename", - "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically", - "alias": "zim-file", - "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$", - "relaxedPattern": "^[A-Za-z0-9._-]+$" - }, - "name": { - "type": "string", - "required": true, - "title": "ZIM name", - "description": "Name of the ZIM.", - "alias": "name", - "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", - "relaxedPattern": "^[A-Za-z0-9._-]+$" - }, - "overwrite": { - "type": "boolean", - "required": false, - "title": "Overwrite", - "description": "Whether to overwrite existing ZIM file if it exists" - } - } -} diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 281b1bb..e94f36f 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.6-dev0" +__version__ = "3.0.5" diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index b205007..02b167d 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -796,14 +796,11 @@ def run(raw_args): if known_args.adminEmail: user_agent_suffix += f" {known_args.adminEmail}" - # set temp dir to use for this crawl + # make temp dir for this crawl global temp_root_dir # noqa: PLW0603 if known_args.build: - # use build dir argument if passed - temp_root_dir = Path(known_args.build) - temp_root_dir.mkdir(parents=True, exist_ok=True) + temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp")) else: - # make new randomized temp dir temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) seeds = [] @@ -849,9 +846,6 @@ def run(raw_args): warc2zim_args.append("--lang") warc2zim_args.append(known_args.zim_lang) - if known_args.overwrite: - warc2zim_args.append("--overwrite") - logger.info("----------") logger.info("Testing warc2zim args") logger.info("Running: warc2zim " + " ".join(warc2zim_args)) @@ -860,8 +854,7 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - # only trigger cleanup when the keep argument is passed without a custom build dir. - if not known_args.build and not known_args.keep: + if not known_args.keep: atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler @@ -1039,6 +1032,7 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: + logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") crawl = subprocess.run(crawler_args, check=False) if ( @@ -1082,18 +1076,18 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, combining pages from all " - "of them" + "Found many WARC files directories, only most recently modified one" + " will be used" ) for directory in warc_dirs: logger.info(f"- {directory}") - warc_files = warc_dirs + warc_files = [warc_dirs[-1]] logger.info("") logger.info("----------") logger.info( f"Processing WARC files in/at " - f"{' '.join(str(warc_file) for warc_file in warc_files)}" + f'{" ".join(str(warc_file) for warc_file in warc_files)}' ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index d51650d..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from zimit import zimit as app - -""" - cleanup disabled because atexit hooks run at the very end of the Python process - shutdown. By the time cleanup() is called, the logging module has already closed its - file streams. -""" - - -@pytest.fixture(autouse=True) -def disable_zimit_cleanup(monkeypatch): - monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc deleted file mode 100644 index 143b947..0000000 Binary files a/tests/data/example-response.warc and /dev/null differ diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py deleted file mode 100644 index e41baca..0000000 --- a/tests/test_overwrite.py +++ /dev/null @@ -1,83 +0,0 @@ -import pathlib - -import pytest - -from zimit.zimit import run - -TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" - - -def test_overwrite_flag_behaviour(tmp_path): - zim_output = "overwrite-test.zim" - output_path = tmp_path / zim_output - - # 1st run → creates file - result = run( - [ - "--seeds", - "https://example.com", - "--warcs", - str(TEST_DATA_DIR / "example-response.warc"), - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - ] - ) - assert result in (None, 100) - assert output_path.exists() - - # 2nd run, no overwrite → should fail - with pytest.raises(SystemExit) as exc: - run( - [ - "--seeds", - "https://example.com", - "--warcs", - str(TEST_DATA_DIR / "example-response.warc"), - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - ] - ) - assert exc.value.code == 2 - - # 2nd run, no overwrite → should fail - with pytest.raises(SystemExit) as exc: - run( - [ - "--seeds", - "https://example.com", - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - ] - ) - assert exc.value.code == 2 - - # 3rd run, with overwrite → should succeed - result = run( - [ - "--seeds", - "https://example.com", - "--warcs", - str(TEST_DATA_DIR / "example-response.warc"), - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - "--overwrite", - ] - ) - assert result in (None, 100) - assert output_path.exists()