diff --git a/.github/workflows/DailyTests.yaml b/.github/workflows/DailyTests.yaml index 0585721..2bc9bc5 100644 --- a/.github/workflows/DailyTests.yaml +++ b/.github/workflows/DailyTests.yaml @@ -18,7 +18,7 @@ jobs: run: docker build -t local-zimit . - name: run crawl of test website - run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim + run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim - name: archive ZIM uses: actions/upload-artifact@v4 diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index b6660d0..1ddb343 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -5,8 +5,9 @@ on: types: [published] jobs: - publish: - runs-on: ubuntu-22.04 + publish-amd64: + runs-on: ubuntu-24.04 + name: "Publish for AMD64" steps: - uses: actions/checkout@v4 @@ -19,11 +20,34 @@ jobs: latest-on-tag: true restrict-to: openzim/zimit registries: ghcr.io - credentials: + credentials: | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto repo_overview: auto platforms: | linux/amd64 - linux/arm64 + + # Disabled for now, see https://github.com/openzim/zimit/issues/463 + # publish-arm64: + # runs-on: ubuntu-24.04 + # name: "Publish for ARM64" + # + # steps: + # - uses: actions/checkout@v4 + # + # - name: Build and push Docker image + # uses: openzim/docker-publish-action@v10 + # with: + # image-name: openzim/zimit + # tag-pattern: /^v([0-9.]+)$/ + # latest-on-tag: true + # restrict-to: openzim/zimit + # registries: ghcr.io + # credentials: | + # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + # repo_description: auto + # repo_overview: auto + # platforms: | + # linux/arm64 diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index 5e2431e..1cbecea 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -7,8 +7,9 @@ on: workflow_dispatch: jobs: - publish: - runs-on: ubuntu-22.04 + publish-amd64: + runs-on: ubuntu-24.04 + name: "Publish for AMD64" steps: - uses: actions/checkout@v4 @@ -21,11 +22,34 @@ jobs: latest-on-tag: false restrict-to: openzim/zimit registries: ghcr.io - credentials: + credentials: | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto repo_overview: auto platforms: | linux/amd64 - linux/arm64 + + # Disabled for now, see https://github.com/openzim/zimit/issues/463 + # publish-arm64: + # runs-on: ubuntu-24.04-arm + # name: "Publish for ARM64" + # + # steps: + # - uses: actions/checkout@v4 + # + # - name: Build and push Docker image + # uses: openzim/docker-publish-action@v10 + # with: + # image-name: openzim/zimit + # manual-tag: dev + # latest-on-tag: false + # restrict-to: openzim/zimit + # registries: ghcr.io + # credentials: | + # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + # repo_description: auto + # repo_overview: auto + # platforms: | + # linux/arm64 diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 9e21fa7..8c74b21 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -57,13 +57,25 @@ jobs: uses: actions/checkout@v4 - name: build image - run: docker build -t zimit . + run: docker build -t local-zimit . - name: ensure help display without issue - run: docker run -v $PWD/output:/output zimit zimit --help + run: docker run -v $PWD/output:/output local-zimit zimit --help - - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + - name: run crawl with soft size limit + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json + + - name: run crawl with hard size limit + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true + + - name: run crawl with soft time limit + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json + + - name: run crawl with hard time limit + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true + + - name: run standard crawl + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep - name: run integration test suite - run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" + run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml new file mode 100644 index 0000000..f481354 --- /dev/null +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -0,0 +1,45 @@ +name: Update ZIMFarm Definitions + +on: + push: + branches: [main] + paths: + - "offliner-definition.json" + release: + types: [published] + + workflow_dispatch: + inputs: + version: + description: "Version to publish" + required: false + default: "dev" + +jobs: + prepare-json: + runs-on: ubuntu-24.04 + outputs: + offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - id: read-json + run: | + if [ ! -f "offliner-definition.json" ]; then + echo "File not found!" >&2 + exit 1 + fi + json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)") + echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT + call-workflow: + needs: prepare-json + uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main + with: + version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }} + offliner: zimit + offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} + secrets: + zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4f91d0b..b362d62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,20 +2,20 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: "24.10.0" + rev: "25.1.0" hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.9 + rev: v0.9.4 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.383 + rev: v1.1.393 hooks: - id: pyright name: pyright (system) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2af6e6f..2a99b30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,92 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + +### Added +- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) + +### Changed +- Fix issues preventing interrupted crawls from being resumed. (#499) + - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. + - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed. + - Don't cleanup an explicitly passed build directory. + +## [3.0.5] - 2024-04-11 + +### Changed + +- Upgrade to browsertrix crawler 1.6.0 (#493) + +## [3.0.4] - 2024-04-04 + +### Changed + +- Upgrade to browsertrix crawler 1.5.10 (#491) + +## [3.0.3] - 2024-02-28 + +### Changed + +- Upgrade to browsertrix crawler 1.5.7 (#483) + +## [3.0.2] - 2024-02-27 + +### Changed + +- Upgrade to browsertrix crawler 1.5.6 (#482) + +## [3.0.1] - 2024-02-24 + +### Changed + +- Upgrade to browsertrix crawler 1.5.4 (#476) + +## [3.0.0] - 2024-02-17 + +### Changed + +- Change solution to report partial ZIM to the Zimfarm and other clients (#304) +- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) +- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433) +- Document all Browsertrix Crawler default arguments values (#416) +- Use preferred Browsertrix Crawler arguments names: (part of #471) + - `--seeds` instead of `--url` + - `--seedFile` instead of `--urlFile` + - `--pageLimit` instead of `--limit` + - `--pageLoadTimeout` instead of `--timeout` + - `--scopeIncludeRx` instead of `--include` + - `--scopeExcludeRx` instead of `--exclude` + - `--pageExtraDelay` instead of `--delay` +- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471) + - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments) + - `--zimit-progress-file` is now the zimit stats location + - `--warc2zim-progress-file` is the warc2zim stats location + - all are optional values, if not set and needed temporary files are used + +### Fixed + +- Do not create the ZIM when crawl is incomplete (#444) + +## [2.1.8] - 2024-02-07 + +### Changed + +- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464) + +## [2.1.7] - 2024-01-10 + +### Changed + +- Upgrade to browsertrix crawler 1.4.2 (#450) +- Upgrade to warc2zim 2.2.0 + +## [2.1.6] - 2024-11-07 + +### Changed + +- Upgrade to browsertrix crawler 1.3.5 (#426) + ## [2.1.5] - 2024-11-01 ### Changed diff --git a/Dockerfile b/Dockerfile index 5be84c4..9666c0b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,16 @@ -FROM webrecorder/browsertrix-crawler:1.3.4 -LABEL org.opencontainers.image.source https://github.com/openzim/zimit +FROM webrecorder/browsertrix-crawler:1.6.0 +LABEL org.opencontainers.image.source=https://github.com/openzim/zimit + +# add deadsnakes ppa for latest Python on Ubuntu +RUN add-apt-repository ppa:deadsnakes/ppa -y RUN apt-get update \ && apt-get install -qqy --no-install-recommends \ libmagic1 \ - python3.12-venv \ + python3.13-venv \ && rm -rf /var/lib/apt/lists/* \ # python setup (in venv not to conflict with browsertrix) - && python3.12 -m venv /app/zimit \ + && python3.13 -m venv /app/zimit \ # placeholder (default output location) && mkdir -p /output \ # disable chrome upgrade diff --git a/README.md b/README.md index e01abbc..188615f 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ Zimit ===== -Zimit is a scraper allowing to create ZIM file from any Web site. +Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) -[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit) +[![Docker](https://ghcr-badge.egpl.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit) Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing). -Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**. +Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**. Capabilities and known limitations -------------------- @@ -38,24 +38,23 @@ Usage `zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit. -The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance: +The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**: -- Required: `--url URL` - the url to be crawled +- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage - Required: `--name` - Name of ZIM file - `--output` - output directory (defaults to `/output`) -- `--limit U` - Limit capture to at most U URLs -- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded) -- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. +- `--pageLimit U` - Limit capture to at most U URLs +- `--scopeExcludeRx ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. - `--workers N` - number of crawl workers to be run in parallel -- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). -- `--keep` - if set, keep the WARC files in a temp directory inside the output directory +- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). +- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success. Example command: ```bash docker run ghcr.io/openzim/zimit zimit --help docker run ghcr.io/openzim/zimit warc2zim --help -docker run -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile +docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile ``` **Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`). diff --git a/offliner-definition.json b/offliner-definition.json new file mode 100644 index 0000000..4bb68b5 --- /dev/null +++ b/offliner-definition.json @@ -0,0 +1,981 @@ +{ + "offliner_id": "zimit", + "stdOutput": true, + "stdStats": "zimit-progress-file", + "flags": { + "seeds": { + "type": "string", + "required": false, + "title": "Seeds", + "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage" + }, + "seed_file": { + "type": "string", + "required": false, + "title": "Seed File", + "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file." + }, + "lang": { + "type": "string", + "required": false, + "title": "Browser Language", + "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`" + }, + "title": { + "type": "string", + "required": false, + "title": "Title", + "description": "Custom title for your ZIM. Defaults to title of main page", + "minLength": 1, + "maxLength": 30 + }, + "description": { + "type": "string", + "required": false, + "title": "Description", + "description": "Description for ZIM", + "minLength": 1, + "maxLength": 80 + }, + "favicon": { + "type": "blob", + "kind": "image", + "required": false, + "title": "Illustration", + "description": "URL for Illustration. " + }, + "tags": { + "type": "string", + "required": false, + "title": "ZIM Tags", + "description": "Single string with individual tags separated by a semicolon." + }, + "creator": { + "type": "string", + "required": false, + "title": "Creator", + "description": "Name of content creator" + }, + "publisher": { + "type": "string", + "required": false, + "title": "Publisher", + "isPublisher": true, + "description": "Custom publisher name (ZIM metadata). openZIM otherwise" + }, + "source": { + "type": "string", + "required": false, + "title": "Source", + "description": "Source name/URL of content" + }, + "workers": { + "type": "integer", + "required": false, + "title": "Workers", + "description": "The number of workers to run in parallel. Defaults to 1", + "min": 1 + }, + "wait_until": { + "type": "string", + "required": false, + "title": "WaitUntil", + "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2" + }, + "extra_hops": { + "type": "integer", + "required": false, + "title": "Extra Hops", + "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0", + "min": 0 + }, + "page_limit": { + "type": "integer", + "required": false, + "title": "Page Limit", + "description": "Limit crawl to this number of pages. Default is 0 (no-limit).", + "min": 0 + }, + "max_page_limit": { + "type": "integer", + "required": false, + "title": "Max Page Limit", + "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)", + "min": 0 + }, + "page_load_timeout": { + "type": "integer", + "required": false, + "title": "Page Load Timeout", + "description": "Timeout for each page to load (in seconds). Default is 90", + "min": 0 + }, + "scope_type": { + "type": "string-enum", + "required": false, + "title": "Scope Type", + "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.", + "choices": [ + { + "title": "Page", + "value": "page" + }, + { + "title": "Page SPA", + "value": "page-spa" + }, + { + "title": "Prefix", + "value": "prefix" + }, + { + "title": "Host", + "value": "host" + }, + { + "title": "Domain", + "value": "domain" + }, + { + "title": "Any", + "value": "any" + }, + { + "title": "Custom", + "value": "custom" + } + ] + }, + "scope_include_rx": { + "type": "string", + "required": false, + "title": "Scope Include Regex", + "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)" + }, + "scope_exclude_rx": { + "type": "string", + "required": false, + "title": "Scope Exclude Regex", + "description": "Regex of page URLs that should be excluded from the crawl" + }, + "allow_hash_urls": { + "type": "boolean", + "required": false, + "title": "Allow Hashtag URLs", + "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" + }, + "mobile_device": { + "type": "string-enum", + "required": false, + "title": "As device", + "description": "Device to crawl as. See Pupeeter's Device.ts for a list", + "choices": [ + { + "title": "Blackberry Playbook", + "value": "Blackberry PlayBook" + }, + { + "title": "Blackberry Playbook Landscape", + "value": "Blackberry PlayBook landscape" + }, + { + "title": "Blackberry Z30", + "value": "BlackBerry Z30" + }, + { + "title": "Blackberry Z30 Landscape", + "value": "BlackBerry Z30 landscape" + }, + { + "title": "Galaxy Note 3", + "value": "Galaxy Note 3" + }, + { + "title": "Galaxy Note 3 Landscape", + "value": "Galaxy Note 3 landscape" + }, + { + "title": "Galaxy Note II", + "value": "Galaxy Note II" + }, + { + "title": "Galaxy Note II Landscape", + "value": "Galaxy Note II landscape" + }, + { + "title": "Galaxy S III", + "value": "Galaxy S III" + }, + { + "title": "Galaxy S III Landscape", + "value": "Galaxy S III landscape" + }, + { + "title": "Galaxy S5", + "value": "Galaxy S5" + }, + { + "title": "Galaxy S5 Landscape", + "value": "Galaxy S5 landscape" + }, + { + "title": "Galaxy S8", + "value": "Galaxy S8" + }, + { + "title": "Galaxy S8 Landscape", + "value": "Galaxy S8 landscape" + }, + { + "title": "Galaxy S9 Plus", + "value": "Galaxy S9+" + }, + { + "title": "Galaxy S9 Plus Landscape", + "value": "Galaxy S9+ landscape" + }, + { + "title": "Galaxy Tab S4", + "value": "Galaxy Tab S4" + }, + { + "title": "Galaxy Tab S4 Landscape", + "value": "Galaxy Tab S4 landscape" + }, + { + "title": "iPad", + "value": "iPad" + }, + { + "title": "iPad Landscape", + "value": "iPad landscape" + }, + { + "title": "iPad Gen 6", + "value": "iPad (gen 6)" + }, + { + "title": "iPad Gen 6 Landscape", + "value": "iPad (gen 6) landscape" + }, + { + "title": "iPad Gen 7", + "value": "iPad (gen 7)" + }, + { + "title": "iPad Gen 7 Landscape", + "value": "iPad (gen 7) landscape" + }, + { + "title": "iPad Mini", + "value": "iPad Mini" + }, + { + "title": "iPad Mini Landscape", + "value": "iPad Mini landscape" + }, + { + "title": "iPad Pro", + "value": "iPad Pro" + }, + { + "title": "iPad Pro Landscape", + "value": "iPad Pro landscape" + }, + { + "title": "iPad Pro 11", + "value": "iPad Pro 11" + }, + { + "title": "iPad Pro 11 Landscape", + "value": "iPad Pro 11 landscape" + }, + { + "title": "iPhone 4", + "value": "iPhone 4" + }, + { + "title": "iPhone 4 Landscape", + "value": "iPhone 4 landscape" + }, + { + "title": "iPhone 5", + "value": "iPhone 5" + }, + { + "title": "iPhone 5 Landscape", + "value": "iPhone 5 landscape" + }, + { + "title": "iPhone 6", + "value": "iPhone 6" + }, + { + "title": "iPhone 6 Landscape", + "value": "iPhone 6 landscape" + }, + { + "title": "iPhone 6 Plus", + "value": "iPhone 6 Plus" + }, + { + "title": "iPhone 6 Plus Landscape", + "value": "iPhone 6 Plus landscape" + }, + { + "title": "iPhone 7", + "value": "iPhone 7" + }, + { + "title": "iPhone 7 Landscape", + "value": "iPhone 7 landscape" + }, + { + "title": "iPhone 7 Plus", + "value": "iPhone 7 Plus" + }, + { + "title": "iPhone 7 Plus Landscape", + "value": "iPhone 7 Plus landscape" + }, + { + "title": "iPhone 8", + "value": "iPhone 8" + }, + { + "title": "iPhone 8 Landscape", + "value": "iPhone 8 landscape" + }, + { + "title": "iPhone 8 Plus", + "value": "iPhone 8 Plus" + }, + { + "title": "iPhone 8 Plus Landscape", + "value": "iPhone 8 Plus landscape" + }, + { + "title": "iPhone SE", + "value": "iPhone SE" + }, + { + "title": "iPhone SE Landscape", + "value": "iPhone SE landscape" + }, + { + "title": "iPhone X", + "value": "iPhone X" + }, + { + "title": "iPhone X Landscape", + "value": "iPhone X landscape" + }, + { + "title": "iPhone XR", + "value": "iPhone XR" + }, + { + "title": "iPhone XR Landscape", + "value": "iPhone XR landscape" + }, + { + "title": "iPhone 11", + "value": "iPhone 11" + }, + { + "title": "iPhone 11 Landscape", + "value": "iPhone 11 landscape" + }, + { + "title": "iPhone 11 Pro", + "value": "iPhone 11 Pro" + }, + { + "title": "iPhone 11 Pro Landscape", + "value": "iPhone 11 Pro landscape" + }, + { + "title": "iPhone 11 Pro Max", + "value": "iPhone 11 Pro Max" + }, + { + "title": "iPhone 11 Pro Max Landscape", + "value": "iPhone 11 Pro Max landscape" + }, + { + "title": "iPhone 12", + "value": "iPhone 12" + }, + { + "title": "iPhone 12 Landscape", + "value": "iPhone 12 landscape" + }, + { + "title": "iPhone 12 Pro", + "value": "iPhone 12 Pro" + }, + { + "title": "iPhone 12 Pro Landscape", + "value": "iPhone 12 Pro landscape" + }, + { + "title": "iPhone 12 Pro Max", + "value": "iPhone 12 Pro Max" + }, + { + "title": "iPhone 12 Pro Max Landscape", + "value": "iPhone 12 Pro Max landscape" + }, + { + "title": "iPhone 12 Mini", + "value": "iPhone 12 Mini" + }, + { + "title": "iPhone 12 Mini Landscape", + "value": "iPhone 12 Mini landscape" + }, + { + "title": "iPhone 13", + "value": "iPhone 13" + }, + { + "title": "iPhone 13 Landscape", + "value": "iPhone 13 landscape" + }, + { + "title": "iPhone 13 Pro", + "value": "iPhone 13 Pro" + }, + { + "title": "iPhone 13 Pro Landscape", + "value": "iPhone 13 Pro landscape" + }, + { + "title": "iPhone 13 Pro Max", + "value": "iPhone 13 Pro Max" + }, + { + "title": "iPhone 13 Pro Max Landscape", + "value": "iPhone 13 Pro Max landscape" + }, + { + "title": "iPhone 13 Mini", + "value": "iPhone 13 Mini" + }, + { + "title": "iPhone 13 Mini Landscape", + "value": "iPhone 13 Mini landscape" + }, + { + "title": "Jio Phone 2", + "value": "JioPhone 2" + }, + { + "title": "Jio Phone 2 Landscape", + "value": "JioPhone 2 landscape" + }, + { + "title": "Kindle Fire HDX", + "value": "Kindle Fire HDX" + }, + { + "title": "Kindle Fire HDX Landscape", + "value": "Kindle Fire HDX landscape" + }, + { + "title": "LG Optimus L70", + "value": "LG Optimus L70" + }, + { + "title": "LG Optimus L70 Landscape", + "value": "LG Optimus L70 landscape" + }, + { + "title": "Microsoft Lumia 550", + "value": "Microsoft Lumia 550" + }, + { + "title": "Microsoft Lumia 950", + "value": "Microsoft Lumia 950" + }, + { + "title": "Microsoft Lumia 950 Landscape", + "value": "Microsoft Lumia 950 landscape" + }, + { + "title": "Nexus 10", + "value": "Nexus 10" + }, + { + "title": "Nexus 10 Landscape", + "value": "Nexus 10 landscape" + }, + { + "title": "Nexus 4", + "value": "Nexus 4" + }, + { + "title": "Nexus 4 Landscape", + "value": "Nexus 4 landscape" + }, + { + "title": "Nexus 5", + "value": "Nexus 5" + }, + { + "title": "Nexus 5 Landscape", + "value": "Nexus 5 landscape" + }, + { + "title": "Nexus 5X", + "value": "Nexus 5X" + }, + { + "title": "Nexus 5X Landscape", + "value": "Nexus 5X landscape" + }, + { + "title": "Nexus 6", + "value": "Nexus 6" + }, + { + "title": "Nexus 6 Landscape", + "value": "Nexus 6 landscape" + }, + { + "title": "Nexus 6P", + "value": "Nexus 6P" + }, + { + "title": "Nexus 6P Landscape", + "value": "Nexus 6P landscape" + }, + { + "title": "Nexus 7", + "value": "Nexus 7" + }, + { + "title": "Nexus 7 Landscape", + "value": "Nexus 7 landscape" + }, + { + "title": "Nokia Lumia 520", + "value": "Nokia Lumia 520" + }, + { + "title": "Nokia Lumia 520 Landscape", + "value": "Nokia Lumia 520 landscape" + }, + { + "title": "Nokia N9", + "value": "Nokia N9" + }, + { + "title": "Nokia N9 Landscape", + "value": "Nokia N9 landscape" + }, + { + "title": "Pixel 2", + "value": "Pixel 2" + }, + { + "title": "Pixel 2 Landscape", + "value": "Pixel 2 landscape" + }, + { + "title": "Pixel 2 XL", + "value": "Pixel 2 XL" + }, + { + "title": "Pixel 2 XL Landscape", + "value": "Pixel 2 XL landscape" + }, + { + "title": "Pixel 3", + "value": "Pixel 3" + }, + { + "title": "Pixel 3 Landscape", + "value": "Pixel 3 landscape" + }, + { + "title": "Pixel 4", + "value": "Pixel 4" + }, + { + "title": "Pixel 4 Landscape", + "value": "Pixel 4 landscape" + }, + { + "title": "Pixel 4A 5G", + "value": "Pixel 4a (5G)" + }, + { + "title": "Pixel 4A 5G Landscape", + "value": "Pixel 4a (5G) landscape" + }, + { + "title": "Pixel 5", + "value": "Pixel 5" + }, + { + "title": "Pixel 5 Landscape", + "value": "Pixel 5 landscape" + }, + { + "title": "Moto G4", + "value": "Moto G4" + }, + { + "title": "Moto G4 Landscape", + "value": "Moto G4 landscape" + } + ] + }, + "select_links": { + "type": "string", + "required": false, + "title": "Select Links", + "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" + }, + "click_selector": { + "type": "string", + "required": false, + "title": "Click Selector", + "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'" + }, + "block_rules": { + "type": "string", + "required": false, + "title": "Block Rules", + "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" + }, + "block_message": { + "type": "string", + "required": false, + "title": "Block Message", + "description": "If specified, when a URL is blocked, a record with this error message is added instead" + }, + "block_ads": { + "type": "boolean", + "required": false, + "title": "Block Ads", + "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set." + }, + "ad_block_message": { + "type": "string", + "required": false, + "title": "Ads Block Message", + "description": "If specified, when an ad is blocked, a record with this error message is added instead" + }, + "user_agent": { + "type": "string", + "required": false, + "title": "User Agent", + "description": "Override user-agent with specified" + }, + "user_agent_suffix": { + "type": "string", + "required": false, + "title": "User Agent Suffix", + "description": "Append suffix to existing browser user-agent. Defaults to +Zimit" + }, + "use_sitemap": { + "type": "string", + "required": false, + "title": "Sitemap URL", + "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)" + }, + "sitemap_from_date": { + "type": "string", + "required": false, + "title": "Sitemap From Date", + "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "sitemap_to_date": { + "type": "string", + "required": false, + "title": "Sitemap To Date", + "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "behavior_timeout": { + "type": "integer", + "required": false, + "title": "Behavior Timeout", + "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.", + "min": 0 + }, + "post_load_delay": { + "type": "integer", + "required": false, + "title": "Post Load Delay", + "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.", + "min": 0 + }, + "page_extra_delay": { + "type": "integer", + "required": false, + "title": "Page Extra Delay", + "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.", + "min": 0 + }, + "dedup_policy": { + "type": "string-enum", + "required": false, + "title": "Dedup Policy", + "description": "Deduplication policy. One of skip, revisit or keep. Default is skip", + "choices": [ + { + "title": "Skip", + "value": "skip" + }, + { + "title": "Revisit", + "value": "revisit" + }, + { + "title": "Keep", + "value": "keep" + } + ] + }, + "screenshot": { + "type": "string", + "required": false, + "title": "Screenshot", + "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those." + }, + "size_soft_limit": { + "type": "integer", + "required": false, + "title": "Size Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.", + "min": 0 + }, + "size_hard_limit": { + "type": "integer", + "required": false, + "title": "Size Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value", + "min": 0 + }, + "disk_utilization": { + "type": "integer", + "required": false, + "title": "Disk Utilization", + "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.", + "min": 0 + }, + "time_soft_limit": { + "type": "integer", + "required": false, + "title": "Time Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.", + "min": 0 + }, + "time_hard_limit": { + "type": "integer", + "required": false, + "title": "Time Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds", + "min": 0 + }, + "net_idle_wait": { + "type": "integer", + "required": false, + "title": "Net Idle Wait", + "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope." + }, + "origin_override": { + "type": "string", + "required": false, + "title": "Origin Override", + "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port." + }, + "max_page_retries": { + "type": "integer", + "required": false, + "title": "Max Page Retries", + "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.", + "min": 0 + }, + "fail_on_failed_seed": { + "type": "boolean", + "required": false, + "title": "Fail on failed seed", + "description": "Whether to display additional logs" + }, + "fail_on_invalid_status": { + "type": "boolean", + "required": false, + "title": "Fail on invalid status", + "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses" + }, + "fail_on_failed_limit": { + "type": "integer", + "required": false, + "title": "Fail on failed - Limit", + "description": "If set, save state and exit if number of failed pages exceeds this value.", + "min": 0 + }, + "warcs": { + "type": "string", + "required": false, + "title": "WARC files", + "description": "Comma-separated list of WARC files to use as input." + }, + "verbose": { + "type": "boolean", + "required": false, + "title": "Verbose mode", + "description": "Whether to display additional logs" + }, + "keep": { + "type": "boolean", + "required": false, + "title": "Keep", + "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.", + "default": true + }, + "output": { + "type": "string", + "required": false, + "title": "Output folder", + "description": "Output folder for ZIM file(s). Leave it as `/output`", + "pattern": "^/output$" + }, + "admin_email": { + "type": "email", + "required": false, + "title": "Admin Email", + "description": "Admin Email for crawler: used in UserAgent so website admin can contact us", + "default": "contact+zimfarm@kiwix.org" + }, + "profile": { + "type": "string", + "required": false, + "title": "Browser profile", + "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler." + }, + "behaviors": { + "type": "string", + "required": false, + "title": "Behaviors", + "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific." + }, + "depth": { + "type": "integer", + "required": false, + "title": "Depth", + "description": "The depth of the crawl for all seeds. Default is -1 (infinite).", + "min": -1 + }, + "zim_lang": { + "type": "string", + "required": false, + "title": "ZIM Language", + "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`", + "alias": "zim-lang", + "customValidator": "language_code" + }, + "long_description": { + "type": "string", + "required": false, + "title": "Long description", + "description": "Optional long description for your ZIM", + "minLength": 1, + "maxLength": 4000, + "alias": "long-description" + }, + "custom_css": { + "type": "blob", + "kind": "css", + "required": false, + "title": "Custom CSS", + "description": "URL to a CSS file to inject into pages", + "alias": "custom-css" + }, + "charsets_to_try": { + "type": "string", + "required": false, + "title": "Charsets to try", + "description": "List of charsets to try decode content when charset is not found", + "alias": "charsets-to-try" + }, + "ignore_content_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore Content Header Charsets", + "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.", + "alias": "ignore-content-header-charsets" + }, + "content_header_bytes_length": { + "type": "integer", + "required": false, + "title": "Content Header Bytes Length", + "description": "How many bytes to consider when searching for content charsets in header (default is 1024).", + "alias": "content-header-bytes-length", + "min": 0 + }, + "ignore_http_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore HTTP Header Charsets", + "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.", + "alias": "ignore-http-header-charsets" + }, + "encoding_aliases": { + "type": "string", + "required": false, + "title": "Encoding Aliases", + "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.", + "alias": "encoding-aliases" + }, + "custom_behaviors": { + "type": "string", + "required": false, + "title": "Custom Behaviors", + "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.", + "alias": "custom-behaviours" + }, + "zimit_progress_file": { + "type": "string", + "required": false, + "title": "Zimit Progress File", + "description": "Scraping progress file. Leave it as `/output/task_progress.json`", + "alias": "zimit-progress-file", + "pattern": "^/output/task_progress\\.json$" + }, + "replay_viewer_source": { + "type": "url", + "required": false, + "title": "Replay Viewer Source", + "description": "URL from which to load the ReplayWeb.page replay viewer from", + "alias": "replay-viewer-source" + }, + "zim_file": { + "type": "string", + "required": false, + "title": "ZIM filename", + "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically", + "alias": "zim-file", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "name": { + "type": "string", + "required": true, + "title": "ZIM name", + "description": "Name of the ZIM.", + "alias": "name", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "overwrite": { + "type": "boolean", + "required": false, + "title": "Overwrite", + "description": "Whether to overwrite existing ZIM file if it exists" + } + } +} diff --git a/pyproject.toml b/pyproject.toml index b213161..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,17 @@ [build-system] -requires = ["hatchling", "hatch-openzim==0.2.0"] +requires = ["hatchling", "hatch-openzim"] build-backend = "hatchling.build" [project] name = "zimit" -requires-python = ">=3.12,<3.13" +requires-python = ">=3.13,<3.14" description = "Make ZIM file from any website through crawling" readme = "README.md" dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.1.3", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] @@ -26,20 +26,20 @@ scripts = [ "invoke==2.2.0", ] lint = [ - "black==24.10.0", - "ruff==0.6.9", + "black==25.1.0", + "ruff==0.9.4", ] check = [ - "pyright==1.1.383", + "pyright==1.1.393", ] test = [ - "pytest==8.3.3", - "coverage==7.6.1", + "pytest==8.3.4", + "coverage==7.6.10", ] dev = [ - "pre-commit==4.0.0", - "debugpy==1.8.6", - "selenium==4.25.0", # used in daily tests, convenient for dev purpose (autocompletion) + "pre-commit==4.1.0", + "debugpy==1.8.12", + "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion) "zimit[scripts]", "zimit[lint]", "zimit[test]", @@ -95,10 +95,10 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 -target-version = ['py312'] +target-version = ['py313'] [tool.ruff] -target-version = "py312" +target-version = "py313" line-length = 88 src = ["src"] @@ -221,5 +221,5 @@ exclude_lines = [ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**"] extraPaths = ["src"] -pythonVersion = "3.12" +pythonVersion = "3.13" typeCheckingMode="basic" diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 0b167e6..281b1bb 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.5" +__version__ = "3.0.6-dev0" diff --git a/src/zimit/constants.py b/src/zimit/constants.py index f81905a..35baeb9 100644 --- a/src/zimit/constants.py +++ b/src/zimit/constants.py @@ -3,7 +3,8 @@ import logging from zimscraperlib.logging import getLogger EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 -EXIT_CODE_CRAWLER_LIMIT_HIT = 11 +EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14 +EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15 NORMAL_WARC2ZIM_EXIT_CODE = 100 REQUESTS_TIMEOUT = 10 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 44c6d4f..b205007 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -25,26 +25,28 @@ from zimscraperlib.uri import rebuild_uri from zimit.__about__ import __version__ from zimit.constants import ( - EXIT_CODE_CRAWLER_LIMIT_HIT, + EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT, + EXIT_CODE_CRAWLER_TIME_LIMIT_HIT, EXIT_CODE_WARC2ZIM_CHECK_FAILED, NORMAL_WARC2ZIM_EXIT_CODE, logger, ) from zimit.utils import download_file +temp_root_dir: Path | None = None + class ProgressFileWatcher: - def __init__(self, output_dir: Path, stats_path: Path): - self.crawl_path = output_dir / "crawl.json" - self.warc2zim_path = output_dir / "warc2zim.json" - self.stats_path = stats_path - - if not self.stats_path.is_absolute(): - self.stats_path = output_dir / self.stats_path + def __init__( + self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path + ): + self.crawl_stats_path = crawl_stats_path + self.warc2zim_stats_path = warc2zim_stats_path + self.zimit_stats_path = zimit_stats_path # touch them all so inotify is not unhappy on add_watch - self.crawl_path.touch() - self.warc2zim_path.touch() + self.crawl_stats_path.touch() + self.warc2zim_stats_path.touch() self.process = None def stop(self): @@ -56,40 +58,28 @@ class ProgressFileWatcher: def watch(self): self.process = Process( target=self.inotify_watcher, - args=(str(self.crawl_path), str(self.warc2zim_path), str(self.stats_path)), + args=( + str(self.crawl_stats_path), + str(self.warc2zim_stats_path), + str(self.zimit_stats_path), + ), ) self.process.daemon = True self.process.start() - @staticmethod - def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str): + def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, zimit_fpath: str): ino = inotify.adapters.Inotify() ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore - class Limit: - def __init__(self): - self.max = self.hit = None - - @property - def as_dict(self): - return {"max": self.max, "hit": self.hit} - - # limit is only reported by crawl but needs to be reported up - limit = Limit() - - def crawl_conv(data, limit): + def crawl_conv(data): # we consider crawl to be 90% of the workload so total = craw_total * 90% - # limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]} - limit.max = data["limit"]["max"] - limit.hit = data["limit"]["hit"] return { "done": data["crawled"], "total": int(data["total"] / 0.9), - "limit": limit.as_dict, } - def warc2zim_conv(data, limit): + def warc2zim_conv(data): # we consider warc2zim to be 10% of the workload so # warc2zim_total = 10% and total = 90 + warc2zim_total * 10% return { @@ -98,7 +88,6 @@ class ProgressFileWatcher: * (0.9 + (float(data["written"]) / data["total"]) / 10) ), "total": data["total"], - "limit": limit.as_dict, } for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore @@ -108,128 +97,305 @@ class ProgressFileWatcher: # open input and output separatly as to not clear output on error with open(fpath) as ifh: try: - out = func(json.load(ifh), limit) + out = func(json.load(ifh)) except Exception: # nosec # noqa: S112 # simply ignore progress update should an error arise # might be malformed input for instance continue if not out: continue - with open(output_fpath, "w") as ofh: + with open(zimit_fpath, "w") as ofh: json.dump(out, ofh) +def cleanup(): + if not temp_root_dir: + logger.warning("Temporary root dir not already set, cannot clean this up") + return + logger.info("") + logger.info("----------") + logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") + shutil.rmtree(temp_root_dir) + + +def cancel_cleanup(): + logger.info( + f"Temporary files have been kept in {temp_root_dir}, please clean them" + " up manually once you don't need them anymore" + ) + atexit.unregister(cleanup) + + def run(raw_args): parser = ArgumentParser( description="Run a browser-based crawl on the specified URL and convert to ZIM" ) - parser.add_argument("-u", "--url", help="The URL to start crawling from") - parser.add_argument("--title", help="ZIM title") - parser.add_argument("--description", help="ZIM description") + parser.add_argument( + "--seeds", + help="The seed URL(s) to start crawling from. Multile seed URL must be " + "separated by a comma (usually not needed, these are just the crawl seeds). " + "First seed URL is used as ZIM homepage", + ) + + parser.add_argument("--title", help="WARC and ZIM title") + parser.add_argument("--description", help="WARC and ZIM description") parser.add_argument("--long-description", help="ZIM long description metadata") parser.add_argument( - "--urlFile", - help="If set, read a list of seed urls, one per line, from the specified", + "--seedFile", + help="If set, read a list of seed urls, one per line. Can be a local file or " + "the HTTP(s) URL to an online file.", ) - parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") + parser.add_argument( + "-w", "--workers", type=int, help="Number of parallel workers. Default is 1." + ) + + parser.add_argument( + "--crawlId", + help="A user provided ID for this crawl or crawl configuration (can also be " + "set via CRAWL_ID env var, defaults to machine hostname)", + ) parser.add_argument( "--waitUntil", help="Puppeteer page.goto() condition to wait for before continuing. One of " "load, domcontentloaded, networkidle0 or networkidle2, or a " - "comma-separated combination of those.", - default="load", + "comma-separated combination of those. Default is load,networkidle2", ) parser.add_argument( - "--depth", help="The depth of the crawl for all seeds", type=int, default=-1 + "--depth", + help="The depth of the crawl for all seeds. Default is -1 (infinite).", + type=int, ) parser.add_argument( "--extraHops", - help="Number of extra 'hops' to follow, beyond the current scope", + help="Number of extra 'hops' to follow, beyond the current scope. " + "Default is 0.", type=int, ) - parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int) + parser.add_argument( + "--pageLimit", + help="Limit crawl to this number of pages. Default is 0 (no limit).", + type=int, + ) parser.add_argument( "--maxPageLimit", - help="Maximum pages to crawl, overriding pageLimit if both are set", + help="Maximum pages to crawl, overriding pageLimit if both are set. Default is " + "0 (no limit)", type=int, ) parser.add_argument( - "--timeout", - help="Timeout for each page to load (in seconds)", + "--pageLoadTimeout", + help="Timeout for each page to load (in seconds). Default is 90 secs.", type=int, - default=90, ) parser.add_argument( "--scopeType", help="A predfined scope of the crawl. For more customization, " - "use 'custom' and set scopeIncludeRx regexes", + "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom" + "if scopeIncludeRx is set, prefix otherwise.", choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], ) parser.add_argument( - "--include", - help="Regex of page URLs that should be " - "included in the crawl (defaults to " + "--scopeIncludeRx", + help="Regex of page URLs that should be included in the crawl (defaults to " "the immediate directory of URL)", ) parser.add_argument( - "--exclude", + "--scopeExcludeRx", help="Regex of page URLs that should be excluded from the crawl", ) + parser.add_argument( + "--allowHashUrls", + help="Allow Hashtag URLs, useful for single-page-application crawling or " + "when different hashtags load dynamic content", + action="store_true", + ) + + parser.add_argument( + "--selectLinks", + help="One or more selectors for extracting links, in the format " + "[css selector]->[property to use],[css selector]->@[attribute to use]", + ) + + parser.add_argument( + "--clickSelector", + help="Selector for elements to click when using the autoclick behavior. Default" + " is 'a'", + ) + + parser.add_argument( + "--blockRules", + help="Additional rules for blocking certain URLs from being loaded, by URL " + "regex and optionally via text match in an iframe", + ) + + parser.add_argument( + "--blockMessage", + help="If specified, when a URL is blocked, a record with this error message is" + " added instead", + ) + + parser.add_argument( + "--blockAds", + help="If set, block advertisements from being loaded (based on Stephen Black's" + " blocklist). Note that some bad domains are also blocked by zimit" + " configuration even if this option is not set.", + ) + + parser.add_argument( + "--adBlockMessage", + help="If specified, when an ad is blocked, a record with this error message is" + " added instead", + ) + parser.add_argument( "--collection", help="Collection name to crawl to (replay will be accessible " - "under this name in pywb preview) instead of crawl-@ts", + "under this name in pywb preview). Default is crawl-@ts.", ) parser.add_argument( - "--allowHashUrls", - help="Allow Hashtag URLs, useful for " - "single-page-application crawling or " - "when different hashtags load dynamic " - "content", + "--headless", + help="Run in headless mode, otherwise start xvfb", action="store_true", ) parser.add_argument( - "--lang", - help="if set, sets the language used by the browser, should be ISO 639 " - "language[-country] code", + "--driver", + help="Custom driver for the crawler, if any", ) parser.add_argument( - "--zim-lang", - help="Language metadata of ZIM " - "(warc2zim --lang param). ISO-639-3 code. " - "Retrieved from homepage if found, fallback to `eng`", + "--generateCDX", + help="If set, generate index (CDXJ) for use with pywb after crawl is done", + action="store_true", ) + parser.add_argument( + "--combineWARC", + help="If set, combine the warcs", + action="store_true", + ) + + parser.add_argument( + "--rolloverSize", + help="If set, declare the rollover size. Default is 1000000000.", + type=int, + ) + + parser.add_argument( + "--generateWACZ", + help="If set, generate WACZ on disk", + action="store_true", + ) + + parser.add_argument( + "--logging", + help="Crawler logging configuration", + ) + + parser.add_argument( + "--logLevel", + help="Comma-separated list of log levels to include in logs", + ) + + parser.add_argument( + "--logContext", + help="Comma-separated list of contexts to include in logs", + choices=[ + "general", + "worker", + "recorder", + "recorderNetwork", + "writer", + "state", + "redis", + "storage", + "text", + "exclusion", + "screenshots", + "screencast", + "originOverride", + "healthcheck", + "browser", + "blocking", + "behavior", + "behaviorScript", + "jsError", + "fetch", + "pageStatus", + "memoryStatus", + "crawlStatus", + "links", + "sitemap", + "wacz", + "replay", + "proxy", + ], + ) + + parser.add_argument( + "--logExcludeContext", + help="Comma-separated list of contexts to NOT include in logs. Default is " + "recorderNetwork,jsError,screencast", + choices=[ + "general", + "worker", + "recorder", + "recorderNetwork", + "writer", + "state", + "redis", + "storage", + "text", + "exclusion", + "screenshots", + "screencast", + "originOverride", + "healthcheck", + "browser", + "blocking", + "behavior", + "behaviorScript", + "jsError", + "fetch", + "pageStatus", + "memoryStatus", + "crawlStatus", + "links", + "sitemap", + "wacz", + "replay", + "proxy", + ], + ) + + parser.add_argument( + "--text", + help="Extract initial (default) or final text to pages.jsonl or WARC resource" + " record(s)", + ) + + # cwd is manipulated directly by zimit, based on --output / --build, we do not want + # to expose this setting + parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", - default="Pixel 2", - ) - - parser.add_argument( - "--noMobileDevice", - help="Do not emulate a mobile device (use at your own risk, behavior is" - "uncertain)", - action="store_true", - default=False, ) parser.add_argument( @@ -251,27 +417,71 @@ def run(raw_args): "(usually /sitemap.xml)", ) + parser.add_argument( + "--sitemapFromDate", + help="If set, filter URLs from sitemaps to those greater than or equal to (>=)" + " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", + ) + + parser.add_argument( + "--sitemapToDate", + help="If set, filter URLs from sitemaps to those less than or equal to (<=) " + "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", + ) + + parser.add_argument( + "--statsFilename", + help="If set, output crawl stats as JSON to this file. Relative filename " + "resolves to output directory, see --output.", + ) + + parser.add_argument( + "--zimit-progress-file", + help="If set, output zimit stats as JSON to this file. Forces the creation of" + "crawler and warc2zim stats as well. If --statsFilename and/or " + "--warc2zim-progress-file are not set, default temporary files will be used. " + "Relative filename resolves to output directory, see --output.", + ) + + parser.add_argument( + "--warc2zim-progress-file", + help="If set, output warc2zim stats as JSON to this file. Relative filename " + "resolves to output directory, see --output.", + ) + parser.add_argument( "--behaviors", - help="Which background behaviors to enable on each page", - default="autoplay,autofetch,siteSpecific", + help="Which background behaviors to enable on each page. Default is autoplay," + "autofetch,autoscroll,siteSpecific", ) parser.add_argument( "--behaviorTimeout", help="If >0, timeout (in seconds) for in-page behavior will run on each page. " - "If 0, a behavior can run until finish", + "If 0, a behavior can run until finish. Default is 90.", type=int, - default=90, ) parser.add_argument( - "--delay", - help="If >0, amount of time to sleep (in seconds) after behaviors " - "before moving on to next page", + "--postLoadDelay", + help="If >0, amount of time to sleep (in seconds) after page has loaded, before" + " taking screenshots / getting text / running behaviors. Default is 0.", type=int, ) + parser.add_argument( + "--pageExtraDelay", + help="If >0, amount of time to sleep (in seconds) after behaviors " + "before moving on to next page. Default is 0.", + type=int, + ) + + parser.add_argument( + "--dedupPolicy", + help="Deduplication policy. Default is skip", + choices=["skip", "revisit", "keep"], + ) + parser.add_argument( "--profile", help="Path or HTTP(S) URL to tar.gz file which contains the browser profile " @@ -279,8 +489,61 @@ def run(raw_args): ) parser.add_argument( - "--sizeLimit", - help="If set, save state and exit if size limit exceeds this value", + "--screenshot", + help="Screenshot options for crawler. One of view, thumbnail, fullPage, " + "fullPageFinal or a comma-separated combination of those.", + ) + + parser.add_argument( + "--screencastPort", + help="If set to a non-zero value, starts an HTTP server with screencast " + "accessible on this port.", + type=int, + ) + + parser.add_argument( + "--screencastRedis", + help="If set, will use the state store redis pubsub for screencasting", + action="store_true", + ) + + parser.add_argument( + "--warcInfo", + help="Optional fields added to the warcinfo record in combined WARCs", + ) + + parser.add_argument( + "--saveState", + help="If the crawl state should be serialized to the crawls/ directory. " + "Defaults to 'partial', only saved when crawl is interrupted", + choices=["never", "partial", "always"], + ) + + parser.add_argument( + "--saveStateInterval", + help="If save state is set to 'always', also save state during the crawl at " + "this interval (in seconds). Default to 300.", + type=int, + ) + + parser.add_argument( + "--saveStateHistory", + help="Number of save states to keep during the duration of a crawl. " + "Default to 5.", + type=int, + ) + + size_group = parser.add_mutually_exclusive_group() + size_group.add_argument( + "--sizeSoftLimit", + help="If set, save crawl state and stop crawl if WARC size exceeds this value. " + "ZIM will still be created.", + type=int, + ) + size_group.add_argument( + "--sizeHardLimit", + help="If set, exit crawler and fail the scraper immediately if WARC size " + "exceeds this value", type=int, ) @@ -292,9 +555,17 @@ def run(raw_args): default=90, ) - parser.add_argument( - "--timeLimit", - help="If set, save state and exit after time limit, in seconds", + time_group = parser.add_mutually_exclusive_group() + time_group.add_argument( + "--timeSoftLimit", + help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes " + "longer than this value, in seconds. ZIM will still be created.", + type=int, + ) + time_group.add_argument( + "--timeHardLimit", + help="If set, exit crawler and fail the scraper immediately if WARC(s) creation" + " takes longer than this value, in seconds", type=int, ) @@ -309,16 +580,150 @@ def run(raw_args): help="overwrite current crawl data: if set, existing collection directory " "will be deleted before crawl is started", action="store_true", - default=False, + ) + + parser.add_argument( + "--waitOnDone", + help="if set, wait for interrupt signal when finished instead of exiting", + action="store_true", + ) + + parser.add_argument( + "--restartsOnError", + help="if set, assume will be restarted if interrupted, don't run post-crawl " + "processes on interrupt", + action="store_true", + ) + + parser.add_argument( + "--netIdleWait", + help="If set, wait for network idle after page load and after behaviors are " + "done (in seconds). if -1 (default), determine based on scope.", + type=int, + ) + + parser.add_argument( + "--lang", + help="if set, sets the language used by the browser, should be ISO 639 " + "language[-country] code", + ) + + parser.add_argument( + "--originOverride", + help="if set, will redirect requests from each origin in key to origin in the " + "value, eg. --originOverride https://host:port=http://alt-host:alt-port", + ) + + parser.add_argument( + "--logErrorsToRedis", + help="If set, write error messages to redis", + action="store_true", + ) + + parser.add_argument( + "--writePagesToRedis", + help="If set, write page objects to redis", + action="store_true", + ) + + parser.add_argument( + "--maxPageRetries", + help="If set, number of times to retry a page that failed to load before page" + " is considered to have failed. Default is 2.", + type=int, + ) + + parser.add_argument( + "--failOnFailedSeed", + help="If set, crawler will fail with exit code 1 if any seed fails. When " + "combined with --failOnInvalidStatus, will result in crawl failing with exit " + "code 1 if any seed has a 4xx/5xx response", + action="store_true", + ) + + parser.add_argument( + "--failOnFailedLimit", + help="If set, save state and exit if number of failed pages exceeds this value", + action="store_true", + ) + + parser.add_argument( + "--failOnInvalidStatus", + help="If set, will treat pages with 4xx or 5xx response as failures. When " + "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl " + "failing due to non-200 responses", + action="store_true", + ) + + # customBehaviors not included because it has special handling + # debugAccessRedis not included due to custom redis engine in zimit + + parser.add_argument( + "--debugAccessBrowser", + help="if set, allow debugging browser on port 9222 via CDP", + action="store_true", + ) + + parser.add_argument( + "--warcPrefix", + help="prefix for WARC files generated, including WARCs added to WACZ", + ) + + parser.add_argument( + "--serviceWorker", + help="service worker handling: disabled, enabled or disabled-if-profile. " + "Default: disabled.", + ) + + parser.add_argument( + "--proxyServer", + help="if set, will use specified proxy server. Takes precedence over any env " + "var proxy settings", + ) + + parser.add_argument( + "--dryRun", + help="If true, no archive data is written to disk, only pages and logs (and " + "optionally saved state).", + action="store_true", + ) + + parser.add_argument( + "--qaSource", + help="Required for QA mode. Path to the source WACZ or multi WACZ file for QA", + ) + + parser.add_argument( + "--qaDebugImageDiff", + help="if specified, will write crawl.png, replay.png and diff.png for each " + "page where they're different", + action="store_true", + ) + + parser.add_argument( + "--sshProxyPrivateKeyFile", + help="path to SSH private key for SOCKS5 over SSH proxy connection", + ) + + parser.add_argument( + "--sshProxyKnownHostsFile", + help="path to SSH known hosts file for SOCKS5 over SSH proxy connection", ) parser.add_argument( "--keep", - help="If set, keep WARC files after crawl, don't delete", + help="In case of failure, WARC files and other temporary files (which are " + "stored as a subfolder of output directory) are always kept, otherwise " + "they are automatically deleted. Use this flag to always keep WARC files, " + "even in case of success.", action="store_true", ) - parser.add_argument("--output", help="Output directory for ZIM", default="/output") + parser.add_argument( + "--output", + help="Output directory for ZIM. Default to /output.", + default="/output", + ) parser.add_argument( "--build", @@ -332,11 +737,6 @@ def run(raw_args): help="[warc2zim] Custom CSS file URL/path to inject into all articles", ) - parser.add_argument( - "--statsFilename", - help="If set, output stats as JSON to this file", - ) - parser.add_argument( "--config", help="Path to YAML config file. If set, browsertrix-crawler will use this file" @@ -351,8 +751,10 @@ def run(raw_args): ) parser.add_argument( - "--logging", - help="Crawler logging configuration", + "--zim-lang", + help="Language metadata of ZIM " + "(warc2zim --lang param). ISO-639-3 code. " + "Retrieved from homepage if found, fallback to `eng`", ) parser.add_argument( @@ -369,7 +771,16 @@ def run(raw_args): "path/URLs separated by comma", ) - zimit_args, warc2zim_args = parser.parse_known_args(raw_args) + parser.add_argument( + "--acceptable-crawler-exit-codes", + help="Non-zero crawler exit codes to consider as acceptable to continue with " + " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if " + " used). Single value with individual error codes separated by comma", + ) + + # by design, all unknown args are for warc2zim ; known one are either for crawler + # or shared + known_args, warc2zim_args = parser.parse_known_args(raw_args) # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are # associated with the ZIM ; make it a CSV for easier parsing @@ -377,39 +788,69 @@ def run(raw_args): warc2zim_args.append(f"zimit {__version__}") # pass url and output to warc2zim also - if zimit_args.output: + if known_args.output: warc2zim_args.append("--output") - warc2zim_args.append(zimit_args.output) + warc2zim_args.append(known_args.output) - url = zimit_args.url + user_agent_suffix = known_args.userAgentSuffix + if known_args.adminEmail: + user_agent_suffix += f" {known_args.adminEmail}" - user_agent_suffix = zimit_args.userAgentSuffix - if zimit_args.adminEmail: - user_agent_suffix += f" {zimit_args.adminEmail}" + # set temp dir to use for this crawl + global temp_root_dir # noqa: PLW0603 + if known_args.build: + # use build dir argument if passed + temp_root_dir = Path(known_args.build) + temp_root_dir.mkdir(parents=True, exist_ok=True) + else: + # make new randomized temp dir + temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) - if url: - url = get_cleaned_url(url) - warc2zim_args.append("--url") - warc2zim_args.append(url) + seeds = [] + if known_args.seeds: + seeds += [get_cleaned_url(url) for url in known_args.seeds.split(",")] + if known_args.seedFile: + if re.match(r"^https?\://", known_args.seedFile): + with tempfile.NamedTemporaryFile( + dir=temp_root_dir, + prefix="seeds_", + suffix=".txt", + delete_on_close=True, + ) as filename: + seed_file = Path(filename.name) + download_file(known_args.seedFile, seed_file) + seeds += [ + get_cleaned_url(url) for url in seed_file.read_text().splitlines() + ] + else: + seeds += [ + get_cleaned_url(url) + for url in Path(known_args.seedFile).read_text().splitlines() + ] + warc2zim_args.append("--url") + warc2zim_args.append(seeds[0]) - if zimit_args.custom_css: - warc2zim_args += ["--custom-css", zimit_args.custom_css] + if known_args.custom_css: + warc2zim_args += ["--custom-css", known_args.custom_css] - if zimit_args.title: + if known_args.title: warc2zim_args.append("--title") - warc2zim_args.append(zimit_args.title) + warc2zim_args.append(known_args.title) - if zimit_args.description: + if known_args.description: warc2zim_args.append("--description") - warc2zim_args.append(zimit_args.description) + warc2zim_args.append(known_args.description) - if zimit_args.long_description: + if known_args.long_description: warc2zim_args.append("--long-description") - warc2zim_args.append(zimit_args.long_description) + warc2zim_args.append(known_args.long_description) - if zimit_args.zim_lang: + if known_args.zim_lang: warc2zim_args.append("--lang") - warc2zim_args.append(zimit_args.zim_lang) + warc2zim_args.append(known_args.zim_lang) + + if known_args.overwrite: + warc2zim_args.append("--overwrite") logger.info("----------") logger.info("Testing warc2zim args") @@ -419,29 +860,17 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - # make temp dir for this crawl - if zimit_args.build: - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) - else: - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) - - if not zimit_args.keep: - - def cleanup(): - logger.info("") - logger.info("----------") - logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") - shutil.rmtree(temp_root_dir) - + # only trigger cleanup when the keep argument is passed without a custom build dir. + if not known_args.build and not known_args.keep: atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler - if zimit_args.custom_behaviors: + if known_args.custom_behaviors: behaviors_dir = temp_root_dir / "custom-behaviors" behaviors_dir.mkdir() for custom_behavior in [ custom_behavior.strip() - for custom_behavior in zimit_args.custom_behaviors.split(",") + for custom_behavior in known_args.custom_behaviors.split(",") ]: behaviors_file = tempfile.NamedTemporaryFile( dir=behaviors_dir, @@ -461,55 +890,100 @@ def run(raw_args): f"to {behaviors_file.name}" ) shutil.copy(custom_behavior, behaviors_file.name) - zimit_args.customBehaviors = str(behaviors_dir) + known_args.customBehaviors = str(behaviors_dir) else: - zimit_args.customBehaviors = None + known_args.customBehaviors = None - cmd_args = get_node_cmd_line(zimit_args) - if url: - cmd_args.append("--url") - cmd_args.append(url) + crawler_args = get_crawler_cmd_line(known_args) + for seed in seeds: + crawler_args.append("--seeds") + crawler_args.append(seed) - cmd_args.append("--userAgentSuffix") - cmd_args.append(user_agent_suffix) + crawler_args.append("--userAgentSuffix") + crawler_args.append(user_agent_suffix) - if not zimit_args.noMobileDevice: - cmd_args.append("--mobileDevice") - cmd_args.append(zimit_args.mobileDevice) + crawler_args.append("--cwd") + crawler_args.append(str(temp_root_dir)) - cmd_args.append("--cwd") - cmd_args.append(str(temp_root_dir)) + output_dir = Path(known_args.output) + warc2zim_stats_file = ( + Path(known_args.warc2zim_progress_file) + if known_args.warc2zim_progress_file + else temp_root_dir / "warc2zim.json" + ) + if not warc2zim_stats_file.is_absolute(): + warc2zim_stats_file = output_dir / warc2zim_stats_file + warc2zim_stats_file.parent.mkdir(parents=True, exist_ok=True) + warc2zim_stats_file.unlink(missing_ok=True) - # setup inotify crawler progress watcher - if zimit_args.statsFilename: + crawler_stats_file = ( + Path(known_args.statsFilename) + if known_args.statsFilename + else temp_root_dir / "crawl.json" + ) + if not crawler_stats_file.is_absolute(): + crawler_stats_file = output_dir / crawler_stats_file + crawler_stats_file.parent.mkdir(parents=True, exist_ok=True) + crawler_stats_file.unlink(missing_ok=True) + + zimit_stats_file = ( + Path(known_args.zimit_progress_file) + if known_args.zimit_progress_file + else temp_root_dir / "stats.json" + ) + if not zimit_stats_file.is_absolute(): + zimit_stats_file = output_dir / zimit_stats_file + zimit_stats_file.parent.mkdir(parents=True, exist_ok=True) + zimit_stats_file.unlink(missing_ok=True) + + if known_args.zimit_progress_file: + # setup inotify crawler progress watcher watcher = ProgressFileWatcher( - Path(zimit_args.output), Path(zimit_args.statsFilename) + zimit_stats_path=zimit_stats_file, + crawl_stats_path=crawler_stats_file, + warc2zim_stats_path=warc2zim_stats_file, + ) + logger.info( + f"Writing zimit progress to {watcher.zimit_stats_path}, crawler progress to" + f" {watcher.crawl_stats_path} and warc2zim progress to " + f"{watcher.warc2zim_stats_path}" ) - logger.info(f"Writing progress to {watcher.stats_path}") # update crawler command - cmd_args.append("--statsFilename") - cmd_args.append(str(watcher.crawl_path)) + crawler_args.append("--statsFilename") + crawler_args.append(str(crawler_stats_file)) # update warc2zim command warc2zim_args.append("-v") warc2zim_args.append("--progress-file") - warc2zim_args.append(str(watcher.warc2zim_path)) + warc2zim_args.append(str(warc2zim_stats_file)) watcher.watch() + else: + if known_args.statsFilename: + logger.info(f"Writing crawler progress to {crawler_stats_file}") + crawler_args.append("--statsFilename") + crawler_args.append(str(crawler_stats_file)) + if known_args.warc2zim_progress_file: + logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}") + warc2zim_args.append("-v") + warc2zim_args.append("--progress-file") + warc2zim_args.append(str(warc2zim_stats_file)) - cmd_line = " ".join(cmd_args) + cmd_line = " ".join(crawler_args) logger.info("") logger.info("----------") logger.info( f"Output to tempdir: {temp_root_dir} - " - f"{'will keep' if zimit_args.keep else 'will delete'}" + f"{'will keep' if known_args.keep else 'will delete'}" ) + partial_zim = False + # if warc files are passed, do not run browsertrix crawler but fetch the files if # they are provided as an HTTP URL + extract the archive if it is a tar.gz warc_files: list[Path] = [] - if zimit_args.warcs: + if known_args.warcs: for warc_location in [ - warc_location.strip() for warc_location in zimit_args.warcs.split(",") + warc_location.strip() for warc_location in known_args.warcs.split(",") ]: suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes) if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}: @@ -565,17 +1039,36 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") - crawl = subprocess.run(cmd_args, check=False) - if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: - logger.info("crawl interupted by a limit") + crawl = subprocess.run(crawler_args, check=False) + if ( + crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT + and known_args.sizeSoftLimit + ): + logger.info( + "Crawl size soft limit hit. Continuing with warc2zim conversion." + ) + if known_args.zimit_progress_file: + partial_zim = True + elif ( + crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT + and known_args.timeSoftLimit + ): + logger.info( + "Crawl time soft limit hit. Continuing with warc2zim conversion." + ) + if known_args.zimit_progress_file: + partial_zim = True elif crawl.returncode != 0: - raise subprocess.CalledProcessError(crawl.returncode, cmd_args) + logger.error( + f"Crawl returned an error: {crawl.returncode}, scraper exiting" + ) + cancel_cleanup() + return crawl.returncode - if zimit_args.collection: + if known_args.collection: warc_files = [ - temp_root_dir.joinpath(f"collections/{zimit_args.collection}/archive/") + temp_root_dir.joinpath(f"collections/{known_args.collection}/archive/") ] else: @@ -589,24 +1082,36 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, only most recently modified one" - " will be used" + "Found many WARC files directories, combining pages from all " + "of them" ) for directory in warc_dirs: logger.info(f"- {directory}") - warc_files = [warc_dirs[-1]] + warc_files = warc_dirs logger.info("") logger.info("----------") logger.info( f"Processing WARC files in/at " - f'{" ".join(str(warc_file) for warc_file in warc_files)}' + f"{' '.join(str(warc_file) for warc_file in warc_files)}" ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) logger.info(f"Calling warc2zim with these args: {warc2zim_args}") - return warc2zim(warc2zim_args) + warc2zim_exit_code = warc2zim(warc2zim_args) + + if known_args.zimit_progress_file: + stats_content = json.loads(zimit_stats_file.read_bytes()) + stats_content["partialZim"] = partial_zim + zimit_stats_file.write_text(json.dumps(stats_content)) + + # also call cancel_cleanup when --keep, even if it is not supposed to be registered, + # so that we will display temporary files location just like in other situations + if warc2zim_exit_code or known_args.keep: + cancel_cleanup() + + return warc2zim_exit_code def get_cleaned_url(url: str): @@ -621,39 +1126,92 @@ def get_cleaned_url(url: str): return parsed_url.geturl() -def get_node_cmd_line(args): - node_cmd = ["crawl", "--failOnFailedSeed"] +def get_crawler_cmd_line(args): + """Build the command line for Browsertrix crawler""" + node_cmd = ["crawl"] for arg in [ - "workers", - "waitUntil", - "urlFile", "title", "description", + "workers", + "crawlId", + "waitUntil", "depth", "extraHops", - "limit", + "pageLimit", "maxPageLimit", - "timeout", + "pageLoadTimeout", "scopeType", - "include", - "exclude", + "scopeIncludeRx", + "scopeExcludeRx", "collection", "allowHashUrls", - "lang", + "selectLinks", + "clickSelector", + "blockRules", + "blockMessage", + "blockAds", + "adBlockMessage", + "collection", + "headless", + "driver", + "generateCDX", + "combineWARC", + "rolloverSize", + "generateWACZ", + "logging", + "logLevel", + "logContext", + "logExcludeContext", + "text", + "mobileDevice", "userAgent", + # userAgentSuffix (manipulated), "useSitemap", + "sitemapFromDate", + "sitemapToDate", + # statsFilename (manipulated), "behaviors", "behaviorTimeout", - "delay", + "postLoadDelay", + "pageExtraDelay", + "dedupPolicy", "profile", - "sizeLimit", + "screenshot", + "screencastPort", + "screencastRedis", + "warcInfo", + "saveState", + "saveStateInterval", + "saveStateHistory", + "sizeSoftLimit", + "sizeHardLimit", "diskUtilization", - "timeLimit", + "timeSoftLimit", + "timeHardLimit", "healthCheckPort", "overwrite", - "config", - "logging", + "waitOnDone", + "restartsOnError", + "netIdleWait", + "lang", + "originOverride", + "logErrorsToRedis", + "writePagesToRedis", + "maxPageRetries", + "failOnFailedSeed", + "failOnFailedLimit", + "failOnInvalidStatus", + "debugAccessBrowser", + "warcPrefix", + "serviceWorker", + "proxyServer", + "dryRun", + "qaSource", + "qaDebugImageDiff", + "sshProxyPrivateKeyFile", + "sshProxyKnownHostsFile", "customBehaviors", + "config", ]: value = getattr(args, arg) if arg == "userAgent": @@ -668,7 +1226,14 @@ def get_node_cmd_line(args): continue if value is None or (isinstance(value, bool) and value is False): continue - node_cmd.append("--" + arg) + node_cmd.append( + "--" + + ( + "sizeLimit" + if arg in ["sizeSoftLimit", "sizeHardLimit"] + else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg + ) + ) if not isinstance(value, bool): node_cmd.append(str(value)) @@ -685,7 +1250,7 @@ def sigint_handler(*args): # noqa: ARG001 def zimit(): - run(sys.argv[1:]) + sys.exit(run(sys.argv[1:])) signal.signal(signal.SIGINT, sigint_handler) diff --git a/tests-daily/Dockerfile b/tests-daily/Dockerfile index f6118fe..22d45ef 100644 --- a/tests-daily/Dockerfile +++ b/tests-daily/Dockerfile @@ -1,5 +1,5 @@ # Let's extract kiwix-tools as usual on alpine temporary build container -FROM alpine:3.18 as kiwix-serve +FROM alpine:3.21 as kiwix-serve LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools # TARGETPLATFORM is injected by docker build @@ -30,7 +30,7 @@ RUN set -e && \ curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1 # Build real "workload" container -FROM python:3.12-slim-bookworm +FROM python:3.13-slim-bookworm # Add kiwix-serve COPY --from=kiwix-serve /kiwix-serve /usr/local/bin @@ -70,6 +70,6 @@ RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json RUN \ python -m pip install --no-cache-dir -U \ pip \ - selenium==4.23.0 \ - pytest==8.2.2 \ + selenium==4.28.1 \ + pytest==8.3.4 \ && mkdir -p /work diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 16ab337..7e79f52 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -1,30 +1,55 @@ import glob import json import os +from pathlib import Path +import pytest from warcio import ArchiveIterator from zimscraperlib.zim import Archive -def test_is_file(): +@pytest.mark.parametrize( + "filename", + [ + pytest.param("/output/tests_en_onepage.zim", id="onepage"), + pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"), + pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"), + ], +) +def test_zim_created(filename): """Ensure ZIM file exists""" - assert os.path.isfile("/output/isago.zim") + assert os.path.isfile(filename) + + +@pytest.mark.parametrize( + "filename", + [ + pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"), + pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"), + ], +) +def test_zim_not_created(filename): + """Ensure ZIM file does not exists""" + assert not os.path.exists(filename) def test_zim_main_page(): - """Main page specified, http://isago.rskg.org/, was a redirect to https + """Main page specified, http://website.test.openzim.org/http-return-codes.html, + was a redirect to https Ensure main page is the redirected page""" - main_entry = Archive("/output/isago.zim").main_entry + main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry assert main_entry.is_redirect - assert main_entry.get_redirect_entry().path == "isago.rskg.org/" + assert ( + main_entry.get_redirect_entry().path + == "website.test.openzim.org/http-return-codes.html" + ) def test_zim_scraper(): - """Main page specified, http://isago.rskg.org/, was a redirect to https - Ensure main page is the redirected page""" + """Check content of scraper metadata""" - zim_fh = Archive("/output/isago.zim") + zim_fh = Archive(Path("/output/tests_en_onepage.zim")) scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper @@ -33,18 +58,28 @@ def test_zim_scraper(): def test_files_list(): """Check that expected files are present in the ZIM at proper path""" - zim_fh = Archive("/output/isago.zim") + zim_fh = Archive(Path("/output/tests_en_onepage.zim")) for expected_entry in [ "_zim_static/__wb_module_decl.js", "_zim_static/wombat.js", "_zim_static/wombatSetup.js", - "isago.rskg.org/", - "isago.rskg.org/a-propos", - "isago.rskg.org/conseils", - "isago.rskg.org/faq", - "isago.rskg.org/static/favicon256.png", - "isago.rskg.org/static/tarifs-isago.pdf", - "maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css", + "website.test.openzim.org/http-return-codes.html", + "website.test.openzim.org/200-response", + "website.test.openzim.org/201-response", + "website.test.openzim.org/202-response", + "website.test.openzim.org/301-external-redirect-ok", + "website.test.openzim.org/301-internal-redirect-ok", + "website.test.openzim.org/302-external-redirect-ok", + "website.test.openzim.org/302-internal-redirect-ok", + "website.test.openzim.org/307-external-redirect-ok", + "website.test.openzim.org/307-internal-redirect-ok", + "website.test.openzim.org/308-external-redirect-ok", + "website.test.openzim.org/308-internal-redirect-ok", + "website.test.openzim.org/http-return-codes.html", + "website.test.openzim.org/icons/favicon.ico", + "website.test.openzim.org/icons/site.webmanifest", + "website.test.openzim.org/internal_redirect_target.html", + "www.example.com/", ]: assert zim_fh.get_content(expected_entry) @@ -71,24 +106,40 @@ def test_user_agent(): assert found -def test_stats_output(): - with open("/output/crawl.json") as fh: - assert json.loads(fh.read()) == { - "crawled": 5, - "pending": 0, - "pendingPages": [], - "total": 5, - "failed": 0, - "limit": {"max": 0, "hit": False}, - } - with open("/output/warc2zim.json") as fh: - assert json.loads(fh.read()) == { - "written": 7, - "total": 7, - } - with open("/output/stats.json") as fh: - assert json.loads(fh.read()) == { - "done": 7, - "total": 7, - "limit": {"max": 0, "hit": False}, - } +def test_stats_output_standard(): + assert json.loads(Path("/output/crawl.json").read_bytes()) == { + "crawled": 17, + "pending": 0, + "pendingPages": [], + "total": 35, + "failed": 18, + "limit": {"max": 0, "hit": False}, + } + + assert json.loads(Path("/output/warc2zim.json").read_bytes()) == { + "written": 8, + "total": 8, + } + + assert json.loads(Path("/output/stats.json").read_bytes()) == { + "done": 8, + "total": 8, + "partialZim": False, + } + + +@pytest.mark.parametrize( + "filename", + [ + pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"), + pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"), + ], +) +def test_stats_output_softlimit(filename): + file = Path(filename) + assert file.exists + content = json.loads(file.read_bytes()) + assert "done" in content + assert "total" in content + assert "partialZim" in content + assert content["partialZim"] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d51650d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,14 @@ +import pytest + +from zimit import zimit as app + +""" + cleanup disabled because atexit hooks run at the very end of the Python process + shutdown. By the time cleanup() is called, the logging module has already closed its + file streams. +""" + + +@pytest.fixture(autouse=True) +def disable_zimit_cleanup(monkeypatch): + monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc new file mode 100644 index 0000000..143b947 Binary files /dev/null and b/tests/data/example-response.warc differ diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py new file mode 100644 index 0000000..e41baca --- /dev/null +++ b/tests/test_overwrite.py @@ -0,0 +1,83 @@ +import pathlib + +import pytest + +from zimit.zimit import run + +TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" + + +def test_overwrite_flag_behaviour(tmp_path): + zim_output = "overwrite-test.zim" + output_path = tmp_path / zim_output + + # 1st run → creates file + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert result in (None, 100) + assert output_path.exists() + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 3rd run, with overwrite → should succeed + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + "--overwrite", + ] + ) + assert result in (None, 100) + assert output_path.exists()