diff --git a/.github/workflows/DailyTests.yaml b/.github/workflows/DailyTests.yaml index 2bc9bc5..0585721 100644 --- a/.github/workflows/DailyTests.yaml +++ b/.github/workflows/DailyTests.yaml @@ -18,7 +18,7 @@ jobs: run: docker build -t local-zimit . - name: run crawl of test website - run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim + run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim - name: archive ZIM uses: actions/upload-artifact@v4 diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index 1ddb343..b6660d0 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -5,9 +5,8 @@ on: types: [published] jobs: - publish-amd64: - runs-on: ubuntu-24.04 - name: "Publish for AMD64" + publish: + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -20,34 +19,11 @@ jobs: latest-on-tag: true restrict-to: openzim/zimit registries: ghcr.io - credentials: | + credentials: GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto repo_overview: auto platforms: | linux/amd64 - - # Disabled for now, see https://github.com/openzim/zimit/issues/463 - # publish-arm64: - # runs-on: ubuntu-24.04 - # name: "Publish for ARM64" - # - # steps: - # - uses: actions/checkout@v4 - # - # - name: Build and push Docker image - # uses: openzim/docker-publish-action@v10 - # with: - # image-name: openzim/zimit - # tag-pattern: /^v([0-9.]+)$/ - # latest-on-tag: true - # restrict-to: openzim/zimit - # registries: ghcr.io - # credentials: | - # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - # repo_description: auto - # repo_overview: auto - # platforms: | - # linux/arm64 + linux/arm64 diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index 1cbecea..5e2431e 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -7,9 +7,8 @@ on: workflow_dispatch: jobs: - publish-amd64: - runs-on: ubuntu-24.04 - name: "Publish for AMD64" + publish: + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 @@ -22,34 +21,11 @@ jobs: latest-on-tag: false restrict-to: openzim/zimit registries: ghcr.io - credentials: | + credentials: GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto repo_overview: auto platforms: | linux/amd64 - - # Disabled for now, see https://github.com/openzim/zimit/issues/463 - # publish-arm64: - # runs-on: ubuntu-24.04-arm - # name: "Publish for ARM64" - # - # steps: - # - uses: actions/checkout@v4 - # - # - name: Build and push Docker image - # uses: openzim/docker-publish-action@v10 - # with: - # image-name: openzim/zimit - # manual-tag: dev - # latest-on-tag: false - # restrict-to: openzim/zimit - # registries: ghcr.io - # credentials: | - # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - # repo_description: auto - # repo_overview: auto - # platforms: | - # linux/arm64 + linux/arm64 diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 8c74b21..9e21fa7 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -57,25 +57,13 @@ jobs: uses: actions/checkout@v4 - name: build image - run: docker build -t local-zimit . + run: docker build -t zimit . - name: ensure help display without issue - run: docker run -v $PWD/output:/output local-zimit zimit --help + run: docker run -v $PWD/output:/output zimit zimit --help - - name: run crawl with soft size limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json - - - name: run crawl with hard size limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true - - - name: run crawl with soft time limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json - - - name: run crawl with hard time limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true - - - name: run standard crawl - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep + - name: run crawl + run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite - run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" + run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml deleted file mode 100644 index f481354..0000000 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ /dev/null @@ -1,45 +0,0 @@ -name: Update ZIMFarm Definitions - -on: - push: - branches: [main] - paths: - - "offliner-definition.json" - release: - types: [published] - - workflow_dispatch: - inputs: - version: - description: "Version to publish" - required: false - default: "dev" - -jobs: - prepare-json: - runs-on: ubuntu-24.04 - outputs: - offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - id: read-json - run: | - if [ ! -f "offliner-definition.json" ]; then - echo "File not found!" >&2 - exit 1 - fi - json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)") - echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT - call-workflow: - needs: prepare-json - uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main - with: - version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }} - offliner: zimit - offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} - secrets: - zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b362d62..4f91d0b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,20 +2,20 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v4.4.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: "25.1.0" + rev: "24.10.0" hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.4 + rev: v0.6.9 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.393 + rev: v1.1.383 hooks: - id: pyright name: pyright (system) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a99b30..6beb584 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,86 +5,6 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] - -### Added -- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) - -### Changed -- Fix issues preventing interrupted crawls from being resumed. (#499) - - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. - - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed. - - Don't cleanup an explicitly passed build directory. - -## [3.0.5] - 2024-04-11 - -### Changed - -- Upgrade to browsertrix crawler 1.6.0 (#493) - -## [3.0.4] - 2024-04-04 - -### Changed - -- Upgrade to browsertrix crawler 1.5.10 (#491) - -## [3.0.3] - 2024-02-28 - -### Changed - -- Upgrade to browsertrix crawler 1.5.7 (#483) - -## [3.0.2] - 2024-02-27 - -### Changed - -- Upgrade to browsertrix crawler 1.5.6 (#482) - -## [3.0.1] - 2024-02-24 - -### Changed - -- Upgrade to browsertrix crawler 1.5.4 (#476) - -## [3.0.0] - 2024-02-17 - -### Changed - -- Change solution to report partial ZIM to the Zimfarm and other clients (#304) -- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) -- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433) -- Document all Browsertrix Crawler default arguments values (#416) -- Use preferred Browsertrix Crawler arguments names: (part of #471) - - `--seeds` instead of `--url` - - `--seedFile` instead of `--urlFile` - - `--pageLimit` instead of `--limit` - - `--pageLoadTimeout` instead of `--timeout` - - `--scopeIncludeRx` instead of `--include` - - `--scopeExcludeRx` instead of `--exclude` - - `--pageExtraDelay` instead of `--delay` -- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471) - - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments) - - `--zimit-progress-file` is now the zimit stats location - - `--warc2zim-progress-file` is the warc2zim stats location - - all are optional values, if not set and needed temporary files are used - -### Fixed - -- Do not create the ZIM when crawl is incomplete (#444) - -## [2.1.8] - 2024-02-07 - -### Changed - -- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464) - -## [2.1.7] - 2024-01-10 - -### Changed - -- Upgrade to browsertrix crawler 1.4.2 (#450) -- Upgrade to warc2zim 2.2.0 - ## [2.1.6] - 2024-11-07 ### Changed diff --git a/Dockerfile b/Dockerfile index 9666c0b..bac1b30 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,13 @@ -FROM webrecorder/browsertrix-crawler:1.6.0 -LABEL org.opencontainers.image.source=https://github.com/openzim/zimit - -# add deadsnakes ppa for latest Python on Ubuntu -RUN add-apt-repository ppa:deadsnakes/ppa -y +FROM webrecorder/browsertrix-crawler:1.3.5 +LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update \ && apt-get install -qqy --no-install-recommends \ libmagic1 \ - python3.13-venv \ + python3.12-venv \ && rm -rf /var/lib/apt/lists/* \ # python setup (in venv not to conflict with browsertrix) - && python3.13 -m venv /app/zimit \ + && python3.12 -m venv /app/zimit \ # placeholder (default output location) && mkdir -p /output \ # disable chrome upgrade diff --git a/README.md b/README.md index 188615f..9bfba9b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Zimit ===== -Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site. +Zimit is a scraper allowing to create ZIM file from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) @@ -9,7 +9,7 @@ Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/Z Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing). -Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**. +Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**. Capabilities and known limitations -------------------- @@ -38,23 +38,24 @@ Usage `zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit. -The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**: +The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance: -- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage +- Required: `--url URL` - the url to be crawled - Required: `--name` - Name of ZIM file - `--output` - output directory (defaults to `/output`) -- `--pageLimit U` - Limit capture to at most U URLs -- `--scopeExcludeRx ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. +- `--limit U` - Limit capture to at most U URLs +- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded) +- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. - `--workers N` - number of crawl workers to be run in parallel -- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). -- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success. +- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). +- `--keep` - if set, keep the WARC files in a temp directory inside the output directory Example command: ```bash docker run ghcr.io/openzim/zimit zimit --help docker run ghcr.io/openzim/zimit warc2zim --help -docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile +docker run -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile ``` **Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`). diff --git a/offliner-definition.json b/offliner-definition.json deleted file mode 100644 index 4bb68b5..0000000 --- a/offliner-definition.json +++ /dev/null @@ -1,981 +0,0 @@ -{ - "offliner_id": "zimit", - "stdOutput": true, - "stdStats": "zimit-progress-file", - "flags": { - "seeds": { - "type": "string", - "required": false, - "title": "Seeds", - "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage" - }, - "seed_file": { - "type": "string", - "required": false, - "title": "Seed File", - "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file." - }, - "lang": { - "type": "string", - "required": false, - "title": "Browser Language", - "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`" - }, - "title": { - "type": "string", - "required": false, - "title": "Title", - "description": "Custom title for your ZIM. Defaults to title of main page", - "minLength": 1, - "maxLength": 30 - }, - "description": { - "type": "string", - "required": false, - "title": "Description", - "description": "Description for ZIM", - "minLength": 1, - "maxLength": 80 - }, - "favicon": { - "type": "blob", - "kind": "image", - "required": false, - "title": "Illustration", - "description": "URL for Illustration. " - }, - "tags": { - "type": "string", - "required": false, - "title": "ZIM Tags", - "description": "Single string with individual tags separated by a semicolon." - }, - "creator": { - "type": "string", - "required": false, - "title": "Creator", - "description": "Name of content creator" - }, - "publisher": { - "type": "string", - "required": false, - "title": "Publisher", - "isPublisher": true, - "description": "Custom publisher name (ZIM metadata). openZIM otherwise" - }, - "source": { - "type": "string", - "required": false, - "title": "Source", - "description": "Source name/URL of content" - }, - "workers": { - "type": "integer", - "required": false, - "title": "Workers", - "description": "The number of workers to run in parallel. Defaults to 1", - "min": 1 - }, - "wait_until": { - "type": "string", - "required": false, - "title": "WaitUntil", - "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2" - }, - "extra_hops": { - "type": "integer", - "required": false, - "title": "Extra Hops", - "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0", - "min": 0 - }, - "page_limit": { - "type": "integer", - "required": false, - "title": "Page Limit", - "description": "Limit crawl to this number of pages. Default is 0 (no-limit).", - "min": 0 - }, - "max_page_limit": { - "type": "integer", - "required": false, - "title": "Max Page Limit", - "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)", - "min": 0 - }, - "page_load_timeout": { - "type": "integer", - "required": false, - "title": "Page Load Timeout", - "description": "Timeout for each page to load (in seconds). Default is 90", - "min": 0 - }, - "scope_type": { - "type": "string-enum", - "required": false, - "title": "Scope Type", - "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.", - "choices": [ - { - "title": "Page", - "value": "page" - }, - { - "title": "Page SPA", - "value": "page-spa" - }, - { - "title": "Prefix", - "value": "prefix" - }, - { - "title": "Host", - "value": "host" - }, - { - "title": "Domain", - "value": "domain" - }, - { - "title": "Any", - "value": "any" - }, - { - "title": "Custom", - "value": "custom" - } - ] - }, - "scope_include_rx": { - "type": "string", - "required": false, - "title": "Scope Include Regex", - "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)" - }, - "scope_exclude_rx": { - "type": "string", - "required": false, - "title": "Scope Exclude Regex", - "description": "Regex of page URLs that should be excluded from the crawl" - }, - "allow_hash_urls": { - "type": "boolean", - "required": false, - "title": "Allow Hashtag URLs", - "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" - }, - "mobile_device": { - "type": "string-enum", - "required": false, - "title": "As device", - "description": "Device to crawl as. See Pupeeter's Device.ts for a list", - "choices": [ - { - "title": "Blackberry Playbook", - "value": "Blackberry PlayBook" - }, - { - "title": "Blackberry Playbook Landscape", - "value": "Blackberry PlayBook landscape" - }, - { - "title": "Blackberry Z30", - "value": "BlackBerry Z30" - }, - { - "title": "Blackberry Z30 Landscape", - "value": "BlackBerry Z30 landscape" - }, - { - "title": "Galaxy Note 3", - "value": "Galaxy Note 3" - }, - { - "title": "Galaxy Note 3 Landscape", - "value": "Galaxy Note 3 landscape" - }, - { - "title": "Galaxy Note II", - "value": "Galaxy Note II" - }, - { - "title": "Galaxy Note II Landscape", - "value": "Galaxy Note II landscape" - }, - { - "title": "Galaxy S III", - "value": "Galaxy S III" - }, - { - "title": "Galaxy S III Landscape", - "value": "Galaxy S III landscape" - }, - { - "title": "Galaxy S5", - "value": "Galaxy S5" - }, - { - "title": "Galaxy S5 Landscape", - "value": "Galaxy S5 landscape" - }, - { - "title": "Galaxy S8", - "value": "Galaxy S8" - }, - { - "title": "Galaxy S8 Landscape", - "value": "Galaxy S8 landscape" - }, - { - "title": "Galaxy S9 Plus", - "value": "Galaxy S9+" - }, - { - "title": "Galaxy S9 Plus Landscape", - "value": "Galaxy S9+ landscape" - }, - { - "title": "Galaxy Tab S4", - "value": "Galaxy Tab S4" - }, - { - "title": "Galaxy Tab S4 Landscape", - "value": "Galaxy Tab S4 landscape" - }, - { - "title": "iPad", - "value": "iPad" - }, - { - "title": "iPad Landscape", - "value": "iPad landscape" - }, - { - "title": "iPad Gen 6", - "value": "iPad (gen 6)" - }, - { - "title": "iPad Gen 6 Landscape", - "value": "iPad (gen 6) landscape" - }, - { - "title": "iPad Gen 7", - "value": "iPad (gen 7)" - }, - { - "title": "iPad Gen 7 Landscape", - "value": "iPad (gen 7) landscape" - }, - { - "title": "iPad Mini", - "value": "iPad Mini" - }, - { - "title": "iPad Mini Landscape", - "value": "iPad Mini landscape" - }, - { - "title": "iPad Pro", - "value": "iPad Pro" - }, - { - "title": "iPad Pro Landscape", - "value": "iPad Pro landscape" - }, - { - "title": "iPad Pro 11", - "value": "iPad Pro 11" - }, - { - "title": "iPad Pro 11 Landscape", - "value": "iPad Pro 11 landscape" - }, - { - "title": "iPhone 4", - "value": "iPhone 4" - }, - { - "title": "iPhone 4 Landscape", - "value": "iPhone 4 landscape" - }, - { - "title": "iPhone 5", - "value": "iPhone 5" - }, - { - "title": "iPhone 5 Landscape", - "value": "iPhone 5 landscape" - }, - { - "title": "iPhone 6", - "value": "iPhone 6" - }, - { - "title": "iPhone 6 Landscape", - "value": "iPhone 6 landscape" - }, - { - "title": "iPhone 6 Plus", - "value": "iPhone 6 Plus" - }, - { - "title": "iPhone 6 Plus Landscape", - "value": "iPhone 6 Plus landscape" - }, - { - "title": "iPhone 7", - "value": "iPhone 7" - }, - { - "title": "iPhone 7 Landscape", - "value": "iPhone 7 landscape" - }, - { - "title": "iPhone 7 Plus", - "value": "iPhone 7 Plus" - }, - { - "title": "iPhone 7 Plus Landscape", - "value": "iPhone 7 Plus landscape" - }, - { - "title": "iPhone 8", - "value": "iPhone 8" - }, - { - "title": "iPhone 8 Landscape", - "value": "iPhone 8 landscape" - }, - { - "title": "iPhone 8 Plus", - "value": "iPhone 8 Plus" - }, - { - "title": "iPhone 8 Plus Landscape", - "value": "iPhone 8 Plus landscape" - }, - { - "title": "iPhone SE", - "value": "iPhone SE" - }, - { - "title": "iPhone SE Landscape", - "value": "iPhone SE landscape" - }, - { - "title": "iPhone X", - "value": "iPhone X" - }, - { - "title": "iPhone X Landscape", - "value": "iPhone X landscape" - }, - { - "title": "iPhone XR", - "value": "iPhone XR" - }, - { - "title": "iPhone XR Landscape", - "value": "iPhone XR landscape" - }, - { - "title": "iPhone 11", - "value": "iPhone 11" - }, - { - "title": "iPhone 11 Landscape", - "value": "iPhone 11 landscape" - }, - { - "title": "iPhone 11 Pro", - "value": "iPhone 11 Pro" - }, - { - "title": "iPhone 11 Pro Landscape", - "value": "iPhone 11 Pro landscape" - }, - { - "title": "iPhone 11 Pro Max", - "value": "iPhone 11 Pro Max" - }, - { - "title": "iPhone 11 Pro Max Landscape", - "value": "iPhone 11 Pro Max landscape" - }, - { - "title": "iPhone 12", - "value": "iPhone 12" - }, - { - "title": "iPhone 12 Landscape", - "value": "iPhone 12 landscape" - }, - { - "title": "iPhone 12 Pro", - "value": "iPhone 12 Pro" - }, - { - "title": "iPhone 12 Pro Landscape", - "value": "iPhone 12 Pro landscape" - }, - { - "title": "iPhone 12 Pro Max", - "value": "iPhone 12 Pro Max" - }, - { - "title": "iPhone 12 Pro Max Landscape", - "value": "iPhone 12 Pro Max landscape" - }, - { - "title": "iPhone 12 Mini", - "value": "iPhone 12 Mini" - }, - { - "title": "iPhone 12 Mini Landscape", - "value": "iPhone 12 Mini landscape" - }, - { - "title": "iPhone 13", - "value": "iPhone 13" - }, - { - "title": "iPhone 13 Landscape", - "value": "iPhone 13 landscape" - }, - { - "title": "iPhone 13 Pro", - "value": "iPhone 13 Pro" - }, - { - "title": "iPhone 13 Pro Landscape", - "value": "iPhone 13 Pro landscape" - }, - { - "title": "iPhone 13 Pro Max", - "value": "iPhone 13 Pro Max" - }, - { - "title": "iPhone 13 Pro Max Landscape", - "value": "iPhone 13 Pro Max landscape" - }, - { - "title": "iPhone 13 Mini", - "value": "iPhone 13 Mini" - }, - { - "title": "iPhone 13 Mini Landscape", - "value": "iPhone 13 Mini landscape" - }, - { - "title": "Jio Phone 2", - "value": "JioPhone 2" - }, - { - "title": "Jio Phone 2 Landscape", - "value": "JioPhone 2 landscape" - }, - { - "title": "Kindle Fire HDX", - "value": "Kindle Fire HDX" - }, - { - "title": "Kindle Fire HDX Landscape", - "value": "Kindle Fire HDX landscape" - }, - { - "title": "LG Optimus L70", - "value": "LG Optimus L70" - }, - { - "title": "LG Optimus L70 Landscape", - "value": "LG Optimus L70 landscape" - }, - { - "title": "Microsoft Lumia 550", - "value": "Microsoft Lumia 550" - }, - { - "title": "Microsoft Lumia 950", - "value": "Microsoft Lumia 950" - }, - { - "title": "Microsoft Lumia 950 Landscape", - "value": "Microsoft Lumia 950 landscape" - }, - { - "title": "Nexus 10", - "value": "Nexus 10" - }, - { - "title": "Nexus 10 Landscape", - "value": "Nexus 10 landscape" - }, - { - "title": "Nexus 4", - "value": "Nexus 4" - }, - { - "title": "Nexus 4 Landscape", - "value": "Nexus 4 landscape" - }, - { - "title": "Nexus 5", - "value": "Nexus 5" - }, - { - "title": "Nexus 5 Landscape", - "value": "Nexus 5 landscape" - }, - { - "title": "Nexus 5X", - "value": "Nexus 5X" - }, - { - "title": "Nexus 5X Landscape", - "value": "Nexus 5X landscape" - }, - { - "title": "Nexus 6", - "value": "Nexus 6" - }, - { - "title": "Nexus 6 Landscape", - "value": "Nexus 6 landscape" - }, - { - "title": "Nexus 6P", - "value": "Nexus 6P" - }, - { - "title": "Nexus 6P Landscape", - "value": "Nexus 6P landscape" - }, - { - "title": "Nexus 7", - "value": "Nexus 7" - }, - { - "title": "Nexus 7 Landscape", - "value": "Nexus 7 landscape" - }, - { - "title": "Nokia Lumia 520", - "value": "Nokia Lumia 520" - }, - { - "title": "Nokia Lumia 520 Landscape", - "value": "Nokia Lumia 520 landscape" - }, - { - "title": "Nokia N9", - "value": "Nokia N9" - }, - { - "title": "Nokia N9 Landscape", - "value": "Nokia N9 landscape" - }, - { - "title": "Pixel 2", - "value": "Pixel 2" - }, - { - "title": "Pixel 2 Landscape", - "value": "Pixel 2 landscape" - }, - { - "title": "Pixel 2 XL", - "value": "Pixel 2 XL" - }, - { - "title": "Pixel 2 XL Landscape", - "value": "Pixel 2 XL landscape" - }, - { - "title": "Pixel 3", - "value": "Pixel 3" - }, - { - "title": "Pixel 3 Landscape", - "value": "Pixel 3 landscape" - }, - { - "title": "Pixel 4", - "value": "Pixel 4" - }, - { - "title": "Pixel 4 Landscape", - "value": "Pixel 4 landscape" - }, - { - "title": "Pixel 4A 5G", - "value": "Pixel 4a (5G)" - }, - { - "title": "Pixel 4A 5G Landscape", - "value": "Pixel 4a (5G) landscape" - }, - { - "title": "Pixel 5", - "value": "Pixel 5" - }, - { - "title": "Pixel 5 Landscape", - "value": "Pixel 5 landscape" - }, - { - "title": "Moto G4", - "value": "Moto G4" - }, - { - "title": "Moto G4 Landscape", - "value": "Moto G4 landscape" - } - ] - }, - "select_links": { - "type": "string", - "required": false, - "title": "Select Links", - "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" - }, - "click_selector": { - "type": "string", - "required": false, - "title": "Click Selector", - "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'" - }, - "block_rules": { - "type": "string", - "required": false, - "title": "Block Rules", - "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" - }, - "block_message": { - "type": "string", - "required": false, - "title": "Block Message", - "description": "If specified, when a URL is blocked, a record with this error message is added instead" - }, - "block_ads": { - "type": "boolean", - "required": false, - "title": "Block Ads", - "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set." - }, - "ad_block_message": { - "type": "string", - "required": false, - "title": "Ads Block Message", - "description": "If specified, when an ad is blocked, a record with this error message is added instead" - }, - "user_agent": { - "type": "string", - "required": false, - "title": "User Agent", - "description": "Override user-agent with specified" - }, - "user_agent_suffix": { - "type": "string", - "required": false, - "title": "User Agent Suffix", - "description": "Append suffix to existing browser user-agent. Defaults to +Zimit" - }, - "use_sitemap": { - "type": "string", - "required": false, - "title": "Sitemap URL", - "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)" - }, - "sitemap_from_date": { - "type": "string", - "required": false, - "title": "Sitemap From Date", - "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" - }, - "sitemap_to_date": { - "type": "string", - "required": false, - "title": "Sitemap To Date", - "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" - }, - "behavior_timeout": { - "type": "integer", - "required": false, - "title": "Behavior Timeout", - "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.", - "min": 0 - }, - "post_load_delay": { - "type": "integer", - "required": false, - "title": "Post Load Delay", - "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.", - "min": 0 - }, - "page_extra_delay": { - "type": "integer", - "required": false, - "title": "Page Extra Delay", - "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.", - "min": 0 - }, - "dedup_policy": { - "type": "string-enum", - "required": false, - "title": "Dedup Policy", - "description": "Deduplication policy. One of skip, revisit or keep. Default is skip", - "choices": [ - { - "title": "Skip", - "value": "skip" - }, - { - "title": "Revisit", - "value": "revisit" - }, - { - "title": "Keep", - "value": "keep" - } - ] - }, - "screenshot": { - "type": "string", - "required": false, - "title": "Screenshot", - "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those." - }, - "size_soft_limit": { - "type": "integer", - "required": false, - "title": "Size Soft Limit", - "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.", - "min": 0 - }, - "size_hard_limit": { - "type": "integer", - "required": false, - "title": "Size Hard Limit", - "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value", - "min": 0 - }, - "disk_utilization": { - "type": "integer", - "required": false, - "title": "Disk Utilization", - "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.", - "min": 0 - }, - "time_soft_limit": { - "type": "integer", - "required": false, - "title": "Time Soft Limit", - "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.", - "min": 0 - }, - "time_hard_limit": { - "type": "integer", - "required": false, - "title": "Time Hard Limit", - "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds", - "min": 0 - }, - "net_idle_wait": { - "type": "integer", - "required": false, - "title": "Net Idle Wait", - "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope." - }, - "origin_override": { - "type": "string", - "required": false, - "title": "Origin Override", - "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port." - }, - "max_page_retries": { - "type": "integer", - "required": false, - "title": "Max Page Retries", - "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.", - "min": 0 - }, - "fail_on_failed_seed": { - "type": "boolean", - "required": false, - "title": "Fail on failed seed", - "description": "Whether to display additional logs" - }, - "fail_on_invalid_status": { - "type": "boolean", - "required": false, - "title": "Fail on invalid status", - "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses" - }, - "fail_on_failed_limit": { - "type": "integer", - "required": false, - "title": "Fail on failed - Limit", - "description": "If set, save state and exit if number of failed pages exceeds this value.", - "min": 0 - }, - "warcs": { - "type": "string", - "required": false, - "title": "WARC files", - "description": "Comma-separated list of WARC files to use as input." - }, - "verbose": { - "type": "boolean", - "required": false, - "title": "Verbose mode", - "description": "Whether to display additional logs" - }, - "keep": { - "type": "boolean", - "required": false, - "title": "Keep", - "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.", - "default": true - }, - "output": { - "type": "string", - "required": false, - "title": "Output folder", - "description": "Output folder for ZIM file(s). Leave it as `/output`", - "pattern": "^/output$" - }, - "admin_email": { - "type": "email", - "required": false, - "title": "Admin Email", - "description": "Admin Email for crawler: used in UserAgent so website admin can contact us", - "default": "contact+zimfarm@kiwix.org" - }, - "profile": { - "type": "string", - "required": false, - "title": "Browser profile", - "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler." - }, - "behaviors": { - "type": "string", - "required": false, - "title": "Behaviors", - "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific." - }, - "depth": { - "type": "integer", - "required": false, - "title": "Depth", - "description": "The depth of the crawl for all seeds. Default is -1 (infinite).", - "min": -1 - }, - "zim_lang": { - "type": "string", - "required": false, - "title": "ZIM Language", - "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`", - "alias": "zim-lang", - "customValidator": "language_code" - }, - "long_description": { - "type": "string", - "required": false, - "title": "Long description", - "description": "Optional long description for your ZIM", - "minLength": 1, - "maxLength": 4000, - "alias": "long-description" - }, - "custom_css": { - "type": "blob", - "kind": "css", - "required": false, - "title": "Custom CSS", - "description": "URL to a CSS file to inject into pages", - "alias": "custom-css" - }, - "charsets_to_try": { - "type": "string", - "required": false, - "title": "Charsets to try", - "description": "List of charsets to try decode content when charset is not found", - "alias": "charsets-to-try" - }, - "ignore_content_header_charsets": { - "type": "boolean", - "required": false, - "title": "Ignore Content Header Charsets", - "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.", - "alias": "ignore-content-header-charsets" - }, - "content_header_bytes_length": { - "type": "integer", - "required": false, - "title": "Content Header Bytes Length", - "description": "How many bytes to consider when searching for content charsets in header (default is 1024).", - "alias": "content-header-bytes-length", - "min": 0 - }, - "ignore_http_header_charsets": { - "type": "boolean", - "required": false, - "title": "Ignore HTTP Header Charsets", - "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.", - "alias": "ignore-http-header-charsets" - }, - "encoding_aliases": { - "type": "string", - "required": false, - "title": "Encoding Aliases", - "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.", - "alias": "encoding-aliases" - }, - "custom_behaviors": { - "type": "string", - "required": false, - "title": "Custom Behaviors", - "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.", - "alias": "custom-behaviours" - }, - "zimit_progress_file": { - "type": "string", - "required": false, - "title": "Zimit Progress File", - "description": "Scraping progress file. Leave it as `/output/task_progress.json`", - "alias": "zimit-progress-file", - "pattern": "^/output/task_progress\\.json$" - }, - "replay_viewer_source": { - "type": "url", - "required": false, - "title": "Replay Viewer Source", - "description": "URL from which to load the ReplayWeb.page replay viewer from", - "alias": "replay-viewer-source" - }, - "zim_file": { - "type": "string", - "required": false, - "title": "ZIM filename", - "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically", - "alias": "zim-file", - "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$", - "relaxedPattern": "^[A-Za-z0-9._-]+$" - }, - "name": { - "type": "string", - "required": true, - "title": "ZIM name", - "description": "Name of the ZIM.", - "alias": "name", - "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", - "relaxedPattern": "^[A-Za-z0-9._-]+$" - }, - "overwrite": { - "type": "boolean", - "required": false, - "title": "Overwrite", - "description": "Whether to overwrite existing ZIM file if it exists" - } - } -} diff --git a/pyproject.toml b/pyproject.toml index e4e7696..b213161 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,17 +1,17 @@ [build-system] -requires = ["hatchling", "hatch-openzim"] +requires = ["hatchling", "hatch-openzim==0.2.0"] build-backend = "hatchling.build" [project] name = "zimit" -requires-python = ">=3.13,<3.14" +requires-python = ">=3.12,<3.13" description = "Make ZIM file from any website through crawling" readme = "README.md" dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.1.3", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] @@ -26,20 +26,20 @@ scripts = [ "invoke==2.2.0", ] lint = [ - "black==25.1.0", - "ruff==0.9.4", + "black==24.10.0", + "ruff==0.6.9", ] check = [ - "pyright==1.1.393", + "pyright==1.1.383", ] test = [ - "pytest==8.3.4", - "coverage==7.6.10", + "pytest==8.3.3", + "coverage==7.6.1", ] dev = [ - "pre-commit==4.1.0", - "debugpy==1.8.12", - "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion) + "pre-commit==4.0.0", + "debugpy==1.8.6", + "selenium==4.25.0", # used in daily tests, convenient for dev purpose (autocompletion) "zimit[scripts]", "zimit[lint]", "zimit[test]", @@ -95,10 +95,10 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 -target-version = ['py313'] +target-version = ['py312'] [tool.ruff] -target-version = "py313" +target-version = "py312" line-length = 88 src = ["src"] @@ -221,5 +221,5 @@ exclude_lines = [ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**"] extraPaths = ["src"] -pythonVersion = "3.13" +pythonVersion = "3.12" typeCheckingMode="basic" diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 281b1bb..edc60b3 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.6-dev0" +__version__ = "2.1.6" diff --git a/src/zimit/constants.py b/src/zimit/constants.py index 35baeb9..f81905a 100644 --- a/src/zimit/constants.py +++ b/src/zimit/constants.py @@ -3,8 +3,7 @@ import logging from zimscraperlib.logging import getLogger EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 -EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14 -EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15 +EXIT_CODE_CRAWLER_LIMIT_HIT = 11 NORMAL_WARC2ZIM_EXIT_CODE = 100 REQUESTS_TIMEOUT = 10 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index b205007..44c6d4f 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -25,28 +25,26 @@ from zimscraperlib.uri import rebuild_uri from zimit.__about__ import __version__ from zimit.constants import ( - EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT, - EXIT_CODE_CRAWLER_TIME_LIMIT_HIT, + EXIT_CODE_CRAWLER_LIMIT_HIT, EXIT_CODE_WARC2ZIM_CHECK_FAILED, NORMAL_WARC2ZIM_EXIT_CODE, logger, ) from zimit.utils import download_file -temp_root_dir: Path | None = None - class ProgressFileWatcher: - def __init__( - self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path - ): - self.crawl_stats_path = crawl_stats_path - self.warc2zim_stats_path = warc2zim_stats_path - self.zimit_stats_path = zimit_stats_path + def __init__(self, output_dir: Path, stats_path: Path): + self.crawl_path = output_dir / "crawl.json" + self.warc2zim_path = output_dir / "warc2zim.json" + self.stats_path = stats_path + + if not self.stats_path.is_absolute(): + self.stats_path = output_dir / self.stats_path # touch them all so inotify is not unhappy on add_watch - self.crawl_stats_path.touch() - self.warc2zim_stats_path.touch() + self.crawl_path.touch() + self.warc2zim_path.touch() self.process = None def stop(self): @@ -58,28 +56,40 @@ class ProgressFileWatcher: def watch(self): self.process = Process( target=self.inotify_watcher, - args=( - str(self.crawl_stats_path), - str(self.warc2zim_stats_path), - str(self.zimit_stats_path), - ), + args=(str(self.crawl_path), str(self.warc2zim_path), str(self.stats_path)), ) self.process.daemon = True self.process.start() - def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, zimit_fpath: str): + @staticmethod + def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str): ino = inotify.adapters.Inotify() ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore - def crawl_conv(data): + class Limit: + def __init__(self): + self.max = self.hit = None + + @property + def as_dict(self): + return {"max": self.max, "hit": self.hit} + + # limit is only reported by crawl but needs to be reported up + limit = Limit() + + def crawl_conv(data, limit): # we consider crawl to be 90% of the workload so total = craw_total * 90% + # limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]} + limit.max = data["limit"]["max"] + limit.hit = data["limit"]["hit"] return { "done": data["crawled"], "total": int(data["total"] / 0.9), + "limit": limit.as_dict, } - def warc2zim_conv(data): + def warc2zim_conv(data, limit): # we consider warc2zim to be 10% of the workload so # warc2zim_total = 10% and total = 90 + warc2zim_total * 10% return { @@ -88,6 +98,7 @@ class ProgressFileWatcher: * (0.9 + (float(data["written"]) / data["total"]) / 10) ), "total": data["total"], + "limit": limit.as_dict, } for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore @@ -97,305 +108,128 @@ class ProgressFileWatcher: # open input and output separatly as to not clear output on error with open(fpath) as ifh: try: - out = func(json.load(ifh)) + out = func(json.load(ifh), limit) except Exception: # nosec # noqa: S112 # simply ignore progress update should an error arise # might be malformed input for instance continue if not out: continue - with open(zimit_fpath, "w") as ofh: + with open(output_fpath, "w") as ofh: json.dump(out, ofh) -def cleanup(): - if not temp_root_dir: - logger.warning("Temporary root dir not already set, cannot clean this up") - return - logger.info("") - logger.info("----------") - logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") - shutil.rmtree(temp_root_dir) - - -def cancel_cleanup(): - logger.info( - f"Temporary files have been kept in {temp_root_dir}, please clean them" - " up manually once you don't need them anymore" - ) - atexit.unregister(cleanup) - - def run(raw_args): parser = ArgumentParser( description="Run a browser-based crawl on the specified URL and convert to ZIM" ) - parser.add_argument( - "--seeds", - help="The seed URL(s) to start crawling from. Multile seed URL must be " - "separated by a comma (usually not needed, these are just the crawl seeds). " - "First seed URL is used as ZIM homepage", - ) - - parser.add_argument("--title", help="WARC and ZIM title") - parser.add_argument("--description", help="WARC and ZIM description") + parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument("--title", help="ZIM title") + parser.add_argument("--description", help="ZIM description") parser.add_argument("--long-description", help="ZIM long description metadata") parser.add_argument( - "--seedFile", - help="If set, read a list of seed urls, one per line. Can be a local file or " - "the HTTP(s) URL to an online file.", + "--urlFile", + help="If set, read a list of seed urls, one per line, from the specified", ) - parser.add_argument( - "-w", "--workers", type=int, help="Number of parallel workers. Default is 1." - ) - - parser.add_argument( - "--crawlId", - help="A user provided ID for this crawl or crawl configuration (can also be " - "set via CRAWL_ID env var, defaults to machine hostname)", - ) + parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument( "--waitUntil", help="Puppeteer page.goto() condition to wait for before continuing. One of " "load, domcontentloaded, networkidle0 or networkidle2, or a " - "comma-separated combination of those. Default is load,networkidle2", + "comma-separated combination of those.", + default="load", ) parser.add_argument( - "--depth", - help="The depth of the crawl for all seeds. Default is -1 (infinite).", - type=int, + "--depth", help="The depth of the crawl for all seeds", type=int, default=-1 ) parser.add_argument( "--extraHops", - help="Number of extra 'hops' to follow, beyond the current scope. " - "Default is 0.", + help="Number of extra 'hops' to follow, beyond the current scope", type=int, ) - parser.add_argument( - "--pageLimit", - help="Limit crawl to this number of pages. Default is 0 (no limit).", - type=int, - ) + parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int) parser.add_argument( "--maxPageLimit", - help="Maximum pages to crawl, overriding pageLimit if both are set. Default is " - "0 (no limit)", + help="Maximum pages to crawl, overriding pageLimit if both are set", type=int, ) parser.add_argument( - "--pageLoadTimeout", - help="Timeout for each page to load (in seconds). Default is 90 secs.", + "--timeout", + help="Timeout for each page to load (in seconds)", type=int, + default=90, ) parser.add_argument( "--scopeType", help="A predfined scope of the crawl. For more customization, " - "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom" - "if scopeIncludeRx is set, prefix otherwise.", + "use 'custom' and set scopeIncludeRx regexes", choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], ) parser.add_argument( - "--scopeIncludeRx", - help="Regex of page URLs that should be included in the crawl (defaults to " + "--include", + help="Regex of page URLs that should be " + "included in the crawl (defaults to " "the immediate directory of URL)", ) parser.add_argument( - "--scopeExcludeRx", + "--exclude", help="Regex of page URLs that should be excluded from the crawl", ) - parser.add_argument( - "--allowHashUrls", - help="Allow Hashtag URLs, useful for single-page-application crawling or " - "when different hashtags load dynamic content", - action="store_true", - ) - - parser.add_argument( - "--selectLinks", - help="One or more selectors for extracting links, in the format " - "[css selector]->[property to use],[css selector]->@[attribute to use]", - ) - - parser.add_argument( - "--clickSelector", - help="Selector for elements to click when using the autoclick behavior. Default" - " is 'a'", - ) - - parser.add_argument( - "--blockRules", - help="Additional rules for blocking certain URLs from being loaded, by URL " - "regex and optionally via text match in an iframe", - ) - - parser.add_argument( - "--blockMessage", - help="If specified, when a URL is blocked, a record with this error message is" - " added instead", - ) - - parser.add_argument( - "--blockAds", - help="If set, block advertisements from being loaded (based on Stephen Black's" - " blocklist). Note that some bad domains are also blocked by zimit" - " configuration even if this option is not set.", - ) - - parser.add_argument( - "--adBlockMessage", - help="If specified, when an ad is blocked, a record with this error message is" - " added instead", - ) - parser.add_argument( "--collection", help="Collection name to crawl to (replay will be accessible " - "under this name in pywb preview). Default is crawl-@ts.", + "under this name in pywb preview) instead of crawl-@ts", ) parser.add_argument( - "--headless", - help="Run in headless mode, otherwise start xvfb", + "--allowHashUrls", + help="Allow Hashtag URLs, useful for " + "single-page-application crawling or " + "when different hashtags load dynamic " + "content", action="store_true", ) parser.add_argument( - "--driver", - help="Custom driver for the crawler, if any", + "--lang", + help="if set, sets the language used by the browser, should be ISO 639 " + "language[-country] code", ) parser.add_argument( - "--generateCDX", - help="If set, generate index (CDXJ) for use with pywb after crawl is done", - action="store_true", + "--zim-lang", + help="Language metadata of ZIM " + "(warc2zim --lang param). ISO-639-3 code. " + "Retrieved from homepage if found, fallback to `eng`", ) - parser.add_argument( - "--combineWARC", - help="If set, combine the warcs", - action="store_true", - ) - - parser.add_argument( - "--rolloverSize", - help="If set, declare the rollover size. Default is 1000000000.", - type=int, - ) - - parser.add_argument( - "--generateWACZ", - help="If set, generate WACZ on disk", - action="store_true", - ) - - parser.add_argument( - "--logging", - help="Crawler logging configuration", - ) - - parser.add_argument( - "--logLevel", - help="Comma-separated list of log levels to include in logs", - ) - - parser.add_argument( - "--logContext", - help="Comma-separated list of contexts to include in logs", - choices=[ - "general", - "worker", - "recorder", - "recorderNetwork", - "writer", - "state", - "redis", - "storage", - "text", - "exclusion", - "screenshots", - "screencast", - "originOverride", - "healthcheck", - "browser", - "blocking", - "behavior", - "behaviorScript", - "jsError", - "fetch", - "pageStatus", - "memoryStatus", - "crawlStatus", - "links", - "sitemap", - "wacz", - "replay", - "proxy", - ], - ) - - parser.add_argument( - "--logExcludeContext", - help="Comma-separated list of contexts to NOT include in logs. Default is " - "recorderNetwork,jsError,screencast", - choices=[ - "general", - "worker", - "recorder", - "recorderNetwork", - "writer", - "state", - "redis", - "storage", - "text", - "exclusion", - "screenshots", - "screencast", - "originOverride", - "healthcheck", - "browser", - "blocking", - "behavior", - "behaviorScript", - "jsError", - "fetch", - "pageStatus", - "memoryStatus", - "crawlStatus", - "links", - "sitemap", - "wacz", - "replay", - "proxy", - ], - ) - - parser.add_argument( - "--text", - help="Extract initial (default) or final text to pages.jsonl or WARC resource" - " record(s)", - ) - - # cwd is manipulated directly by zimit, based on --output / --build, we do not want - # to expose this setting - parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", + default="Pixel 2", + ) + + parser.add_argument( + "--noMobileDevice", + help="Do not emulate a mobile device (use at your own risk, behavior is" + "uncertain)", + action="store_true", + default=False, ) parser.add_argument( @@ -417,71 +251,27 @@ def run(raw_args): "(usually /sitemap.xml)", ) - parser.add_argument( - "--sitemapFromDate", - help="If set, filter URLs from sitemaps to those greater than or equal to (>=)" - " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", - ) - - parser.add_argument( - "--sitemapToDate", - help="If set, filter URLs from sitemaps to those less than or equal to (<=) " - "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", - ) - - parser.add_argument( - "--statsFilename", - help="If set, output crawl stats as JSON to this file. Relative filename " - "resolves to output directory, see --output.", - ) - - parser.add_argument( - "--zimit-progress-file", - help="If set, output zimit stats as JSON to this file. Forces the creation of" - "crawler and warc2zim stats as well. If --statsFilename and/or " - "--warc2zim-progress-file are not set, default temporary files will be used. " - "Relative filename resolves to output directory, see --output.", - ) - - parser.add_argument( - "--warc2zim-progress-file", - help="If set, output warc2zim stats as JSON to this file. Relative filename " - "resolves to output directory, see --output.", - ) - parser.add_argument( "--behaviors", - help="Which background behaviors to enable on each page. Default is autoplay," - "autofetch,autoscroll,siteSpecific", + help="Which background behaviors to enable on each page", + default="autoplay,autofetch,siteSpecific", ) parser.add_argument( "--behaviorTimeout", help="If >0, timeout (in seconds) for in-page behavior will run on each page. " - "If 0, a behavior can run until finish. Default is 90.", + "If 0, a behavior can run until finish", type=int, + default=90, ) parser.add_argument( - "--postLoadDelay", - help="If >0, amount of time to sleep (in seconds) after page has loaded, before" - " taking screenshots / getting text / running behaviors. Default is 0.", - type=int, - ) - - parser.add_argument( - "--pageExtraDelay", + "--delay", help="If >0, amount of time to sleep (in seconds) after behaviors " - "before moving on to next page. Default is 0.", + "before moving on to next page", type=int, ) - parser.add_argument( - "--dedupPolicy", - help="Deduplication policy. Default is skip", - choices=["skip", "revisit", "keep"], - ) - parser.add_argument( "--profile", help="Path or HTTP(S) URL to tar.gz file which contains the browser profile " @@ -489,61 +279,8 @@ def run(raw_args): ) parser.add_argument( - "--screenshot", - help="Screenshot options for crawler. One of view, thumbnail, fullPage, " - "fullPageFinal or a comma-separated combination of those.", - ) - - parser.add_argument( - "--screencastPort", - help="If set to a non-zero value, starts an HTTP server with screencast " - "accessible on this port.", - type=int, - ) - - parser.add_argument( - "--screencastRedis", - help="If set, will use the state store redis pubsub for screencasting", - action="store_true", - ) - - parser.add_argument( - "--warcInfo", - help="Optional fields added to the warcinfo record in combined WARCs", - ) - - parser.add_argument( - "--saveState", - help="If the crawl state should be serialized to the crawls/ directory. " - "Defaults to 'partial', only saved when crawl is interrupted", - choices=["never", "partial", "always"], - ) - - parser.add_argument( - "--saveStateInterval", - help="If save state is set to 'always', also save state during the crawl at " - "this interval (in seconds). Default to 300.", - type=int, - ) - - parser.add_argument( - "--saveStateHistory", - help="Number of save states to keep during the duration of a crawl. " - "Default to 5.", - type=int, - ) - - size_group = parser.add_mutually_exclusive_group() - size_group.add_argument( - "--sizeSoftLimit", - help="If set, save crawl state and stop crawl if WARC size exceeds this value. " - "ZIM will still be created.", - type=int, - ) - size_group.add_argument( - "--sizeHardLimit", - help="If set, exit crawler and fail the scraper immediately if WARC size " - "exceeds this value", + "--sizeLimit", + help="If set, save state and exit if size limit exceeds this value", type=int, ) @@ -555,17 +292,9 @@ def run(raw_args): default=90, ) - time_group = parser.add_mutually_exclusive_group() - time_group.add_argument( - "--timeSoftLimit", - help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes " - "longer than this value, in seconds. ZIM will still be created.", - type=int, - ) - time_group.add_argument( - "--timeHardLimit", - help="If set, exit crawler and fail the scraper immediately if WARC(s) creation" - " takes longer than this value, in seconds", + parser.add_argument( + "--timeLimit", + help="If set, save state and exit after time limit, in seconds", type=int, ) @@ -580,150 +309,16 @@ def run(raw_args): help="overwrite current crawl data: if set, existing collection directory " "will be deleted before crawl is started", action="store_true", - ) - - parser.add_argument( - "--waitOnDone", - help="if set, wait for interrupt signal when finished instead of exiting", - action="store_true", - ) - - parser.add_argument( - "--restartsOnError", - help="if set, assume will be restarted if interrupted, don't run post-crawl " - "processes on interrupt", - action="store_true", - ) - - parser.add_argument( - "--netIdleWait", - help="If set, wait for network idle after page load and after behaviors are " - "done (in seconds). if -1 (default), determine based on scope.", - type=int, - ) - - parser.add_argument( - "--lang", - help="if set, sets the language used by the browser, should be ISO 639 " - "language[-country] code", - ) - - parser.add_argument( - "--originOverride", - help="if set, will redirect requests from each origin in key to origin in the " - "value, eg. --originOverride https://host:port=http://alt-host:alt-port", - ) - - parser.add_argument( - "--logErrorsToRedis", - help="If set, write error messages to redis", - action="store_true", - ) - - parser.add_argument( - "--writePagesToRedis", - help="If set, write page objects to redis", - action="store_true", - ) - - parser.add_argument( - "--maxPageRetries", - help="If set, number of times to retry a page that failed to load before page" - " is considered to have failed. Default is 2.", - type=int, - ) - - parser.add_argument( - "--failOnFailedSeed", - help="If set, crawler will fail with exit code 1 if any seed fails. When " - "combined with --failOnInvalidStatus, will result in crawl failing with exit " - "code 1 if any seed has a 4xx/5xx response", - action="store_true", - ) - - parser.add_argument( - "--failOnFailedLimit", - help="If set, save state and exit if number of failed pages exceeds this value", - action="store_true", - ) - - parser.add_argument( - "--failOnInvalidStatus", - help="If set, will treat pages with 4xx or 5xx response as failures. When " - "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl " - "failing due to non-200 responses", - action="store_true", - ) - - # customBehaviors not included because it has special handling - # debugAccessRedis not included due to custom redis engine in zimit - - parser.add_argument( - "--debugAccessBrowser", - help="if set, allow debugging browser on port 9222 via CDP", - action="store_true", - ) - - parser.add_argument( - "--warcPrefix", - help="prefix for WARC files generated, including WARCs added to WACZ", - ) - - parser.add_argument( - "--serviceWorker", - help="service worker handling: disabled, enabled or disabled-if-profile. " - "Default: disabled.", - ) - - parser.add_argument( - "--proxyServer", - help="if set, will use specified proxy server. Takes precedence over any env " - "var proxy settings", - ) - - parser.add_argument( - "--dryRun", - help="If true, no archive data is written to disk, only pages and logs (and " - "optionally saved state).", - action="store_true", - ) - - parser.add_argument( - "--qaSource", - help="Required for QA mode. Path to the source WACZ or multi WACZ file for QA", - ) - - parser.add_argument( - "--qaDebugImageDiff", - help="if specified, will write crawl.png, replay.png and diff.png for each " - "page where they're different", - action="store_true", - ) - - parser.add_argument( - "--sshProxyPrivateKeyFile", - help="path to SSH private key for SOCKS5 over SSH proxy connection", - ) - - parser.add_argument( - "--sshProxyKnownHostsFile", - help="path to SSH known hosts file for SOCKS5 over SSH proxy connection", + default=False, ) parser.add_argument( "--keep", - help="In case of failure, WARC files and other temporary files (which are " - "stored as a subfolder of output directory) are always kept, otherwise " - "they are automatically deleted. Use this flag to always keep WARC files, " - "even in case of success.", + help="If set, keep WARC files after crawl, don't delete", action="store_true", ) - parser.add_argument( - "--output", - help="Output directory for ZIM. Default to /output.", - default="/output", - ) + parser.add_argument("--output", help="Output directory for ZIM", default="/output") parser.add_argument( "--build", @@ -737,6 +332,11 @@ def run(raw_args): help="[warc2zim] Custom CSS file URL/path to inject into all articles", ) + parser.add_argument( + "--statsFilename", + help="If set, output stats as JSON to this file", + ) + parser.add_argument( "--config", help="Path to YAML config file. If set, browsertrix-crawler will use this file" @@ -751,10 +351,8 @@ def run(raw_args): ) parser.add_argument( - "--zim-lang", - help="Language metadata of ZIM " - "(warc2zim --lang param). ISO-639-3 code. " - "Retrieved from homepage if found, fallback to `eng`", + "--logging", + help="Crawler logging configuration", ) parser.add_argument( @@ -771,16 +369,7 @@ def run(raw_args): "path/URLs separated by comma", ) - parser.add_argument( - "--acceptable-crawler-exit-codes", - help="Non-zero crawler exit codes to consider as acceptable to continue with " - " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if " - " used). Single value with individual error codes separated by comma", - ) - - # by design, all unknown args are for warc2zim ; known one are either for crawler - # or shared - known_args, warc2zim_args = parser.parse_known_args(raw_args) + zimit_args, warc2zim_args = parser.parse_known_args(raw_args) # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are # associated with the ZIM ; make it a CSV for easier parsing @@ -788,69 +377,39 @@ def run(raw_args): warc2zim_args.append(f"zimit {__version__}") # pass url and output to warc2zim also - if known_args.output: + if zimit_args.output: warc2zim_args.append("--output") - warc2zim_args.append(known_args.output) + warc2zim_args.append(zimit_args.output) - user_agent_suffix = known_args.userAgentSuffix - if known_args.adminEmail: - user_agent_suffix += f" {known_args.adminEmail}" + url = zimit_args.url - # set temp dir to use for this crawl - global temp_root_dir # noqa: PLW0603 - if known_args.build: - # use build dir argument if passed - temp_root_dir = Path(known_args.build) - temp_root_dir.mkdir(parents=True, exist_ok=True) - else: - # make new randomized temp dir - temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) + user_agent_suffix = zimit_args.userAgentSuffix + if zimit_args.adminEmail: + user_agent_suffix += f" {zimit_args.adminEmail}" - seeds = [] - if known_args.seeds: - seeds += [get_cleaned_url(url) for url in known_args.seeds.split(",")] - if known_args.seedFile: - if re.match(r"^https?\://", known_args.seedFile): - with tempfile.NamedTemporaryFile( - dir=temp_root_dir, - prefix="seeds_", - suffix=".txt", - delete_on_close=True, - ) as filename: - seed_file = Path(filename.name) - download_file(known_args.seedFile, seed_file) - seeds += [ - get_cleaned_url(url) for url in seed_file.read_text().splitlines() - ] - else: - seeds += [ - get_cleaned_url(url) - for url in Path(known_args.seedFile).read_text().splitlines() - ] - warc2zim_args.append("--url") - warc2zim_args.append(seeds[0]) + if url: + url = get_cleaned_url(url) + warc2zim_args.append("--url") + warc2zim_args.append(url) - if known_args.custom_css: - warc2zim_args += ["--custom-css", known_args.custom_css] + if zimit_args.custom_css: + warc2zim_args += ["--custom-css", zimit_args.custom_css] - if known_args.title: + if zimit_args.title: warc2zim_args.append("--title") - warc2zim_args.append(known_args.title) + warc2zim_args.append(zimit_args.title) - if known_args.description: + if zimit_args.description: warc2zim_args.append("--description") - warc2zim_args.append(known_args.description) + warc2zim_args.append(zimit_args.description) - if known_args.long_description: + if zimit_args.long_description: warc2zim_args.append("--long-description") - warc2zim_args.append(known_args.long_description) + warc2zim_args.append(zimit_args.long_description) - if known_args.zim_lang: + if zimit_args.zim_lang: warc2zim_args.append("--lang") - warc2zim_args.append(known_args.zim_lang) - - if known_args.overwrite: - warc2zim_args.append("--overwrite") + warc2zim_args.append(zimit_args.zim_lang) logger.info("----------") logger.info("Testing warc2zim args") @@ -860,17 +419,29 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - # only trigger cleanup when the keep argument is passed without a custom build dir. - if not known_args.build and not known_args.keep: + # make temp dir for this crawl + if zimit_args.build: + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) + else: + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) + + if not zimit_args.keep: + + def cleanup(): + logger.info("") + logger.info("----------") + logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") + shutil.rmtree(temp_root_dir) + atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler - if known_args.custom_behaviors: + if zimit_args.custom_behaviors: behaviors_dir = temp_root_dir / "custom-behaviors" behaviors_dir.mkdir() for custom_behavior in [ custom_behavior.strip() - for custom_behavior in known_args.custom_behaviors.split(",") + for custom_behavior in zimit_args.custom_behaviors.split(",") ]: behaviors_file = tempfile.NamedTemporaryFile( dir=behaviors_dir, @@ -890,100 +461,55 @@ def run(raw_args): f"to {behaviors_file.name}" ) shutil.copy(custom_behavior, behaviors_file.name) - known_args.customBehaviors = str(behaviors_dir) + zimit_args.customBehaviors = str(behaviors_dir) else: - known_args.customBehaviors = None + zimit_args.customBehaviors = None - crawler_args = get_crawler_cmd_line(known_args) - for seed in seeds: - crawler_args.append("--seeds") - crawler_args.append(seed) + cmd_args = get_node_cmd_line(zimit_args) + if url: + cmd_args.append("--url") + cmd_args.append(url) - crawler_args.append("--userAgentSuffix") - crawler_args.append(user_agent_suffix) + cmd_args.append("--userAgentSuffix") + cmd_args.append(user_agent_suffix) - crawler_args.append("--cwd") - crawler_args.append(str(temp_root_dir)) + if not zimit_args.noMobileDevice: + cmd_args.append("--mobileDevice") + cmd_args.append(zimit_args.mobileDevice) - output_dir = Path(known_args.output) - warc2zim_stats_file = ( - Path(known_args.warc2zim_progress_file) - if known_args.warc2zim_progress_file - else temp_root_dir / "warc2zim.json" - ) - if not warc2zim_stats_file.is_absolute(): - warc2zim_stats_file = output_dir / warc2zim_stats_file - warc2zim_stats_file.parent.mkdir(parents=True, exist_ok=True) - warc2zim_stats_file.unlink(missing_ok=True) + cmd_args.append("--cwd") + cmd_args.append(str(temp_root_dir)) - crawler_stats_file = ( - Path(known_args.statsFilename) - if known_args.statsFilename - else temp_root_dir / "crawl.json" - ) - if not crawler_stats_file.is_absolute(): - crawler_stats_file = output_dir / crawler_stats_file - crawler_stats_file.parent.mkdir(parents=True, exist_ok=True) - crawler_stats_file.unlink(missing_ok=True) - - zimit_stats_file = ( - Path(known_args.zimit_progress_file) - if known_args.zimit_progress_file - else temp_root_dir / "stats.json" - ) - if not zimit_stats_file.is_absolute(): - zimit_stats_file = output_dir / zimit_stats_file - zimit_stats_file.parent.mkdir(parents=True, exist_ok=True) - zimit_stats_file.unlink(missing_ok=True) - - if known_args.zimit_progress_file: - # setup inotify crawler progress watcher + # setup inotify crawler progress watcher + if zimit_args.statsFilename: watcher = ProgressFileWatcher( - zimit_stats_path=zimit_stats_file, - crawl_stats_path=crawler_stats_file, - warc2zim_stats_path=warc2zim_stats_file, - ) - logger.info( - f"Writing zimit progress to {watcher.zimit_stats_path}, crawler progress to" - f" {watcher.crawl_stats_path} and warc2zim progress to " - f"{watcher.warc2zim_stats_path}" + Path(zimit_args.output), Path(zimit_args.statsFilename) ) + logger.info(f"Writing progress to {watcher.stats_path}") # update crawler command - crawler_args.append("--statsFilename") - crawler_args.append(str(crawler_stats_file)) + cmd_args.append("--statsFilename") + cmd_args.append(str(watcher.crawl_path)) # update warc2zim command warc2zim_args.append("-v") warc2zim_args.append("--progress-file") - warc2zim_args.append(str(warc2zim_stats_file)) + warc2zim_args.append(str(watcher.warc2zim_path)) watcher.watch() - else: - if known_args.statsFilename: - logger.info(f"Writing crawler progress to {crawler_stats_file}") - crawler_args.append("--statsFilename") - crawler_args.append(str(crawler_stats_file)) - if known_args.warc2zim_progress_file: - logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}") - warc2zim_args.append("-v") - warc2zim_args.append("--progress-file") - warc2zim_args.append(str(warc2zim_stats_file)) - cmd_line = " ".join(crawler_args) + cmd_line = " ".join(cmd_args) logger.info("") logger.info("----------") logger.info( f"Output to tempdir: {temp_root_dir} - " - f"{'will keep' if known_args.keep else 'will delete'}" + f"{'will keep' if zimit_args.keep else 'will delete'}" ) - partial_zim = False - # if warc files are passed, do not run browsertrix crawler but fetch the files if # they are provided as an HTTP URL + extract the archive if it is a tar.gz warc_files: list[Path] = [] - if known_args.warcs: + if zimit_args.warcs: for warc_location in [ - warc_location.strip() for warc_location in known_args.warcs.split(",") + warc_location.strip() for warc_location in zimit_args.warcs.split(",") ]: suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes) if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}: @@ -1039,36 +565,17 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") - crawl = subprocess.run(crawler_args, check=False) - if ( - crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT - and known_args.sizeSoftLimit - ): - logger.info( - "Crawl size soft limit hit. Continuing with warc2zim conversion." - ) - if known_args.zimit_progress_file: - partial_zim = True - elif ( - crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT - and known_args.timeSoftLimit - ): - logger.info( - "Crawl time soft limit hit. Continuing with warc2zim conversion." - ) - if known_args.zimit_progress_file: - partial_zim = True - elif crawl.returncode != 0: - logger.error( - f"Crawl returned an error: {crawl.returncode}, scraper exiting" - ) - cancel_cleanup() - return crawl.returncode - if known_args.collection: + logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") + crawl = subprocess.run(cmd_args, check=False) + if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: + logger.info("crawl interupted by a limit") + elif crawl.returncode != 0: + raise subprocess.CalledProcessError(crawl.returncode, cmd_args) + + if zimit_args.collection: warc_files = [ - temp_root_dir.joinpath(f"collections/{known_args.collection}/archive/") + temp_root_dir.joinpath(f"collections/{zimit_args.collection}/archive/") ] else: @@ -1082,36 +589,24 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, combining pages from all " - "of them" + "Found many WARC files directories, only most recently modified one" + " will be used" ) for directory in warc_dirs: logger.info(f"- {directory}") - warc_files = warc_dirs + warc_files = [warc_dirs[-1]] logger.info("") logger.info("----------") logger.info( f"Processing WARC files in/at " - f"{' '.join(str(warc_file) for warc_file in warc_files)}" + f'{" ".join(str(warc_file) for warc_file in warc_files)}' ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) logger.info(f"Calling warc2zim with these args: {warc2zim_args}") - warc2zim_exit_code = warc2zim(warc2zim_args) - - if known_args.zimit_progress_file: - stats_content = json.loads(zimit_stats_file.read_bytes()) - stats_content["partialZim"] = partial_zim - zimit_stats_file.write_text(json.dumps(stats_content)) - - # also call cancel_cleanup when --keep, even if it is not supposed to be registered, - # so that we will display temporary files location just like in other situations - if warc2zim_exit_code or known_args.keep: - cancel_cleanup() - - return warc2zim_exit_code + return warc2zim(warc2zim_args) def get_cleaned_url(url: str): @@ -1126,92 +621,39 @@ def get_cleaned_url(url: str): return parsed_url.geturl() -def get_crawler_cmd_line(args): - """Build the command line for Browsertrix crawler""" - node_cmd = ["crawl"] +def get_node_cmd_line(args): + node_cmd = ["crawl", "--failOnFailedSeed"] for arg in [ + "workers", + "waitUntil", + "urlFile", "title", "description", - "workers", - "crawlId", - "waitUntil", "depth", "extraHops", - "pageLimit", + "limit", "maxPageLimit", - "pageLoadTimeout", + "timeout", "scopeType", - "scopeIncludeRx", - "scopeExcludeRx", + "include", + "exclude", "collection", "allowHashUrls", - "selectLinks", - "clickSelector", - "blockRules", - "blockMessage", - "blockAds", - "adBlockMessage", - "collection", - "headless", - "driver", - "generateCDX", - "combineWARC", - "rolloverSize", - "generateWACZ", - "logging", - "logLevel", - "logContext", - "logExcludeContext", - "text", - "mobileDevice", + "lang", "userAgent", - # userAgentSuffix (manipulated), "useSitemap", - "sitemapFromDate", - "sitemapToDate", - # statsFilename (manipulated), "behaviors", "behaviorTimeout", - "postLoadDelay", - "pageExtraDelay", - "dedupPolicy", + "delay", "profile", - "screenshot", - "screencastPort", - "screencastRedis", - "warcInfo", - "saveState", - "saveStateInterval", - "saveStateHistory", - "sizeSoftLimit", - "sizeHardLimit", + "sizeLimit", "diskUtilization", - "timeSoftLimit", - "timeHardLimit", + "timeLimit", "healthCheckPort", "overwrite", - "waitOnDone", - "restartsOnError", - "netIdleWait", - "lang", - "originOverride", - "logErrorsToRedis", - "writePagesToRedis", - "maxPageRetries", - "failOnFailedSeed", - "failOnFailedLimit", - "failOnInvalidStatus", - "debugAccessBrowser", - "warcPrefix", - "serviceWorker", - "proxyServer", - "dryRun", - "qaSource", - "qaDebugImageDiff", - "sshProxyPrivateKeyFile", - "sshProxyKnownHostsFile", - "customBehaviors", "config", + "logging", + "customBehaviors", ]: value = getattr(args, arg) if arg == "userAgent": @@ -1226,14 +668,7 @@ def get_crawler_cmd_line(args): continue if value is None or (isinstance(value, bool) and value is False): continue - node_cmd.append( - "--" - + ( - "sizeLimit" - if arg in ["sizeSoftLimit", "sizeHardLimit"] - else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg - ) - ) + node_cmd.append("--" + arg) if not isinstance(value, bool): node_cmd.append(str(value)) @@ -1250,7 +685,7 @@ def sigint_handler(*args): # noqa: ARG001 def zimit(): - sys.exit(run(sys.argv[1:])) + run(sys.argv[1:]) signal.signal(signal.SIGINT, sigint_handler) diff --git a/tests-daily/Dockerfile b/tests-daily/Dockerfile index 22d45ef..f6118fe 100644 --- a/tests-daily/Dockerfile +++ b/tests-daily/Dockerfile @@ -1,5 +1,5 @@ # Let's extract kiwix-tools as usual on alpine temporary build container -FROM alpine:3.21 as kiwix-serve +FROM alpine:3.18 as kiwix-serve LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools # TARGETPLATFORM is injected by docker build @@ -30,7 +30,7 @@ RUN set -e && \ curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1 # Build real "workload" container -FROM python:3.13-slim-bookworm +FROM python:3.12-slim-bookworm # Add kiwix-serve COPY --from=kiwix-serve /kiwix-serve /usr/local/bin @@ -70,6 +70,6 @@ RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json RUN \ python -m pip install --no-cache-dir -U \ pip \ - selenium==4.28.1 \ - pytest==8.3.4 \ + selenium==4.23.0 \ + pytest==8.2.2 \ && mkdir -p /work diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 7e79f52..16ab337 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -1,55 +1,30 @@ import glob import json import os -from pathlib import Path -import pytest from warcio import ArchiveIterator from zimscraperlib.zim import Archive -@pytest.mark.parametrize( - "filename", - [ - pytest.param("/output/tests_en_onepage.zim", id="onepage"), - pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"), - pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"), - ], -) -def test_zim_created(filename): +def test_is_file(): """Ensure ZIM file exists""" - assert os.path.isfile(filename) - - -@pytest.mark.parametrize( - "filename", - [ - pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"), - pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"), - ], -) -def test_zim_not_created(filename): - """Ensure ZIM file does not exists""" - assert not os.path.exists(filename) + assert os.path.isfile("/output/isago.zim") def test_zim_main_page(): - """Main page specified, http://website.test.openzim.org/http-return-codes.html, - was a redirect to https + """Main page specified, http://isago.rskg.org/, was a redirect to https Ensure main page is the redirected page""" - main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry + main_entry = Archive("/output/isago.zim").main_entry assert main_entry.is_redirect - assert ( - main_entry.get_redirect_entry().path - == "website.test.openzim.org/http-return-codes.html" - ) + assert main_entry.get_redirect_entry().path == "isago.rskg.org/" def test_zim_scraper(): - """Check content of scraper metadata""" + """Main page specified, http://isago.rskg.org/, was a redirect to https + Ensure main page is the redirected page""" - zim_fh = Archive(Path("/output/tests_en_onepage.zim")) + zim_fh = Archive("/output/isago.zim") scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper @@ -58,28 +33,18 @@ def test_zim_scraper(): def test_files_list(): """Check that expected files are present in the ZIM at proper path""" - zim_fh = Archive(Path("/output/tests_en_onepage.zim")) + zim_fh = Archive("/output/isago.zim") for expected_entry in [ "_zim_static/__wb_module_decl.js", "_zim_static/wombat.js", "_zim_static/wombatSetup.js", - "website.test.openzim.org/http-return-codes.html", - "website.test.openzim.org/200-response", - "website.test.openzim.org/201-response", - "website.test.openzim.org/202-response", - "website.test.openzim.org/301-external-redirect-ok", - "website.test.openzim.org/301-internal-redirect-ok", - "website.test.openzim.org/302-external-redirect-ok", - "website.test.openzim.org/302-internal-redirect-ok", - "website.test.openzim.org/307-external-redirect-ok", - "website.test.openzim.org/307-internal-redirect-ok", - "website.test.openzim.org/308-external-redirect-ok", - "website.test.openzim.org/308-internal-redirect-ok", - "website.test.openzim.org/http-return-codes.html", - "website.test.openzim.org/icons/favicon.ico", - "website.test.openzim.org/icons/site.webmanifest", - "website.test.openzim.org/internal_redirect_target.html", - "www.example.com/", + "isago.rskg.org/", + "isago.rskg.org/a-propos", + "isago.rskg.org/conseils", + "isago.rskg.org/faq", + "isago.rskg.org/static/favicon256.png", + "isago.rskg.org/static/tarifs-isago.pdf", + "maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css", ]: assert zim_fh.get_content(expected_entry) @@ -106,40 +71,24 @@ def test_user_agent(): assert found -def test_stats_output_standard(): - assert json.loads(Path("/output/crawl.json").read_bytes()) == { - "crawled": 17, - "pending": 0, - "pendingPages": [], - "total": 35, - "failed": 18, - "limit": {"max": 0, "hit": False}, - } - - assert json.loads(Path("/output/warc2zim.json").read_bytes()) == { - "written": 8, - "total": 8, - } - - assert json.loads(Path("/output/stats.json").read_bytes()) == { - "done": 8, - "total": 8, - "partialZim": False, - } - - -@pytest.mark.parametrize( - "filename", - [ - pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"), - pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"), - ], -) -def test_stats_output_softlimit(filename): - file = Path(filename) - assert file.exists - content = json.loads(file.read_bytes()) - assert "done" in content - assert "total" in content - assert "partialZim" in content - assert content["partialZim"] +def test_stats_output(): + with open("/output/crawl.json") as fh: + assert json.loads(fh.read()) == { + "crawled": 5, + "pending": 0, + "pendingPages": [], + "total": 5, + "failed": 0, + "limit": {"max": 0, "hit": False}, + } + with open("/output/warc2zim.json") as fh: + assert json.loads(fh.read()) == { + "written": 7, + "total": 7, + } + with open("/output/stats.json") as fh: + assert json.loads(fh.read()) == { + "done": 7, + "total": 7, + "limit": {"max": 0, "hit": False}, + } diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index d51650d..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from zimit import zimit as app - -""" - cleanup disabled because atexit hooks run at the very end of the Python process - shutdown. By the time cleanup() is called, the logging module has already closed its - file streams. -""" - - -@pytest.fixture(autouse=True) -def disable_zimit_cleanup(monkeypatch): - monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc deleted file mode 100644 index 143b947..0000000 Binary files a/tests/data/example-response.warc and /dev/null differ diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py deleted file mode 100644 index e41baca..0000000 --- a/tests/test_overwrite.py +++ /dev/null @@ -1,83 +0,0 @@ -import pathlib - -import pytest - -from zimit.zimit import run - -TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" - - -def test_overwrite_flag_behaviour(tmp_path): - zim_output = "overwrite-test.zim" - output_path = tmp_path / zim_output - - # 1st run → creates file - result = run( - [ - "--seeds", - "https://example.com", - "--warcs", - str(TEST_DATA_DIR / "example-response.warc"), - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - ] - ) - assert result in (None, 100) - assert output_path.exists() - - # 2nd run, no overwrite → should fail - with pytest.raises(SystemExit) as exc: - run( - [ - "--seeds", - "https://example.com", - "--warcs", - str(TEST_DATA_DIR / "example-response.warc"), - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - ] - ) - assert exc.value.code == 2 - - # 2nd run, no overwrite → should fail - with pytest.raises(SystemExit) as exc: - run( - [ - "--seeds", - "https://example.com", - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - ] - ) - assert exc.value.code == 2 - - # 3rd run, with overwrite → should succeed - result = run( - [ - "--seeds", - "https://example.com", - "--warcs", - str(TEST_DATA_DIR / "example-response.warc"), - "--output", - str(tmp_path), - "--zim-file", - zim_output, - "--name", - "overwrite-test", - "--overwrite", - ] - ) - assert result in (None, 100) - assert output_path.exists()