mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 12:33:15 +00:00
Compare commits
No commits in common. "main" and "v2.1.5" have entirely different histories.
19 changed files with 278 additions and 2166 deletions
2
.github/workflows/DailyTests.yaml
vendored
2
.github/workflows/DailyTests.yaml
vendored
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
run: docker build -t local-zimit .
|
||||
|
||||
- name: run crawl of test website
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
|
||||
|
||||
- name: archive ZIM
|
||||
uses: actions/upload-artifact@v4
|
||||
|
|
|
|||
32
.github/workflows/Publish.yml
vendored
32
.github/workflows/Publish.yml
vendored
|
|
@ -5,9 +5,8 @@ on:
|
|||
types: [published]
|
||||
|
||||
jobs:
|
||||
publish-amd64:
|
||||
runs-on: ubuntu-24.04
|
||||
name: "Publish for AMD64"
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
@ -20,34 +19,11 @@ jobs:
|
|||
latest-on-tag: true
|
||||
restrict-to: openzim/zimit
|
||||
registries: ghcr.io
|
||||
credentials: |
|
||||
credentials:
|
||||
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
repo_description: auto
|
||||
repo_overview: auto
|
||||
platforms: |
|
||||
linux/amd64
|
||||
|
||||
# Disabled for now, see https://github.com/openzim/zimit/issues/463
|
||||
# publish-arm64:
|
||||
# runs-on: ubuntu-24.04
|
||||
# name: "Publish for ARM64"
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build and push Docker image
|
||||
# uses: openzim/docker-publish-action@v10
|
||||
# with:
|
||||
# image-name: openzim/zimit
|
||||
# tag-pattern: /^v([0-9.]+)$/
|
||||
# latest-on-tag: true
|
||||
# restrict-to: openzim/zimit
|
||||
# registries: ghcr.io
|
||||
# credentials: |
|
||||
# GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
# GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
# repo_description: auto
|
||||
# repo_overview: auto
|
||||
# platforms: |
|
||||
# linux/arm64
|
||||
linux/arm64
|
||||
|
|
|
|||
32
.github/workflows/PublishDockerDevImage.yaml
vendored
32
.github/workflows/PublishDockerDevImage.yaml
vendored
|
|
@ -7,9 +7,8 @@ on:
|
|||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
publish-amd64:
|
||||
runs-on: ubuntu-24.04
|
||||
name: "Publish for AMD64"
|
||||
publish:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
|
@ -22,34 +21,11 @@ jobs:
|
|||
latest-on-tag: false
|
||||
restrict-to: openzim/zimit
|
||||
registries: ghcr.io
|
||||
credentials: |
|
||||
credentials:
|
||||
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
repo_description: auto
|
||||
repo_overview: auto
|
||||
platforms: |
|
||||
linux/amd64
|
||||
|
||||
# Disabled for now, see https://github.com/openzim/zimit/issues/463
|
||||
# publish-arm64:
|
||||
# runs-on: ubuntu-24.04-arm
|
||||
# name: "Publish for ARM64"
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build and push Docker image
|
||||
# uses: openzim/docker-publish-action@v10
|
||||
# with:
|
||||
# image-name: openzim/zimit
|
||||
# manual-tag: dev
|
||||
# latest-on-tag: false
|
||||
# restrict-to: openzim/zimit
|
||||
# registries: ghcr.io
|
||||
# credentials: |
|
||||
# GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
|
||||
# GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
|
||||
# repo_description: auto
|
||||
# repo_overview: auto
|
||||
# platforms: |
|
||||
# linux/arm64
|
||||
linux/arm64
|
||||
|
|
|
|||
22
.github/workflows/Tests.yaml
vendored
22
.github/workflows/Tests.yaml
vendored
|
|
@ -57,25 +57,13 @@ jobs:
|
|||
uses: actions/checkout@v4
|
||||
|
||||
- name: build image
|
||||
run: docker build -t local-zimit .
|
||||
run: docker build -t zimit .
|
||||
|
||||
- name: ensure help display without issue
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --help
|
||||
run: docker run -v $PWD/output:/output zimit zimit --help
|
||||
|
||||
- name: run crawl with soft size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
|
||||
|
||||
- name: run crawl with hard size limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
|
||||
|
||||
- name: run crawl with soft time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
|
||||
|
||||
- name: run crawl with hard time limit
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
|
||||
|
||||
- name: run standard crawl
|
||||
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
|
||||
- name: run crawl
|
||||
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
|
||||
|
||||
- name: run integration test suite
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
||||
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
|
||||
|
|
|
|||
|
|
@ -1,45 +0,0 @@
|
|||
name: Update ZIMFarm Definitions
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- "offliner-definition.json"
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: "Version to publish"
|
||||
required: false
|
||||
default: "dev"
|
||||
|
||||
jobs:
|
||||
prepare-json:
|
||||
runs-on: ubuntu-24.04
|
||||
outputs:
|
||||
offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- id: read-json
|
||||
run: |
|
||||
if [ ! -f "offliner-definition.json" ]; then
|
||||
echo "File not found!" >&2
|
||||
exit 1
|
||||
fi
|
||||
json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
|
||||
echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
|
||||
call-workflow:
|
||||
needs: prepare-json
|
||||
uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
|
||||
with:
|
||||
version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
|
||||
offliner: zimit
|
||||
offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
|
||||
secrets:
|
||||
zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
|
||||
|
|
@ -2,20 +2,20 @@
|
|||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
rev: v4.4.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- repo: https://github.com/psf/black
|
||||
rev: "25.1.0"
|
||||
rev: "24.10.0"
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.4
|
||||
rev: v0.6.9
|
||||
hooks:
|
||||
- id: ruff
|
||||
- repo: https://github.com/RobertCraigie/pyright-python
|
||||
rev: v1.1.393
|
||||
rev: v1.1.383
|
||||
hooks:
|
||||
- id: pyright
|
||||
name: pyright (system)
|
||||
|
|
|
|||
86
CHANGELOG.md
86
CHANGELOG.md
|
|
@ -5,92 +5,6 @@ All notable changes to this project are documented in this file.
|
|||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
|
||||
|
||||
### Changed
|
||||
- Fix issues preventing interrupted crawls from being resumed. (#499)
|
||||
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
|
||||
- Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
|
||||
- Don't cleanup an explicitly passed build directory.
|
||||
|
||||
## [3.0.5] - 2024-04-11
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.6.0 (#493)
|
||||
|
||||
## [3.0.4] - 2024-04-04
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.10 (#491)
|
||||
|
||||
## [3.0.3] - 2024-02-28
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.7 (#483)
|
||||
|
||||
## [3.0.2] - 2024-02-27
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.6 (#482)
|
||||
|
||||
## [3.0.1] - 2024-02-24
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.4 (#476)
|
||||
|
||||
## [3.0.0] - 2024-02-17
|
||||
|
||||
### Changed
|
||||
|
||||
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
|
||||
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
|
||||
- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
|
||||
- Document all Browsertrix Crawler default arguments values (#416)
|
||||
- Use preferred Browsertrix Crawler arguments names: (part of #471)
|
||||
- `--seeds` instead of `--url`
|
||||
- `--seedFile` instead of `--urlFile`
|
||||
- `--pageLimit` instead of `--limit`
|
||||
- `--pageLoadTimeout` instead of `--timeout`
|
||||
- `--scopeIncludeRx` instead of `--include`
|
||||
- `--scopeExcludeRx` instead of `--exclude`
|
||||
- `--pageExtraDelay` instead of `--delay`
|
||||
- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
|
||||
- `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
|
||||
- `--zimit-progress-file` is now the zimit stats location
|
||||
- `--warc2zim-progress-file` is the warc2zim stats location
|
||||
- all are optional values, if not set and needed temporary files are used
|
||||
|
||||
### Fixed
|
||||
|
||||
- Do not create the ZIM when crawl is incomplete (#444)
|
||||
|
||||
## [2.1.8] - 2024-02-07
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
|
||||
|
||||
## [2.1.7] - 2024-01-10
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.4.2 (#450)
|
||||
- Upgrade to warc2zim 2.2.0
|
||||
|
||||
## [2.1.6] - 2024-11-07
|
||||
|
||||
### Changed
|
||||
|
||||
- Upgrade to browsertrix crawler 1.3.5 (#426)
|
||||
|
||||
## [2.1.5] - 2024-11-01
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
11
Dockerfile
11
Dockerfile
|
|
@ -1,16 +1,13 @@
|
|||
FROM webrecorder/browsertrix-crawler:1.6.0
|
||||
LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
|
||||
|
||||
# add deadsnakes ppa for latest Python on Ubuntu
|
||||
RUN add-apt-repository ppa:deadsnakes/ppa -y
|
||||
FROM webrecorder/browsertrix-crawler:1.3.4
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -qqy --no-install-recommends \
|
||||
libmagic1 \
|
||||
python3.13-venv \
|
||||
python3.12-venv \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# python setup (in venv not to conflict with browsertrix)
|
||||
&& python3.13 -m venv /app/zimit \
|
||||
&& python3.12 -m venv /app/zimit \
|
||||
# placeholder (default output location)
|
||||
&& mkdir -p /output \
|
||||
# disable chrome upgrade
|
||||
|
|
|
|||
21
README.md
21
README.md
|
|
@ -1,15 +1,15 @@
|
|||
Zimit
|
||||
=====
|
||||
|
||||
Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
|
||||
Zimit is a scraper allowing to create ZIM file from any Web site.
|
||||
|
||||
[](https://www.codefactor.io/repository/github/openzim/zimit)
|
||||
[](https://www.gnu.org/licenses/gpl-3.0)
|
||||
[](https://ghcr.io/openzim/zimit)
|
||||
[](https://ghcr.io/openzim/zimit)
|
||||
|
||||
Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
|
||||
|
||||
Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
|
||||
Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**.
|
||||
|
||||
Capabilities and known limitations
|
||||
--------------------
|
||||
|
|
@ -38,23 +38,24 @@ Usage
|
|||
|
||||
`zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
|
||||
|
||||
The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
|
||||
The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance:
|
||||
|
||||
- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
|
||||
- Required: `--url URL` - the url to be crawled
|
||||
- Required: `--name` - Name of ZIM file
|
||||
- `--output` - output directory (defaults to `/output`)
|
||||
- `--pageLimit U` - Limit capture to at most U URLs
|
||||
- `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
||||
- `--limit U` - Limit capture to at most U URLs
|
||||
- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded)
|
||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
|
||||
- `--workers N` - number of crawl workers to be run in parallel
|
||||
- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
|
||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
||||
- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
|
||||
|
||||
Example command:
|
||||
|
||||
```bash
|
||||
docker run ghcr.io/openzim/zimit zimit --help
|
||||
docker run ghcr.io/openzim/zimit warc2zim --help
|
||||
docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
|
||||
docker run -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile
|
||||
```
|
||||
|
||||
**Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
|
||||
|
|
|
|||
|
|
@ -1,981 +0,0 @@
|
|||
{
|
||||
"offliner_id": "zimit",
|
||||
"stdOutput": true,
|
||||
"stdStats": "zimit-progress-file",
|
||||
"flags": {
|
||||
"seeds": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Seeds",
|
||||
"description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
|
||||
},
|
||||
"seed_file": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Seed File",
|
||||
"description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
|
||||
},
|
||||
"lang": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Browser Language",
|
||||
"description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
|
||||
},
|
||||
"title": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Title",
|
||||
"description": "Custom title for your ZIM. Defaults to title of main page",
|
||||
"minLength": 1,
|
||||
"maxLength": 30
|
||||
},
|
||||
"description": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Description",
|
||||
"description": "Description for ZIM",
|
||||
"minLength": 1,
|
||||
"maxLength": 80
|
||||
},
|
||||
"favicon": {
|
||||
"type": "blob",
|
||||
"kind": "image",
|
||||
"required": false,
|
||||
"title": "Illustration",
|
||||
"description": "URL for Illustration. "
|
||||
},
|
||||
"tags": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "ZIM Tags",
|
||||
"description": "Single string with individual tags separated by a semicolon."
|
||||
},
|
||||
"creator": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Creator",
|
||||
"description": "Name of content creator"
|
||||
},
|
||||
"publisher": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Publisher",
|
||||
"isPublisher": true,
|
||||
"description": "Custom publisher name (ZIM metadata). openZIM otherwise"
|
||||
},
|
||||
"source": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Source",
|
||||
"description": "Source name/URL of content"
|
||||
},
|
||||
"workers": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Workers",
|
||||
"description": "The number of workers to run in parallel. Defaults to 1",
|
||||
"min": 1
|
||||
},
|
||||
"wait_until": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "WaitUntil",
|
||||
"description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
|
||||
},
|
||||
"extra_hops": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Extra Hops",
|
||||
"description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
|
||||
"min": 0
|
||||
},
|
||||
"page_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Page Limit",
|
||||
"description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
|
||||
"min": 0
|
||||
},
|
||||
"max_page_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Max Page Limit",
|
||||
"description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
|
||||
"min": 0
|
||||
},
|
||||
"page_load_timeout": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Page Load Timeout",
|
||||
"description": "Timeout for each page to load (in seconds). Default is 90",
|
||||
"min": 0
|
||||
},
|
||||
"scope_type": {
|
||||
"type": "string-enum",
|
||||
"required": false,
|
||||
"title": "Scope Type",
|
||||
"description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
|
||||
"choices": [
|
||||
{
|
||||
"title": "Page",
|
||||
"value": "page"
|
||||
},
|
||||
{
|
||||
"title": "Page SPA",
|
||||
"value": "page-spa"
|
||||
},
|
||||
{
|
||||
"title": "Prefix",
|
||||
"value": "prefix"
|
||||
},
|
||||
{
|
||||
"title": "Host",
|
||||
"value": "host"
|
||||
},
|
||||
{
|
||||
"title": "Domain",
|
||||
"value": "domain"
|
||||
},
|
||||
{
|
||||
"title": "Any",
|
||||
"value": "any"
|
||||
},
|
||||
{
|
||||
"title": "Custom",
|
||||
"value": "custom"
|
||||
}
|
||||
]
|
||||
},
|
||||
"scope_include_rx": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Scope Include Regex",
|
||||
"description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
|
||||
},
|
||||
"scope_exclude_rx": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Scope Exclude Regex",
|
||||
"description": "Regex of page URLs that should be excluded from the crawl"
|
||||
},
|
||||
"allow_hash_urls": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Allow Hashtag URLs",
|
||||
"description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
|
||||
},
|
||||
"mobile_device": {
|
||||
"type": "string-enum",
|
||||
"required": false,
|
||||
"title": "As device",
|
||||
"description": "Device to crawl as. See Pupeeter's Device.ts for a list",
|
||||
"choices": [
|
||||
{
|
||||
"title": "Blackberry Playbook",
|
||||
"value": "Blackberry PlayBook"
|
||||
},
|
||||
{
|
||||
"title": "Blackberry Playbook Landscape",
|
||||
"value": "Blackberry PlayBook landscape"
|
||||
},
|
||||
{
|
||||
"title": "Blackberry Z30",
|
||||
"value": "BlackBerry Z30"
|
||||
},
|
||||
{
|
||||
"title": "Blackberry Z30 Landscape",
|
||||
"value": "BlackBerry Z30 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note 3",
|
||||
"value": "Galaxy Note 3"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note 3 Landscape",
|
||||
"value": "Galaxy Note 3 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note II",
|
||||
"value": "Galaxy Note II"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Note II Landscape",
|
||||
"value": "Galaxy Note II landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S III",
|
||||
"value": "Galaxy S III"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S III Landscape",
|
||||
"value": "Galaxy S III landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S5",
|
||||
"value": "Galaxy S5"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S5 Landscape",
|
||||
"value": "Galaxy S5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S8",
|
||||
"value": "Galaxy S8"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S8 Landscape",
|
||||
"value": "Galaxy S8 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S9 Plus",
|
||||
"value": "Galaxy S9+"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy S9 Plus Landscape",
|
||||
"value": "Galaxy S9+ landscape"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Tab S4",
|
||||
"value": "Galaxy Tab S4"
|
||||
},
|
||||
{
|
||||
"title": "Galaxy Tab S4 Landscape",
|
||||
"value": "Galaxy Tab S4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad",
|
||||
"value": "iPad"
|
||||
},
|
||||
{
|
||||
"title": "iPad Landscape",
|
||||
"value": "iPad landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 6",
|
||||
"value": "iPad (gen 6)"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 6 Landscape",
|
||||
"value": "iPad (gen 6) landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 7",
|
||||
"value": "iPad (gen 7)"
|
||||
},
|
||||
{
|
||||
"title": "iPad Gen 7 Landscape",
|
||||
"value": "iPad (gen 7) landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Mini",
|
||||
"value": "iPad Mini"
|
||||
},
|
||||
{
|
||||
"title": "iPad Mini Landscape",
|
||||
"value": "iPad Mini landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro",
|
||||
"value": "iPad Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro Landscape",
|
||||
"value": "iPad Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro 11",
|
||||
"value": "iPad Pro 11"
|
||||
},
|
||||
{
|
||||
"title": "iPad Pro 11 Landscape",
|
||||
"value": "iPad Pro 11 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 4",
|
||||
"value": "iPhone 4"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 4 Landscape",
|
||||
"value": "iPhone 4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 5",
|
||||
"value": "iPhone 5"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 5 Landscape",
|
||||
"value": "iPhone 5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6",
|
||||
"value": "iPhone 6"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6 Landscape",
|
||||
"value": "iPhone 6 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6 Plus",
|
||||
"value": "iPhone 6 Plus"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 6 Plus Landscape",
|
||||
"value": "iPhone 6 Plus landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7",
|
||||
"value": "iPhone 7"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7 Landscape",
|
||||
"value": "iPhone 7 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7 Plus",
|
||||
"value": "iPhone 7 Plus"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 7 Plus Landscape",
|
||||
"value": "iPhone 7 Plus landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8",
|
||||
"value": "iPhone 8"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8 Landscape",
|
||||
"value": "iPhone 8 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8 Plus",
|
||||
"value": "iPhone 8 Plus"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 8 Plus Landscape",
|
||||
"value": "iPhone 8 Plus landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone SE",
|
||||
"value": "iPhone SE"
|
||||
},
|
||||
{
|
||||
"title": "iPhone SE Landscape",
|
||||
"value": "iPhone SE landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone X",
|
||||
"value": "iPhone X"
|
||||
},
|
||||
{
|
||||
"title": "iPhone X Landscape",
|
||||
"value": "iPhone X landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone XR",
|
||||
"value": "iPhone XR"
|
||||
},
|
||||
{
|
||||
"title": "iPhone XR Landscape",
|
||||
"value": "iPhone XR landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11",
|
||||
"value": "iPhone 11"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Landscape",
|
||||
"value": "iPhone 11 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro",
|
||||
"value": "iPhone 11 Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro Landscape",
|
||||
"value": "iPhone 11 Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro Max",
|
||||
"value": "iPhone 11 Pro Max"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 11 Pro Max Landscape",
|
||||
"value": "iPhone 11 Pro Max landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12",
|
||||
"value": "iPhone 12"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Landscape",
|
||||
"value": "iPhone 12 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro",
|
||||
"value": "iPhone 12 Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro Landscape",
|
||||
"value": "iPhone 12 Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro Max",
|
||||
"value": "iPhone 12 Pro Max"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Pro Max Landscape",
|
||||
"value": "iPhone 12 Pro Max landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Mini",
|
||||
"value": "iPhone 12 Mini"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 12 Mini Landscape",
|
||||
"value": "iPhone 12 Mini landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13",
|
||||
"value": "iPhone 13"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Landscape",
|
||||
"value": "iPhone 13 landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro",
|
||||
"value": "iPhone 13 Pro"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro Landscape",
|
||||
"value": "iPhone 13 Pro landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro Max",
|
||||
"value": "iPhone 13 Pro Max"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Pro Max Landscape",
|
||||
"value": "iPhone 13 Pro Max landscape"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Mini",
|
||||
"value": "iPhone 13 Mini"
|
||||
},
|
||||
{
|
||||
"title": "iPhone 13 Mini Landscape",
|
||||
"value": "iPhone 13 Mini landscape"
|
||||
},
|
||||
{
|
||||
"title": "Jio Phone 2",
|
||||
"value": "JioPhone 2"
|
||||
},
|
||||
{
|
||||
"title": "Jio Phone 2 Landscape",
|
||||
"value": "JioPhone 2 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Kindle Fire HDX",
|
||||
"value": "Kindle Fire HDX"
|
||||
},
|
||||
{
|
||||
"title": "Kindle Fire HDX Landscape",
|
||||
"value": "Kindle Fire HDX landscape"
|
||||
},
|
||||
{
|
||||
"title": "LG Optimus L70",
|
||||
"value": "LG Optimus L70"
|
||||
},
|
||||
{
|
||||
"title": "LG Optimus L70 Landscape",
|
||||
"value": "LG Optimus L70 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Microsoft Lumia 550",
|
||||
"value": "Microsoft Lumia 550"
|
||||
},
|
||||
{
|
||||
"title": "Microsoft Lumia 950",
|
||||
"value": "Microsoft Lumia 950"
|
||||
},
|
||||
{
|
||||
"title": "Microsoft Lumia 950 Landscape",
|
||||
"value": "Microsoft Lumia 950 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 10",
|
||||
"value": "Nexus 10"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 10 Landscape",
|
||||
"value": "Nexus 10 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 4",
|
||||
"value": "Nexus 4"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 4 Landscape",
|
||||
"value": "Nexus 4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5",
|
||||
"value": "Nexus 5"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5 Landscape",
|
||||
"value": "Nexus 5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5X",
|
||||
"value": "Nexus 5X"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 5X Landscape",
|
||||
"value": "Nexus 5X landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6",
|
||||
"value": "Nexus 6"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6 Landscape",
|
||||
"value": "Nexus 6 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6P",
|
||||
"value": "Nexus 6P"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 6P Landscape",
|
||||
"value": "Nexus 6P landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 7",
|
||||
"value": "Nexus 7"
|
||||
},
|
||||
{
|
||||
"title": "Nexus 7 Landscape",
|
||||
"value": "Nexus 7 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nokia Lumia 520",
|
||||
"value": "Nokia Lumia 520"
|
||||
},
|
||||
{
|
||||
"title": "Nokia Lumia 520 Landscape",
|
||||
"value": "Nokia Lumia 520 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Nokia N9",
|
||||
"value": "Nokia N9"
|
||||
},
|
||||
{
|
||||
"title": "Nokia N9 Landscape",
|
||||
"value": "Nokia N9 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2",
|
||||
"value": "Pixel 2"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2 Landscape",
|
||||
"value": "Pixel 2 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2 XL",
|
||||
"value": "Pixel 2 XL"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 2 XL Landscape",
|
||||
"value": "Pixel 2 XL landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 3",
|
||||
"value": "Pixel 3"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 3 Landscape",
|
||||
"value": "Pixel 3 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4",
|
||||
"value": "Pixel 4"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4 Landscape",
|
||||
"value": "Pixel 4 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4A 5G",
|
||||
"value": "Pixel 4a (5G)"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 4A 5G Landscape",
|
||||
"value": "Pixel 4a (5G) landscape"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 5",
|
||||
"value": "Pixel 5"
|
||||
},
|
||||
{
|
||||
"title": "Pixel 5 Landscape",
|
||||
"value": "Pixel 5 landscape"
|
||||
},
|
||||
{
|
||||
"title": "Moto G4",
|
||||
"value": "Moto G4"
|
||||
},
|
||||
{
|
||||
"title": "Moto G4 Landscape",
|
||||
"value": "Moto G4 landscape"
|
||||
}
|
||||
]
|
||||
},
|
||||
"select_links": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Select Links",
|
||||
"description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
|
||||
},
|
||||
"click_selector": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Click Selector",
|
||||
"description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
|
||||
},
|
||||
"block_rules": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Block Rules",
|
||||
"description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
|
||||
},
|
||||
"block_message": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Block Message",
|
||||
"description": "If specified, when a URL is blocked, a record with this error message is added instead"
|
||||
},
|
||||
"block_ads": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Block Ads",
|
||||
"description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
|
||||
},
|
||||
"ad_block_message": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Ads Block Message",
|
||||
"description": "If specified, when an ad is blocked, a record with this error message is added instead"
|
||||
},
|
||||
"user_agent": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "User Agent",
|
||||
"description": "Override user-agent with specified"
|
||||
},
|
||||
"user_agent_suffix": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "User Agent Suffix",
|
||||
"description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
|
||||
},
|
||||
"use_sitemap": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Sitemap URL",
|
||||
"description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
|
||||
},
|
||||
"sitemap_from_date": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Sitemap From Date",
|
||||
"description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
|
||||
},
|
||||
"sitemap_to_date": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Sitemap To Date",
|
||||
"description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
|
||||
},
|
||||
"behavior_timeout": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Behavior Timeout",
|
||||
"description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
|
||||
"min": 0
|
||||
},
|
||||
"post_load_delay": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Post Load Delay",
|
||||
"description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
|
||||
"min": 0
|
||||
},
|
||||
"page_extra_delay": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Page Extra Delay",
|
||||
"description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
|
||||
"min": 0
|
||||
},
|
||||
"dedup_policy": {
|
||||
"type": "string-enum",
|
||||
"required": false,
|
||||
"title": "Dedup Policy",
|
||||
"description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
|
||||
"choices": [
|
||||
{
|
||||
"title": "Skip",
|
||||
"value": "skip"
|
||||
},
|
||||
{
|
||||
"title": "Revisit",
|
||||
"value": "revisit"
|
||||
},
|
||||
{
|
||||
"title": "Keep",
|
||||
"value": "keep"
|
||||
}
|
||||
]
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Screenshot",
|
||||
"description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
|
||||
},
|
||||
"size_soft_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Size Soft Limit",
|
||||
"description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
|
||||
"min": 0
|
||||
},
|
||||
"size_hard_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Size Hard Limit",
|
||||
"description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
|
||||
"min": 0
|
||||
},
|
||||
"disk_utilization": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Disk Utilization",
|
||||
"description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
|
||||
"min": 0
|
||||
},
|
||||
"time_soft_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Time Soft Limit",
|
||||
"description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
|
||||
"min": 0
|
||||
},
|
||||
"time_hard_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Time Hard Limit",
|
||||
"description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
|
||||
"min": 0
|
||||
},
|
||||
"net_idle_wait": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Net Idle Wait",
|
||||
"description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
|
||||
},
|
||||
"origin_override": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Origin Override",
|
||||
"description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
|
||||
},
|
||||
"max_page_retries": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Max Page Retries",
|
||||
"description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
|
||||
"min": 0
|
||||
},
|
||||
"fail_on_failed_seed": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Fail on failed seed",
|
||||
"description": "Whether to display additional logs"
|
||||
},
|
||||
"fail_on_invalid_status": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Fail on invalid status",
|
||||
"description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
|
||||
},
|
||||
"fail_on_failed_limit": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Fail on failed - Limit",
|
||||
"description": "If set, save state and exit if number of failed pages exceeds this value.",
|
||||
"min": 0
|
||||
},
|
||||
"warcs": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "WARC files",
|
||||
"description": "Comma-separated list of WARC files to use as input."
|
||||
},
|
||||
"verbose": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Verbose mode",
|
||||
"description": "Whether to display additional logs"
|
||||
},
|
||||
"keep": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Keep",
|
||||
"description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
|
||||
"default": true
|
||||
},
|
||||
"output": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Output folder",
|
||||
"description": "Output folder for ZIM file(s). Leave it as `/output`",
|
||||
"pattern": "^/output$"
|
||||
},
|
||||
"admin_email": {
|
||||
"type": "email",
|
||||
"required": false,
|
||||
"title": "Admin Email",
|
||||
"description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
|
||||
"default": "contact+zimfarm@kiwix.org"
|
||||
},
|
||||
"profile": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Browser profile",
|
||||
"description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
|
||||
},
|
||||
"behaviors": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Behaviors",
|
||||
"description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
|
||||
},
|
||||
"depth": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Depth",
|
||||
"description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
|
||||
"min": -1
|
||||
},
|
||||
"zim_lang": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "ZIM Language",
|
||||
"description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
|
||||
"alias": "zim-lang",
|
||||
"customValidator": "language_code"
|
||||
},
|
||||
"long_description": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Long description",
|
||||
"description": "Optional long description for your ZIM",
|
||||
"minLength": 1,
|
||||
"maxLength": 4000,
|
||||
"alias": "long-description"
|
||||
},
|
||||
"custom_css": {
|
||||
"type": "blob",
|
||||
"kind": "css",
|
||||
"required": false,
|
||||
"title": "Custom CSS",
|
||||
"description": "URL to a CSS file to inject into pages",
|
||||
"alias": "custom-css"
|
||||
},
|
||||
"charsets_to_try": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Charsets to try",
|
||||
"description": "List of charsets to try decode content when charset is not found",
|
||||
"alias": "charsets-to-try"
|
||||
},
|
||||
"ignore_content_header_charsets": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Ignore Content Header Charsets",
|
||||
"description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
|
||||
"alias": "ignore-content-header-charsets"
|
||||
},
|
||||
"content_header_bytes_length": {
|
||||
"type": "integer",
|
||||
"required": false,
|
||||
"title": "Content Header Bytes Length",
|
||||
"description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
|
||||
"alias": "content-header-bytes-length",
|
||||
"min": 0
|
||||
},
|
||||
"ignore_http_header_charsets": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Ignore HTTP Header Charsets",
|
||||
"description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
|
||||
"alias": "ignore-http-header-charsets"
|
||||
},
|
||||
"encoding_aliases": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Encoding Aliases",
|
||||
"description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
|
||||
"alias": "encoding-aliases"
|
||||
},
|
||||
"custom_behaviors": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Custom Behaviors",
|
||||
"description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
|
||||
"alias": "custom-behaviours"
|
||||
},
|
||||
"zimit_progress_file": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "Zimit Progress File",
|
||||
"description": "Scraping progress file. Leave it as `/output/task_progress.json`",
|
||||
"alias": "zimit-progress-file",
|
||||
"pattern": "^/output/task_progress\\.json$"
|
||||
},
|
||||
"replay_viewer_source": {
|
||||
"type": "url",
|
||||
"required": false,
|
||||
"title": "Replay Viewer Source",
|
||||
"description": "URL from which to load the ReplayWeb.page replay viewer from",
|
||||
"alias": "replay-viewer-source"
|
||||
},
|
||||
"zim_file": {
|
||||
"type": "string",
|
||||
"required": false,
|
||||
"title": "ZIM filename",
|
||||
"description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
|
||||
"alias": "zim-file",
|
||||
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
|
||||
"relaxedPattern": "^[A-Za-z0-9._-]+$"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"required": true,
|
||||
"title": "ZIM name",
|
||||
"description": "Name of the ZIM.",
|
||||
"alias": "name",
|
||||
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
|
||||
"relaxedPattern": "^[A-Za-z0-9._-]+$"
|
||||
},
|
||||
"overwrite": {
|
||||
"type": "boolean",
|
||||
"required": false,
|
||||
"title": "Overwrite",
|
||||
"description": "Whether to overwrite existing ZIM file if it exists"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,17 +1,17 @@
|
|||
[build-system]
|
||||
requires = ["hatchling", "hatch-openzim"]
|
||||
requires = ["hatchling", "hatch-openzim==0.2.0"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "zimit"
|
||||
requires-python = ">=3.13,<3.14"
|
||||
requires-python = ">=3.12,<3.13"
|
||||
description = "Make ZIM file from any website through crawling"
|
||||
readme = "README.md"
|
||||
dependencies = [
|
||||
"requests==2.32.3",
|
||||
"inotify==0.2.10",
|
||||
"tld==0.13",
|
||||
"warc2zim @ git+https://github.com/openzim/warc2zim@main",
|
||||
"warc2zim==2.1.3",
|
||||
]
|
||||
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
|
||||
|
||||
|
|
@ -26,20 +26,20 @@ scripts = [
|
|||
"invoke==2.2.0",
|
||||
]
|
||||
lint = [
|
||||
"black==25.1.0",
|
||||
"ruff==0.9.4",
|
||||
"black==24.10.0",
|
||||
"ruff==0.6.9",
|
||||
]
|
||||
check = [
|
||||
"pyright==1.1.393",
|
||||
"pyright==1.1.383",
|
||||
]
|
||||
test = [
|
||||
"pytest==8.3.4",
|
||||
"coverage==7.6.10",
|
||||
"pytest==8.3.3",
|
||||
"coverage==7.6.1",
|
||||
]
|
||||
dev = [
|
||||
"pre-commit==4.1.0",
|
||||
"debugpy==1.8.12",
|
||||
"selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
|
||||
"pre-commit==4.0.0",
|
||||
"debugpy==1.8.6",
|
||||
"selenium==4.25.0", # used in daily tests, convenient for dev purpose (autocompletion)
|
||||
"zimit[scripts]",
|
||||
"zimit[lint]",
|
||||
"zimit[test]",
|
||||
|
|
@ -95,10 +95,10 @@ all = "inv checkall --args '{args}'"
|
|||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
target-version = ['py313']
|
||||
target-version = ['py312']
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py313"
|
||||
target-version = "py312"
|
||||
line-length = 88
|
||||
src = ["src"]
|
||||
|
||||
|
|
@ -221,5 +221,5 @@ exclude_lines = [
|
|||
include = ["src", "tests", "tasks.py"]
|
||||
exclude = [".env/**", ".venv/**"]
|
||||
extraPaths = ["src"]
|
||||
pythonVersion = "3.13"
|
||||
pythonVersion = "3.12"
|
||||
typeCheckingMode="basic"
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
__version__ = "3.0.6-dev0"
|
||||
__version__ = "2.1.5"
|
||||
|
|
|
|||
|
|
@ -3,8 +3,7 @@ import logging
|
|||
from zimscraperlib.logging import getLogger
|
||||
|
||||
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
|
||||
EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
|
||||
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
|
||||
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
|
||||
NORMAL_WARC2ZIM_EXIT_CODE = 100
|
||||
REQUESTS_TIMEOUT = 10
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,5 @@
|
|||
# Let's extract kiwix-tools as usual on alpine temporary build container
|
||||
FROM alpine:3.21 as kiwix-serve
|
||||
FROM alpine:3.18 as kiwix-serve
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
|
||||
|
||||
# TARGETPLATFORM is injected by docker build
|
||||
|
|
@ -30,7 +30,7 @@ RUN set -e && \
|
|||
curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
|
||||
|
||||
# Build real "workload" container
|
||||
FROM python:3.13-slim-bookworm
|
||||
FROM python:3.12-slim-bookworm
|
||||
|
||||
# Add kiwix-serve
|
||||
COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
|
||||
|
|
@ -70,6 +70,6 @@ RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
|
|||
RUN \
|
||||
python -m pip install --no-cache-dir -U \
|
||||
pip \
|
||||
selenium==4.28.1 \
|
||||
pytest==8.3.4 \
|
||||
selenium==4.23.0 \
|
||||
pytest==8.2.2 \
|
||||
&& mkdir -p /work
|
||||
|
|
|
|||
|
|
@ -1,55 +1,30 @@
|
|||
import glob
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from warcio import ArchiveIterator
|
||||
from zimscraperlib.zim import Archive
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/tests_en_onepage.zim", id="onepage"),
|
||||
pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
|
||||
pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
|
||||
],
|
||||
)
|
||||
def test_zim_created(filename):
|
||||
def test_is_file():
|
||||
"""Ensure ZIM file exists"""
|
||||
assert os.path.isfile(filename)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
|
||||
pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
|
||||
],
|
||||
)
|
||||
def test_zim_not_created(filename):
|
||||
"""Ensure ZIM file does not exists"""
|
||||
assert not os.path.exists(filename)
|
||||
assert os.path.isfile("/output/isago.zim")
|
||||
|
||||
|
||||
def test_zim_main_page():
|
||||
"""Main page specified, http://website.test.openzim.org/http-return-codes.html,
|
||||
was a redirect to https
|
||||
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
|
||||
main_entry = Archive("/output/isago.zim").main_entry
|
||||
assert main_entry.is_redirect
|
||||
assert (
|
||||
main_entry.get_redirect_entry().path
|
||||
== "website.test.openzim.org/http-return-codes.html"
|
||||
)
|
||||
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
|
||||
|
||||
|
||||
def test_zim_scraper():
|
||||
"""Check content of scraper metadata"""
|
||||
"""Main page specified, http://isago.rskg.org/, was a redirect to https
|
||||
Ensure main page is the redirected page"""
|
||||
|
||||
zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
|
||||
zim_fh = Archive("/output/isago.zim")
|
||||
scraper = zim_fh.get_text_metadata("Scraper")
|
||||
assert "zimit " in scraper
|
||||
assert "warc2zim " in scraper
|
||||
|
|
@ -58,28 +33,18 @@ def test_zim_scraper():
|
|||
|
||||
def test_files_list():
|
||||
"""Check that expected files are present in the ZIM at proper path"""
|
||||
zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
|
||||
zim_fh = Archive("/output/isago.zim")
|
||||
for expected_entry in [
|
||||
"_zim_static/__wb_module_decl.js",
|
||||
"_zim_static/wombat.js",
|
||||
"_zim_static/wombatSetup.js",
|
||||
"website.test.openzim.org/http-return-codes.html",
|
||||
"website.test.openzim.org/200-response",
|
||||
"website.test.openzim.org/201-response",
|
||||
"website.test.openzim.org/202-response",
|
||||
"website.test.openzim.org/301-external-redirect-ok",
|
||||
"website.test.openzim.org/301-internal-redirect-ok",
|
||||
"website.test.openzim.org/302-external-redirect-ok",
|
||||
"website.test.openzim.org/302-internal-redirect-ok",
|
||||
"website.test.openzim.org/307-external-redirect-ok",
|
||||
"website.test.openzim.org/307-internal-redirect-ok",
|
||||
"website.test.openzim.org/308-external-redirect-ok",
|
||||
"website.test.openzim.org/308-internal-redirect-ok",
|
||||
"website.test.openzim.org/http-return-codes.html",
|
||||
"website.test.openzim.org/icons/favicon.ico",
|
||||
"website.test.openzim.org/icons/site.webmanifest",
|
||||
"website.test.openzim.org/internal_redirect_target.html",
|
||||
"www.example.com/",
|
||||
"isago.rskg.org/",
|
||||
"isago.rskg.org/a-propos",
|
||||
"isago.rskg.org/conseils",
|
||||
"isago.rskg.org/faq",
|
||||
"isago.rskg.org/static/favicon256.png",
|
||||
"isago.rskg.org/static/tarifs-isago.pdf",
|
||||
"maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css",
|
||||
]:
|
||||
assert zim_fh.get_content(expected_entry)
|
||||
|
||||
|
|
@ -106,40 +71,24 @@ def test_user_agent():
|
|||
assert found
|
||||
|
||||
|
||||
def test_stats_output_standard():
|
||||
assert json.loads(Path("/output/crawl.json").read_bytes()) == {
|
||||
"crawled": 17,
|
||||
"pending": 0,
|
||||
"pendingPages": [],
|
||||
"total": 35,
|
||||
"failed": 18,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
||||
|
||||
assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
|
||||
"written": 8,
|
||||
"total": 8,
|
||||
}
|
||||
|
||||
assert json.loads(Path("/output/stats.json").read_bytes()) == {
|
||||
"done": 8,
|
||||
"total": 8,
|
||||
"partialZim": False,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
[
|
||||
pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
|
||||
pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
|
||||
],
|
||||
)
|
||||
def test_stats_output_softlimit(filename):
|
||||
file = Path(filename)
|
||||
assert file.exists
|
||||
content = json.loads(file.read_bytes())
|
||||
assert "done" in content
|
||||
assert "total" in content
|
||||
assert "partialZim" in content
|
||||
assert content["partialZim"]
|
||||
def test_stats_output():
|
||||
with open("/output/crawl.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"crawled": 5,
|
||||
"pending": 0,
|
||||
"pendingPages": [],
|
||||
"total": 5,
|
||||
"failed": 0,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
||||
with open("/output/warc2zim.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"written": 7,
|
||||
"total": 7,
|
||||
}
|
||||
with open("/output/stats.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"done": 7,
|
||||
"total": 7,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,14 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from zimit import zimit as app
|
||||
|
||||
"""
|
||||
cleanup disabled because atexit hooks run at the very end of the Python process
|
||||
shutdown. By the time cleanup() is called, the logging module has already closed its
|
||||
file streams.
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def disable_zimit_cleanup(monkeypatch):
|
||||
monkeypatch.setattr(app, "cleanup", lambda: None)
|
||||
Binary file not shown.
|
|
@ -1,83 +0,0 @@
|
|||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from zimit.zimit import run
|
||||
|
||||
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
|
||||
|
||||
|
||||
def test_overwrite_flag_behaviour(tmp_path):
|
||||
zim_output = "overwrite-test.zim"
|
||||
output_path = tmp_path / zim_output
|
||||
|
||||
# 1st run → creates file
|
||||
result = run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--warcs",
|
||||
str(TEST_DATA_DIR / "example-response.warc"),
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
]
|
||||
)
|
||||
assert result in (None, 100)
|
||||
assert output_path.exists()
|
||||
|
||||
# 2nd run, no overwrite → should fail
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--warcs",
|
||||
str(TEST_DATA_DIR / "example-response.warc"),
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
]
|
||||
)
|
||||
assert exc.value.code == 2
|
||||
|
||||
# 2nd run, no overwrite → should fail
|
||||
with pytest.raises(SystemExit) as exc:
|
||||
run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
]
|
||||
)
|
||||
assert exc.value.code == 2
|
||||
|
||||
# 3rd run, with overwrite → should succeed
|
||||
result = run(
|
||||
[
|
||||
"--seeds",
|
||||
"https://example.com",
|
||||
"--warcs",
|
||||
str(TEST_DATA_DIR / "example-response.warc"),
|
||||
"--output",
|
||||
str(tmp_path),
|
||||
"--zim-file",
|
||||
zim_output,
|
||||
"--name",
|
||||
"overwrite-test",
|
||||
"--overwrite",
|
||||
]
|
||||
)
|
||||
assert result in (None, 100)
|
||||
assert output_path.exists()
|
||||
Loading…
Add table
Add a link
Reference in a new issue