Compare commits

..

No commits in common. "main" and "v2.1.5" have entirely different histories.
main ... v2.1.5

19 changed files with 278 additions and 2166 deletions

View file

@ -18,7 +18,7 @@ jobs:
run: docker build -t local-zimit .
- name: run crawl of test website
run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
- name: archive ZIM
uses: actions/upload-artifact@v4

View file

@ -5,9 +5,8 @@ on:
types: [published]
jobs:
publish-amd64:
runs-on: ubuntu-24.04
name: "Publish for AMD64"
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
@ -20,34 +19,11 @@ jobs:
latest-on-tag: true
restrict-to: openzim/zimit
registries: ghcr.io
credentials: |
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
# Disabled for now, see https://github.com/openzim/zimit/issues/463
# publish-arm64:
# runs-on: ubuntu-24.04
# name: "Publish for ARM64"
#
# steps:
# - uses: actions/checkout@v4
#
# - name: Build and push Docker image
# uses: openzim/docker-publish-action@v10
# with:
# image-name: openzim/zimit
# tag-pattern: /^v([0-9.]+)$/
# latest-on-tag: true
# restrict-to: openzim/zimit
# registries: ghcr.io
# credentials: |
# GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
# GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
# repo_description: auto
# repo_overview: auto
# platforms: |
# linux/arm64
linux/arm64

View file

@ -7,9 +7,8 @@ on:
workflow_dispatch:
jobs:
publish-amd64:
runs-on: ubuntu-24.04
name: "Publish for AMD64"
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
@ -22,34 +21,11 @@ jobs:
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials: |
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
# Disabled for now, see https://github.com/openzim/zimit/issues/463
# publish-arm64:
# runs-on: ubuntu-24.04-arm
# name: "Publish for ARM64"
#
# steps:
# - uses: actions/checkout@v4
#
# - name: Build and push Docker image
# uses: openzim/docker-publish-action@v10
# with:
# image-name: openzim/zimit
# manual-tag: dev
# latest-on-tag: false
# restrict-to: openzim/zimit
# registries: ghcr.io
# credentials: |
# GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
# GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
# repo_description: auto
# repo_overview: auto
# platforms: |
# linux/arm64
linux/arm64

View file

@ -57,25 +57,13 @@ jobs:
uses: actions/checkout@v4
- name: build image
run: docker build -t local-zimit .
run: docker build -t zimit .
- name: ensure help display without issue
run: docker run -v $PWD/output:/output local-zimit zimit --help
run: docker run -v $PWD/output:/output zimit zimit --help
- name: run crawl with soft size limit
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
- name: run crawl with hard size limit
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
- name: run crawl with soft time limit
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
- name: run crawl with hard time limit
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
- name: run standard crawl
run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"

View file

@ -1,45 +0,0 @@
name: Update ZIMFarm Definitions
on:
push:
branches: [main]
paths:
- "offliner-definition.json"
release:
types: [published]
workflow_dispatch:
inputs:
version:
description: "Version to publish"
required: false
default: "dev"
jobs:
prepare-json:
runs-on: ubuntu-24.04
outputs:
offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- id: read-json
run: |
if [ ! -f "offliner-definition.json" ]; then
echo "File not found!" >&2
exit 1
fi
json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
call-workflow:
needs: prepare-json
uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
with:
version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
offliner: zimit
offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
secrets:
zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}

View file

@ -2,20 +2,20 @@
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: "25.1.0"
rev: "24.10.0"
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
rev: v0.6.9
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.393
rev: v1.1.383
hooks:
- id: pyright
name: pyright (system)

View file

@ -5,92 +5,6 @@ All notable changes to this project are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
## [Unreleased]
### Added
- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
### Changed
- Fix issues preventing interrupted crawls from being resumed. (#499)
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
- Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
- Don't cleanup an explicitly passed build directory.
## [3.0.5] - 2024-04-11
### Changed
- Upgrade to browsertrix crawler 1.6.0 (#493)
## [3.0.4] - 2024-04-04
### Changed
- Upgrade to browsertrix crawler 1.5.10 (#491)
## [3.0.3] - 2024-02-28
### Changed
- Upgrade to browsertrix crawler 1.5.7 (#483)
## [3.0.2] - 2024-02-27
### Changed
- Upgrade to browsertrix crawler 1.5.6 (#482)
## [3.0.1] - 2024-02-24
### Changed
- Upgrade to browsertrix crawler 1.5.4 (#476)
## [3.0.0] - 2024-02-17
### Changed
- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
- Document all Browsertrix Crawler default arguments values (#416)
- Use preferred Browsertrix Crawler arguments names: (part of #471)
- `--seeds` instead of `--url`
- `--seedFile` instead of `--urlFile`
- `--pageLimit` instead of `--limit`
- `--pageLoadTimeout` instead of `--timeout`
- `--scopeIncludeRx` instead of `--include`
- `--scopeExcludeRx` instead of `--exclude`
- `--pageExtraDelay` instead of `--delay`
- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
- `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
- `--zimit-progress-file` is now the zimit stats location
- `--warc2zim-progress-file` is the warc2zim stats location
- all are optional values, if not set and needed temporary files are used
### Fixed
- Do not create the ZIM when crawl is incomplete (#444)
## [2.1.8] - 2024-02-07
### Changed
- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
## [2.1.7] - 2024-01-10
### Changed
- Upgrade to browsertrix crawler 1.4.2 (#450)
- Upgrade to warc2zim 2.2.0
## [2.1.6] - 2024-11-07
### Changed
- Upgrade to browsertrix crawler 1.3.5 (#426)
## [2.1.5] - 2024-11-01
### Changed

View file

@ -1,16 +1,13 @@
FROM webrecorder/browsertrix-crawler:1.6.0
LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
# add deadsnakes ppa for latest Python on Ubuntu
RUN add-apt-repository ppa:deadsnakes/ppa -y
FROM webrecorder/browsertrix-crawler:1.3.4
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
libmagic1 \
python3.13-venv \
python3.12-venv \
&& rm -rf /var/lib/apt/lists/* \
# python setup (in venv not to conflict with browsertrix)
&& python3.13 -m venv /app/zimit \
&& python3.12 -m venv /app/zimit \
# placeholder (default output location)
&& mkdir -p /output \
# disable chrome upgrade

View file

@ -1,15 +1,15 @@
Zimit
=====
Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
Zimit is a scraper allowing to create ZIM file from any Web site.
[![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
[![Docker](https://ghcr-badge.egpl.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**.
Capabilities and known limitations
--------------------
@ -38,23 +38,24 @@ Usage
`zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance:
- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
- Required: `--url URL` - the url to be crawled
- Required: `--name` - Name of ZIM file
- `--output` - output directory (defaults to `/output`)
- `--pageLimit U` - Limit capture to at most U URLs
- `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
- `--limit U` - Limit capture to at most U URLs
- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded)
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
- `--workers N` - number of crawl workers to be run in parallel
- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
Example command:
```bash
docker run ghcr.io/openzim/zimit zimit --help
docker run ghcr.io/openzim/zimit warc2zim --help
docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
docker run -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile
```
**Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).

View file

@ -1,981 +0,0 @@
{
"offliner_id": "zimit",
"stdOutput": true,
"stdStats": "zimit-progress-file",
"flags": {
"seeds": {
"type": "string",
"required": false,
"title": "Seeds",
"description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
},
"seed_file": {
"type": "string",
"required": false,
"title": "Seed File",
"description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
},
"lang": {
"type": "string",
"required": false,
"title": "Browser Language",
"description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
},
"title": {
"type": "string",
"required": false,
"title": "Title",
"description": "Custom title for your ZIM. Defaults to title of main page",
"minLength": 1,
"maxLength": 30
},
"description": {
"type": "string",
"required": false,
"title": "Description",
"description": "Description for ZIM",
"minLength": 1,
"maxLength": 80
},
"favicon": {
"type": "blob",
"kind": "image",
"required": false,
"title": "Illustration",
"description": "URL for Illustration. "
},
"tags": {
"type": "string",
"required": false,
"title": "ZIM Tags",
"description": "Single string with individual tags separated by a semicolon."
},
"creator": {
"type": "string",
"required": false,
"title": "Creator",
"description": "Name of content creator"
},
"publisher": {
"type": "string",
"required": false,
"title": "Publisher",
"isPublisher": true,
"description": "Custom publisher name (ZIM metadata). openZIM otherwise"
},
"source": {
"type": "string",
"required": false,
"title": "Source",
"description": "Source name/URL of content"
},
"workers": {
"type": "integer",
"required": false,
"title": "Workers",
"description": "The number of workers to run in parallel. Defaults to 1",
"min": 1
},
"wait_until": {
"type": "string",
"required": false,
"title": "WaitUntil",
"description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
},
"extra_hops": {
"type": "integer",
"required": false,
"title": "Extra Hops",
"description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
"min": 0
},
"page_limit": {
"type": "integer",
"required": false,
"title": "Page Limit",
"description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
"min": 0
},
"max_page_limit": {
"type": "integer",
"required": false,
"title": "Max Page Limit",
"description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
"min": 0
},
"page_load_timeout": {
"type": "integer",
"required": false,
"title": "Page Load Timeout",
"description": "Timeout for each page to load (in seconds). Default is 90",
"min": 0
},
"scope_type": {
"type": "string-enum",
"required": false,
"title": "Scope Type",
"description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
"choices": [
{
"title": "Page",
"value": "page"
},
{
"title": "Page SPA",
"value": "page-spa"
},
{
"title": "Prefix",
"value": "prefix"
},
{
"title": "Host",
"value": "host"
},
{
"title": "Domain",
"value": "domain"
},
{
"title": "Any",
"value": "any"
},
{
"title": "Custom",
"value": "custom"
}
]
},
"scope_include_rx": {
"type": "string",
"required": false,
"title": "Scope Include Regex",
"description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
},
"scope_exclude_rx": {
"type": "string",
"required": false,
"title": "Scope Exclude Regex",
"description": "Regex of page URLs that should be excluded from the crawl"
},
"allow_hash_urls": {
"type": "boolean",
"required": false,
"title": "Allow Hashtag URLs",
"description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
},
"mobile_device": {
"type": "string-enum",
"required": false,
"title": "As device",
"description": "Device to crawl as. See Pupeeter's Device.ts for a list",
"choices": [
{
"title": "Blackberry Playbook",
"value": "Blackberry PlayBook"
},
{
"title": "Blackberry Playbook Landscape",
"value": "Blackberry PlayBook landscape"
},
{
"title": "Blackberry Z30",
"value": "BlackBerry Z30"
},
{
"title": "Blackberry Z30 Landscape",
"value": "BlackBerry Z30 landscape"
},
{
"title": "Galaxy Note 3",
"value": "Galaxy Note 3"
},
{
"title": "Galaxy Note 3 Landscape",
"value": "Galaxy Note 3 landscape"
},
{
"title": "Galaxy Note II",
"value": "Galaxy Note II"
},
{
"title": "Galaxy Note II Landscape",
"value": "Galaxy Note II landscape"
},
{
"title": "Galaxy S III",
"value": "Galaxy S III"
},
{
"title": "Galaxy S III Landscape",
"value": "Galaxy S III landscape"
},
{
"title": "Galaxy S5",
"value": "Galaxy S5"
},
{
"title": "Galaxy S5 Landscape",
"value": "Galaxy S5 landscape"
},
{
"title": "Galaxy S8",
"value": "Galaxy S8"
},
{
"title": "Galaxy S8 Landscape",
"value": "Galaxy S8 landscape"
},
{
"title": "Galaxy S9 Plus",
"value": "Galaxy S9+"
},
{
"title": "Galaxy S9 Plus Landscape",
"value": "Galaxy S9+ landscape"
},
{
"title": "Galaxy Tab S4",
"value": "Galaxy Tab S4"
},
{
"title": "Galaxy Tab S4 Landscape",
"value": "Galaxy Tab S4 landscape"
},
{
"title": "iPad",
"value": "iPad"
},
{
"title": "iPad Landscape",
"value": "iPad landscape"
},
{
"title": "iPad Gen 6",
"value": "iPad (gen 6)"
},
{
"title": "iPad Gen 6 Landscape",
"value": "iPad (gen 6) landscape"
},
{
"title": "iPad Gen 7",
"value": "iPad (gen 7)"
},
{
"title": "iPad Gen 7 Landscape",
"value": "iPad (gen 7) landscape"
},
{
"title": "iPad Mini",
"value": "iPad Mini"
},
{
"title": "iPad Mini Landscape",
"value": "iPad Mini landscape"
},
{
"title": "iPad Pro",
"value": "iPad Pro"
},
{
"title": "iPad Pro Landscape",
"value": "iPad Pro landscape"
},
{
"title": "iPad Pro 11",
"value": "iPad Pro 11"
},
{
"title": "iPad Pro 11 Landscape",
"value": "iPad Pro 11 landscape"
},
{
"title": "iPhone 4",
"value": "iPhone 4"
},
{
"title": "iPhone 4 Landscape",
"value": "iPhone 4 landscape"
},
{
"title": "iPhone 5",
"value": "iPhone 5"
},
{
"title": "iPhone 5 Landscape",
"value": "iPhone 5 landscape"
},
{
"title": "iPhone 6",
"value": "iPhone 6"
},
{
"title": "iPhone 6 Landscape",
"value": "iPhone 6 landscape"
},
{
"title": "iPhone 6 Plus",
"value": "iPhone 6 Plus"
},
{
"title": "iPhone 6 Plus Landscape",
"value": "iPhone 6 Plus landscape"
},
{
"title": "iPhone 7",
"value": "iPhone 7"
},
{
"title": "iPhone 7 Landscape",
"value": "iPhone 7 landscape"
},
{
"title": "iPhone 7 Plus",
"value": "iPhone 7 Plus"
},
{
"title": "iPhone 7 Plus Landscape",
"value": "iPhone 7 Plus landscape"
},
{
"title": "iPhone 8",
"value": "iPhone 8"
},
{
"title": "iPhone 8 Landscape",
"value": "iPhone 8 landscape"
},
{
"title": "iPhone 8 Plus",
"value": "iPhone 8 Plus"
},
{
"title": "iPhone 8 Plus Landscape",
"value": "iPhone 8 Plus landscape"
},
{
"title": "iPhone SE",
"value": "iPhone SE"
},
{
"title": "iPhone SE Landscape",
"value": "iPhone SE landscape"
},
{
"title": "iPhone X",
"value": "iPhone X"
},
{
"title": "iPhone X Landscape",
"value": "iPhone X landscape"
},
{
"title": "iPhone XR",
"value": "iPhone XR"
},
{
"title": "iPhone XR Landscape",
"value": "iPhone XR landscape"
},
{
"title": "iPhone 11",
"value": "iPhone 11"
},
{
"title": "iPhone 11 Landscape",
"value": "iPhone 11 landscape"
},
{
"title": "iPhone 11 Pro",
"value": "iPhone 11 Pro"
},
{
"title": "iPhone 11 Pro Landscape",
"value": "iPhone 11 Pro landscape"
},
{
"title": "iPhone 11 Pro Max",
"value": "iPhone 11 Pro Max"
},
{
"title": "iPhone 11 Pro Max Landscape",
"value": "iPhone 11 Pro Max landscape"
},
{
"title": "iPhone 12",
"value": "iPhone 12"
},
{
"title": "iPhone 12 Landscape",
"value": "iPhone 12 landscape"
},
{
"title": "iPhone 12 Pro",
"value": "iPhone 12 Pro"
},
{
"title": "iPhone 12 Pro Landscape",
"value": "iPhone 12 Pro landscape"
},
{
"title": "iPhone 12 Pro Max",
"value": "iPhone 12 Pro Max"
},
{
"title": "iPhone 12 Pro Max Landscape",
"value": "iPhone 12 Pro Max landscape"
},
{
"title": "iPhone 12 Mini",
"value": "iPhone 12 Mini"
},
{
"title": "iPhone 12 Mini Landscape",
"value": "iPhone 12 Mini landscape"
},
{
"title": "iPhone 13",
"value": "iPhone 13"
},
{
"title": "iPhone 13 Landscape",
"value": "iPhone 13 landscape"
},
{
"title": "iPhone 13 Pro",
"value": "iPhone 13 Pro"
},
{
"title": "iPhone 13 Pro Landscape",
"value": "iPhone 13 Pro landscape"
},
{
"title": "iPhone 13 Pro Max",
"value": "iPhone 13 Pro Max"
},
{
"title": "iPhone 13 Pro Max Landscape",
"value": "iPhone 13 Pro Max landscape"
},
{
"title": "iPhone 13 Mini",
"value": "iPhone 13 Mini"
},
{
"title": "iPhone 13 Mini Landscape",
"value": "iPhone 13 Mini landscape"
},
{
"title": "Jio Phone 2",
"value": "JioPhone 2"
},
{
"title": "Jio Phone 2 Landscape",
"value": "JioPhone 2 landscape"
},
{
"title": "Kindle Fire HDX",
"value": "Kindle Fire HDX"
},
{
"title": "Kindle Fire HDX Landscape",
"value": "Kindle Fire HDX landscape"
},
{
"title": "LG Optimus L70",
"value": "LG Optimus L70"
},
{
"title": "LG Optimus L70 Landscape",
"value": "LG Optimus L70 landscape"
},
{
"title": "Microsoft Lumia 550",
"value": "Microsoft Lumia 550"
},
{
"title": "Microsoft Lumia 950",
"value": "Microsoft Lumia 950"
},
{
"title": "Microsoft Lumia 950 Landscape",
"value": "Microsoft Lumia 950 landscape"
},
{
"title": "Nexus 10",
"value": "Nexus 10"
},
{
"title": "Nexus 10 Landscape",
"value": "Nexus 10 landscape"
},
{
"title": "Nexus 4",
"value": "Nexus 4"
},
{
"title": "Nexus 4 Landscape",
"value": "Nexus 4 landscape"
},
{
"title": "Nexus 5",
"value": "Nexus 5"
},
{
"title": "Nexus 5 Landscape",
"value": "Nexus 5 landscape"
},
{
"title": "Nexus 5X",
"value": "Nexus 5X"
},
{
"title": "Nexus 5X Landscape",
"value": "Nexus 5X landscape"
},
{
"title": "Nexus 6",
"value": "Nexus 6"
},
{
"title": "Nexus 6 Landscape",
"value": "Nexus 6 landscape"
},
{
"title": "Nexus 6P",
"value": "Nexus 6P"
},
{
"title": "Nexus 6P Landscape",
"value": "Nexus 6P landscape"
},
{
"title": "Nexus 7",
"value": "Nexus 7"
},
{
"title": "Nexus 7 Landscape",
"value": "Nexus 7 landscape"
},
{
"title": "Nokia Lumia 520",
"value": "Nokia Lumia 520"
},
{
"title": "Nokia Lumia 520 Landscape",
"value": "Nokia Lumia 520 landscape"
},
{
"title": "Nokia N9",
"value": "Nokia N9"
},
{
"title": "Nokia N9 Landscape",
"value": "Nokia N9 landscape"
},
{
"title": "Pixel 2",
"value": "Pixel 2"
},
{
"title": "Pixel 2 Landscape",
"value": "Pixel 2 landscape"
},
{
"title": "Pixel 2 XL",
"value": "Pixel 2 XL"
},
{
"title": "Pixel 2 XL Landscape",
"value": "Pixel 2 XL landscape"
},
{
"title": "Pixel 3",
"value": "Pixel 3"
},
{
"title": "Pixel 3 Landscape",
"value": "Pixel 3 landscape"
},
{
"title": "Pixel 4",
"value": "Pixel 4"
},
{
"title": "Pixel 4 Landscape",
"value": "Pixel 4 landscape"
},
{
"title": "Pixel 4A 5G",
"value": "Pixel 4a (5G)"
},
{
"title": "Pixel 4A 5G Landscape",
"value": "Pixel 4a (5G) landscape"
},
{
"title": "Pixel 5",
"value": "Pixel 5"
},
{
"title": "Pixel 5 Landscape",
"value": "Pixel 5 landscape"
},
{
"title": "Moto G4",
"value": "Moto G4"
},
{
"title": "Moto G4 Landscape",
"value": "Moto G4 landscape"
}
]
},
"select_links": {
"type": "string",
"required": false,
"title": "Select Links",
"description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
},
"click_selector": {
"type": "string",
"required": false,
"title": "Click Selector",
"description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
},
"block_rules": {
"type": "string",
"required": false,
"title": "Block Rules",
"description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
},
"block_message": {
"type": "string",
"required": false,
"title": "Block Message",
"description": "If specified, when a URL is blocked, a record with this error message is added instead"
},
"block_ads": {
"type": "boolean",
"required": false,
"title": "Block Ads",
"description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
},
"ad_block_message": {
"type": "string",
"required": false,
"title": "Ads Block Message",
"description": "If specified, when an ad is blocked, a record with this error message is added instead"
},
"user_agent": {
"type": "string",
"required": false,
"title": "User Agent",
"description": "Override user-agent with specified"
},
"user_agent_suffix": {
"type": "string",
"required": false,
"title": "User Agent Suffix",
"description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
},
"use_sitemap": {
"type": "string",
"required": false,
"title": "Sitemap URL",
"description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
},
"sitemap_from_date": {
"type": "string",
"required": false,
"title": "Sitemap From Date",
"description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
},
"sitemap_to_date": {
"type": "string",
"required": false,
"title": "Sitemap To Date",
"description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
},
"behavior_timeout": {
"type": "integer",
"required": false,
"title": "Behavior Timeout",
"description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
"min": 0
},
"post_load_delay": {
"type": "integer",
"required": false,
"title": "Post Load Delay",
"description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
"min": 0
},
"page_extra_delay": {
"type": "integer",
"required": false,
"title": "Page Extra Delay",
"description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
"min": 0
},
"dedup_policy": {
"type": "string-enum",
"required": false,
"title": "Dedup Policy",
"description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
"choices": [
{
"title": "Skip",
"value": "skip"
},
{
"title": "Revisit",
"value": "revisit"
},
{
"title": "Keep",
"value": "keep"
}
]
},
"screenshot": {
"type": "string",
"required": false,
"title": "Screenshot",
"description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
},
"size_soft_limit": {
"type": "integer",
"required": false,
"title": "Size Soft Limit",
"description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
"min": 0
},
"size_hard_limit": {
"type": "integer",
"required": false,
"title": "Size Hard Limit",
"description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
"min": 0
},
"disk_utilization": {
"type": "integer",
"required": false,
"title": "Disk Utilization",
"description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
"min": 0
},
"time_soft_limit": {
"type": "integer",
"required": false,
"title": "Time Soft Limit",
"description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
"min": 0
},
"time_hard_limit": {
"type": "integer",
"required": false,
"title": "Time Hard Limit",
"description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
"min": 0
},
"net_idle_wait": {
"type": "integer",
"required": false,
"title": "Net Idle Wait",
"description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
},
"origin_override": {
"type": "string",
"required": false,
"title": "Origin Override",
"description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
},
"max_page_retries": {
"type": "integer",
"required": false,
"title": "Max Page Retries",
"description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
"min": 0
},
"fail_on_failed_seed": {
"type": "boolean",
"required": false,
"title": "Fail on failed seed",
"description": "Whether to display additional logs"
},
"fail_on_invalid_status": {
"type": "boolean",
"required": false,
"title": "Fail on invalid status",
"description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
},
"fail_on_failed_limit": {
"type": "integer",
"required": false,
"title": "Fail on failed - Limit",
"description": "If set, save state and exit if number of failed pages exceeds this value.",
"min": 0
},
"warcs": {
"type": "string",
"required": false,
"title": "WARC files",
"description": "Comma-separated list of WARC files to use as input."
},
"verbose": {
"type": "boolean",
"required": false,
"title": "Verbose mode",
"description": "Whether to display additional logs"
},
"keep": {
"type": "boolean",
"required": false,
"title": "Keep",
"description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
"default": true
},
"output": {
"type": "string",
"required": false,
"title": "Output folder",
"description": "Output folder for ZIM file(s). Leave it as `/output`",
"pattern": "^/output$"
},
"admin_email": {
"type": "email",
"required": false,
"title": "Admin Email",
"description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
"default": "contact+zimfarm@kiwix.org"
},
"profile": {
"type": "string",
"required": false,
"title": "Browser profile",
"description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
},
"behaviors": {
"type": "string",
"required": false,
"title": "Behaviors",
"description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
},
"depth": {
"type": "integer",
"required": false,
"title": "Depth",
"description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
"min": -1
},
"zim_lang": {
"type": "string",
"required": false,
"title": "ZIM Language",
"description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
"alias": "zim-lang",
"customValidator": "language_code"
},
"long_description": {
"type": "string",
"required": false,
"title": "Long description",
"description": "Optional long description for your ZIM",
"minLength": 1,
"maxLength": 4000,
"alias": "long-description"
},
"custom_css": {
"type": "blob",
"kind": "css",
"required": false,
"title": "Custom CSS",
"description": "URL to a CSS file to inject into pages",
"alias": "custom-css"
},
"charsets_to_try": {
"type": "string",
"required": false,
"title": "Charsets to try",
"description": "List of charsets to try decode content when charset is not found",
"alias": "charsets-to-try"
},
"ignore_content_header_charsets": {
"type": "boolean",
"required": false,
"title": "Ignore Content Header Charsets",
"description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
"alias": "ignore-content-header-charsets"
},
"content_header_bytes_length": {
"type": "integer",
"required": false,
"title": "Content Header Bytes Length",
"description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
"alias": "content-header-bytes-length",
"min": 0
},
"ignore_http_header_charsets": {
"type": "boolean",
"required": false,
"title": "Ignore HTTP Header Charsets",
"description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
"alias": "ignore-http-header-charsets"
},
"encoding_aliases": {
"type": "string",
"required": false,
"title": "Encoding Aliases",
"description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
"alias": "encoding-aliases"
},
"custom_behaviors": {
"type": "string",
"required": false,
"title": "Custom Behaviors",
"description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
"alias": "custom-behaviours"
},
"zimit_progress_file": {
"type": "string",
"required": false,
"title": "Zimit Progress File",
"description": "Scraping progress file. Leave it as `/output/task_progress.json`",
"alias": "zimit-progress-file",
"pattern": "^/output/task_progress\\.json$"
},
"replay_viewer_source": {
"type": "url",
"required": false,
"title": "Replay Viewer Source",
"description": "URL from which to load the ReplayWeb.page replay viewer from",
"alias": "replay-viewer-source"
},
"zim_file": {
"type": "string",
"required": false,
"title": "ZIM filename",
"description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
"alias": "zim-file",
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
"relaxedPattern": "^[A-Za-z0-9._-]+$"
},
"name": {
"type": "string",
"required": true,
"title": "ZIM name",
"description": "Name of the ZIM.",
"alias": "name",
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
"relaxedPattern": "^[A-Za-z0-9._-]+$"
},
"overwrite": {
"type": "boolean",
"required": false,
"title": "Overwrite",
"description": "Whether to overwrite existing ZIM file if it exists"
}
}
}

View file

@ -1,17 +1,17 @@
[build-system]
requires = ["hatchling", "hatch-openzim"]
requires = ["hatchling", "hatch-openzim==0.2.0"]
build-backend = "hatchling.build"
[project]
name = "zimit"
requires-python = ">=3.13,<3.14"
requires-python = ">=3.12,<3.13"
description = "Make ZIM file from any website through crawling"
readme = "README.md"
dependencies = [
"requests==2.32.3",
"inotify==0.2.10",
"tld==0.13",
"warc2zim @ git+https://github.com/openzim/warc2zim@main",
"warc2zim==2.1.3",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
@ -26,20 +26,20 @@ scripts = [
"invoke==2.2.0",
]
lint = [
"black==25.1.0",
"ruff==0.9.4",
"black==24.10.0",
"ruff==0.6.9",
]
check = [
"pyright==1.1.393",
"pyright==1.1.383",
]
test = [
"pytest==8.3.4",
"coverage==7.6.10",
"pytest==8.3.3",
"coverage==7.6.1",
]
dev = [
"pre-commit==4.1.0",
"debugpy==1.8.12",
"selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
"pre-commit==4.0.0",
"debugpy==1.8.6",
"selenium==4.25.0", # used in daily tests, convenient for dev purpose (autocompletion)
"zimit[scripts]",
"zimit[lint]",
"zimit[test]",
@ -95,10 +95,10 @@ all = "inv checkall --args '{args}'"
[tool.black]
line-length = 88
target-version = ['py313']
target-version = ['py312']
[tool.ruff]
target-version = "py313"
target-version = "py312"
line-length = 88
src = ["src"]
@ -221,5 +221,5 @@ exclude_lines = [
include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**"]
extraPaths = ["src"]
pythonVersion = "3.13"
pythonVersion = "3.12"
typeCheckingMode="basic"

View file

@ -1 +1 @@
__version__ = "3.0.6-dev0"
__version__ = "2.1.5"

View file

@ -3,8 +3,7 @@ import logging
from zimscraperlib.logging import getLogger
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
REQUESTS_TIMEOUT = 10

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
# Let's extract kiwix-tools as usual on alpine temporary build container
FROM alpine:3.21 as kiwix-serve
FROM alpine:3.18 as kiwix-serve
LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
# TARGETPLATFORM is injected by docker build
@ -30,7 +30,7 @@ RUN set -e && \
curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
# Build real "workload" container
FROM python:3.13-slim-bookworm
FROM python:3.12-slim-bookworm
# Add kiwix-serve
COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
@ -70,6 +70,6 @@ RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
RUN \
python -m pip install --no-cache-dir -U \
pip \
selenium==4.28.1 \
pytest==8.3.4 \
selenium==4.23.0 \
pytest==8.2.2 \
&& mkdir -p /work

View file

@ -1,55 +1,30 @@
import glob
import json
import os
from pathlib import Path
import pytest
from warcio import ArchiveIterator
from zimscraperlib.zim import Archive
@pytest.mark.parametrize(
"filename",
[
pytest.param("/output/tests_en_onepage.zim", id="onepage"),
pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
],
)
def test_zim_created(filename):
def test_is_file():
"""Ensure ZIM file exists"""
assert os.path.isfile(filename)
@pytest.mark.parametrize(
"filename",
[
pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
],
)
def test_zim_not_created(filename):
"""Ensure ZIM file does not exists"""
assert not os.path.exists(filename)
assert os.path.isfile("/output/isago.zim")
def test_zim_main_page():
"""Main page specified, http://website.test.openzim.org/http-return-codes.html,
was a redirect to https
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
main_entry = Archive("/output/isago.zim").main_entry
assert main_entry.is_redirect
assert (
main_entry.get_redirect_entry().path
== "website.test.openzim.org/http-return-codes.html"
)
assert main_entry.get_redirect_entry().path == "isago.rskg.org/"
def test_zim_scraper():
"""Check content of scraper metadata"""
"""Main page specified, http://isago.rskg.org/, was a redirect to https
Ensure main page is the redirected page"""
zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
zim_fh = Archive("/output/isago.zim")
scraper = zim_fh.get_text_metadata("Scraper")
assert "zimit " in scraper
assert "warc2zim " in scraper
@ -58,28 +33,18 @@ def test_zim_scraper():
def test_files_list():
"""Check that expected files are present in the ZIM at proper path"""
zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
zim_fh = Archive("/output/isago.zim")
for expected_entry in [
"_zim_static/__wb_module_decl.js",
"_zim_static/wombat.js",
"_zim_static/wombatSetup.js",
"website.test.openzim.org/http-return-codes.html",
"website.test.openzim.org/200-response",
"website.test.openzim.org/201-response",
"website.test.openzim.org/202-response",
"website.test.openzim.org/301-external-redirect-ok",
"website.test.openzim.org/301-internal-redirect-ok",
"website.test.openzim.org/302-external-redirect-ok",
"website.test.openzim.org/302-internal-redirect-ok",
"website.test.openzim.org/307-external-redirect-ok",
"website.test.openzim.org/307-internal-redirect-ok",
"website.test.openzim.org/308-external-redirect-ok",
"website.test.openzim.org/308-internal-redirect-ok",
"website.test.openzim.org/http-return-codes.html",
"website.test.openzim.org/icons/favicon.ico",
"website.test.openzim.org/icons/site.webmanifest",
"website.test.openzim.org/internal_redirect_target.html",
"www.example.com/",
"isago.rskg.org/",
"isago.rskg.org/a-propos",
"isago.rskg.org/conseils",
"isago.rskg.org/faq",
"isago.rskg.org/static/favicon256.png",
"isago.rskg.org/static/tarifs-isago.pdf",
"maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css",
]:
assert zim_fh.get_content(expected_entry)
@ -106,40 +71,24 @@ def test_user_agent():
assert found
def test_stats_output_standard():
assert json.loads(Path("/output/crawl.json").read_bytes()) == {
"crawled": 17,
"pending": 0,
"pendingPages": [],
"total": 35,
"failed": 18,
"limit": {"max": 0, "hit": False},
}
assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
"written": 8,
"total": 8,
}
assert json.loads(Path("/output/stats.json").read_bytes()) == {
"done": 8,
"total": 8,
"partialZim": False,
}
@pytest.mark.parametrize(
"filename",
[
pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
],
)
def test_stats_output_softlimit(filename):
file = Path(filename)
assert file.exists
content = json.loads(file.read_bytes())
assert "done" in content
assert "total" in content
assert "partialZim" in content
assert content["partialZim"]
def test_stats_output():
with open("/output/crawl.json") as fh:
assert json.loads(fh.read()) == {
"crawled": 5,
"pending": 0,
"pendingPages": [],
"total": 5,
"failed": 0,
"limit": {"max": 0, "hit": False},
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 7,
"total": 7,
"limit": {"max": 0, "hit": False},
}

View file

@ -1,14 +0,0 @@
import pytest
from zimit import zimit as app
"""
cleanup disabled because atexit hooks run at the very end of the Python process
shutdown. By the time cleanup() is called, the logging module has already closed its
file streams.
"""
@pytest.fixture(autouse=True)
def disable_zimit_cleanup(monkeypatch):
monkeypatch.setattr(app, "cleanup", lambda: None)

Binary file not shown.

View file

@ -1,83 +0,0 @@
import pathlib
import pytest
from zimit.zimit import run
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
def test_overwrite_flag_behaviour(tmp_path):
zim_output = "overwrite-test.zim"
output_path = tmp_path / zim_output
# 1st run → creates file
result = run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert result in (None, 100)
assert output_path.exists()
# 2nd run, no overwrite → should fail
with pytest.raises(SystemExit) as exc:
run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert exc.value.code == 2
# 2nd run, no overwrite → should fail
with pytest.raises(SystemExit) as exc:
run(
[
"--seeds",
"https://example.com",
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert exc.value.code == 2
# 3rd run, with overwrite → should succeed
result = run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
"--overwrite",
]
)
assert result in (None, 100)
assert output_path.exists()