From 15b72022cebf42143dda2eb32b6508b4576bcc5f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 7 Nov 2024 10:03:03 +0000 Subject: [PATCH 01/65] Prepare for 2.1.7 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6beb584..cdd3fd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [2.1.6] - 2024-11-07 ### Changed diff --git a/pyproject.toml b/pyproject.toml index b213161..ffedf8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.1.3", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index edc60b3..63e60b5 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.6" +__version__ = "2.1.7-dev0" From bfa226bf81da7c5c7c1a624f22f3063abd5058c9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 8 Nov 2024 14:22:35 +0000 Subject: [PATCH 02/65] Properly exit with code --- src/zimit/zimit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 44c6d4f..ec989f1 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -685,7 +685,7 @@ def sigint_handler(*args): # noqa: ARG001 def zimit(): - run(sys.argv[1:]) + sys.exit(run(sys.argv[1:])) signal.signal(signal.SIGINT, sigint_handler) From 16a4f8d4d830932632999109a9965c9c6f108277 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 15 Nov 2024 15:46:40 +0000 Subject: [PATCH 03/65] Upgrade to browsertrix crawler 1.4.0-beta.0 --- CHANGELOG.md | 4 ++++ Dockerfile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdd3fd2..20dce9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434) + ## [2.1.6] - 2024-11-07 ### Changed diff --git a/Dockerfile b/Dockerfile index bac1b30..67e0e18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.3.5 +FROM webrecorder/browsertrix-crawler:1.4.0-beta.0 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update \ From 00d2433383c32e539a84ac36a19746d6aaf9c6a0 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 09:06:08 +0000 Subject: [PATCH 04/65] Upgrade to browsertrix crawler 1.4.2 --- CHANGELOG.md | 2 +- Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20dce9f..0711aea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Upgrade to browsertrix crawler 1.4.0-beta.0 (#434) +- Upgrade to browsertrix crawler 1.4.2 (#450) ## [2.1.6] - 2024-11-07 diff --git a/Dockerfile b/Dockerfile index 67e0e18..9b304d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.4.0-beta.0 +FROM webrecorder/browsertrix-crawler:1.4.2 LABEL org.opencontainers.image.source https://github.com/openzim/zimit RUN apt-get update \ From 8d42a8dd93ba12aeb75d6294ff9d55ed51e6de5a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:41:05 +0000 Subject: [PATCH 05/65] Move integration tests to test website --- .github/workflows/Tests.yaml | 2 +- tests-integration/integration.py | 83 ++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 9e21fa7..592a5aa 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -63,7 +63,7 @@ jobs: run: docker run -v $PWD/output:/output zimit zimit --help - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 16ab337..9d37b0f 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -1,6 +1,7 @@ import glob import json import os +from pathlib import Path from warcio import ArchiveIterator from zimscraperlib.zim import Archive @@ -8,23 +9,26 @@ from zimscraperlib.zim import Archive def test_is_file(): """Ensure ZIM file exists""" - assert os.path.isfile("/output/isago.zim") + assert os.path.isfile("/output/tests_en_onepage.zim") def test_zim_main_page(): - """Main page specified, http://isago.rskg.org/, was a redirect to https + """Main page specified, http://website.test.openzim.org/http-return-codes.html, + was a redirect to https Ensure main page is the redirected page""" - main_entry = Archive("/output/isago.zim").main_entry + main_entry = Archive("/output/tests_en_onepage.zim").main_entry assert main_entry.is_redirect - assert main_entry.get_redirect_entry().path == "isago.rskg.org/" + assert ( + main_entry.get_redirect_entry().path + == "website.test.openzim.org/http-return-codes.html" + ) def test_zim_scraper(): - """Main page specified, http://isago.rskg.org/, was a redirect to https - Ensure main page is the redirected page""" + """Check content of scraper metadata""" - zim_fh = Archive("/output/isago.zim") + zim_fh = Archive("/output/tests_en_onepage.zim") scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper @@ -33,18 +37,28 @@ def test_zim_scraper(): def test_files_list(): """Check that expected files are present in the ZIM at proper path""" - zim_fh = Archive("/output/isago.zim") + zim_fh = Archive("/output/tests_en_onepage.zim") for expected_entry in [ "_zim_static/__wb_module_decl.js", "_zim_static/wombat.js", "_zim_static/wombatSetup.js", - "isago.rskg.org/", - "isago.rskg.org/a-propos", - "isago.rskg.org/conseils", - "isago.rskg.org/faq", - "isago.rskg.org/static/favicon256.png", - "isago.rskg.org/static/tarifs-isago.pdf", - "maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css", + "website.test.openzim.org/http-return-codes.html", + "website.test.openzim.org/200-response", + "website.test.openzim.org/201-response", + "website.test.openzim.org/202-response", + "website.test.openzim.org/301-external-redirect-ok", + "website.test.openzim.org/301-internal-redirect-ok", + "website.test.openzim.org/302-external-redirect-ok", + "website.test.openzim.org/302-internal-redirect-ok", + "website.test.openzim.org/307-external-redirect-ok", + "website.test.openzim.org/307-internal-redirect-ok", + "website.test.openzim.org/308-external-redirect-ok", + "website.test.openzim.org/308-internal-redirect-ok", + "website.test.openzim.org/http-return-codes.html", + "website.test.openzim.org/icons/favicon.ico", + "website.test.openzim.org/icons/site.webmanifest", + "website.test.openzim.org/internal_redirect_target.html", + "www.example.com/", ]: assert zim_fh.get_content(expected_entry) @@ -72,23 +86,22 @@ def test_user_agent(): def test_stats_output(): - with open("/output/crawl.json") as fh: - assert json.loads(fh.read()) == { - "crawled": 5, - "pending": 0, - "pendingPages": [], - "total": 5, - "failed": 0, - "limit": {"max": 0, "hit": False}, - } - with open("/output/warc2zim.json") as fh: - assert json.loads(fh.read()) == { - "written": 7, - "total": 7, - } - with open("/output/stats.json") as fh: - assert json.loads(fh.read()) == { - "done": 7, - "total": 7, - "limit": {"max": 0, "hit": False}, - } + assert json.loads(Path("/output/crawl.json").read_bytes()) == { + "crawled": 35, + "pending": 0, + "pendingPages": [], + "total": 35, + "failed": 18, + "limit": {"max": 0, "hit": False}, + } + + assert json.loads(Path("/output/warc2zim.json").read_bytes()) == { + "written": 8, + "total": 8, + } + + assert json.loads(Path("/output/stats.json").read_bytes()) == { + "done": 8, + "total": 8, + "limit": {"max": 0, "hit": False}, + } From 97ea6dfd7b8d74026e859a49c1681af88e1cbcb8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:41:22 +0000 Subject: [PATCH 06/65] Fix Docker label to follow new convention --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9b304d0..9d88f45 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ FROM webrecorder/browsertrix-crawler:1.4.2 -LABEL org.opencontainers.image.source https://github.com/openzim/zimit +LABEL org.opencontainers.image.source=https://github.com/openzim/zimit RUN apt-get update \ && apt-get install -qqy --no-install-recommends \ From 14670d4c6959f2c55851c2f9df4df0505c930d0b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 10 Jan 2025 10:24:47 +0000 Subject: [PATCH 07/65] Release 2.1.7 --- CHANGELOG.md | 3 ++- pyproject.toml | 5 +---- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0711aea..d0e9044 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,12 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [2.1.7] - 2024-01-10 ### Changed - Upgrade to browsertrix crawler 1.4.2 (#450) +- Upgrade to warc2zim 2.2.0 ## [2.1.6] - 2024-11-07 diff --git a/pyproject.toml b/pyproject.toml index ffedf8b..e522b95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,16 +11,13 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] [tool.hatch.metadata.hooks.openzim-metadata] kind = "scraper" -[tool.hatch.metadata] -allow-direct-references = true # to be removed once we use a released warc2zim version - [project.optional-dependencies] scripts = [ "invoke==2.2.0", diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 63e60b5..b6d2a0c 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.7-dev0" +__version__ = "2.1.7" From 4835adbdd7c7d43225a62f86d0128285af2ea3d4 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 10 Jan 2025 12:41:01 +0000 Subject: [PATCH 08/65] Prepare for 2.1.8 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e9044..c437da8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [2.1.7] - 2024-01-10 ### Changed diff --git a/pyproject.toml b/pyproject.toml index e522b95..1bc23a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.0", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index b6d2a0c..72f5b42 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.7" +__version__ = "2.1.8-dev0" From 0cb84f212677e1794913489cb6aefce7715d283e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 10 Jan 2025 12:41:01 +0000 Subject: [PATCH 09/65] Prepare for 2.1.8 --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 1bc23a7..ffedf8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,9 @@ dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] [tool.hatch.metadata.hooks.openzim-metadata] kind = "scraper" +[tool.hatch.metadata] +allow-direct-references = true # to be removed once we use a released warc2zim version + [project.optional-dependencies] scripts = [ "invoke==2.2.0", From 0f136d2f2f49d583d0768b0e46dc763afb5c5169 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 4 Feb 2025 15:12:49 +0000 Subject: [PATCH 10/65] Upgrade Python 3.13, Crawler 1.5.0 and others --- .pre-commit-config.yaml | 8 ++++---- CHANGELOG.md | 4 ++++ Dockerfile | 9 ++++++--- pyproject.toml | 26 +++++++++++++------------- tests-daily/Dockerfile | 8 ++++---- tests-integration/integration.py | 6 +++--- 6 files changed, 34 insertions(+), 27 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4f91d0b..b362d62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,20 +2,20 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: "24.10.0" + rev: "25.1.0" hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.9 + rev: v0.9.4 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.383 + rev: v1.1.393 hooks: - id: pyright name: pyright (system) diff --git a/CHANGELOG.md b/CHANGELOG.md index c437da8..4033a33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.5.0, Python 3.13 and others (#462) + ## [2.1.7] - 2024-01-10 ### Changed diff --git a/Dockerfile b/Dockerfile index 9d88f45..d2854dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,16 @@ -FROM webrecorder/browsertrix-crawler:1.4.2 +FROM webrecorder/browsertrix-crawler:1.5.0 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit +# add deadsnakes ppa for latest Python on Ubuntu +RUN add-apt-repository ppa:deadsnakes/ppa -y + RUN apt-get update \ && apt-get install -qqy --no-install-recommends \ libmagic1 \ - python3.12-venv \ + python3.13-venv \ && rm -rf /var/lib/apt/lists/* \ # python setup (in venv not to conflict with browsertrix) - && python3.12 -m venv /app/zimit \ + && python3.13 -m venv /app/zimit \ # placeholder (default output location) && mkdir -p /output \ # disable chrome upgrade diff --git a/pyproject.toml b/pyproject.toml index ffedf8b..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [build-system] -requires = ["hatchling", "hatch-openzim==0.2.0"] +requires = ["hatchling", "hatch-openzim"] build-backend = "hatchling.build" [project] name = "zimit" -requires-python = ">=3.12,<3.13" +requires-python = ">=3.13,<3.14" description = "Make ZIM file from any website through crawling" readme = "README.md" dependencies = [ @@ -26,20 +26,20 @@ scripts = [ "invoke==2.2.0", ] lint = [ - "black==24.10.0", - "ruff==0.6.9", + "black==25.1.0", + "ruff==0.9.4", ] check = [ - "pyright==1.1.383", + "pyright==1.1.393", ] test = [ - "pytest==8.3.3", - "coverage==7.6.1", + "pytest==8.3.4", + "coverage==7.6.10", ] dev = [ - "pre-commit==4.0.0", - "debugpy==1.8.6", - "selenium==4.25.0", # used in daily tests, convenient for dev purpose (autocompletion) + "pre-commit==4.1.0", + "debugpy==1.8.12", + "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion) "zimit[scripts]", "zimit[lint]", "zimit[test]", @@ -95,10 +95,10 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 -target-version = ['py312'] +target-version = ['py313'] [tool.ruff] -target-version = "py312" +target-version = "py313" line-length = 88 src = ["src"] @@ -221,5 +221,5 @@ exclude_lines = [ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**"] extraPaths = ["src"] -pythonVersion = "3.12" +pythonVersion = "3.13" typeCheckingMode="basic" diff --git a/tests-daily/Dockerfile b/tests-daily/Dockerfile index f6118fe..22d45ef 100644 --- a/tests-daily/Dockerfile +++ b/tests-daily/Dockerfile @@ -1,5 +1,5 @@ # Let's extract kiwix-tools as usual on alpine temporary build container -FROM alpine:3.18 as kiwix-serve +FROM alpine:3.21 as kiwix-serve LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools # TARGETPLATFORM is injected by docker build @@ -30,7 +30,7 @@ RUN set -e && \ curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1 # Build real "workload" container -FROM python:3.12-slim-bookworm +FROM python:3.13-slim-bookworm # Add kiwix-serve COPY --from=kiwix-serve /kiwix-serve /usr/local/bin @@ -70,6 +70,6 @@ RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json RUN \ python -m pip install --no-cache-dir -U \ pip \ - selenium==4.23.0 \ - pytest==8.2.2 \ + selenium==4.28.1 \ + pytest==8.3.4 \ && mkdir -p /work diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 9d37b0f..55fcecb 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -17,7 +17,7 @@ def test_zim_main_page(): was a redirect to https Ensure main page is the redirected page""" - main_entry = Archive("/output/tests_en_onepage.zim").main_entry + main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry assert main_entry.is_redirect assert ( main_entry.get_redirect_entry().path @@ -28,7 +28,7 @@ def test_zim_main_page(): def test_zim_scraper(): """Check content of scraper metadata""" - zim_fh = Archive("/output/tests_en_onepage.zim") + zim_fh = Archive(Path("/output/tests_en_onepage.zim")) scraper = zim_fh.get_text_metadata("Scraper") assert "zimit " in scraper assert "warc2zim " in scraper @@ -37,7 +37,7 @@ def test_zim_scraper(): def test_files_list(): """Check that expected files are present in the ZIM at proper path""" - zim_fh = Archive("/output/tests_en_onepage.zim") + zim_fh = Archive(Path("/output/tests_en_onepage.zim")) for expected_entry in [ "_zim_static/__wb_module_decl.js", "_zim_static/wombat.js", From 9396cf1ca05ab73a1c6ee001328a6420c4fd9385 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 6 Feb 2025 13:38:10 +0000 Subject: [PATCH 11/65] Alter crawl statistics following 1.5.0 release --- tests-integration/integration.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 55fcecb..b757e3d 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -87,11 +87,12 @@ def test_user_agent(): def test_stats_output(): assert json.loads(Path("/output/crawl.json").read_bytes()) == { - "crawled": 35, + "crawled": 17, "pending": 0, "pendingPages": [], - "total": 35, - "failed": 18, + "total": 17, + "failed": 1, + "failedWillRetry": 17, "limit": {"max": 0, "hit": False}, } From 4ef9a0d380ae992448151baeafa888befeb2f446 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 6 Feb 2025 21:11:40 +0000 Subject: [PATCH 12/65] Remove support for ARM64, this is not working anymore and was painfully slow --- .github/workflows/Publish.yml | 1 - .github/workflows/PublishDockerDevImage.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index b6660d0..6fabc32 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -26,4 +26,3 @@ jobs: repo_overview: auto platforms: | linux/amd64 - linux/arm64 diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index 5e2431e..d893882 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -28,4 +28,3 @@ jobs: repo_overview: auto platforms: | linux/amd64 - linux/arm64 From cea10bd3b5cb3866f1c18680fbe0ce307391e367 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 6 Feb 2025 21:17:46 +0000 Subject: [PATCH 13/65] Add second build job on native arch for ARM64 --- .github/workflows/Publish.yml | 28 ++++++++++++++++++-- .github/workflows/PublishDockerDevImage.yaml | 28 ++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index 6fabc32..0b22e1c 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -5,8 +5,9 @@ on: types: [published] jobs: - publish: - runs-on: ubuntu-22.04 + publish-amd64: + runs-on: ubuntu-24.04 + name: "Publish for AMD64" steps: - uses: actions/checkout@v4 @@ -26,3 +27,26 @@ jobs: repo_overview: auto platforms: | linux/amd64 + + publish-arm64: + runs-on: ubuntu-24.04-arm + name: "Publish for ARM64" + + steps: + - uses: actions/checkout@v4 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/zimit + tag-pattern: /^v([0-9.]+)$/ + latest-on-tag: true + restrict-to: openzim/zimit + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto + platforms: | + linux/arm64 diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index d893882..61c9140 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -7,8 +7,9 @@ on: workflow_dispatch: jobs: - publish: - runs-on: ubuntu-22.04 + publish-amd64: + runs-on: ubuntu-24.04 + name: "Publish for AMD64" steps: - uses: actions/checkout@v4 @@ -28,3 +29,26 @@ jobs: repo_overview: auto platforms: | linux/amd64 + + publish-arm64: + runs-on: ubuntu-24.04-arm64 + name: "Publish for ARM64" + + steps: + - uses: actions/checkout@v4 + + - name: Build and push Docker image + uses: openzim/docker-publish-action@v10 + with: + image-name: openzim/zimit + manual-tag: dev + latest-on-tag: false + restrict-to: openzim/zimit + registries: ghcr.io + credentials: + GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + repo_description: auto + repo_overview: auto + platforms: | + linux/arm64 From b4c0495f48a325544a59c03fd239a83c14c8c02c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 6 Feb 2025 21:19:08 +0000 Subject: [PATCH 14/65] Fix arm runner selector --- .github/workflows/Publish.yml | 2 +- .github/workflows/PublishDockerDevImage.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index 0b22e1c..7121cb7 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -29,7 +29,7 @@ jobs: linux/amd64 publish-arm64: - runs-on: ubuntu-24.04-arm + runs-on: ubuntu-24.04 name: "Publish for ARM64" steps: diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index 61c9140..05a8ae1 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -31,7 +31,7 @@ jobs: linux/amd64 publish-arm64: - runs-on: ubuntu-24.04-arm64 + runs-on: ubuntu-24.04-arm name: "Publish for ARM64" steps: From 5af981c01c6f7f65b43fc13204884ebfc58fdaa8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Feb 2025 08:07:23 +0000 Subject: [PATCH 15/65] Remove ARM64 job temporarily, still not working --- .github/workflows/Publish.yml | 49 ++++++++++---------- .github/workflows/PublishDockerDevImage.yaml | 47 ++++++++++--------- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/.github/workflows/Publish.yml b/.github/workflows/Publish.yml index 7121cb7..1ddb343 100644 --- a/.github/workflows/Publish.yml +++ b/.github/workflows/Publish.yml @@ -5,7 +5,7 @@ on: types: [published] jobs: - publish-amd64: + publish-amd64: runs-on: ubuntu-24.04 name: "Publish for AMD64" @@ -20,7 +20,7 @@ jobs: latest-on-tag: true restrict-to: openzim/zimit registries: ghcr.io - credentials: + credentials: | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto @@ -28,25 +28,26 @@ jobs: platforms: | linux/amd64 - publish-arm64: - runs-on: ubuntu-24.04 - name: "Publish for ARM64" - - steps: - - uses: actions/checkout@v4 - - - name: Build and push Docker image - uses: openzim/docker-publish-action@v10 - with: - image-name: openzim/zimit - tag-pattern: /^v([0-9.]+)$/ - latest-on-tag: true - restrict-to: openzim/zimit - registries: ghcr.io - credentials: - GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - repo_description: auto - repo_overview: auto - platforms: | - linux/arm64 + # Disabled for now, see https://github.com/openzim/zimit/issues/463 + # publish-arm64: + # runs-on: ubuntu-24.04 + # name: "Publish for ARM64" + # + # steps: + # - uses: actions/checkout@v4 + # + # - name: Build and push Docker image + # uses: openzim/docker-publish-action@v10 + # with: + # image-name: openzim/zimit + # tag-pattern: /^v([0-9.]+)$/ + # latest-on-tag: true + # restrict-to: openzim/zimit + # registries: ghcr.io + # credentials: | + # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + # repo_description: auto + # repo_overview: auto + # platforms: | + # linux/arm64 diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index 05a8ae1..1cbecea 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -22,7 +22,7 @@ jobs: latest-on-tag: false restrict-to: openzim/zimit registries: ghcr.io - credentials: + credentials: | GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} repo_description: auto @@ -30,25 +30,26 @@ jobs: platforms: | linux/amd64 - publish-arm64: - runs-on: ubuntu-24.04-arm - name: "Publish for ARM64" - - steps: - - uses: actions/checkout@v4 - - - name: Build and push Docker image - uses: openzim/docker-publish-action@v10 - with: - image-name: openzim/zimit - manual-tag: dev - latest-on-tag: false - restrict-to: openzim/zimit - registries: ghcr.io - credentials: - GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} - GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} - repo_description: auto - repo_overview: auto - platforms: | - linux/arm64 + # Disabled for now, see https://github.com/openzim/zimit/issues/463 + # publish-arm64: + # runs-on: ubuntu-24.04-arm + # name: "Publish for ARM64" + # + # steps: + # - uses: actions/checkout@v4 + # + # - name: Build and push Docker image + # uses: openzim/docker-publish-action@v10 + # with: + # image-name: openzim/zimit + # manual-tag: dev + # latest-on-tag: false + # restrict-to: openzim/zimit + # registries: ghcr.io + # credentials: | + # GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }} + # GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }} + # repo_description: auto + # repo_overview: auto + # platforms: | + # linux/arm64 From 6ec53f774f009e436beab98fadfbe7620a1f61fc Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Feb 2025 08:24:02 +0000 Subject: [PATCH 16/65] Upgrade to Browsertrix Crawler 1.5.1 --- CHANGELOG.md | 2 +- Dockerfile | 2 +- tests-integration/integration.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4033a33..3ac740a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Upgrade to browsertrix crawler 1.5.0, Python 3.13 and others (#462) +- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462) ## [2.1.7] - 2024-01-10 diff --git a/Dockerfile b/Dockerfile index d2854dc..6a14b59 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.0 +FROM webrecorder/browsertrix-crawler:1.5.1 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu diff --git a/tests-integration/integration.py b/tests-integration/integration.py index b757e3d..50cfa00 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -91,8 +91,7 @@ def test_stats_output(): "pending": 0, "pendingPages": [], "total": 17, - "failed": 1, - "failedWillRetry": 17, + "failed": 18, "limit": {"max": 0, "hit": False}, } From a7e1026b2ec17069056aaacf048a5b11a52ce6f1 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Feb 2025 08:38:20 +0000 Subject: [PATCH 17/65] Pin warc2zim for release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4e7696..50298f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.1", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] From d228e9f346625bc87ac1c9e48d5a4f5616d60ad7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Feb 2025 08:57:09 +0000 Subject: [PATCH 18/65] Release 2.1.8 --- CHANGELOG.md | 4 ++-- src/zimit/__about__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ac740a..2c1d1f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,11 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [2.1.8] - 2024-02-07 ### Changed -- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462) +- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464) ## [2.1.7] - 2024-01-10 diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 72f5b42..c377db3 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.8-dev0" +__version__ = "2.1.8" From 8b4b18bfb79002fc4b4bc475d0b2683868e4a285 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Feb 2025 08:59:54 +0000 Subject: [PATCH 19/65] Prepare for 2.1.9 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c1d1f5..6387cbd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [2.1.8] - 2024-02-07 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 50298f0..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.1", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index c377db3..4b55b6b 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.8" +__version__ = "2.1.9-dev0" From 3a7f583a96ed70e79f85c53c24567edcb6e37190 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 7 Feb 2025 08:59:54 +0000 Subject: [PATCH 20/65] Upgrade to Browsertrix Crawler 1.5.3 Include restore of total number of pages, following upstream fix. --- Dockerfile | 2 +- tests-integration/integration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6a14b59..c37dfcc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.1 +FROM webrecorder/browsertrix-crawler:1.5.3 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu diff --git a/tests-integration/integration.py b/tests-integration/integration.py index 50cfa00..d9bfc94 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -90,7 +90,7 @@ def test_stats_output(): "crawled": 17, "pending": 0, "pendingPages": [], - "total": 17, + "total": 35, "failed": 18, "limit": {"max": 0, "hit": False}, } From 101fb71a0bdd0cc308de060f59c83a32ad5cdbaf Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 11 Feb 2025 16:57:19 +0000 Subject: [PATCH 21/65] Better processing of crawler exit codes with soft/hard limits --- .github/workflows/Tests.yaml | 22 ++++-- CHANGELOG.md | 8 +++ src/zimit/constants.py | 3 +- src/zimit/zimit.py | 115 +++++++++++++++++++++---------- tests-integration/integration.py | 46 +++++++++++-- 5 files changed, 149 insertions(+), 45 deletions(-) diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 592a5aa..601f9ba 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -57,13 +57,25 @@ jobs: uses: actions/checkout@v4 - name: build image - run: docker build -t zimit . + run: docker build -t local-zimit . - name: ensure help display without issue - run: docker run -v $PWD/output:/output zimit zimit --help + run: docker run -v $PWD/output:/output local-zimit zimit --help - - name: run crawl - run: docker run -v $PWD/output:/output zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + - name: run crawl with soft size limit + run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json + + - name: run crawl with hard size limit + run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true + + - name: run crawl with soft time limit + run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json + + - name: run crawl with hard time limit + run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true + + - name: run standard crawl + run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite - run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" + run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/CHANGELOG.md b/CHANGELOG.md index 6387cbd..f6d7044 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Change solution to report partial ZIM to the Zimfarm and other clients (#304) + +### Fixed + +- Do not create the ZIM when crawl is incomplete (#444) + ## [2.1.8] - 2024-02-07 ### Changed diff --git a/src/zimit/constants.py b/src/zimit/constants.py index f81905a..35baeb9 100644 --- a/src/zimit/constants.py +++ b/src/zimit/constants.py @@ -3,7 +3,8 @@ import logging from zimscraperlib.logging import getLogger EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2 -EXIT_CODE_CRAWLER_LIMIT_HIT = 11 +EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14 +EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15 NORMAL_WARC2ZIM_EXIT_CODE = 100 REQUESTS_TIMEOUT = 10 diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index ec989f1..416bec9 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -25,7 +25,8 @@ from zimscraperlib.uri import rebuild_uri from zimit.__about__ import __version__ from zimit.constants import ( - EXIT_CODE_CRAWLER_LIMIT_HIT, + EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT, + EXIT_CODE_CRAWLER_TIME_LIMIT_HIT, EXIT_CODE_WARC2ZIM_CHECK_FAILED, NORMAL_WARC2ZIM_EXIT_CODE, logger, @@ -61,35 +62,19 @@ class ProgressFileWatcher: self.process.daemon = True self.process.start() - @staticmethod - def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str): + def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str): ino = inotify.adapters.Inotify() ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore - class Limit: - def __init__(self): - self.max = self.hit = None - - @property - def as_dict(self): - return {"max": self.max, "hit": self.hit} - - # limit is only reported by crawl but needs to be reported up - limit = Limit() - - def crawl_conv(data, limit): + def crawl_conv(data): # we consider crawl to be 90% of the workload so total = craw_total * 90% - # limit = {"max": data["limit"]["max"], "hit": data["limit"]["hit"]} - limit.max = data["limit"]["max"] - limit.hit = data["limit"]["hit"] return { "done": data["crawled"], "total": int(data["total"] / 0.9), - "limit": limit.as_dict, } - def warc2zim_conv(data, limit): + def warc2zim_conv(data): # we consider warc2zim to be 10% of the workload so # warc2zim_total = 10% and total = 90 + warc2zim_total * 10% return { @@ -98,7 +83,6 @@ class ProgressFileWatcher: * (0.9 + (float(data["written"]) / data["total"]) / 10) ), "total": data["total"], - "limit": limit.as_dict, } for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore @@ -108,7 +92,7 @@ class ProgressFileWatcher: # open input and output separatly as to not clear output on error with open(fpath) as ifh: try: - out = func(json.load(ifh), limit) + out = func(json.load(ifh)) except Exception: # nosec # noqa: S112 # simply ignore progress update should an error arise # might be malformed input for instance @@ -278,9 +262,17 @@ def run(raw_args): "directory", ) - parser.add_argument( - "--sizeLimit", - help="If set, save state and exit if size limit exceeds this value", + size_group = parser.add_mutually_exclusive_group() + size_group.add_argument( + "--sizeSoftLimit", + help="If set, save crawl state and stop crawl if WARC size exceeds this value. " + "ZIM will still be created.", + type=int, + ) + size_group.add_argument( + "--sizeHardLimit", + help="If set, exit crawler and fail the scraper immediately if WARC size " + "exceeds this value", type=int, ) @@ -292,9 +284,17 @@ def run(raw_args): default=90, ) - parser.add_argument( - "--timeLimit", - help="If set, save state and exit after time limit, in seconds", + time_group = parser.add_mutually_exclusive_group() + time_group.add_argument( + "--timeSoftLimit", + help="If set, save crawl state and stop crawl if WARC WARC(s) creation takes " + "longer than this value, in seconds. ZIM will still be created.", + type=int, + ) + time_group.add_argument( + "--timeHardLimit", + help="If set, exit crawler and fail the scraper immediately if WARC(s) creation" + " takes longer than this value, in seconds", type=int, ) @@ -369,6 +369,13 @@ def run(raw_args): "path/URLs separated by comma", ) + parser.add_argument( + "--acceptable-crawler-exit-codes", + help="Non-zero crawler exit codes to consider as acceptable to continue with " + " conversion of WARC to ZIM. Flag partialZim will be set in statsFilename (if " + " used). Single value with individual error codes separated by comma", + ) + zimit_args, warc2zim_args = parser.parse_known_args(raw_args) # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are @@ -504,6 +511,8 @@ def run(raw_args): f"{'will keep' if zimit_args.keep else 'will delete'}" ) + partial_zim = False + # if warc files are passed, do not run browsertrix crawler but fetch the files if # they are provided as an HTTP URL + extract the archive if it is a tar.gz warc_files: list[Path] = [] @@ -568,10 +577,29 @@ def run(raw_args): logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") crawl = subprocess.run(cmd_args, check=False) - if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT: - logger.info("crawl interupted by a limit") + if ( + crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT + and zimit_args.sizeSoftLimit + ): + logger.info( + "Crawl size soft limit hit. Continuing with warc2zim conversion." + ) + if zimit_args.statsFilename: + partial_zim = True + elif ( + crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT + and zimit_args.timeSoftLimit + ): + logger.info( + "Crawl time soft limit hit. Continuing with warc2zim conversion." + ) + if zimit_args.statsFilename: + partial_zim = True elif crawl.returncode != 0: - raise subprocess.CalledProcessError(crawl.returncode, cmd_args) + logger.error( + f"Crawl returned an error: {crawl.returncode}, scraper exiting" + ) + return crawl.returncode if zimit_args.collection: warc_files = [ @@ -606,7 +634,15 @@ def run(raw_args): logger.info(f"Calling warc2zim with these args: {warc2zim_args}") - return warc2zim(warc2zim_args) + warc2zim_exit_code = warc2zim(warc2zim_args) + + if zimit_args.statsFilename: + stats = Path(zimit_args.statsFilename) + stats_content = json.loads(stats.read_bytes()) + stats_content["partialZim"] = partial_zim + stats.write_text(json.dumps(stats_content)) + + return warc2zim_exit_code def get_cleaned_url(url: str): @@ -646,9 +682,11 @@ def get_node_cmd_line(args): "behaviorTimeout", "delay", "profile", - "sizeLimit", + "sizeSoftLimit", + "sizeHardLimit", "diskUtilization", - "timeLimit", + "timeSoftLimit", + "timeHardLimit", "healthCheckPort", "overwrite", "config", @@ -668,7 +706,14 @@ def get_node_cmd_line(args): continue if value is None or (isinstance(value, bool) and value is False): continue - node_cmd.append("--" + arg) + node_cmd.append( + "--" + + ( + "sizeLimit" + if arg in ["sizeSoftLimit", "sizeHardLimit"] + else "timeLimit" if arg in ["timeSoftLimit", "timeHardLimit"] else arg + ) + ) if not isinstance(value, bool): node_cmd.append(str(value)) diff --git a/tests-integration/integration.py b/tests-integration/integration.py index d9bfc94..7e79f52 100644 --- a/tests-integration/integration.py +++ b/tests-integration/integration.py @@ -3,13 +3,34 @@ import json import os from pathlib import Path +import pytest from warcio import ArchiveIterator from zimscraperlib.zim import Archive -def test_is_file(): +@pytest.mark.parametrize( + "filename", + [ + pytest.param("/output/tests_en_onepage.zim", id="onepage"), + pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"), + pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"), + ], +) +def test_zim_created(filename): """Ensure ZIM file exists""" - assert os.path.isfile("/output/tests_en_onepage.zim") + assert os.path.isfile(filename) + + +@pytest.mark.parametrize( + "filename", + [ + pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"), + pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"), + ], +) +def test_zim_not_created(filename): + """Ensure ZIM file does not exists""" + assert not os.path.exists(filename) def test_zim_main_page(): @@ -85,7 +106,7 @@ def test_user_agent(): assert found -def test_stats_output(): +def test_stats_output_standard(): assert json.loads(Path("/output/crawl.json").read_bytes()) == { "crawled": 17, "pending": 0, @@ -103,5 +124,22 @@ def test_stats_output(): assert json.loads(Path("/output/stats.json").read_bytes()) == { "done": 8, "total": 8, - "limit": {"max": 0, "hit": False}, + "partialZim": False, } + + +@pytest.mark.parametrize( + "filename", + [ + pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"), + pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"), + ], +) +def test_stats_output_softlimit(filename): + file = Path(filename) + assert file.exists + content = json.loads(file.read_bytes()) + assert "done" in content + assert "total" in content + assert "partialZim" in content + assert content["partialZim"] From ee82837aaa12de19c6bcc3fdd1ea641f8bdb559b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 13 Feb 2025 13:18:06 +0000 Subject: [PATCH 22/65] Keep temporary folder when crawler or warc2zim fails, even if not asked for --- CHANGELOG.md | 1 + src/zimit/zimit.py | 34 +++++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6d7044..308058b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Change solution to report partial ZIM to the Zimfarm and other clients (#304) +- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) ### Fixed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 416bec9..70dfdbd 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -33,6 +33,8 @@ from zimit.constants import ( ) from zimit.utils import download_file +temp_root_dir: Path | None = None + class ProgressFileWatcher: def __init__(self, output_dir: Path, stats_path: Path): @@ -103,6 +105,24 @@ class ProgressFileWatcher: json.dump(out, ofh) +def cleanup(): + if not temp_root_dir: + logger.warning("Temporary root dir not already set, cannot clean this up") + return + logger.info("") + logger.info("----------") + logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") + shutil.rmtree(temp_root_dir) + + +def cancel_cleanup(): + logger.info( + f"Temporary files have been kept in {temp_root_dir}, please clean them" + " up manually once you don't need them anymore" + ) + atexit.unregister(cleanup) + + def run(raw_args): parser = ArgumentParser( description="Run a browser-based crawl on the specified URL and convert to ZIM" @@ -427,19 +447,13 @@ def run(raw_args): return EXIT_CODE_WARC2ZIM_CHECK_FAILED # make temp dir for this crawl + global temp_root_dir # noqa: PLW0603 if zimit_args.build: temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) else: temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: - - def cleanup(): - logger.info("") - logger.info("----------") - logger.info(f"Cleanup, removing temp dir: {temp_root_dir}") - shutil.rmtree(temp_root_dir) - atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler @@ -599,6 +613,7 @@ def run(raw_args): logger.error( f"Crawl returned an error: {crawl.returncode}, scraper exiting" ) + cancel_cleanup() return crawl.returncode if zimit_args.collection: @@ -642,6 +657,11 @@ def run(raw_args): stats_content["partialZim"] = partial_zim stats.write_text(json.dumps(stats_content)) + # also call cancel_cleanup when --keep, even if it is not supposed to be registered, + # so that we will display temporary files location just like in other situations + if warc2zim_exit_code or zimit_args.keep: + cancel_cleanup() + return warc2zim_exit_code From b4ec60f31663e97ad3c4dc508e8632d6ce0a1472 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 13 Feb 2025 15:31:51 +0000 Subject: [PATCH 23/65] fixup! Keep temporary folder when crawler or warc2zim fails, even if not asked for --- README.md | 2 +- src/zimit/zimit.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9bfba9b..1598ead 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ The image accepts the following parameters, **as well as any of the [warc2zim](h - `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. - `--workers N` - number of crawl workers to be run in parallel - `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). -- `--keep` - if set, keep the WARC files in a temp directory inside the output directory +- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success. Example command: diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 70dfdbd..49ead05 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -334,7 +334,10 @@ def run(raw_args): parser.add_argument( "--keep", - help="If set, keep WARC files after crawl, don't delete", + help="In case of failure, WARC files and other temporary files (which are " + "stored as a subfolder of output directory) are always kept, otherwise " + "they are automatically deleted. Use this flag to always keep WARC files, " + "even in case of success.", action="store_true", ) From dc6b5aafb70f61771bf209746bce9679e985743f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 13 Feb 2025 15:14:53 +0000 Subject: [PATCH 24/65] Enhance support of Browsertrix Crawler arguments --- CHANGELOG.md | 2 + src/zimit/zimit.py | 522 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 462 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 308058b..3d20f59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Change solution to report partial ZIM to the Zimfarm and other clients (#304) - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) +- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433) +- Document all Browsertrix Crawler default arguments values (#416) ### Fixed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 49ead05..8634b71 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -129,6 +129,7 @@ def run(raw_args): ) parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument("--title", help="ZIM title") parser.add_argument("--description", help="ZIM description") parser.add_argument("--long-description", help="ZIM long description metadata") @@ -138,52 +139,66 @@ def run(raw_args): help="If set, read a list of seed urls, one per line, from the specified", ) - parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") + parser.add_argument( + "-w", "--workers", type=int, help="Number of parallel workers. Default is 1." + ) + + parser.add_argument( + "--crawlId", + help="A user provided ID for this crawl or crawl configuration (can also be " + "set via CRAWL_ID env var, defaults to hostname)", + ) parser.add_argument( "--waitUntil", help="Puppeteer page.goto() condition to wait for before continuing. One of " "load, domcontentloaded, networkidle0 or networkidle2, or a " - "comma-separated combination of those.", - default="load", + "comma-separated combination of those. Default is load,networkidle2", ) parser.add_argument( - "--depth", help="The depth of the crawl for all seeds", type=int, default=-1 + "--depth", + help="The depth of the crawl for all seeds. Default is -1.", + type=int, ) parser.add_argument( "--extraHops", - help="Number of extra 'hops' to follow, beyond the current scope", + help="Number of extra 'hops' to follow, beyond the current scope. " + "Default is 0.", type=int, ) - parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int) + parser.add_argument( + "--limit", + help="Limit crawl to this number of pages. Default is 0 (no limit).", + type=int, + ) parser.add_argument( "--maxPageLimit", - help="Maximum pages to crawl, overriding pageLimit if both are set", + help="Maximum pages to crawl, overriding pageLimit if both are set. Default is " + "0 (no limit)", type=int, ) parser.add_argument( "--timeout", - help="Timeout for each page to load (in seconds)", + help="Timeout for each page to load (in seconds). Default is 90 secs.", type=int, - default=90, ) parser.add_argument( "--scopeType", help="A predfined scope of the crawl. For more customization, " - "use 'custom' and set scopeIncludeRx regexes", + "use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom" + "if scopeIncludeRx is set, prefix otherwise.", choices=["page", "page-spa", "prefix", "host", "domain", "any", "custom"], ) parser.add_argument( "--include", - help="Regex of page URLs that should be " - "included in the crawl (defaults to " + help="Regex of page URLs that should be included in the crawl (defaults to " "the immediate directory of URL)", ) @@ -192,48 +207,185 @@ def run(raw_args): help="Regex of page URLs that should be excluded from the crawl", ) - parser.add_argument( - "--collection", - help="Collection name to crawl to (replay will be accessible " - "under this name in pywb preview) instead of crawl-@ts", - ) - parser.add_argument( "--allowHashUrls", - help="Allow Hashtag URLs, useful for " - "single-page-application crawling or " - "when different hashtags load dynamic " - "content", + help="Allow Hashtag URLs, useful for single-page-application crawling or " + "when different hashtags load dynamic content", action="store_true", ) parser.add_argument( - "--lang", - help="if set, sets the language used by the browser, should be ISO 639 " - "language[-country] code", + "--selectLinks", + help="One or more selectors for extracting links, in the format " + "[css selector]->[property to use],[css selector]->@[attribute to use]", ) parser.add_argument( - "--zim-lang", - help="Language metadata of ZIM " - "(warc2zim --lang param). ISO-639-3 code. " - "Retrieved from homepage if found, fallback to `eng`", + "--clickSelector", + help="Selector for elements to click when using the autoclick behavior. Default" + " is 'a'", ) + parser.add_argument( + "--blockRules", + help="Additional rules for blocking certain URLs from being loaded, by URL " + "regex and optionally via text match in an iframe", + ) + + parser.add_argument( + "--blockMessage", + help="If specified, when a URL is blocked, a record with this error message is" + " added instead", + ) + + parser.add_argument( + "--blockAds", + help="If set, block advertisements from being loaded (based on Stephen Black's" + " blocklist). Note that some bad domains are also blocked by zimit" + " configuration even if this option is not set.", + ) + + parser.add_argument( + "--adBlockMessage", + help="If specified, when an ad is blocked, a record with this error message is" + " added instead", + ) + + parser.add_argument( + "--collection", + help="Collection name to crawl to (replay will be accessible " + "under this name in pywb preview). Default is crawl-@ts.", + ) + + parser.add_argument( + "--headless", + help="Run in headless mode, otherwise start xvfb", + action="store_true", + ) + + parser.add_argument( + "--driver", + help="Custom driver for the crawler, if any", + ) + + parser.add_argument( + "--generateCDX", + help="If set, generate index (CDXJ) for use with pywb after crawl is done", + action="store_true", + ) + + parser.add_argument( + "--combineWARC", + help="If set, combine the warcs", + action="store_true", + ) + + parser.add_argument( + "--rolloverSize", + help="If set, declare the rollover size. Default is 1000000000.", + type=int, + ) + + parser.add_argument( + "--generateWACZ", + help="If set, generate WACZ on disk", + action="store_true", + ) + + parser.add_argument( + "--logging", + help="Crawler logging configuration", + ) + + parser.add_argument( + "--logLevel", + help="Comma-separated list of log levels to include in logs", + ) + + parser.add_argument( + "--logContext", + help="Comma-separated list of contexts to include in logs", + choices=[ + "general", + "worker", + "recorder", + "recorderNetwork", + "writer", + "state", + "redis", + "storage", + "text", + "exclusion", + "screenshots", + "screencast", + "originOverride", + "healthcheck", + "browser", + "blocking", + "behavior", + "behaviorScript", + "jsError", + "fetch", + "pageStatus", + "memoryStatus", + "crawlStatus", + "links", + "sitemap", + "wacz", + "replay", + "proxy", + ], + ) + + parser.add_argument( + "--logExcludeContext", + help="Comma-separated list of contexts to NOT include in logs. Default is " + "recorderNetwork,jsError,screencast", + choices=[ + "general", + "worker", + "recorder", + "recorderNetwork", + "writer", + "state", + "redis", + "storage", + "text", + "exclusion", + "screenshots", + "screencast", + "originOverride", + "healthcheck", + "browser", + "blocking", + "behavior", + "behaviorScript", + "jsError", + "fetch", + "pageStatus", + "memoryStatus", + "crawlStatus", + "links", + "sitemap", + "wacz", + "replay", + "proxy", + ], + ) + + parser.add_argument( + "--text", + help="Extract initial (default) or final text to pages.jsonl or WARC resource" + " record(s)", + ) + + # cwd is not manipulable + parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " "https://github.com/puppeteer/puppeteer/blob/" "main/packages/puppeteer-core/src/common/Device.ts", - default="Pixel 2", - ) - - parser.add_argument( - "--noMobileDevice", - help="Do not emulate a mobile device (use at your own risk, behavior is" - "uncertain)", - action="store_true", - default=False, ) parser.add_argument( @@ -255,33 +407,108 @@ def run(raw_args): "(usually /sitemap.xml)", ) + parser.add_argument( + "--sitemapFromDate", + help="If set, filter URLs from sitemaps to those greater than or equal to (>=)" + " provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", + ) + + parser.add_argument( + "--sitemapToDate", + help="If set, filter URLs from sitemaps to those less than or equal to (<=) " + "provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", + ) + + parser.add_argument( + "--statsFilename", + help="If set, output stats as JSON to this file. (Relative filename resolves " + "to crawl working directory)", + ) + parser.add_argument( "--behaviors", - help="Which background behaviors to enable on each page", - default="autoplay,autofetch,siteSpecific", + help="Which background behaviors to enable on each page. Default is autoplay," + "autofetch,autoscroll,siteSpecific", ) parser.add_argument( "--behaviorTimeout", help="If >0, timeout (in seconds) for in-page behavior will run on each page. " - "If 0, a behavior can run until finish", + "If 0, a behavior can run until finish. Default is 90.", + type=int, + ) + + parser.add_argument( + "--postLoadDelay", + help="If >0, amount of time to sleep (in seconds) after page has loaded, before" + " taking screenshots / getting text / running behaviors. Default is 0.", type=int, - default=90, ) parser.add_argument( "--delay", help="If >0, amount of time to sleep (in seconds) after behaviors " - "before moving on to next page", + "before moving on to next page. Default is 0.", type=int, ) + parser.add_argument( + "--dedupPolicy", + help="Deduplication policy. Default is skip", + choices=["skip", "revisit", "keep"], + ) + parser.add_argument( "--profile", help="Path or HTTP(S) URL to tar.gz file which contains the browser profile " "directory", ) + parser.add_argument( + "--screenshot", + help="Screenshot options for crawler. One of view, thumbnail, fullPage, " + "fullPageFinal or a comma-separated combination of those.", + ) + + parser.add_argument( + "--screencastPort", + help="If set to a non-zero value, starts an HTTP server with screencast " + "accessible on this port.", + type=int, + ) + + parser.add_argument( + "--screencastRedis", + help="If set, will use the state store redis pubsub for screencasting", + action="store_true", + ) + + parser.add_argument( + "--warcInfo", + help="Optional fields added to the warcinfo record in combined WARCs", + ) + + parser.add_argument( + "--saveState", + help="If the crawl state should be serialized to the crawls/ directory. " + "Defaults to 'partial', only saved when crawl is interrupted", + choices=["never", "partial", "always"], + ) + + parser.add_argument( + "--saveStateInterval", + help="If save state is set to 'always', also save state during the crawl at " + "this interval (in seconds). Default to 300.", + type=int, + ) + + parser.add_argument( + "--saveStateHistory", + help="Number of save states to keep during the duration of a crawl. " + "Default to 5.", + type=int, + ) + size_group = parser.add_mutually_exclusive_group() size_group.add_argument( "--sizeSoftLimit", @@ -329,7 +556,134 @@ def run(raw_args): help="overwrite current crawl data: if set, existing collection directory " "will be deleted before crawl is started", action="store_true", - default=False, + ) + + parser.add_argument( + "--waitOnDone", + help="if set, wait for interrupt signal when finished instead of exiting", + action="store_true", + ) + + parser.add_argument( + "--restartsOnError", + help="if set, assume will be restarted if interrupted, don't run post-crawl " + "processes on interrupt", + action="store_true", + ) + + parser.add_argument( + "--netIdleWait", + help="If set, wait for network idle after page load and after behaviors are " + "done (in seconds). if -1 (default), determine based on scope.", + type=int, + ) + + parser.add_argument( + "--lang", + help="if set, sets the language used by the browser, should be ISO 639 " + "language[-country] code", + ) + + parser.add_argument( + "--originOverride", + help="if set, will redirect requests from each origin in key to origin in the " + "value, eg. --originOverride https://host:port=http://alt-host:alt-port", + ) + + parser.add_argument( + "--logErrorsToRedis", + help="If set, write error messages to redis", + action="store_true", + ) + + parser.add_argument( + "--writePagesToRedis", + help="If set, write page objects to redis", + action="store_true", + ) + + parser.add_argument( + "--maxPageRetries", + help="If set, number of times to retry a page that failed to load before page" + " is considered to have failed. Default is 2.", + type=int, + ) + + parser.add_argument( + "--failOnFailedSeed", + help="If set, crawler will fail with exit code 1 if any seed fails. When " + "combined with --failOnInvalidStatus, will result in crawl failing with exit " + "code 1 if any seed has a 4xx/5xx response", + action="store_true", + ) + + parser.add_argument( + "--failOnFailedLimit", + help="If set, save state and exit if number of failed pages exceeds this value", + action="store_true", + ) + + parser.add_argument( + "--failOnInvalidStatus", + help="If set, will treat pages with 4xx or 5xx response as failures. When " + "combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl " + "failing due to non-200 responses", + action="store_true", + ) + + # customBehaviors not included because it has special handling + # debugAccessRedis not included due to custom redis engine in zimit + + parser.add_argument( + "--debugAccessBrowser", + help="if set, allow debugging browser on port 9222 via CDP", + action="store_true", + ) + + parser.add_argument( + "--warcPrefix", + help="prefix for WARC files generated, including WARCs added to WACZ", + ) + + parser.add_argument( + "--serviceWorker", + help="service worker handling: disabled, enabled or disabled-if-profile. " + "Default: disabled.", + ) + + parser.add_argument( + "--proxyServer", + help="if set, will use specified proxy server. Takes precedence over any env " + "var proxy settings", + ) + + parser.add_argument( + "--dryRun", + help="If true, no archive data is written to disk, only pages and logs (and " + "optionally saved state).", + action="store_true", + ) + + parser.add_argument( + "--qaSource", + help="Required for QA mode. Source (WACZ or multi WACZ) for QA", + ) + + parser.add_argument( + "--qaDebugImageDiff", + help="if specified, will write crawl.png, replay.png and diff.png for each " + "page where they're different", + action="store_true", + ) + + parser.add_argument( + "--sshProxyPrivateKeyFile", + help="path to SSH private key for SOCKS5 over SSH proxy connection", + ) + + parser.add_argument( + "--sshProxyKnownHostsFile", + help="path to SSH known hosts file for SOCKS5 over SSH proxy connection", ) parser.add_argument( @@ -355,11 +709,6 @@ def run(raw_args): help="[warc2zim] Custom CSS file URL/path to inject into all articles", ) - parser.add_argument( - "--statsFilename", - help="If set, output stats as JSON to this file", - ) - parser.add_argument( "--config", help="Path to YAML config file. If set, browsertrix-crawler will use this file" @@ -374,8 +723,10 @@ def run(raw_args): ) parser.add_argument( - "--logging", - help="Crawler logging configuration", + "--zim-lang", + help="Language metadata of ZIM " + "(warc2zim --lang param). ISO-639-3 code. " + "Retrieved from homepage if found, fallback to `eng`", ) parser.add_argument( @@ -497,10 +848,6 @@ def run(raw_args): cmd_args.append("--userAgentSuffix") cmd_args.append(user_agent_suffix) - if not zimit_args.noMobileDevice: - cmd_args.append("--mobileDevice") - cmd_args.append(zimit_args.mobileDevice) - cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) @@ -681,13 +1028,14 @@ def get_cleaned_url(url: str): def get_node_cmd_line(args): - node_cmd = ["crawl", "--failOnFailedSeed"] + node_cmd = ["crawl"] for arg in [ - "workers", - "waitUntil", - "urlFile", "title", "description", + "urlFile", + "workers", + "crawlId", + "waitUntil", "depth", "extraHops", "limit", @@ -698,13 +1046,44 @@ def get_node_cmd_line(args): "exclude", "collection", "allowHashUrls", - "lang", + "selectLinks", + "clickSelector", + "blockRules", + "blockMessage", + "blockAds", + "adBlockMessage", + "collection", + "headless", + "driver", + "generateCDX", + "combineWARC", + "rolloverSize", + "generateWACZ", + "logging", + "logLevel", + "logContext", + "logExcludeContext", + "text", + "mobileDevice", "userAgent", + # userAgentSuffix (manipulated), "useSitemap", + "sitemapFromDate", + "sitemapToDate", + # statsFilename (manipulated), "behaviors", "behaviorTimeout", + "postLoadDelay", "delay", + "dedupPolicy", "profile", + "screenshot", + "screencastPort", + "screencastRedis", + "warcInfo", + "saveState", + "saveStateInterval", + "saveStateHistory", "sizeSoftLimit", "sizeHardLimit", "diskUtilization", @@ -712,9 +1091,28 @@ def get_node_cmd_line(args): "timeHardLimit", "healthCheckPort", "overwrite", - "config", - "logging", + "waitOnDone", + "restartsOnError", + "netIdleWait", + "lang", + "originOverride", + "logErrorsToRedis", + "writePagesToRedis", + "maxPageRetries", + "failOnFailedSeed", + "failOnFailedLimit", + "failOnInvalidStatus", + "debugAccessBrowser", + "warcPrefix", + "serviceWorker", + "proxyServer", + "dryRun", + "qaSource", + "qaDebugImageDiff", + "sshProxyPrivateKeyFile", + "sshProxyKnownHostsFile", "customBehaviors", + "config", ]: value = getattr(args, arg) if arg == "userAgent": From ed1a8a0aa9a1ef28b0d81ab20ed3d4c61d59a01b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 13 Feb 2025 15:30:30 +0000 Subject: [PATCH 25/65] Use preferred Browsertrix Crawler arguments and fix multiple/file seeds support --- .github/workflows/DailyTests.yaml | 2 +- .github/workflows/Tests.yaml | 10 ++-- CHANGELOG.md | 8 +++ README.md | 13 +++-- src/zimit/__about__.py | 2 +- src/zimit/zimit.py | 86 +++++++++++++++++++------------ 6 files changed, 75 insertions(+), 46 deletions(-) diff --git a/.github/workflows/DailyTests.yaml b/.github/workflows/DailyTests.yaml index 0585721..2bc9bc5 100644 --- a/.github/workflows/DailyTests.yaml +++ b/.github/workflows/DailyTests.yaml @@ -18,7 +18,7 @@ jobs: run: docker build -t local-zimit . - name: run crawl of test website - run: docker run -v $PWD/output:/output local-zimit zimit --url https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim + run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim - name: archive ZIM uses: actions/upload-artifact@v4 diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 601f9ba..afdc18b 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -63,19 +63,19 @@ jobs: run: docker run -v $PWD/output:/output local-zimit zimit --help - name: run crawl with soft size limit - run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json - name: run crawl with hard size limit - run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true - name: run crawl with soft time limit - run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json - name: run crawl with hard time limit - run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true - name: run standard crawl - run: docker run -v $PWD/output:/output local-zimit zimit --url http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d20f59..ae0d548 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433) - Document all Browsertrix Crawler default arguments values (#416) +- Use preferred Browsertrix Crawler arguments names: + - `--seeds` instead of `--url` + - `--seedFile` instead of `--urlFile` + - `--pageLimit` instead of `--limit` + - `--pageLoadTimeout` instead of `--timeout` + - `--scopeIncludeRx` instead of `--include` + - `--scopeExcludeRx` instead of `--exclude` + - `--pageExtraDelay` instead of `--delay` ### Fixed diff --git a/README.md b/README.md index 1598ead..bc18dc6 100644 --- a/README.md +++ b/README.md @@ -38,16 +38,15 @@ Usage `zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit. -The image accepts the following parameters, **as well as any of the [warc2zim](https://github.com/openzim/warc2zim) ones**; useful for setting metadata, for instance: +The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**: -- Required: `--url URL` - the url to be crawled +- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage - Required: `--name` - Name of ZIM file - `--output` - output directory (defaults to `/output`) -- `--limit U` - Limit capture to at most U URLs -- `--behaviors` - Control which browsertrix behaviors are ran (defaults to `autoplay,autofetch,siteSpecific`, adding `autoscroll` to the list is possible to automatically scroll the pages and fetch resources which are lazy loaded) -- `--exclude ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. +- `--pageLimit U` - Limit capture to at most U URLs +- `--scopeExcludeRx ` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded. - `--workers N` - number of crawl workers to be run in parallel -- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). +- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). - `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success. Example command: @@ -55,7 +54,7 @@ Example command: ```bash docker run ghcr.io/openzim/zimit zimit --help docker run ghcr.io/openzim/zimit warc2zim --help -docker run -v /output:/output ghcr.io/openzim/zimit zimit --url URL --name myzimfile +docker run -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile ``` **Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`). diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 4b55b6b..d733cff 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "2.1.9-dev0" +__version__ = "3.0.0-dev0" diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 8634b71..2337c00 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -128,15 +128,21 @@ def run(raw_args): description="Run a browser-based crawl on the specified URL and convert to ZIM" ) - parser.add_argument("-u", "--url", help="The URL to start crawling from") + parser.add_argument( + "--seeds", + help="The seed URL(s) to start crawling from. Multile seed URL must be " + "separated by a comma (usually not needed, these are just the crawl seeds). " + "First seed URL is used as ZIM homepage", + ) - parser.add_argument("--title", help="ZIM title") - parser.add_argument("--description", help="ZIM description") + parser.add_argument("--title", help="WARC and ZIM title") + parser.add_argument("--description", help="WARC and ZIM description") parser.add_argument("--long-description", help="ZIM long description metadata") parser.add_argument( - "--urlFile", - help="If set, read a list of seed urls, one per line, from the specified", + "--seedFile", + help="If set, read a list of seed urls, one per line. Can be a local file or " + "the HTTP(s) URL to an online file.", ) parser.add_argument( @@ -170,7 +176,7 @@ def run(raw_args): ) parser.add_argument( - "--limit", + "--pageLimit", help="Limit crawl to this number of pages. Default is 0 (no limit).", type=int, ) @@ -183,7 +189,7 @@ def run(raw_args): ) parser.add_argument( - "--timeout", + "--pageLoadTimeout", help="Timeout for each page to load (in seconds). Default is 90 secs.", type=int, ) @@ -197,13 +203,13 @@ def run(raw_args): ) parser.add_argument( - "--include", + "--scopeIncludeRx", help="Regex of page URLs that should be included in the crawl (defaults to " "the immediate directory of URL)", ) parser.add_argument( - "--exclude", + "--scopeExcludeRx", help="Regex of page URLs that should be excluded from the crawl", ) @@ -446,7 +452,7 @@ def run(raw_args): ) parser.add_argument( - "--delay", + "--pageExtraDelay", help="If >0, amount of time to sleep (in seconds) after behaviors " "before moving on to next page. Default is 0.", type=int, @@ -762,16 +768,40 @@ def run(raw_args): warc2zim_args.append("--output") warc2zim_args.append(zimit_args.output) - url = zimit_args.url - user_agent_suffix = zimit_args.userAgentSuffix if zimit_args.adminEmail: user_agent_suffix += f" {zimit_args.adminEmail}" - if url: - url = get_cleaned_url(url) - warc2zim_args.append("--url") - warc2zim_args.append(url) + # make temp dir for this crawl + global temp_root_dir # noqa: PLW0603 + if zimit_args.build: + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) + else: + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) + + seeds = [] + if zimit_args.seeds: + seeds += [get_cleaned_url(url) for url in zimit_args.seeds.split(",")] + if zimit_args.seedFile: + if re.match(r"^https?\://", zimit_args.seedFile): + with tempfile.NamedTemporaryFile( + dir=temp_root_dir, + prefix="seeds_", + suffix=".txt", + delete_on_close=True, + ) as filename: + seed_file = Path(filename.name) + download_file(zimit_args.seedFile, seed_file) + seeds += [ + get_cleaned_url(url) for url in seed_file.read_text().splitlines() + ] + else: + seeds += [ + get_cleaned_url(url) + for url in Path(zimit_args.seedFile).read_text().splitlines() + ] + warc2zim_args.append("--url") + warc2zim_args.append(seeds[0]) if zimit_args.custom_css: warc2zim_args += ["--custom-css", zimit_args.custom_css] @@ -800,13 +830,6 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - # make temp dir for this crawl - global temp_root_dir # noqa: PLW0603 - if zimit_args.build: - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) - else: - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) - if not zimit_args.keep: atexit.register(cleanup) @@ -841,9 +864,9 @@ def run(raw_args): zimit_args.customBehaviors = None cmd_args = get_node_cmd_line(zimit_args) - if url: - cmd_args.append("--url") - cmd_args.append(url) + for seed in seeds: + cmd_args.append("--seeds") + cmd_args.append(seed) cmd_args.append("--userAgentSuffix") cmd_args.append(user_agent_suffix) @@ -1032,18 +1055,17 @@ def get_node_cmd_line(args): for arg in [ "title", "description", - "urlFile", "workers", "crawlId", "waitUntil", "depth", "extraHops", - "limit", + "pageLimit", "maxPageLimit", - "timeout", + "pageLoadTimeout", "scopeType", - "include", - "exclude", + "scopeIncludeRx", + "scopeExcludeRx", "collection", "allowHashUrls", "selectLinks", @@ -1074,7 +1096,7 @@ def get_node_cmd_line(args): "behaviors", "behaviorTimeout", "postLoadDelay", - "delay", + "pageExtraDelay", "dedupPolicy", "profile", "screenshot", From 7bfb4b25f0e390e191002345268013f243b0d53b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 13 Feb 2025 17:08:22 +0000 Subject: [PATCH 26/65] Remove confusion between zimit, warc2zim and crawler stats filenames --- .github/workflows/Tests.yaml | 10 +-- CHANGELOG.md | 7 +- src/zimit/zimit.py | 121 +++++++++++++++++++++++++++-------- 3 files changed, 105 insertions(+), 33 deletions(-) diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index afdc18b..8c74b21 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -63,19 +63,19 @@ jobs: run: docker run -v $PWD/output:/output local-zimit zimit --help - name: run crawl with soft size limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizesoftlimit.json + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json - name: run crawl with hard size limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_sizehardlimit.json || true + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true - name: run crawl with soft time limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timesoftlimit.json + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json - name: run crawl with hard time limit - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats_timehardlimit.json || true + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true - name: run standard crawl - run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep + run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep - name: run integration test suite run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py" diff --git a/CHANGELOG.md b/CHANGELOG.md index ae0d548..de71c25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468) - Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433) - Document all Browsertrix Crawler default arguments values (#416) -- Use preferred Browsertrix Crawler arguments names: +- Use preferred Browsertrix Crawler arguments names: (part of #471) - `--seeds` instead of `--url` - `--seedFile` instead of `--urlFile` - `--pageLimit` instead of `--limit` @@ -21,6 +21,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `--scopeIncludeRx` instead of `--include` - `--scopeExcludeRx` instead of `--exclude` - `--pageExtraDelay` instead of `--delay` +- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471) + - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments) + - `--zimit-progress-file` is now the zimit stats location + - `--warc2zim-progress-file` is the warc2zim stats location + - all are optional values, if not set and needed temporary files are used ### Fixed diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 2337c00..abd2978 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -37,17 +37,16 @@ temp_root_dir: Path | None = None class ProgressFileWatcher: - def __init__(self, output_dir: Path, stats_path: Path): - self.crawl_path = output_dir / "crawl.json" - self.warc2zim_path = output_dir / "warc2zim.json" - self.stats_path = stats_path - - if not self.stats_path.is_absolute(): - self.stats_path = output_dir / self.stats_path + def __init__( + self, crawl_stats_path: Path, warc2zim_stats_path, zimit_stats_path: Path + ): + self.crawl_stats_path = crawl_stats_path + self.warc2zim_stats_path = warc2zim_stats_path + self.zimit_stats_path = zimit_stats_path # touch them all so inotify is not unhappy on add_watch - self.crawl_path.touch() - self.warc2zim_path.touch() + self.crawl_stats_path.touch() + self.warc2zim_stats_path.touch() self.process = None def stop(self): @@ -59,12 +58,16 @@ class ProgressFileWatcher: def watch(self): self.process = Process( target=self.inotify_watcher, - args=(str(self.crawl_path), str(self.warc2zim_path), str(self.stats_path)), + args=( + str(self.crawl_stats_path), + str(self.warc2zim_stats_path), + str(self.zimit_stats_path), + ), ) self.process.daemon = True self.process.start() - def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, output_fpath: str): + def inotify_watcher(self, crawl_fpath: str, warc2zim_fpath: str, zimit_fpath: str): ino = inotify.adapters.Inotify() ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore @@ -101,7 +104,7 @@ class ProgressFileWatcher: continue if not out: continue - with open(output_fpath, "w") as ofh: + with open(zimit_fpath, "w") as ofh: json.dump(out, ofh) @@ -427,8 +430,22 @@ def run(raw_args): parser.add_argument( "--statsFilename", - help="If set, output stats as JSON to this file. (Relative filename resolves " - "to crawl working directory)", + help="If set, output crawl stats as JSON to this file. Relative filename " + "resolves to output directory, see --output.", + ) + + parser.add_argument( + "--zimit-progress-file", + help="If set, output zimit stats as JSON to this file. Forces the creation of" + "crawler and warc2zim stats as well. If --statsFilename and/or " + "--warc2zim-progress-file are not set, default temporary files will be used. " + "Relative filename resolves to output directory, see --output.", + ) + + parser.add_argument( + "--warc2zim-progress-file", + help="If set, output warc2zim stats as JSON to this file. Relative filename " + "resolves to output directory, see --output.", ) parser.add_argument( @@ -701,7 +718,11 @@ def run(raw_args): action="store_true", ) - parser.add_argument("--output", help="Output directory for ZIM", default="/output") + parser.add_argument( + "--output", + help="Output directory for ZIM. Default to /output.", + default="/output", + ) parser.add_argument( "--build", @@ -874,20 +895,67 @@ def run(raw_args): cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) - # setup inotify crawler progress watcher - if zimit_args.statsFilename: + output_dir = Path(zimit_args.output) + warc2zim_stats_file = ( + Path(zimit_args.warc2zim_progress_file) + if zimit_args.warc2zim_progress_file + else temp_root_dir / "warc2zim.json" + ) + if not warc2zim_stats_file.is_absolute(): + warc2zim_stats_file = output_dir / warc2zim_stats_file + warc2zim_stats_file.parent.mkdir(parents=True, exist_ok=True) + warc2zim_stats_file.unlink(missing_ok=True) + + crawler_stats_file = ( + Path(zimit_args.statsFilename) + if zimit_args.statsFilename + else temp_root_dir / "crawl.json" + ) + if not crawler_stats_file.is_absolute(): + crawler_stats_file = output_dir / crawler_stats_file + crawler_stats_file.parent.mkdir(parents=True, exist_ok=True) + crawler_stats_file.unlink(missing_ok=True) + + zimit_stats_file = ( + Path(zimit_args.zimit_progress_file) + if zimit_args.zimit_progress_file + else temp_root_dir / "stats.json" + ) + if not zimit_stats_file.is_absolute(): + zimit_stats_file = output_dir / zimit_stats_file + zimit_stats_file.parent.mkdir(parents=True, exist_ok=True) + zimit_stats_file.unlink(missing_ok=True) + + if zimit_args.zimit_progress_file: + # setup inotify crawler progress watcher watcher = ProgressFileWatcher( - Path(zimit_args.output), Path(zimit_args.statsFilename) + zimit_stats_path=zimit_stats_file, + crawl_stats_path=crawler_stats_file, + warc2zim_stats_path=warc2zim_stats_file, + ) + logger.info( + f"Writing zimit progress to {watcher.zimit_stats_path}, crawler progress to" + f" {watcher.crawl_stats_path} and warc2zim progress to " + f"{watcher.warc2zim_stats_path}" ) - logger.info(f"Writing progress to {watcher.stats_path}") # update crawler command cmd_args.append("--statsFilename") - cmd_args.append(str(watcher.crawl_path)) + cmd_args.append(str(crawler_stats_file)) # update warc2zim command warc2zim_args.append("-v") warc2zim_args.append("--progress-file") - warc2zim_args.append(str(watcher.warc2zim_path)) + warc2zim_args.append(str(warc2zim_stats_file)) watcher.watch() + else: + if zimit_args.statsFilename: + logger.info(f"Writing crawler progress to {crawler_stats_file}") + cmd_args.append("--statsFilename") + cmd_args.append(str(crawler_stats_file)) + if zimit_args.warc2zim_progress_file: + logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}") + warc2zim_args.append("-v") + warc2zim_args.append("--progress-file") + warc2zim_args.append(str(warc2zim_stats_file)) cmd_line = " ".join(cmd_args) @@ -971,7 +1039,7 @@ def run(raw_args): logger.info( "Crawl size soft limit hit. Continuing with warc2zim conversion." ) - if zimit_args.statsFilename: + if zimit_args.zimit_progress_file: partial_zim = True elif ( crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT @@ -980,7 +1048,7 @@ def run(raw_args): logger.info( "Crawl time soft limit hit. Continuing with warc2zim conversion." ) - if zimit_args.statsFilename: + if zimit_args.zimit_progress_file: partial_zim = True elif crawl.returncode != 0: logger.error( @@ -1024,11 +1092,10 @@ def run(raw_args): warc2zim_exit_code = warc2zim(warc2zim_args) - if zimit_args.statsFilename: - stats = Path(zimit_args.statsFilename) - stats_content = json.loads(stats.read_bytes()) + if zimit_args.zimit_progress_file: + stats_content = json.loads(zimit_stats_file.read_bytes()) stats_content["partialZim"] = partial_zim - stats.write_text(json.dumps(stats_content)) + zimit_stats_file.write_text(json.dumps(stats_content)) # also call cancel_cleanup when --keep, even if it is not supposed to be registered, # so that we will display temporary files location just like in other situations From 96c4c3bdfd5af5a62cf458effe9b84dfb7caa7bd Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 13 Feb 2025 17:11:32 +0000 Subject: [PATCH 27/65] Clarify args variables/functions names --- src/zimit/zimit.py | 133 +++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 65 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index abd2978..d298909 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -777,7 +777,9 @@ def run(raw_args): " used). Single value with individual error codes separated by comma", ) - zimit_args, warc2zim_args = parser.parse_known_args(raw_args) + # by design, all unknown args are for warc2zim ; known one are either for crawler + # or shared + known_args, warc2zim_args = parser.parse_known_args(raw_args) # pass a scraper suffix to warc2zim so that both zimit and warc2zim versions are # associated with the ZIM ; make it a CSV for easier parsing @@ -785,26 +787,26 @@ def run(raw_args): warc2zim_args.append(f"zimit {__version__}") # pass url and output to warc2zim also - if zimit_args.output: + if known_args.output: warc2zim_args.append("--output") - warc2zim_args.append(zimit_args.output) + warc2zim_args.append(known_args.output) - user_agent_suffix = zimit_args.userAgentSuffix - if zimit_args.adminEmail: - user_agent_suffix += f" {zimit_args.adminEmail}" + user_agent_suffix = known_args.userAgentSuffix + if known_args.adminEmail: + user_agent_suffix += f" {known_args.adminEmail}" # make temp dir for this crawl global temp_root_dir # noqa: PLW0603 - if zimit_args.build: - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) + if known_args.build: + temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp")) else: - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) + temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) seeds = [] - if zimit_args.seeds: - seeds += [get_cleaned_url(url) for url in zimit_args.seeds.split(",")] - if zimit_args.seedFile: - if re.match(r"^https?\://", zimit_args.seedFile): + if known_args.seeds: + seeds += [get_cleaned_url(url) for url in known_args.seeds.split(",")] + if known_args.seedFile: + if re.match(r"^https?\://", known_args.seedFile): with tempfile.NamedTemporaryFile( dir=temp_root_dir, prefix="seeds_", @@ -812,36 +814,36 @@ def run(raw_args): delete_on_close=True, ) as filename: seed_file = Path(filename.name) - download_file(zimit_args.seedFile, seed_file) + download_file(known_args.seedFile, seed_file) seeds += [ get_cleaned_url(url) for url in seed_file.read_text().splitlines() ] else: seeds += [ get_cleaned_url(url) - for url in Path(zimit_args.seedFile).read_text().splitlines() + for url in Path(known_args.seedFile).read_text().splitlines() ] warc2zim_args.append("--url") warc2zim_args.append(seeds[0]) - if zimit_args.custom_css: - warc2zim_args += ["--custom-css", zimit_args.custom_css] + if known_args.custom_css: + warc2zim_args += ["--custom-css", known_args.custom_css] - if zimit_args.title: + if known_args.title: warc2zim_args.append("--title") - warc2zim_args.append(zimit_args.title) + warc2zim_args.append(known_args.title) - if zimit_args.description: + if known_args.description: warc2zim_args.append("--description") - warc2zim_args.append(zimit_args.description) + warc2zim_args.append(known_args.description) - if zimit_args.long_description: + if known_args.long_description: warc2zim_args.append("--long-description") - warc2zim_args.append(zimit_args.long_description) + warc2zim_args.append(known_args.long_description) - if zimit_args.zim_lang: + if known_args.zim_lang: warc2zim_args.append("--lang") - warc2zim_args.append(zimit_args.zim_lang) + warc2zim_args.append(known_args.zim_lang) logger.info("----------") logger.info("Testing warc2zim args") @@ -851,16 +853,16 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - if not zimit_args.keep: + if not known_args.keep: atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler - if zimit_args.custom_behaviors: + if known_args.custom_behaviors: behaviors_dir = temp_root_dir / "custom-behaviors" behaviors_dir.mkdir() for custom_behavior in [ custom_behavior.strip() - for custom_behavior in zimit_args.custom_behaviors.split(",") + for custom_behavior in known_args.custom_behaviors.split(",") ]: behaviors_file = tempfile.NamedTemporaryFile( dir=behaviors_dir, @@ -880,25 +882,25 @@ def run(raw_args): f"to {behaviors_file.name}" ) shutil.copy(custom_behavior, behaviors_file.name) - zimit_args.customBehaviors = str(behaviors_dir) + known_args.customBehaviors = str(behaviors_dir) else: - zimit_args.customBehaviors = None + known_args.customBehaviors = None - cmd_args = get_node_cmd_line(zimit_args) + crawler_args = get_crawler_cmd_line(known_args) for seed in seeds: - cmd_args.append("--seeds") - cmd_args.append(seed) + crawler_args.append("--seeds") + crawler_args.append(seed) - cmd_args.append("--userAgentSuffix") - cmd_args.append(user_agent_suffix) + crawler_args.append("--userAgentSuffix") + crawler_args.append(user_agent_suffix) - cmd_args.append("--cwd") - cmd_args.append(str(temp_root_dir)) + crawler_args.append("--cwd") + crawler_args.append(str(temp_root_dir)) - output_dir = Path(zimit_args.output) + output_dir = Path(known_args.output) warc2zim_stats_file = ( - Path(zimit_args.warc2zim_progress_file) - if zimit_args.warc2zim_progress_file + Path(known_args.warc2zim_progress_file) + if known_args.warc2zim_progress_file else temp_root_dir / "warc2zim.json" ) if not warc2zim_stats_file.is_absolute(): @@ -907,8 +909,8 @@ def run(raw_args): warc2zim_stats_file.unlink(missing_ok=True) crawler_stats_file = ( - Path(zimit_args.statsFilename) - if zimit_args.statsFilename + Path(known_args.statsFilename) + if known_args.statsFilename else temp_root_dir / "crawl.json" ) if not crawler_stats_file.is_absolute(): @@ -917,8 +919,8 @@ def run(raw_args): crawler_stats_file.unlink(missing_ok=True) zimit_stats_file = ( - Path(zimit_args.zimit_progress_file) - if zimit_args.zimit_progress_file + Path(known_args.zimit_progress_file) + if known_args.zimit_progress_file else temp_root_dir / "stats.json" ) if not zimit_stats_file.is_absolute(): @@ -926,7 +928,7 @@ def run(raw_args): zimit_stats_file.parent.mkdir(parents=True, exist_ok=True) zimit_stats_file.unlink(missing_ok=True) - if zimit_args.zimit_progress_file: + if known_args.zimit_progress_file: # setup inotify crawler progress watcher watcher = ProgressFileWatcher( zimit_stats_path=zimit_stats_file, @@ -939,31 +941,31 @@ def run(raw_args): f"{watcher.warc2zim_stats_path}" ) # update crawler command - cmd_args.append("--statsFilename") - cmd_args.append(str(crawler_stats_file)) + crawler_args.append("--statsFilename") + crawler_args.append(str(crawler_stats_file)) # update warc2zim command warc2zim_args.append("-v") warc2zim_args.append("--progress-file") warc2zim_args.append(str(warc2zim_stats_file)) watcher.watch() else: - if zimit_args.statsFilename: + if known_args.statsFilename: logger.info(f"Writing crawler progress to {crawler_stats_file}") - cmd_args.append("--statsFilename") - cmd_args.append(str(crawler_stats_file)) - if zimit_args.warc2zim_progress_file: + crawler_args.append("--statsFilename") + crawler_args.append(str(crawler_stats_file)) + if known_args.warc2zim_progress_file: logger.info(f"Writing warc2zim progress to {warc2zim_stats_file}") warc2zim_args.append("-v") warc2zim_args.append("--progress-file") warc2zim_args.append(str(warc2zim_stats_file)) - cmd_line = " ".join(cmd_args) + cmd_line = " ".join(crawler_args) logger.info("") logger.info("----------") logger.info( f"Output to tempdir: {temp_root_dir} - " - f"{'will keep' if zimit_args.keep else 'will delete'}" + f"{'will keep' if known_args.keep else 'will delete'}" ) partial_zim = False @@ -971,9 +973,9 @@ def run(raw_args): # if warc files are passed, do not run browsertrix crawler but fetch the files if # they are provided as an HTTP URL + extract the archive if it is a tar.gz warc_files: list[Path] = [] - if zimit_args.warcs: + if known_args.warcs: for warc_location in [ - warc_location.strip() for warc_location in zimit_args.warcs.split(",") + warc_location.strip() for warc_location in known_args.warcs.split(",") ]: suffix = "".join(Path(urllib.parse.urlparse(warc_location).path).suffixes) if suffix not in {".tar", ".tar.gz", ".warc", ".warc.gz"}: @@ -1031,24 +1033,24 @@ def run(raw_args): else: logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") - crawl = subprocess.run(cmd_args, check=False) + crawl = subprocess.run(crawler_args, check=False) if ( crawl.returncode == EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT - and zimit_args.sizeSoftLimit + and known_args.sizeSoftLimit ): logger.info( "Crawl size soft limit hit. Continuing with warc2zim conversion." ) - if zimit_args.zimit_progress_file: + if known_args.zimit_progress_file: partial_zim = True elif ( crawl.returncode == EXIT_CODE_CRAWLER_TIME_LIMIT_HIT - and zimit_args.timeSoftLimit + and known_args.timeSoftLimit ): logger.info( "Crawl time soft limit hit. Continuing with warc2zim conversion." ) - if zimit_args.zimit_progress_file: + if known_args.zimit_progress_file: partial_zim = True elif crawl.returncode != 0: logger.error( @@ -1057,9 +1059,9 @@ def run(raw_args): cancel_cleanup() return crawl.returncode - if zimit_args.collection: + if known_args.collection: warc_files = [ - temp_root_dir.joinpath(f"collections/{zimit_args.collection}/archive/") + temp_root_dir.joinpath(f"collections/{known_args.collection}/archive/") ] else: @@ -1092,14 +1094,14 @@ def run(raw_args): warc2zim_exit_code = warc2zim(warc2zim_args) - if zimit_args.zimit_progress_file: + if known_args.zimit_progress_file: stats_content = json.loads(zimit_stats_file.read_bytes()) stats_content["partialZim"] = partial_zim zimit_stats_file.write_text(json.dumps(stats_content)) # also call cancel_cleanup when --keep, even if it is not supposed to be registered, # so that we will display temporary files location just like in other situations - if warc2zim_exit_code or zimit_args.keep: + if warc2zim_exit_code or known_args.keep: cancel_cleanup() return warc2zim_exit_code @@ -1117,7 +1119,8 @@ def get_cleaned_url(url: str): return parsed_url.geturl() -def get_node_cmd_line(args): +def get_crawler_cmd_line(args): + """Build the command line for Browsertrix crawler""" node_cmd = ["crawl"] for arg in [ "title", From 2f7a83e1872c3c81859d8e5157ffe5ff200cf0c9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 14 Feb 2025 14:22:30 +0000 Subject: [PATCH 28/65] Fixes following review --- src/zimit/zimit.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index d298909..02b167d 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -155,7 +155,7 @@ def run(raw_args): parser.add_argument( "--crawlId", help="A user provided ID for this crawl or crawl configuration (can also be " - "set via CRAWL_ID env var, defaults to hostname)", + "set via CRAWL_ID env var, defaults to machine hostname)", ) parser.add_argument( @@ -167,7 +167,7 @@ def run(raw_args): parser.add_argument( "--depth", - help="The depth of the crawl for all seeds. Default is -1.", + help="The depth of the crawl for all seeds. Default is -1 (infinite).", type=int, ) @@ -388,7 +388,8 @@ def run(raw_args): " record(s)", ) - # cwd is not manipulable + # cwd is manipulated directly by zimit, based on --output / --build, we do not want + # to expose this setting parser.add_argument( "--mobileDevice", @@ -689,7 +690,7 @@ def run(raw_args): parser.add_argument( "--qaSource", - help="Required for QA mode. Source (WACZ or multi WACZ) for QA", + help="Required for QA mode. Path to the source WACZ or multi WACZ file for QA", ) parser.add_argument( From 3eb6c090465f38cbbcf078df81ce44717b54efa5 Mon Sep 17 00:00:00 2001 From: clach04 Date: Fri, 14 Feb 2025 22:02:17 -0800 Subject: [PATCH 29/65] Correct link in README.md Signed-off-by: clach04 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bc18dc6..894f523 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Zimit is a scraper allowing to create ZIM file from any Web site. Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing). -Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/docs/Policy.md) **v1.0.1**. +Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**. Capabilities and known limitations -------------------- From ee0f4c6cec3007760365ab619916624499596a03 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 17 Feb 2025 09:52:55 +0000 Subject: [PATCH 30/65] Use released warc2zim 2.2.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] From e3cd12b0d1ba19853f527f1edff5acc045c8fc40 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 17 Feb 2025 10:02:43 +0000 Subject: [PATCH 31/65] Release 3.0.0 --- CHANGELOG.md | 2 +- src/zimit/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de71c25..0105b54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [3.0.0] - 2024-02-17 ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index d733cff..528787c 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.0-dev0" +__version__ = "3.0.0" From bce22ceac11804144e223ed3fb63b5a4415838c6 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 17 Feb 2025 10:08:49 +0000 Subject: [PATCH 32/65] Prepare for 3.0.1 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0105b54..dd9d4b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.0] - 2024-02-17 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 528787c..038a5da 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.0" +__version__ = "3.0.1-dev0" From 1b5b9bb80b6a11aa26ae11f8d3fe5863e49a26b9 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Feb 2025 09:21:19 +0000 Subject: [PATCH 33/65] Upgrade to browsertrix crawler 1.5.4 --- CHANGELOG.md | 4 ++++ Dockerfile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd9d4b9..0335b39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.5.4 (#476) + ## [3.0.0] - 2024-02-17 ### Changed diff --git a/Dockerfile b/Dockerfile index c37dfcc..672fbd8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.3 +FROM webrecorder/browsertrix-crawler:1.5.4 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu From 5e53be6fa4cfa827acd05bcbfee376dd74d173c7 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Feb 2025 09:22:40 +0000 Subject: [PATCH 34/65] Pin warc2zim version in preparation for 3.0.1 release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] From dd659025562e4ce976c593e18254861f43f4bbfe Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Feb 2025 09:37:40 +0000 Subject: [PATCH 35/65] Release 3.0.1 --- CHANGELOG.md | 2 +- src/zimit/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0335b39..c0c7d3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [3.0.1] - 2024-02-24 ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 038a5da..0552768 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.1-dev0" +__version__ = "3.0.1" From 363ff4076711e9b507698736599fed4b2bd79761 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Mon, 24 Feb 2025 09:40:04 +0000 Subject: [PATCH 36/65] Prepare for 3.0.2 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0c7d3d..399ef49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.1] - 2024-02-24 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 0552768..f50125d 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.1" +__version__ = "3.0.2-dev0" From 00f0e475ae903076d20788190262be2180c64c70 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 27 Feb 2025 07:31:28 +0000 Subject: [PATCH 37/65] Upgrade to browsertrix crawler 1.5.5 --- CHANGELOG.md | 4 ++++ Dockerfile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 399ef49..347200f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.5.5 (#480) + ## [3.0.1] - 2024-02-24 ### Changed diff --git a/Dockerfile b/Dockerfile index 672fbd8..dc09fe8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.4 +FROM webrecorder/browsertrix-crawler:1.5.5 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu From eebc75f868af9f8d156442548b19630e42c0977f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 27 Feb 2025 07:32:25 +0000 Subject: [PATCH 38/65] Pin warc2zim version in preparation for 3.0.2 release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] From 1287351c1d587b35b8810ae5edcd6fb7cb0e3309 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 27 Feb 2025 19:36:11 +0000 Subject: [PATCH 39/65] Upgrade to browsertrix crawler 1.5.6 --- CHANGELOG.md | 2 +- Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 347200f..1734ccc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Upgrade to browsertrix crawler 1.5.5 (#480) +- Upgrade to browsertrix crawler 1.5.6 (#482) ## [3.0.1] - 2024-02-24 diff --git a/Dockerfile b/Dockerfile index dc09fe8..923d499 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.5 +FROM webrecorder/browsertrix-crawler:1.5.6 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu From 6ee053af5f6cfa1998647106ff72f6905337cb84 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 27 Feb 2025 19:58:51 +0000 Subject: [PATCH 40/65] Release 3.0.2 --- CHANGELOG.md | 2 +- src/zimit/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1734ccc..d7f6c9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [3.0.2] - 2024-02-27 ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index f50125d..131942e 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.2-dev0" +__version__ = "3.0.2" From 9fc66a95b7c905a1f5467a8d16b4add95ca83716 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 27 Feb 2025 20:03:37 +0000 Subject: [PATCH 41/65] Prepare for 3.0.3 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7f6c9b..0b36f38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.2] - 2024-02-27 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 131942e..a1e7aaa 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.2" +__version__ = "3.0.3-dev0" From 88b85311e0be0c2180d927139ad3e6100e5fe96f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 28 Feb 2025 06:13:45 +0000 Subject: [PATCH 42/65] Upgrade to browsertrix crawler 1.5.7 --- CHANGELOG.md | 4 ++++ Dockerfile | 2 +- pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b36f38..143c9d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.5.7 (#483) + ## [3.0.2] - 2024-02-27 ### Changed diff --git a/Dockerfile b/Dockerfile index 923d499..c6e9bbb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.6 +FROM webrecorder/browsertrix-crawler:1.5.7 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] From 1e6748ab69e1716953b95ae166570974079278ab Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 28 Feb 2025 06:21:27 +0000 Subject: [PATCH 43/65] Release 3.0.3 --- CHANGELOG.md | 2 +- src/zimit/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 143c9d3..abd27a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [3.0.3] - 2024-02-28 ### Changed diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index a1e7aaa..8d1c862 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.3-dev0" +__version__ = "3.0.3" From 4e0174836d36aa2b1061fff179bdf032ee587d54 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 28 Feb 2025 06:38:52 +0000 Subject: [PATCH 44/65] Prepare for 3.0.4 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abd27a2..109319d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.3] - 2024-02-28 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 8d1c862..31a915c 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.3" +__version__ = "3.0.4-dev0" From 146af5de0ad22d1aa479a80786bc075b124122f2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 2 Apr 2025 08:04:32 +0000 Subject: [PATCH 45/65] Upgrade to browsertrix crawler 1.5.10 --- CHANGELOG.md | 4 ++++ Dockerfile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 109319d..94e47e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.5.10 (#491) + ## [3.0.3] - 2024-02-28 ### Changed diff --git a/Dockerfile b/Dockerfile index c6e9bbb..9860ccb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.7 +FROM webrecorder/browsertrix-crawler:1.5.9 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu From 12fde3af9810d09425441df8f870965e16513034 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 4 Apr 2025 11:00:29 +0000 Subject: [PATCH 46/65] Release 3.0.4 --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94e47e0..e188a13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [3.0.4] - 2024-04-04 ### Changed diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 31a915c..8e10cb4 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.4-dev0" +__version__ = "3.0.4" From 3421ca02127aa6a09ff0ff5d4fc874e027a3a910 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 4 Apr 2025 11:09:50 +0000 Subject: [PATCH 47/65] Prepare for 3.0.5 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e188a13..2464bbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.4] - 2024-04-04 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index 8e10cb4..dd14b28 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.4" +__version__ = "3.0.5-dev0" From 511c3a5021823e588408b280940f51f9f9ba6b4e Mon Sep 17 00:00:00 2001 From: orangetin Date: Thu, 10 Apr 2025 17:52:19 -0700 Subject: [PATCH 48/65] Upgrade browsertrix-crawler to version 1.6.0 in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9860ccb..9666c0b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.5.9 +FROM webrecorder/browsertrix-crawler:1.6.0 LABEL org.opencontainers.image.source=https://github.com/openzim/zimit # add deadsnakes ppa for latest Python on Ubuntu From b5d87198d85746988a23fdf3def66ca96982eb0c Mon Sep 17 00:00:00 2001 From: orangetin Date: Thu, 10 Apr 2025 17:54:34 -0700 Subject: [PATCH 49/65] update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2464bbc..2512f06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to browsertrix crawler 1.6.0 (#493) + ## [3.0.4] - 2024-04-04 ### Changed From 009b8b4bd66e9520e0797c01fd0406a8259eb66d Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 11 Apr 2025 07:18:18 +0000 Subject: [PATCH 50/65] Release 3.0.5 --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2512f06..bc99b8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## [Unreleased] +## [3.0.5] - 2024-04-11 ### Changed diff --git a/pyproject.toml b/pyproject.toml index e4e7696..9aa830a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim @ git+https://github.com/openzim/warc2zim@main", + "warc2zim==2.2.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index dd14b28..e94f36f 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.5-dev0" +__version__ = "3.0.5" From 8c471d9ee2269f22d63cfef383b4fba02241319b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 11 Apr 2025 07:46:42 +0000 Subject: [PATCH 51/65] Prepare for 3.0.6 --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- src/zimit/__about__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc99b8f..5e06e20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). +## [Unreleased] + ## [3.0.5] - 2024-04-11 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 9aa830a..e4e7696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "requests==2.32.3", "inotify==0.2.10", "tld==0.13", - "warc2zim==2.2.2", + "warc2zim @ git+https://github.com/openzim/warc2zim@main", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimit/__about__.py b/src/zimit/__about__.py index e94f36f..281b1bb 100644 --- a/src/zimit/__about__.py +++ b/src/zimit/__about__.py @@ -1 +1 @@ -__version__ = "3.0.5" +__version__ = "3.0.6-dev0" From 5624cbf08142b321996bfd85ac9c12f1e52d2dae Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Tue, 7 Oct 2025 04:08:14 +0100 Subject: [PATCH 52/65] set up offliner definitions --- .../update-zim-offliner-definition.yaml | 38 + offliner-definition.json | 973 ++++++++++++++++++ 2 files changed, 1011 insertions(+) create mode 100644 .github/workflows/update-zim-offliner-definition.yaml create mode 100644 offliner-definition.json diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml new file mode 100644 index 0000000..4662e62 --- /dev/null +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -0,0 +1,38 @@ +name: Update ZIMFarm Definitions + +on: + push: + branches: [main] + paths: + - "offliner-definition.json" + release: + types: [published] + +jobs: + prepare-json: + runs-on: ubuntu-24.04 + outputs: + offliner_definition: ${{ steps.read-json.outputs.offliner_definition }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - id: read-json + run: | + if [ ! -f "offliner-definition.json" ]; then + echo "File not found!" >&2 + exit 1 + fi + json=$(jq -c . offliner-definition.json) + echo "offliner_definition=$json" >> $GITHUB_OUTPUT + call-workflow: + needs: prepare-json + uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main + with: + version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }} + offliner: zimit + offliner_definition: ${{ needs.prepare-json.outputs.offliner_definition }} + secrets: + zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} diff --git a/offliner-definition.json b/offliner-definition.json new file mode 100644 index 0000000..c7fed57 --- /dev/null +++ b/offliner-definition.json @@ -0,0 +1,973 @@ +{ + "offliner_id": "zimit", + "stdOutput": true, + "stdStats": "zimit-progress-file", + "flags": { + "seeds": { + "type": "string", + "required": false, + "title": "Seeds", + "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage" + }, + "seed_file": { + "type": "string", + "required": false, + "title": "Seed File", + "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file." + }, + "lang": { + "type": "string", + "required": false, + "title": "Browser Language", + "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`" + }, + "title": { + "type": "string", + "required": false, + "title": "Title", + "description": "Custom title for your ZIM. Defaults to title of main page", + "minLength": 1, + "maxLength": 30 + }, + "description": { + "type": "string", + "required": false, + "title": "Description", + "description": "Description for ZIM", + "minLength": 1, + "maxLength": 80 + }, + "favicon": { + "type": "url", + "required": false, + "title": "Illustration", + "description": "URL for Illustration. " + }, + "tags": { + "type": "string", + "required": false, + "title": "ZIM Tags", + "description": "Single string with individual tags separated by a semicolon." + }, + "creator": { + "type": "string", + "required": false, + "title": "Creator", + "description": "Name of content creator" + }, + "publisher": { + "type": "string", + "required": false, + "title": "Publisher", + "isPublisher": true, + "description": "Custom publisher name (ZIM metadata). openZIM otherwise" + }, + "source": { + "type": "string", + "required": false, + "title": "Source", + "description": "Source name/URL of content" + }, + "workers": { + "type": "integer", + "required": false, + "title": "Workers", + "description": "The number of workers to run in parallel. Defaults to 1", + "min": 1 + }, + "wait_until": { + "type": "string", + "required": false, + "title": "WaitUntil", + "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2" + }, + "extra_hops": { + "type": "integer", + "required": false, + "title": "Extra Hops", + "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0", + "min": 0 + }, + "page_limit": { + "type": "integer", + "required": false, + "title": "Page Limit", + "description": "Limit crawl to this number of pages. Default is 0 (no-limit).", + "min": 0 + }, + "max_page_limit": { + "type": "integer", + "required": false, + "title": "Max Page Limit", + "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)", + "min": 0 + }, + "page_load_timeout": { + "type": "integer", + "required": false, + "title": "Page Load Timeout", + "description": "Timeout for each page to load (in seconds). Default is 90", + "min": 0 + }, + "scope_type": { + "type": "string-enum", + "required": false, + "title": "Scope Type", + "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.", + "choices": [ + { + "title": "Page", + "value": "page" + }, + { + "title": "Page SPA", + "value": "page-spa" + }, + { + "title": "Prefix", + "value": "prefix" + }, + { + "title": "Host", + "value": "host" + }, + { + "title": "Domain", + "value": "domain" + }, + { + "title": "Any", + "value": "any" + }, + { + "title": "Custom", + "value": "custom" + } + ] + }, + "scope_include_rx": { + "type": "string", + "required": false, + "title": "Scope Include Regex", + "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)" + }, + "scope_exclude_rx": { + "type": "string", + "required": false, + "title": "Scope Exclude Regex", + "description": "Regex of page URLs that should be excluded from the crawl" + }, + "allow_hash_urls": { + "type": "boolean", + "required": false, + "title": "Allow Hashtag URLs", + "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content" + }, + "mobile_device": { + "type": "string-enum", + "required": false, + "title": "As device", + "description": "Device to crawl as. See Pupeeter's Device.ts for a list", + "choices": [ + { + "title": "Blackberry Playbook", + "value": "Blackberry PlayBook" + }, + { + "title": "Blackberry Playbook Landscape", + "value": "Blackberry PlayBook landscape" + }, + { + "title": "Blackberry Z30", + "value": "BlackBerry Z30" + }, + { + "title": "Blackberry Z30 Landscape", + "value": "BlackBerry Z30 landscape" + }, + { + "title": "Galaxy Note 3", + "value": "Galaxy Note 3" + }, + { + "title": "Galaxy Note 3 Landscape", + "value": "Galaxy Note 3 landscape" + }, + { + "title": "Galaxy Note II", + "value": "Galaxy Note II" + }, + { + "title": "Galaxy Note II Landscape", + "value": "Galaxy Note II landscape" + }, + { + "title": "Galaxy S III", + "value": "Galaxy S III" + }, + { + "title": "Galaxy S III Landscape", + "value": "Galaxy S III landscape" + }, + { + "title": "Galaxy S5", + "value": "Galaxy S5" + }, + { + "title": "Galaxy S5 Landscape", + "value": "Galaxy S5 landscape" + }, + { + "title": "Galaxy S8", + "value": "Galaxy S8" + }, + { + "title": "Galaxy S8 Landscape", + "value": "Galaxy S8 landscape" + }, + { + "title": "Galaxy S9 Plus", + "value": "Galaxy S9+" + }, + { + "title": "Galaxy S9 Plus Landscape", + "value": "Galaxy S9+ landscape" + }, + { + "title": "Galaxy Tab S4", + "value": "Galaxy Tab S4" + }, + { + "title": "Galaxy Tab S4 Landscape", + "value": "Galaxy Tab S4 landscape" + }, + { + "title": "iPad", + "value": "iPad" + }, + { + "title": "iPad Landscape", + "value": "iPad landscape" + }, + { + "title": "iPad Gen 6", + "value": "iPad (gen 6)" + }, + { + "title": "iPad Gen 6 Landscape", + "value": "iPad (gen 6) landscape" + }, + { + "title": "iPad Gen 7", + "value": "iPad (gen 7)" + }, + { + "title": "iPad Gen 7 Landscape", + "value": "iPad (gen 7) landscape" + }, + { + "title": "iPad Mini", + "value": "iPad Mini" + }, + { + "title": "iPad Mini Landscape", + "value": "iPad Mini landscape" + }, + { + "title": "iPad Pro", + "value": "iPad Pro" + }, + { + "title": "iPad Pro Landscape", + "value": "iPad Pro landscape" + }, + { + "title": "iPad Pro 11", + "value": "iPad Pro 11" + }, + { + "title": "iPad Pro 11 Landscape", + "value": "iPad Pro 11 landscape" + }, + { + "title": "iPhone 4", + "value": "iPhone 4" + }, + { + "title": "iPhone 4 Landscape", + "value": "iPhone 4 landscape" + }, + { + "title": "iPhone 5", + "value": "iPhone 5" + }, + { + "title": "iPhone 5 Landscape", + "value": "iPhone 5 landscape" + }, + { + "title": "iPhone 6", + "value": "iPhone 6" + }, + { + "title": "iPhone 6 Landscape", + "value": "iPhone 6 landscape" + }, + { + "title": "iPhone 6 Plus", + "value": "iPhone 6 Plus" + }, + { + "title": "iPhone 6 Plus Landscape", + "value": "iPhone 6 Plus landscape" + }, + { + "title": "iPhone 7", + "value": "iPhone 7" + }, + { + "title": "iPhone 7 Landscape", + "value": "iPhone 7 landscape" + }, + { + "title": "iPhone 7 Plus", + "value": "iPhone 7 Plus" + }, + { + "title": "iPhone 7 Plus Landscape", + "value": "iPhone 7 Plus landscape" + }, + { + "title": "iPhone 8", + "value": "iPhone 8" + }, + { + "title": "iPhone 8 Landscape", + "value": "iPhone 8 landscape" + }, + { + "title": "iPhone 8 Plus", + "value": "iPhone 8 Plus" + }, + { + "title": "iPhone 8 Plus Landscape", + "value": "iPhone 8 Plus landscape" + }, + { + "title": "iPhone SE", + "value": "iPhone SE" + }, + { + "title": "iPhone SE Landscape", + "value": "iPhone SE landscape" + }, + { + "title": "iPhone X", + "value": "iPhone X" + }, + { + "title": "iPhone X Landscape", + "value": "iPhone X landscape" + }, + { + "title": "iPhone XR", + "value": "iPhone XR" + }, + { + "title": "iPhone XR Landscape", + "value": "iPhone XR landscape" + }, + { + "title": "iPhone 11", + "value": "iPhone 11" + }, + { + "title": "iPhone 11 Landscape", + "value": "iPhone 11 landscape" + }, + { + "title": "iPhone 11 Pro", + "value": "iPhone 11 Pro" + }, + { + "title": "iPhone 11 Pro Landscape", + "value": "iPhone 11 Pro landscape" + }, + { + "title": "iPhone 11 Pro Max", + "value": "iPhone 11 Pro Max" + }, + { + "title": "iPhone 11 Pro Max Landscape", + "value": "iPhone 11 Pro Max landscape" + }, + { + "title": "iPhone 12", + "value": "iPhone 12" + }, + { + "title": "iPhone 12 Landscape", + "value": "iPhone 12 landscape" + }, + { + "title": "iPhone 12 Pro", + "value": "iPhone 12 Pro" + }, + { + "title": "iPhone 12 Pro Landscape", + "value": "iPhone 12 Pro landscape" + }, + { + "title": "iPhone 12 Pro Max", + "value": "iPhone 12 Pro Max" + }, + { + "title": "iPhone 12 Pro Max Landscape", + "value": "iPhone 12 Pro Max landscape" + }, + { + "title": "iPhone 12 Mini", + "value": "iPhone 12 Mini" + }, + { + "title": "iPhone 12 Mini Landscape", + "value": "iPhone 12 Mini landscape" + }, + { + "title": "iPhone 13", + "value": "iPhone 13" + }, + { + "title": "iPhone 13 Landscape", + "value": "iPhone 13 landscape" + }, + { + "title": "iPhone 13 Pro", + "value": "iPhone 13 Pro" + }, + { + "title": "iPhone 13 Pro Landscape", + "value": "iPhone 13 Pro landscape" + }, + { + "title": "iPhone 13 Pro Max", + "value": "iPhone 13 Pro Max" + }, + { + "title": "iPhone 13 Pro Max Landscape", + "value": "iPhone 13 Pro Max landscape" + }, + { + "title": "iPhone 13 Mini", + "value": "iPhone 13 Mini" + }, + { + "title": "iPhone 13 Mini Landscape", + "value": "iPhone 13 Mini landscape" + }, + { + "title": "Jio Phone 2", + "value": "JioPhone 2" + }, + { + "title": "Jio Phone 2 Landscape", + "value": "JioPhone 2 landscape" + }, + { + "title": "Kindle Fire HDX", + "value": "Kindle Fire HDX" + }, + { + "title": "Kindle Fire HDX Landscape", + "value": "Kindle Fire HDX landscape" + }, + { + "title": "LG Optimus L70", + "value": "LG Optimus L70" + }, + { + "title": "LG Optimus L70 Landscape", + "value": "LG Optimus L70 landscape" + }, + { + "title": "Microsoft Lumia 550", + "value": "Microsoft Lumia 550" + }, + { + "title": "Microsoft Lumia 950", + "value": "Microsoft Lumia 950" + }, + { + "title": "Microsoft Lumia 950 Landscape", + "value": "Microsoft Lumia 950 landscape" + }, + { + "title": "Nexus 10", + "value": "Nexus 10" + }, + { + "title": "Nexus 10 Landscape", + "value": "Nexus 10 landscape" + }, + { + "title": "Nexus 4", + "value": "Nexus 4" + }, + { + "title": "Nexus 4 Landscape", + "value": "Nexus 4 landscape" + }, + { + "title": "Nexus 5", + "value": "Nexus 5" + }, + { + "title": "Nexus 5 Landscape", + "value": "Nexus 5 landscape" + }, + { + "title": "Nexus 5X", + "value": "Nexus 5X" + }, + { + "title": "Nexus 5X Landscape", + "value": "Nexus 5X landscape" + }, + { + "title": "Nexus 6", + "value": "Nexus 6" + }, + { + "title": "Nexus 6 Landscape", + "value": "Nexus 6 landscape" + }, + { + "title": "Nexus 6P", + "value": "Nexus 6P" + }, + { + "title": "Nexus 6P Landscape", + "value": "Nexus 6P landscape" + }, + { + "title": "Nexus 7", + "value": "Nexus 7" + }, + { + "title": "Nexus 7 Landscape", + "value": "Nexus 7 landscape" + }, + { + "title": "Nokia Lumia 520", + "value": "Nokia Lumia 520" + }, + { + "title": "Nokia Lumia 520 Landscape", + "value": "Nokia Lumia 520 landscape" + }, + { + "title": "Nokia N9", + "value": "Nokia N9" + }, + { + "title": "Nokia N9 Landscape", + "value": "Nokia N9 landscape" + }, + { + "title": "Pixel 2", + "value": "Pixel 2" + }, + { + "title": "Pixel 2 Landscape", + "value": "Pixel 2 landscape" + }, + { + "title": "Pixel 2 XL", + "value": "Pixel 2 XL" + }, + { + "title": "Pixel 2 XL Landscape", + "value": "Pixel 2 XL landscape" + }, + { + "title": "Pixel 3", + "value": "Pixel 3" + }, + { + "title": "Pixel 3 Landscape", + "value": "Pixel 3 landscape" + }, + { + "title": "Pixel 4", + "value": "Pixel 4" + }, + { + "title": "Pixel 4 Landscape", + "value": "Pixel 4 landscape" + }, + { + "title": "Pixel 4A 5G", + "value": "Pixel 4a (5G)" + }, + { + "title": "Pixel 4A 5G Landscape", + "value": "Pixel 4a (5G) landscape" + }, + { + "title": "Pixel 5", + "value": "Pixel 5" + }, + { + "title": "Pixel 5 Landscape", + "value": "Pixel 5 landscape" + }, + { + "title": "Moto G4", + "value": "Moto G4" + }, + { + "title": "Moto G4 Landscape", + "value": "Moto G4 landscape" + } + ] + }, + "select_links": { + "type": "string", + "required": false, + "title": "Select Links", + "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]" + }, + "click_selector": { + "type": "string", + "required": false, + "title": "Click Selector", + "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'" + }, + "block_rules": { + "type": "string", + "required": false, + "title": "Block Rules", + "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe" + }, + "block_message": { + "type": "string", + "required": false, + "title": "Block Message", + "description": "If specified, when a URL is blocked, a record with this error message is added instead" + }, + "block_ads": { + "type": "boolean", + "required": false, + "title": "Block Ads", + "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set." + }, + "ad_block_message": { + "type": "string", + "required": false, + "title": "Ads Block Message", + "description": "If specified, when an ad is blocked, a record with this error message is added instead" + }, + "user_agent": { + "type": "string", + "required": false, + "title": "User Agent", + "description": "Override user-agent with specified" + }, + "user_agent_suffix": { + "type": "string", + "required": false, + "title": "User Agent Suffix", + "description": "Append suffix to existing browser user-agent. Defaults to +Zimit" + }, + "use_sitemap": { + "type": "string", + "required": false, + "title": "Sitemap URL", + "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)" + }, + "sitemap_from_date": { + "type": "string", + "required": false, + "title": "Sitemap From Date", + "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "sitemap_to_date": { + "type": "string", + "required": false, + "title": "Sitemap To Date", + "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)" + }, + "behavior_timeout": { + "type": "integer", + "required": false, + "title": "Behavior Timeout", + "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.", + "min": 0 + }, + "post_load_delay": { + "type": "integer", + "required": false, + "title": "Post Load Delay", + "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.", + "min": 0 + }, + "page_extra_delay": { + "type": "integer", + "required": false, + "title": "Page Extra Delay", + "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.", + "min": 0 + }, + "dedup_policy": { + "type": "string-enum", + "required": false, + "title": "Dedup Policy", + "description": "Deduplication policy. One of skip, revisit or keep. Default is skip", + "choices": [ + { + "title": "Skip", + "value": "skip" + }, + { + "title": "Revisit", + "value": "revisit" + }, + { + "title": "Keep", + "value": "keep" + } + ] + }, + "screenshot": { + "type": "string", + "required": false, + "title": "Screenshot", + "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those." + }, + "size_soft_limit": { + "type": "integer", + "required": false, + "title": "Size Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.", + "min": 0 + }, + "size_hard_limit": { + "type": "integer", + "required": false, + "title": "Size Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value", + "min": 0 + }, + "disk_utilization": { + "type": "integer", + "required": false, + "title": "Disk Utilization", + "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.", + "min": 0 + }, + "time_soft_limit": { + "type": "integer", + "required": false, + "title": "Time Soft Limit", + "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.", + "min": 0 + }, + "time_hard_limit": { + "type": "integer", + "required": false, + "title": "Time Hard Limit", + "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds", + "min": 0 + }, + "net_idle_wait": { + "type": "integer", + "required": false, + "title": "Net Idle Wait", + "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope." + }, + "origin_override": { + "type": "string", + "required": false, + "title": "Origin Override", + "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port." + }, + "max_page_retries": { + "type": "integer", + "required": false, + "title": "Max Page Retries", + "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.", + "min": 0 + }, + "fail_on_failed_seed": { + "type": "boolean", + "required": false, + "title": "Fail on failed seed", + "description": "Whether to display additional logs" + }, + "fail_on_invalid_status": { + "type": "boolean", + "required": false, + "title": "Fail on invalid status", + "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses" + }, + "fail_on_failed_limit": { + "type": "integer", + "required": false, + "title": "Fail on failed - Limit", + "description": "If set, save state and exit if number of failed pages exceeds this value.", + "min": 0 + }, + "warcs": { + "type": "string", + "required": false, + "title": "WARC files", + "description": "Comma-separated list of WARC files to use as input." + }, + "verbose": { + "type": "boolean", + "required": false, + "title": "Verbose mode", + "description": "Whether to display additional logs" + }, + "keep": { + "type": "boolean", + "required": false, + "title": "Keep", + "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.", + "default": true + }, + "output": { + "type": "string", + "required": false, + "title": "Output folder", + "description": "Output folder for ZIM file(s). Leave it as `/output`", + "pattern": "^/output$" + }, + "admin_email": { + "type": "email", + "required": false, + "title": "Admin Email", + "description": "Admin Email for crawler: used in UserAgent so website admin can contact us", + "default": "contact+zimfarm@kiwix.org" + }, + "profile": { + "type": "string", + "required": false, + "title": "Browser profile", + "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler." + }, + "behaviors": { + "type": "string", + "required": false, + "title": "Behaviors", + "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific." + }, + "depth": { + "type": "integer", + "required": false, + "title": "Depth", + "description": "The depth of the crawl for all seeds. Default is -1 (infinite).", + "min": -1 + }, + "zim_lang": { + "type": "string", + "required": false, + "title": "ZIM Language", + "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`", + "alias": "zim-lang", + "customValidator": "language_code" + }, + "long_description": { + "type": "string", + "required": false, + "title": "Long description", + "description": "Optional long description for your ZIM", + "minLength": 1, + "maxLength": 4000, + "alias": "long-description" + }, + "custom_css": { + "type": "url", + "required": false, + "title": "Custom CSS", + "description": "URL to a CSS file to inject into pages", + "alias": "custom-css" + }, + "charsets_to_try": { + "type": "string", + "required": false, + "title": "Charsets to try", + "description": "List of charsets to try decode content when charset is not found", + "alias": "charsets-to-try" + }, + "ignore_content_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore Content Header Charsets", + "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.", + "alias": "ignore-content-header-charsets" + }, + "content_header_bytes_length": { + "type": "integer", + "required": false, + "title": "Content Header Bytes Length", + "description": "How many bytes to consider when searching for content charsets in header (default is 1024).", + "alias": "content-header-bytes-length", + "min": 0 + }, + "ignore_http_header_charsets": { + "type": "boolean", + "required": false, + "title": "Ignore HTTP Header Charsets", + "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.", + "alias": "ignore-http-header-charsets" + }, + "encoding_aliases": { + "type": "string", + "required": false, + "title": "Encoding Aliases", + "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.", + "alias": "encoding-aliases" + }, + "custom_behaviors": { + "type": "string", + "required": false, + "title": "Custom Behaviors", + "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.", + "alias": "custom-behaviours" + }, + "zimit_progress_file": { + "type": "string", + "required": false, + "title": "Zimit Progress File", + "description": "Scraping progress file. Leave it as `/output/task_progress.json`", + "alias": "zimit-progress-file", + "pattern": "^/output/task_progress\\.json$" + }, + "replay_viewer_source": { + "type": "url", + "required": false, + "title": "Replay Viewer Source", + "description": "URL from which to load the ReplayWeb.page replay viewer from", + "alias": "replay-viewer-source" + }, + "zim_file": { + "type": "string", + "required": false, + "title": "ZIM filename", + "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically", + "alias": "zim-file", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "name": { + "type": "string", + "required": true, + "title": "ZIM name", + "description": "Name of the ZIM.", + "alias": "name", + "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", + "relaxedPattern": "^[A-Za-z0-9._-]+$" + } + } +} From 4ec47cd6dd7c8c69fa390f69c488fbc5ff9d1966 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Wed, 8 Oct 2025 04:25:12 +0100 Subject: [PATCH 53/65] use base64 string as argument to workflow call --- .github/workflows/update-zim-offliner-definition.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml index 4662e62..ee26474 100644 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -25,14 +25,14 @@ jobs: echo "File not found!" >&2 exit 1 fi - json=$(jq -c . offliner-definition.json) - echo "offliner_definition=$json" >> $GITHUB_OUTPUT + json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)") + echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT call-workflow: needs: prepare-json uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main with: version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }} offliner: zimit - offliner_definition: ${{ needs.prepare-json.outputs.offliner_definition }} + offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} secrets: zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }} From ad09665c4a93a503b0394f50a3835f69e6b6c6e5 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 10 Oct 2025 10:22:29 +0100 Subject: [PATCH 54/65] add workflow dispatch to update-offliner ci --- .github/workflows/update-zim-offliner-definition.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml index ee26474..982fe03 100644 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -8,6 +8,13 @@ on: release: types: [published] + workflow_dispatch: + inputs: + version: + description: "Version to publish (leave blank to use 'dev')" + required: false + default: "dev" + jobs: prepare-json: runs-on: ubuntu-24.04 @@ -31,7 +38,7 @@ jobs: needs: prepare-json uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main with: - version: ${{ github.event_name == 'release' && github.event.release.tag_name || 'dev' }} + version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }} offliner: zimit offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }} secrets: From a9805c84c284fc23f0e6497b79cb42e53e4adb28 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Fri, 10 Oct 2025 10:34:26 +0100 Subject: [PATCH 55/65] set proper outputs name --- .github/workflows/update-zim-offliner-definition.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update-zim-offliner-definition.yaml b/.github/workflows/update-zim-offliner-definition.yaml index 982fe03..f481354 100644 --- a/.github/workflows/update-zim-offliner-definition.yaml +++ b/.github/workflows/update-zim-offliner-definition.yaml @@ -11,7 +11,7 @@ on: workflow_dispatch: inputs: version: - description: "Version to publish (leave blank to use 'dev')" + description: "Version to publish" required: false default: "dev" @@ -19,7 +19,7 @@ jobs: prepare-json: runs-on: ubuntu-24.04 outputs: - offliner_definition: ${{ steps.read-json.outputs.offliner_definition }} + offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }} steps: - name: Checkout repository uses: actions/checkout@v4 From 44cf4218cb1940b4fd0cfa45032da1b8d3fdf130 Mon Sep 17 00:00:00 2001 From: Vitaly Zdanevich Date: Mon, 20 Oct 2025 01:22:31 +0400 Subject: [PATCH 56/65] README.md: add link to https://en.wikipedia.org/wiki/ZIM_(file_format) Signed-off-by: Vitaly Zdanevich --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 894f523..188615f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Zimit ===== -Zimit is a scraper allowing to create ZIM file from any Web site. +Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site. [![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) From 611d2033f7500a117aed9069fcad0abd7384b14c Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 09:29:15 -0800 Subject: [PATCH 57/65] Issue #499 - Use build dir rather than random tmp dir when passed. --- src/zimit/zimit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 02b167d..30c5de0 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -796,11 +796,13 @@ def run(raw_args): if known_args.adminEmail: user_agent_suffix += f" {known_args.adminEmail}" - # make temp dir for this crawl + # set temp dir to use for this crawl global temp_root_dir # noqa: PLW0603 if known_args.build: - temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp")) + # use build dir argument if passed + temp_root_dir = Path(known_args.build) else: + # make new randomized temp dir temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) seeds = [] From 4595d2a3027c82fb04629a5e949a8a12bb2483a5 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 09:36:47 -0800 Subject: [PATCH 58/65] Issue #499 - Only register cleanup if neither build or keep arguments have been passed. --- src/zimit/zimit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 30c5de0..9ed8a20 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -856,7 +856,8 @@ def run(raw_args): logger.info("Exiting, invalid warc2zim params") return EXIT_CODE_WARC2ZIM_CHECK_FAILED - if not known_args.keep: + # only trigger cleanup when the keep argument is passed without a custom build dir. + if not known_args.build and not known_args.keep: atexit.register(cleanup) # copy / download custom behaviors to one single folder and configure crawler From 57a88434e22517f6cffb63070d7852b11ad2d7b8 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 11:49:58 -0800 Subject: [PATCH 59/65] Issue #499 - Use all warc_directories found when no specific collection has been passed. --- src/zimit/zimit.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index 9ed8a20..a91c4e4 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -1079,12 +1079,11 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, only most recently modified one" - " will be used" + "Found many WARC files directories, combining pages from all of them" ) for directory in warc_dirs: logger.info(f"- {directory}") - warc_files = [warc_dirs[-1]] + warc_files = warc_dirs logger.info("") logger.info("----------") From 6db73a0a83f6d3b028175ba3a918a75493340f70 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Thu, 6 Nov 2025 12:19:28 -0800 Subject: [PATCH 60/65] Issue #499 - Ensure build directory exists when passed. --- src/zimit/zimit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index a91c4e4..fb070a0 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -801,6 +801,7 @@ def run(raw_args): if known_args.build: # use build dir argument if passed temp_root_dir = Path(known_args.build) + temp_root_dir.mkdir(parents=True, exist_ok=True) else: # make new randomized temp dir temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) From ef004f38635a2b1db1d1385212f913f04ee659e4 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Fri, 7 Nov 2025 11:33:01 -0800 Subject: [PATCH 61/65] Issue #499 Record changes in CHANGELOG --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e06e20..58fb40a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Fix issues preventing interrupted crawls from being resumed. (#499) + - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. + - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed. + - Don't cleanup an explicitly passed build directory. + ## [3.0.5] - 2024-04-11 ### Changed From e30a82a91c4e75de290e04b6d9b56aa9d5832799 Mon Sep 17 00:00:00 2001 From: Chris Routh Date: Fri, 7 Nov 2025 12:59:25 -0800 Subject: [PATCH 62/65] PR #524 Fix line length. --- src/zimit/zimit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index fb070a0..e982cbd 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -1080,7 +1080,8 @@ def run(raw_args): ) elif len(warc_dirs) > 1: logger.info( - "Found many WARC files directories, combining pages from all of them" + "Found many WARC files directories, combining pages from all " + "of them" ) for directory in warc_dirs: logger.info(f"- {directory}") From aec19d95d2257f72445746f92759e9b88574a31a Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Mon, 15 Dec 2025 14:25:24 +0100 Subject: [PATCH 63/65] migrate custom_css and favicon flags to blob types --- offliner-definition.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/offliner-definition.json b/offliner-definition.json index c7fed57..89bdd51 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -38,7 +38,8 @@ "maxLength": 80 }, "favicon": { - "type": "url", + "type": "blob", + "kind": "image", "required": false, "title": "Illustration", "description": "URL for Illustration. " @@ -887,7 +888,8 @@ "alias": "long-description" }, "custom_css": { - "type": "url", + "type": "blob", + "kind": "image", "required": false, "title": "Custom CSS", "description": "URL to a CSS file to inject into pages", From 34ce7eb98dc7a35dac7f3824a856e5e1e23587bf Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 16 Dec 2025 16:33:53 +0000 Subject: [PATCH 64/65] Fix offliner definition --- offliner-definition.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/offliner-definition.json b/offliner-definition.json index 89bdd51..4000466 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -889,7 +889,7 @@ }, "custom_css": { "type": "blob", - "kind": "image", + "kind": "css", "required": false, "title": "Custom CSS", "description": "URL to a CSS file to inject into pages", From 81018f06fa15517917c4c6e52d0212ca669b35dc Mon Sep 17 00:00:00 2001 From: Aaryan Kumar Sinha Date: Sat, 13 Dec 2025 01:30:33 +0530 Subject: [PATCH 65/65] Added --overwrite flag to zimit --- CHANGELOG.md | 3 ++ offliner-definition.json | 6 +++ src/zimit/zimit.py | 6 ++- tests/conftest.py | 14 ++++++ tests/data/example-response.warc | Bin 0 -> 2272 bytes tests/test_overwrite.py | 83 +++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/data/example-response.warc create mode 100644 tests/test_overwrite.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 58fb40a..2a99b30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) + ### Changed - Fix issues preventing interrupted crawls from being resumed. (#499) - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. diff --git a/offliner-definition.json b/offliner-definition.json index 4000466..4bb68b5 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -970,6 +970,12 @@ "alias": "name", "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "overwrite": { + "type": "boolean", + "required": false, + "title": "Overwrite", + "description": "Whether to overwrite existing ZIM file if it exists" } } } diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index e982cbd..b205007 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -849,6 +849,9 @@ def run(raw_args): warc2zim_args.append("--lang") warc2zim_args.append(known_args.zim_lang) + if known_args.overwrite: + warc2zim_args.append("--overwrite") + logger.info("----------") logger.info("Testing warc2zim args") logger.info("Running: warc2zim " + " ".join(warc2zim_args)) @@ -1036,7 +1039,6 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") crawl = subprocess.run(crawler_args, check=False) if ( @@ -1091,7 +1093,7 @@ def run(raw_args): logger.info("----------") logger.info( f"Processing WARC files in/at " - f'{" ".join(str(warc_file) for warc_file in warc_files)}' + f"{' '.join(str(warc_file) for warc_file in warc_files)}" ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d51650d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,14 @@ +import pytest + +from zimit import zimit as app + +""" + cleanup disabled because atexit hooks run at the very end of the Python process + shutdown. By the time cleanup() is called, the logging module has already closed its + file streams. +""" + + +@pytest.fixture(autouse=True) +def disable_zimit_cleanup(monkeypatch): + monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc new file mode 100644 index 0000000000000000000000000000000000000000..143b947d121e61b479cf9cae596b3853a9a60633 GIT binary patch literal 2272 zcmb`I`9Bkk1IJ13mB}8VF(i@kY%^vP#gc21YjZ|(tKqG9zOV21Klpw>KYxDuyxy<(2SO1MX-n+EA32QhCSDKm^z*;tt%-oa zbzumE4h)IVMxfD1Jx$oZqp70_gTvu^a6R;OZ!|(H;-5YrE|U`=pae?&C9rW^Ya*p1 zbjT~mR!ua~n4kol@A;&6KjUFCyxAaDN7gUAWNv-;u#?O2K?`&&1yHTSX2yZ~do0BF z1v%l+=Di-I;tyLt)05F<6`6pGKI@`J(9hMq1{(0}2gk8{zb>Pdn ztf2h@%$|g4W_!Wub6PgVBlKrY&?+_I1R=Rwg_rSHm)Ep2@w(m?{$*hGDov_TiqXL} z0tcpoDm71%@65h^YyJA;--K40^~0$xbQB6a&<_M!xeX7W*;F6p>U2!VxOy1^y$a5Q z-pjO~%c9tE?}km)g1?1MA5sn`*w z$lKUUV=a7qe20eud{_YdRm1V!X&#}%_2<; zXY$Dd(Rrsz$A=p4{*@O+`xQl->mGC&Kt>|U@w4i`4>?aKf|ZRw&fGDFxi((>swq|x zEJ-enG&wo9g*lq8WCo_E$T*v2AT1<;SSM-eXF!jHexp|6`4&aoOOY-yjl?Oy14%U_ z7x;tIX6YVO19|aFr*itiluYb0eR&EgH!r|eIE(wiWj}%a%kT15Qt`cPy}M_!gXME^ z-*5=?N(cWyD!?to@xoeU!EuGI<4jAb=T8aHF|S$QzZ%#@x?aXqtJfMaNU0)v-@=d4 zVhUJS?IT!ujK6MK%=`r3gMMQ{Q`-za=tu1&>2F7R1@bzSQU+6QK5he86a*^%nVmee zZlYH*D|*?IPYec9Bf?!h)a3;k$h*MgqrHm>&Sm&Ams+1QAlFv_EKas}NY1s%CFspf zFi|5sG6(Z*1$gY>LEu?3NB|;sGsrO61_RhkBY$gXypsfgb@9Hoh>$adRqN~4Qg zl_)0UBp=DsOvUiy`T2mO49{tc%+KMV`7?C|dMl|0gd$8u(Qm&XJ+++rn_k~PGgMojC`zD!i@ z6PGKwA7;NuQ2=@C$oRj}X)uO2txn)@yLoe1YAGD@@j;}z) zb<{ILaFg#+BRTs7{n=Ng7}lyrUF>w~f#n2oN$klv((};%fk97HV_I}_!G=+MwVqac zXinZrw95%0>+%D}#`Mwxwx#h`z5a+FBG%+0A{ff1h zKR>m03#_$;o-QX4m(;D+HryntdXKkob^StIv-Z|`I^t2mx5lfc79i2WC#rOFYs{dI zg|<)8ih=r}UOoqvsUxlv)w0U;xHvmDIzI6k{LKK0bd4?1y_;Q9i%t|4)6CQtB6gU?})Q-i!JMs0gDfL|e|fCnnTq)tcbTzjc4<%Y+EE z#ni*By(?Y{UA8f=x(Iv?)b53Q5t8Yr?andDyRP*7EgFyQH42e zuacpvQqzv4Y={{0&aenFiM`WqwXlHV`sOAP(2mztg{DvB`LyC!M@VAyM7 zS14cud+eaoBLn0xaRwvtMs`zsRAdHj3)HX^$@~-ZAOq9ancd24=pmnCrrvma*d4A#_O6AbcZTEp5`+$D98#-XBzbn{k>%2>;U*#I@X_s6KHew>eu=87 RH3|9e>Fz|Z3l0B4?{AQ;Bs>5B literal 0 HcmV?d00001 diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py new file mode 100644 index 0000000..e41baca --- /dev/null +++ b/tests/test_overwrite.py @@ -0,0 +1,83 @@ +import pathlib + +import pytest + +from zimit.zimit import run + +TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" + + +def test_overwrite_flag_behaviour(tmp_path): + zim_output = "overwrite-test.zim" + output_path = tmp_path / zim_output + + # 1st run → creates file + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert result in (None, 100) + assert output_path.exists() + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 3rd run, with overwrite → should succeed + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + "--overwrite", + ] + ) + assert result in (None, 100) + assert output_path.exists()