Merge pull request #272 from openzim/adopt_bootstrap

This commit is contained in:
benoit74 2024-01-22 10:41:29 +01:00 committed by GitHub
commit 343d0040cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 663 additions and 143 deletions

View file

@ -1,2 +0,0 @@
output/
node_modules/

View file

@ -1,26 +1,20 @@
name: Docker
name: Publish released version
on:
push:
branches:
- main
tags:
- v*
release:
types: [published]
jobs:
build-and-push:
name: Deploy Docker Image
publish:
runs-on: ubuntu-22.04
steps:
- name: Retrieve source code
uses: actions/checkout@v3
- uses: actions/checkout@v3
- name: Build and push
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
on-master: dev
tag-pattern: /^v([0-9.]+)$/
latest-on-tag: true
restrict-to: openzim/zimit

View file

@ -0,0 +1,30 @@
name: Publish Docker dev image
on:
push:
branches:
- main
jobs:
publish:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
manual-tag: dev
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials:
GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
repo_description: auto
repo_overview: auto
platforms: |
linux/amd64
linux/arm64

View file

@ -1,4 +1,4 @@
name: Docker Zimit2
name: Publish Docker zimit2 image
on:
push:
@ -6,19 +6,18 @@ on:
- zimit2
jobs:
build-and-push:
name: Deploy Docker Image
publish:
runs-on: ubuntu-22.04
steps:
- name: Retrieve source code
uses: actions/checkout@v3
- uses: actions/checkout@v3
- name: Build and push
- name: Build and push Docker image
uses: openzim/docker-publish-action@v10
with:
image-name: openzim/zimit
manual-tag: zimit2
latest-on-tag: false
restrict-to: openzim/zimit
registries: ghcr.io
credentials:

34
.github/workflows/QA.yaml vendored Normal file
View file

@ -0,0 +1,34 @@
name: QA
on:
pull_request:
push:
branches:
- main
jobs:
check-qa:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[lint,scripts,test,check]
- name: Check black formatting
run: inv lint-black
- name: Check ruff
run: inv lint-ruff
- name: Check pyright
run: inv check-pyright

66
.github/workflows/Tests.yaml vendored Normal file
View file

@ -0,0 +1,66 @@
name: Tests
on:
pull_request:
push:
branches:
- main
jobs:
run-tests:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Install dependencies (and project)
run: |
pip install -U pip
pip install -e .[test,scripts]
- name: Run the tests
run: inv coverage --args "-vvv"
- name: Upload coverage report to codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
build_python:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version-file: pyproject.toml
architecture: x64
- name: Ensure we can build Python targets
run: |
pip install -U pip build
python3 -m build --sdist --wheel
# this job replaces the standard "build_docker" job since it builds the docker image
run-integration-tests:
runs-on: ubuntu-22.04
steps:
- name: checkout
uses: actions/checkout@v3
- name: build image
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"

View file

@ -1,20 +0,0 @@
name: CI
on: push
jobs:
integration-tests:
runs-on: ubuntu-22.04
steps:
- name: checkout
uses: actions/checkout@v3
- name: build image
run: docker build -t zimit .
- name: run crawl
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep
- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"

27
.pre-commit-config.yaml Normal file
View file

@ -0,0 +1,27 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: "23.12.1"
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.3
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.347
hooks:
- id: pyright
name: pyright (system)
description: 'pyright static type checker'
entry: pyright
language: system
'types_or': [python, pyi]
require_serial: true
minimum_pre_commit_version: '2.9.2'

View file

@ -7,12 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- New `--version` flag to display Zimit version
### Changed
- Use `warc2zim` version 2, which works without Service Worker anymore
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
- Adopt Python bootstrap conventions
## [1.6.3] - 2024-01-18
@ -156,7 +160,7 @@ if `--keep` is set.
### Changed
- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
- default WARC location after crawl changed
- default WARC location after crawl changed
from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
### Removed

View file

@ -2,35 +2,44 @@ FROM webrecorder/browsertrix-crawler:0.12.4
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
libmagic1 \
python3.10-venv \
&& rm -rf /var/lib/apt/lists/* \
# python setup (in venv not to conflict with browsertrix)
&& python3 -m venv /app/zimit \
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' \
'git+https://github.com/openzim/warc2zim@warc2zim2#egg_name=warc2zim' \
# placeholder (default output location)
&& mkdir -p /output \
# disable chrome upgrade
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
# download list of bad domains to filter-out. intentionnaly ran post-install \
# so it's not cached in earlier layers (url stays same but content updated) \
mkdir -p /tmp/ads && cd /tmp/ads && \
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
cat ./*.txt > /etc/blocklist.txt \
&& rm ./*.txt \
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
chmod +x /usr/local/bin/entrypoint.sh
&& apt-get install -qqy --no-install-recommends \
libmagic1 \
python3.11-venv \
&& rm -rf /var/lib/apt/lists/* \
# python setup (in venv not to conflict with browsertrix)
&& python3.11 -m venv /app/zimit \
# placeholder (default output location)
&& mkdir -p /output \
# disable chrome upgrade
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
# download list of bad domains to filter-out. intentionnaly ran post-install \
# so it's not cached in earlier layers (url stays same but content updated) \
&& mkdir -p /tmp/ads \
&& cd /tmp/ads \
&& curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
&& curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
&& curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
&& cat ./*.txt > /etc/blocklist.txt \
&& rm ./*.txt \
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
&& chmod +x /usr/local/bin/entrypoint.sh
WORKDIR /app
ADD zimit.py /app/
# fix shebang on zimit to use in-venv python
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
&& ln -s /app/zimit.py /usr/bin/zimit \
&& chmod +x /usr/bin/zimit
# Copy pyproject.toml and its dependencies
COPY pyproject.toml README.md /src/
COPY src/zimit/__about__.py /src/src/zimit/__about__.py
# Install Python dependencies
RUN /app/zimit/bin/python -m pip install --no-cache-dir /src
# Copy code + associated artifacts
COPY src /src/src
COPY *.md /src/
# Install + cleanup
RUN /app/zimit/bin/python -m pip install --no-cache-dir /src \
&& ln -s /app/zimit/bin/zimit /usr/bin/zimit \
&& chmod +x /usr/bin/zimit \
&& rm -rf /src
ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"]
CMD ["zimit", "--help"]

View file

@ -3,12 +3,9 @@ Zimit
Zimit is a scraper allowing to create ZIM file from any Web site.
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
[![Build](https://github.com/openzim/zimit/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/zimit/actions?query=branch%3Amain)
[![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
⚠️ **Important**: this tool uses [warc2zim](https://github.com/openzim/warc2zim) to create Zim files and thus require the Zim reader to support *Service Workers*. At the time of `zimit:1.0`, that's mostly kiwix-android and kiwix-serve. Note that service workers have protocol restrictions as well so you'll need to run it either from `localhost` or over HTTPS.
[![Docker](https://ghcr-badge.deta.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
Technical background
--------------------
@ -45,7 +42,7 @@ The image accepts the following parameters, **as well as any of the [warc2zim](h
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
- `--output` - output directory (defaults to `/output`)
- `--limit U` - Limit capture to at most U URLs
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--exclude="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
- `--scroll [N]` - if set, will activate a simple auto-scroll behavior on each page to scroll for upto N seconds
- `--keep` - if set, keep the WARC files in a temp directory inside the output directory
@ -68,7 +65,10 @@ default and prints the crawl status to the Docker log.
Nota bene
---------
A first version of a generic HTTP scraper was created in 2016 during
While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
since Zimit 2.x which does not have any special requirements anymore.
It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
the [Wikimania Esino Lario
Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).

233
pyproject.toml Normal file
View file

@ -0,0 +1,233 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "zimit"
authors = [
{ name = "Kiwix", email = "dev@kiwix.org" },
]
keywords = ["some"]
requires-python = ">=3.11,<3.12"
description = "Make ZIM file from any website through crawling"
readme = "README.md"
license = {text = "GPL-3.0-or-later"}
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
]
dependencies = [
"requests==2.31.0",
"inotify==0.2.10",
"tld==0.13",
"warc2zim @ git+https://github.com/openzim/warc2zim@warc2zim2",
]
dynamic = ["version"]
[tool.hatch.metadata]
allow-direct-references = true # to be removed once we use a released warc2zim version
[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
]
lint = [
"black==23.12.1",
"ruff==0.1.3",
]
check = [
"pyright==1.1.347",
]
test = [
"pytest==7.4.4",
"coverage==7.4.0",
]
dev = [
"pre-commit==3.6.0",
"debugpy==1.8.0",
"zimit[scripts]",
"zimit[lint]",
"zimit[test]",
"zimit[check]",
]
[project.urls]
Homepage = "https://github.com/openzim/zimit"
Donate = "https://www.kiwix.org/en/support-us/"
[project.scripts]
zimit = "zimit:zimit.zimit"
[tool.hatch.version]
path = "src/zimit/__about__.py"
[tool.hatch.build]
exclude = [
"/.github",
]
[tool.hatch.build.targets.wheel]
packages = ["src/zimit"]
[tool.hatch.envs.default]
features = ["dev"]
[tool.hatch.envs.test]
features = ["scripts", "test"]
[tool.hatch.envs.test.scripts]
run = "inv test --args '{args}'"
run-cov = "inv test-cov --args '{args}'"
report-cov = "inv report-cov"
coverage = "inv coverage --args '{args}'"
html = "inv coverage --html --args '{args}'"
[tool.hatch.envs.lint]
template = "lint"
skip-install = false
features = ["scripts", "lint"]
[tool.hatch.envs.lint.scripts]
black = "inv lint-black --args '{args}'"
ruff = "inv lint-ruff --args '{args}'"
all = "inv lintall --args '{args}'"
fix-black = "inv fix-black --args '{args}'"
fix-ruff = "inv fix-ruff --args '{args}'"
fixall = "inv fixall --args '{args}'"
[tool.hatch.envs.check]
features = ["scripts", "check"]
[tool.hatch.envs.check.scripts]
pyright = "inv check-pyright --args '{args}'"
all = "inv checkall --args '{args}'"
[tool.black]
line-length = 88
target-version = ['py311']
[tool.ruff]
target-version = "py311"
line-length = 88
src = ["src"]
select = [
"A", # flake8-builtins
# "ANN", # flake8-annotations
"ARG", # flake8-unused-arguments
# "ASYNC", # flake8-async
"B", # flake8-bugbear
# "BLE", # flake8-blind-except
"C4", # flake8-comprehensions
"C90", # mccabe
# "COM", # flake8-commas
# "D", # pydocstyle
# "DJ", # flake8-django
"DTZ", # flake8-datetimez
"E", # pycodestyle (default)
"EM", # flake8-errmsg
# "ERA", # eradicate
# "EXE", # flake8-executable
"F", # Pyflakes (default)
# "FA", # flake8-future-annotations
"FBT", # flake8-boolean-trap
# "FLY", # flynt
# "G", # flake8-logging-format
"I", # isort
"ICN", # flake8-import-conventions
# "INP", # flake8-no-pep420
# "INT", # flake8-gettext
"ISC", # flake8-implicit-str-concat
"N", # pep8-naming
# "NPY", # NumPy-specific rules
# "PD", # pandas-vet
# "PGH", # pygrep-hooks
# "PIE", # flake8-pie
# "PL", # Pylint
"PLC", # Pylint: Convention
"PLE", # Pylint: Error
"PLR", # Pylint: Refactor
"PLW", # Pylint: Warning
# "PT", # flake8-pytest-style
# "PTH", # flake8-use-pathlib
# "PYI", # flake8-pyi
"Q", # flake8-quotes
# "RET", # flake8-return
# "RSE", # flake8-raise
"RUF", # Ruff-specific rules
"S", # flake8-bandit
# "SIM", # flake8-simplify
# "SLF", # flake8-self
"T10", # flake8-debugger
"T20", # flake8-print
# "TCH", # flake8-type-checking
# "TD", # flake8-todos
"TID", # flake8-tidy-imports
# "TRY", # tryceratops
"UP", # pyupgrade
"W", # pycodestyle
"YTT", # flake8-2020
]
ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# Remove flake8-errmsg since we consider they bloat the code and provide limited value
"EM",
# Allow boolean positional values in function calls, like `dict.get(... True)`
"FBT003",
# Ignore checks for possible passwords
"S105", "S106", "S107",
# Ignore warnings on subprocess.run / popen
"S603",
# Ignore complexity
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
]
unfixable = [
# Don't touch unused imports
"F401",
]
[tool.ruff.isort]
known-first-party = ["zimit"]
[tool.ruff.flake8-bugbear]
# add exceptions to B008 for fastapi.
extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
[tool.ruff.flake8-tidy-imports]
ban-relative-imports = "all"
[tool.ruff.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests**/**/*" = ["PLR2004", "S101", "TID252"]
[tool.pytest.ini_options]
minversion = "7.3"
testpaths = ["tests"]
pythonpath = [".", "src"]
[tool.coverage.paths]
zimit = ["src/zimit"]
tests = ["tests"]
[tool.coverage.run]
source_pkgs = ["zimit"]
branch = true
parallel = true
omit = [
"src/zimit/__about__.py",
]
[tool.coverage.report]
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]
[tool.pyright]
include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**"]
extraPaths = ["src"]
pythonVersion = "3.11"
typeCheckingMode="basic"

1
src/zimit/__about__.py Normal file
View file

@ -0,0 +1 @@
__version__ = "2.0.0-dev0"

View file

@ -1,7 +1,3 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
"""
Main zimit run script
This script validates arguments with warc2zim, checks permissions
@ -11,6 +7,7 @@ and then calls the Node based driver
import atexit
import itertools
import json
import logging
import shutil
import signal
import subprocess
@ -26,16 +23,28 @@ import inotify.adapters
import requests
from tld import get_fld
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
from zimit.__about__ import __version__
DEFAULT_USER_AGENT = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/17.0 Safari/605.1.15"
)
EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
EXIT_CODE_CRAWLER_LIMIT_HIT = 11
NORMAL_WARC2ZIM_EXIT_CODE = 100
logger = getLogger(name="zimit", level=logging.INFO)
class ProgressFileWatcher:
def __init__(self, output_dir, stats_path):
def __init__(self, output_dir: Path, stats_path: Path):
self.crawl_path = output_dir / "crawl.json"
self.warc2zim_path = output_dir / "warc2zim.json"
self.stats_path = Path(stats_path)
self.stats_path = stats_path
if not self.stats_path.is_absolute():
self.stats_path = output_dir / self.stats_path
@ -46,6 +55,8 @@ class ProgressFileWatcher:
self.process = None
def stop(self):
if not self.process:
return
self.process.join(0.1)
self.process.terminate()
@ -58,10 +69,10 @@ class ProgressFileWatcher:
self.process.start()
@staticmethod
def inotify_watcher(crawl_fpath, warc2zim_fpath, output_fpath):
def inotify_watcher(crawl_fpath: str, warc2zim_fpath: str, output_fpath: str):
ino = inotify.adapters.Inotify()
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY)
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY)
ino.add_watch(crawl_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
ino.add_watch(warc2zim_fpath, inotify.constants.IN_MODIFY) # pyright: ignore
class Limit:
def __init__(self):
@ -97,15 +108,15 @@ class ProgressFileWatcher:
"limit": limit.as_dict,
}
for _, _, fpath, _ in ino.event_gen(yield_nones=False):
for _, _, fpath, _ in ino.event_gen(yield_nones=False): # pyright: ignore
func = {crawl_fpath: crawl_conv, warc2zim_fpath: warc2zim_conv}.get(fpath)
if not func:
continue
# open input and output separatly as to not clear output on error
with open(fpath, "r") as ifh:
with open(fpath) as ifh:
try:
out = func(json.load(ifh), limit)
except Exception: # nosec
except Exception: # nosec # noqa: S112
# simply ignore progress update should an error arise
# might be malformed input for instance
continue
@ -115,7 +126,7 @@ class ProgressFileWatcher:
json.dump(out, ofh)
def zimit(args=None):
def run(raw_args):
wait_until_options = ["load", "domcontentloaded", "networkidle"]
wait_until_all = wait_until_options + [
f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
@ -131,7 +142,7 @@ def zimit(args=None):
parser.add_argument(
"--urlFile",
help="If set, read a list of seed urls, " "one per line, from the specified",
help="If set, read a list of seed urls, one per line, from the specified",
)
parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers")
@ -205,7 +216,8 @@ def zimit(args=None):
parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
help="if set, sets the language used by the browser, should be ISO 639 "
"language[-country] code",
)
parser.add_argument(
@ -224,7 +236,8 @@ def zimit(args=None):
parser.add_argument(
"--userAgent",
help="Override default user-agent with specified value ; --userAgentSuffix is still applied",
help="Override default user-agent with specified value ; --userAgentSuffix is "
"still applied",
default=DEFAULT_USER_AGENT,
)
@ -333,7 +346,14 @@ def zimit(args=None):
"to configure the crawling behaviour if not set via argument.",
)
zimit_args, warc2zim_args = parser.parse_known_args(args)
parser.add_argument(
"--version",
help="Display scraper version and exit",
action="version",
version=f"Zimit {__version__}",
)
zimit_args, warc2zim_args = parser.parse_known_args(raw_args)
# pass url and output to warc2zim also
if zimit_args.output:
@ -372,13 +392,13 @@ def zimit(args=None):
warc2zim_args.append("--lang")
warc2zim_args.append(zimit_args.zim_lang)
print("----------")
print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
logger.info("----------")
logger.info("Testing warc2zim args")
logger.info("Running: warc2zim " + " ".join(warc2zim_args))
res = warc2zim(warc2zim_args)
if res != 100:
print("Exiting, invalid warc2zim params")
return 2
if res != NORMAL_WARC2ZIM_EXIT_CODE:
logger.info("Exiting, invalid warc2zim params")
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
# make temp dir for this crawl
if zimit_args.build:
@ -389,9 +409,9 @@ def zimit(args=None):
if not zimit_args.keep:
def cleanup():
print("")
print("----------")
print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
logger.info("")
logger.info("----------")
logger.info(f"Cleanup, removing temp dir: {temp_root_dir}")
shutil.rmtree(temp_root_dir)
atexit.register(cleanup)
@ -412,7 +432,7 @@ def zimit(args=None):
watcher = ProgressFileWatcher(
Path(zimit_args.output), Path(zimit_args.statsFilename)
)
print(f"Writing progress to {watcher.stats_path}")
logger.info(f"Writing progress to {watcher.stats_path}")
# update crawler command
cmd_args.append("--statsFilename")
cmd_args.append(str(watcher.crawl_path))
@ -424,15 +444,16 @@ def zimit(args=None):
cmd_line = " ".join(cmd_args)
print("")
print("----------")
print(
f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
logger.info("")
logger.info("----------")
logger.info(
f"Output to tempdir: {temp_root_dir} - "
f"{'will keep' if zimit_args.keep else 'will delete'}"
)
print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
crawl = subprocess.run(cmd_args)
if crawl.returncode == 11:
print("crawl interupted by a limit")
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
crawl = subprocess.run(cmd_args, check=False)
if crawl.returncode == EXIT_CODE_CRAWLER_LIMIT_HIT:
logger.info("crawl interupted by a limit")
elif crawl.returncode != 0:
raise subprocess.CalledProcessError(crawl.returncode, cmd_args)
@ -447,28 +468,28 @@ def zimit(args=None):
"Failed to find directory where WARC files have been created"
)
elif len(warc_dirs) > 1:
print("Found many WARC files directories, only last one will be used")
logger.info("Found many WARC files directories, only last one will be used")
for directory in warc_dirs:
print(f"- {directory}")
logger.info(f"- {directory}")
warc_directory = warc_dirs[-1]
print("")
print("----------")
print(f"Processing WARC files in {warc_directory}")
logger.info("")
logger.info("----------")
logger.info(f"Processing WARC files in {warc_directory}")
warc2zim_args.append(str(warc_directory))
num_files = sum(1 for _ in warc_directory.iterdir())
print(f"{num_files} WARC files found", flush=True)
print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True)
logger.info(f"{num_files} WARC files found")
logger.info(f"Calling warc2zim with these args: {warc2zim_args}")
return warc2zim(warc2zim_args)
def check_url(url, user_agent, scope=None):
url = urllib.parse.urlparse(url)
def check_url(url: str, user_agent: str, scope: str | None = None):
parsed_url = urllib.parse.urlparse(url)
try:
with requests.get(
url.geturl(),
parsed_url.geturl(),
stream=True,
allow_redirects=True,
timeout=(12.2, 27),
@ -476,28 +497,28 @@ def check_url(url, user_agent, scope=None):
) as resp:
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
print(f"failed to connect to {url.geturl()}: {exc}", flush=True)
raise SystemExit(1)
logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}")
raise SystemExit(1) from None
actual_url = urllib.parse.urlparse(resp.url)
# remove explicit port in URI for default-for-scheme as browsers does it
if actual_url.scheme == "https" and actual_url.port == 443:
if actual_url.scheme == "https" and actual_url.port == 443: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="")
if actual_url.scheme == "http" and actual_url.port == 80:
if actual_url.scheme == "http" and actual_url.port == 80: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="")
if actual_url.geturl() != url.geturl():
if actual_url.geturl() != parsed_url.geturl():
if scope in (None, "any"):
return actual_url.geturl()
print(
"[WARN] Your URL ({0}) redirects to {1} which {2} on same "
"first-level domain. Depending on your scopeType ({3}), "
logger.info(
"[WARN] Your URL ({}) redirects to {} which {} on same "
"first-level domain. Depending on your scopeType ({}), "
"your homepage might be out-of-scope. Please check!".format(
url.geturl(),
parsed_url.geturl(),
actual_url.geturl(),
"is"
if get_fld(url.geturl()) == get_fld(actual_url.geturl())
if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
else "is not",
scope,
)
@ -505,7 +526,7 @@ def check_url(url, user_agent, scope=None):
return actual_url.geturl()
return url.geturl()
return parsed_url.geturl()
def get_node_cmd_line(args):
@ -541,7 +562,7 @@ def get_node_cmd_line(args):
"config",
]:
value = getattr(args, arg)
if value == None or (isinstance(value, bool) and value == False):
if value is None or (isinstance(value, bool) and value is False):
continue
node_cmd.append("--" + arg)
if not isinstance(value, bool):
@ -550,17 +571,22 @@ def get_node_cmd_line(args):
return node_cmd
def sigint_handler(*args):
print("")
print("")
print("SIGINT/SIGTERM received, stopping zimit")
print("")
print("", flush=True)
def sigint_handler(*args): # noqa: ARG001
logger.info("")
logger.info("")
logger.info("SIGINT/SIGTERM received, stopping zimit")
logger.info("")
logger.info("")
sys.exit(3)
def zimit():
run(sys.argv[1:])
signal.signal(signal.SIGINT, sigint_handler)
signal.signal(signal.SIGTERM, sigint_handler)
if __name__ == "__main__":
zimit()

109
tasks.py Normal file
View file

@ -0,0 +1,109 @@
# pyright: strict, reportUntypedFunctionDecorator=false
import os
from invoke.context import Context
from invoke.tasks import task # pyright: ignore [reportUnknownVariableType]
use_pty = not os.getenv("CI", "")
@task(optional=["args"], help={"args": "pytest additional arguments"})
def test(ctx: Context, args: str = ""):
"""run tests (without coverage)"""
ctx.run(f"pytest {args}", pty=use_pty)
@task(optional=["args"], help={"args": "pytest additional arguments"})
def test_cov(ctx: Context, args: str = ""):
"""run test vith coverage"""
ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
@task(optional=["html"], help={"html": "flag to export html report"})
def report_cov(ctx: Context, *, html: bool = False):
"""report coverage"""
ctx.run("coverage combine", warn=True, pty=use_pty)
ctx.run("coverage report --show-missing", pty=use_pty)
if html:
ctx.run("coverage html", pty=use_pty)
@task(
optional=["args", "html"],
help={
"args": "pytest additional arguments",
"html": "flag to export html report",
},
)
def coverage(ctx: Context, args: str = "", *, html: bool = False):
"""run tests and report coverage"""
test_cov(ctx, args=args)
report_cov(ctx, html=html)
@task(optional=["args"], help={"args": "black additional arguments"})
def lint_black(ctx: Context, args: str = "."):
args = args or "." # needed for hatch script
ctx.run("black --version", pty=use_pty)
ctx.run(f"black --check --diff {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
def lint_ruff(ctx: Context, args: str = "."):
args = args or "." # needed for hatch script
ctx.run("ruff --version", pty=use_pty)
ctx.run(f"ruff check {args}", pty=use_pty)
@task(
optional=["args"],
help={
"args": "linting tools (black, ruff) additional arguments, typically a path",
},
)
def lintall(ctx: Context, args: str = "."):
"""Check linting"""
args = args or "." # needed for hatch script
lint_black(ctx, args)
lint_ruff(ctx, args)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
def check_pyright(ctx: Context, args: str = ""):
"""check static types with pyright"""
ctx.run("pyright --version")
ctx.run(f"pyright {args}", pty=use_pty)
@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
def checkall(ctx: Context, args: str = ""):
"""check static types"""
check_pyright(ctx, args)
@task(optional=["args"], help={"args": "black additional arguments"})
def fix_black(ctx: Context, args: str = "."):
"""fix black formatting"""
args = args or "." # needed for hatch script
ctx.run(f"black {args}", pty=use_pty)
@task(optional=["args"], help={"args": "ruff additional arguments"})
def fix_ruff(ctx: Context, args: str = "."):
"""fix all ruff rules"""
args = args or "." # needed for hatch script
ctx.run(f"ruff --fix {args}", pty=use_pty)
@task(
optional=["args"],
help={
"args": "linting tools (black, ruff) additional arguments, typically a path",
},
)
def fixall(ctx: Context, args: str = "."):
"""Fix everything automatically"""
args = args or "." # needed for hatch script
fix_black(ctx, args)
fix_ruff(ctx, args)
lintall(ctx, args)

View file

@ -0,0 +1 @@
These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)

View file

@ -1,6 +1,6 @@
import os
import glob
import json
import os
import libzim.reader
from warcio import ArchiveIterator
@ -26,14 +26,17 @@ def test_zim_main_page():
def test_user_agent():
"""Test that mobile user agent was used in WARC request records with custom Zimit and email suffix"""
"""Test that mobile user agent was used
Check is done in WARC request records with custom Zimit and email suffix
"""
found = False
for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
with open(warc, "rb") as fh:
for record in ArchiveIterator(fh):
if record.rec_type == "request":
print(record.http_headers)
print(record.http_headers) # noqa: T201
ua = record.http_headers.get_header("User-Agent")
if ua:
assert "Mozilla" in ua

6
tests/test_dummy.py Normal file
View file

@ -0,0 +1,6 @@
from zimit.zimit import DEFAULT_USER_AGENT
# dummy test, just to have coverage report done
def test_default_user_agent():
assert DEFAULT_USER_AGENT