Merge pull request #528 from aaryansinhaa/feature/overwrite

Added --overwrite flag to zimit
2025-12-31 04:23:15 +00:00 · 2025-12-22 11:56:16 +01:00 · 2025-12-22 16:08:02 +05:30 · 2025-12-16 16:33:53 +00:00 · 2025-12-16 17:32:44 +01:00 · 2025-12-15 14:25:24 +01:00
44 changed files with 4983 additions and 8310 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@ -1,6 +1,6 @@
 # These are supported funding model platforms

-github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
 ko_fi: # Replace with a single Ko-fi username
@ -9,4 +9,4 @@ community_bridge: # Replace with a single Community Bridge project-name e.g., cl
 liberapay: # Replace with a single Liberapay username
 issuehunt: # Replace with a single IssueHunt username
 otechie: # Replace with a single Otechie username
-custom: https://kiwix.org/support-us/
+custom: # https://kiwix.org/support-us/
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -0,0 +1,15 @@
+daysUntilClose: false
+staleLabel: stale
+
+issues:
+  daysUntilStale: 60
+  markComment: >
+    This issue has been automatically marked as stale because it has not had
+    recent activity. It will be now be reviewed manually. Thank you
+    for your contributions.
+pulls:
+   daysUntilStale: 7
+   markComment: >
+     This pull request has been automatically marked as stale because it has not had
+     recent activity. It will be now be reviewed manually. Thank you
+     for your contributions.
--- a/.github/workflows/DailyTests.yaml
+++ b/.github/workflows/DailyTests.yaml
@ -0,0 +1,34 @@
+name: DailyTests
+
+on:
+  schedule:
+    - cron: "0 4 * * *"
+  workflow_dispatch:
+
+
+jobs:
+  run-daily-tests:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: checkout
+        uses: actions/checkout@v4
+
+      - name: build zimit image
+        run: docker build -t local-zimit .
+
+      - name: run crawl of test website
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds https://website.test.openzim.org/ --name tests_eng_test-website --zim-file tests_eng_test-website.zim
+
+      - name: archive ZIM
+        uses: actions/upload-artifact@v4
+        with:
+          name: tests_eng_test-website.zim
+          path: output/tests_eng_test-website.zim
+          retention-days: 30
+
+      - name: build tests-daily Docker image
+        run: docker build -t local-tests-daily tests-daily
+
+      - name: run integration test suite
+        run: docker run -e SKIP_YOUTUBE_TEST="True" -v $PWD/tests-daily/daily.py:/app/daily.py -v $PWD/output:/output local-tests-daily bash -c "cd /app && pytest -v --log-level=INFO --log-format='%(levelname)s - %(message)s' daily.py"
--- a/.github/workflows/Publish.yml
+++ b/.github/workflows/Publish.yml
@ -0,0 +1,53 @@
+name: Publish released version
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  publish-amd64:
+    runs-on: ubuntu-24.04
+    name: "Publish for AMD64"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build and push Docker image
+        uses: openzim/docker-publish-action@v10
+        with:
+          image-name: openzim/zimit
+          tag-pattern: /^v([0-9.]+)$/
+          latest-on-tag: true
+          restrict-to: openzim/zimit
+          registries: ghcr.io
+          credentials: |
+            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+          repo_description: auto
+          repo_overview: auto
+          platforms: |
+            linux/amd64
+
+  # Disabled for now, see https://github.com/openzim/zimit/issues/463
+  # publish-arm64:
+  #   runs-on: ubuntu-24.04
+  #   name: "Publish for ARM64"
+  #
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #
+  #     - name: Build and push Docker image
+  #       uses: openzim/docker-publish-action@v10
+  #       with:
+  #         image-name: openzim/zimit
+  #         tag-pattern: /^v([0-9.]+)$/
+  #         latest-on-tag: true
+  #         restrict-to: openzim/zimit
+  #         registries: ghcr.io
+  #         credentials: |
+  #           GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+  #           GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+  #         repo_description: auto
+  #         repo_overview: auto
+  #         platforms: |
+  #           linux/arm64
--- a/.github/workflows/PublishDockerDevImage.yaml
+++ b/.github/workflows/PublishDockerDevImage.yaml
@ -0,0 +1,55 @@
+name: Publish Docker dev image
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  publish-amd64:
+    runs-on: ubuntu-24.04
+    name: "Publish for AMD64"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build and push Docker image
+        uses: openzim/docker-publish-action@v10
+        with:
+          image-name: openzim/zimit
+          manual-tag: dev
+          latest-on-tag: false
+          restrict-to: openzim/zimit
+          registries: ghcr.io
+          credentials: |
+            GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+            GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+          repo_description: auto
+          repo_overview: auto
+          platforms: |
+            linux/amd64
+
+  # Disabled for now, see https://github.com/openzim/zimit/issues/463
+  # publish-arm64:
+  #   runs-on: ubuntu-24.04-arm
+  #   name: "Publish for ARM64"
+  #
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #
+  #     - name: Build and push Docker image
+  #       uses: openzim/docker-publish-action@v10
+  #       with:
+  #         image-name: openzim/zimit
+  #         manual-tag: dev
+  #         latest-on-tag: false
+  #         restrict-to: openzim/zimit
+  #         registries: ghcr.io
+  #         credentials: |
+  #           GHCRIO_USERNAME=${{ secrets.GHCR_USERNAME }}
+  #           GHCRIO_TOKEN=${{ secrets.GHCR_TOKEN }}
+  #         repo_description: auto
+  #         repo_overview: auto
+  #         platforms: |
+  #           linux/arm64
--- a/.github/workflows/QA.yaml
+++ b/.github/workflows/QA.yaml
@ -0,0 +1,34 @@
+name: QA
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  check-qa:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: pyproject.toml
+          architecture: x64
+
+      - name: Install dependencies (and project)
+        run: |
+          pip install -U pip
+          pip install -e .[lint,scripts,test,check]
+
+      - name: Check black formatting
+        run: inv lint-black
+
+      - name: Check ruff
+        run: inv lint-ruff
+
+      - name: Check pyright
+        run: inv check-pyright
--- a/.github/workflows/Tests.yaml
+++ b/.github/workflows/Tests.yaml
@ -0,0 +1,81 @@
+name: Tests
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  run-tests:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: pyproject.toml
+          architecture: x64
+
+      - name: Install dependencies (and project)
+        run: |
+          pip install -U pip
+          pip install -e .[test,scripts]
+
+      - name: Run the tests
+        run: inv coverage --args "-vvv"
+
+      - name: Upload coverage report to codecov
+        uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+  build_python:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: pyproject.toml
+          architecture: x64
+
+      - name: Ensure we can build Python targets
+        run: |
+          pip install -U pip build
+          python3 -m build --sdist --wheel
+
+  # this job replaces the standard "build_docker" job since it builds the docker image
+  run-integration-tests:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: checkout
+        uses: actions/checkout@v4
+
+      - name: build image
+        run: docker build -t local-zimit .
+
+      - name: ensure help display without issue
+        run: docker run -v $PWD/output:/output local-zimit zimit --help
+
+      - name: run crawl with soft size limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeSoftLimit 8192 --name tests_en_sizesoftlimit --zim-file tests_en_sizesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizesoftlimit.json
+
+      - name: run crawl with hard size limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --sizeHardLimit 8192 --name tests_en_sizehardlimit --zim-file tests_en_sizehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_sizehardlimit.json || true
+
+      - name: run crawl with soft time limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeSoftLimit 1 --name tests_en_timesoftlimit --zim-file tests_en_timesoftlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timesoftlimit.json
+
+      - name: run crawl with hard time limit
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/ --timeHardLimit 1 --name tests_en_timehardlimit --zim-file tests_en_timehardlimit.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats_timehardlimit.json || true
+
+      - name: run standard crawl
+        run: docker run -v $PWD/output:/output local-zimit zimit --seeds http://website.test.openzim.org/http-return-codes.html --name tests_en_onepage --zim-file tests_en_onepage.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --zimit-progress-file /output/stats.json --statsFilename /output/crawl.json --warc2zim-progress-file /output/warc2zim.json --keep
+
+      - name: run integration test suite
+        run: docker run -v $PWD/tests-integration/integration.py:/app/integration.py -v $PWD/output:/output local-zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v /app/integration.py"
--- a/.github/workflows/update-zim-offliner-definition.yaml
+++ b/.github/workflows/update-zim-offliner-definition.yaml
@ -0,0 +1,45 @@
+name: Update ZIMFarm Definitions
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - "offliner-definition.json"
+  release:
+    types: [published]
+
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to publish"
+        required: false
+        default: "dev"
+
+jobs:
+  prepare-json:
+    runs-on: ubuntu-24.04
+    outputs:
+      offliner_definition_b64: ${{ steps.read-json.outputs.offliner_definition_b64 }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - id: read-json
+        run: |
+          if [ ! -f "offliner-definition.json" ]; then
+            echo "File not found!" >&2
+            exit 1
+          fi
+          json_b64=$(base64 -w0 <<< "$(jq -c . offliner-definition.json)")
+          echo "offliner_definition_b64=$json_b64" >> $GITHUB_OUTPUT
+  call-workflow:
+    needs: prepare-json
+    uses: openzim/overview/.github/workflows/update-zimfarm-offliner-definition.yaml@main
+    with:
+      version: ${{ github.event_name == 'release' && github.event.release.tag_name || (github.event.inputs.version || 'dev') }}
+      offliner: zimit
+      offliner_definition_b64: ${{ needs.prepare-json.outputs.offliner_definition_b64 }}
+    secrets:
+      zimfarm_ci_secret: ${{ secrets.ZIMFARM_CI_SECRET }}
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,362 @@
-*.pyc
-__pycache__
-*.zim
+# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
+# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,visualstudiocode,intellij
+
+### Intellij ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Intellij Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
 *.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij
+
+# output dir
+output
+
+# ignore all vscode, this editor specific, not maintained by openzim
+.vscode
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,27 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v5.0.0
+  hooks:
+  -   id: trailing-whitespace
+  -   id: end-of-file-fixer
+- repo: https://github.com/psf/black
+  rev: "25.1.0"
+  hooks:
+  -   id: black
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.9.4
+  hooks:
+  - id: ruff
+- repo: https://github.com/RobertCraigie/pyright-python
+  rev: v1.1.393
+  hooks:
+  - id: pyright
+    name: pyright (system)
+    description: 'pyright static type checker'
+    entry: pyright
+    language: system
+    'types_or': [python, pyi]
+    require_serial: true
+    minimum_pre_commit_version: '2.9.2'
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,409 @@
+## Changelog
+
+All notable changes to this project are documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0).
+
+## [Unreleased]
+
+### Added
+- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
+
+### Changed
+- Fix issues preventing interrupted crawls from being resumed. (#499)
+  - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
+  - Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
+  - Don't cleanup an explicitly passed build directory.
+
+## [3.0.5] - 2024-04-11
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.6.0 (#493)
+
+## [3.0.4] - 2024-04-04
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.10 (#491)
+
+## [3.0.3] - 2024-02-28
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.7 (#483)
+
+## [3.0.2] - 2024-02-27
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.6 (#482)
+
+## [3.0.1] - 2024-02-24
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.4 (#476)
+
+## [3.0.0] - 2024-02-17
+
+### Changed
+
+- Change solution to report partial ZIM to the Zimfarm and other clients (#304)
+- Keep temporary folder when crawler or warc2zim fails, even if not asked for (#468)
+- Add many missing Browsertrix Crawler arguments ; drop default overrides by zimit ; drop `--noMobileDevice` setting (not needed anymore) (#433)
+- Document all Browsertrix Crawler default arguments values (#416)
+- Use preferred Browsertrix Crawler arguments names: (part of #471)
+  - `--seeds` instead of `--url`
+  - `--seedFile` instead of `--urlFile`
+  - `--pageLimit` instead of `--limit`
+  - `--pageLoadTimeout` instead of `--timeout`
+  - `--scopeIncludeRx` instead of `--include`
+  - `--scopeExcludeRx` instead of `--exclude`
+  - `--pageExtraDelay` instead of `--delay`
+- Remove confusion between zimit, warc2zim and crawler stats filenames (part of #471)
+  - `--statsFilename` is now the crawler stats file (since it is the same name, just like other arguments)
+  - `--zimit-progress-file` is now the zimit stats location
+  - `--warc2zim-progress-file` is the warc2zim stats location
+  - all are optional values, if not set and needed temporary files are used
+
+### Fixed
+
+- Do not create the ZIM when crawl is incomplete (#444)
+
+## [2.1.8] - 2024-02-07
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.5.1, Python 3.13 and others (#462 + #464)
+
+## [2.1.7] - 2024-01-10
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.4.2 (#450)
+- Upgrade to warc2zim 2.2.0
+
+## [2.1.6] - 2024-11-07
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.3.5 (#426)
+
+## [2.1.5] - 2024-11-01
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.3.4 and warc2zim 2.1.3 (#424)
+
+## [2.1.4] - 2024-10-11
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.3.3 (#411)
+
+## [2.1.3] - 2024-10-08
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.3.2, warc2zim 2.1.2 and other dependencies (#406)
+
+### Fixed
+
+- Fix help (#393)
+
+## [2.1.2] - 2024-09-09
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.3.0-beta.1 (#387) (fixes "Ziming a website with huge assets (e.g. PDFs) is failing to proceed" - #380)
+
+## [2.1.1] - 2024-09-05
+
+### Added
+
+- Add support for uncompressed tar archive in --warcs (#369)
+
+### Changed
+
+- Upgrade to browsertrix crawler 1.3.0-beta.0 (#379), including upgrage to Ubuntu Noble (#307)
+
+### Fixed
+
+- Stream files downloads to not exhaust memory (#373)
+- Fix documentation on `--diskUtilization` setting (#375)
+
+## [2.1.0] - 2024-08-09
+
+### Added
+
+- Add `--custom-behaviors` argument to support path/HTTP(S) URL custom behaviors to pass to the crawler (#313)
+- Add daily automated end-to-end tests of a page with Youtube player (#330)
+- Add `--warcs` option to directly process WARC files (#301)
+
+### Changed
+
+- Make it clear that `--profile` argument can be an HTTP(S) URL (and not only a path) (#288)
+- Fix README imprecisions + add back warc2zim availability in docker image (#314)
+- Enhance integration test to assert final content of the ZIM (#287)
+- Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
+- Do not log number of WARC files found (#357)
+- Upgrade dependencies (warc2zim 2.1.0)
+
+### Fixed
+
+- Sort WARC directories found by modification time (#366)
+
+## [2.0.6] - 2024-08-02
+
+### Changed
+
+- Upgraded Browsertrix Crawler to 1.2.6
+
+## [2.0.5] - 2024-07-24
+
+### Changed
+
+- Upgraded Browsertrix Crawler to 1.2.5
+- Upgraded warc2zim to 2.0.3
+
+## [2.0.4] - 2024-07-15
+
+### Changed
+
+- Upgraded Browsertrix Crawler to 1.2.4 (fixes retrieve automatically the assets present in a data-xxx tag #316)
+
+## [2.0.3] - 2024-06-24
+
+### Changed
+
+- Upgraded Browsertrix Crawler to 1.2.0 (fixes Youtube videos issue #323)
+
+## [2.0.2] - 2024-06-18
+
+### Changed
+
+- Upgrade dependencies (mainly warc2zim 2.0.2)
+
+
+## [2.0.1] - 2024-06-13
+
+### Changed
+
+- Upgrade dependencies (especially warc2zim 2.0.1 and browsertrix crawler 1.2.0-beta.0) (#318)
+
+### Fixed
+
+- Crawler is not correctly checking disk size / usage (#305)
+
+## [2.0.0] - 2024-06-04
+
+### Added
+
+- New `--version` flag to display Zimit version (#234)
+- New `--logging` flag to adjust Browsertrix Crawler logging (#273)
+- Use new `--scraper-suffix` flag of warc2zim to enhance ZIM "Scraper" metadata (#275)
+- New `--noMobileDevice` CLI argument
+- Publish Docker image for `linux/arm64` (in addition to `linux/amd64`) (#178)
+
+### Changed
+
+- **Use `warc2zim` version 2**, which works without Service Worker anymore (#193)
+- Upgraded Browsertrix Crawler to 1.1.3
+- Adopt Python bootstrap conventions
+- Upgrade to Python 3.12 + upgrade dependencies
+- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim (#284)
+- Drop initial check of URL in Python (#256)
+- `--userAgent` CLI argument overrides again the `--userAgentSuffix` and `--adminEmail` values
+- `--userAgent` CLI arguement is not mandatory anymore
+
+### Fixed
+
+- Fix support for Youtube videos (#291)
+- Fix crawler `--waitUntil` values (#289)
+
+## [1.6.3] - 2024-01-18
+
+### Changed
+
+- Adapt to new `warc2zim` code structure
+- Using browsertrix-crawler 0.12.4
+- Using warc2zim 1.5.5
+
+### Added
+
+- New `--build` parameter (optional) to specify the directory holding Browsertrix files ; if not set, `--output`
+directory is used ; zimit creates one subdir of this folder per invocation to isolate datasets ; subdir is kept only
+if `--keep` is set.
+
+### Fixed
+
+- `--collection` parameter was not working (#252)
+
+## [1.6.2] - 2023-11-17
+
+### Changed
+
+- Using browsertrix-crawler 0.12.3
+
+### Fixed
+
+- Fix logic passing args to crawler to support value '0' (#245)
+- Fix documentation about Chrome and headless (#248)
+
+## [1.6.1] - 2023-11-06
+
+### Changed
+
+- Using browsertrix-crawler 0.12.1
+
+## [1.6.0] - 2023-11-02
+
+### Changed
+
+- Scraper fails for all HTTP error codes returned when checking URL at startup (#223)
+- User-Agent now has a default value (#228)
+- Manipulation of spaces with UA suffix and adminEmail has been modified
+- Same User-Agent is used for check_url (Python) and Browsertrix crawler (#227)
+- Using browsertrix-crawler 0.12.0
+
+## [1.5.3] - 2023-10-02
+
+### Changed
+
+- Using browsertrix-crawler 0.11.2
+
+## [1.5.2] - 2023-09-19
+
+### Changed
+
+- Using browsertrix-crawler 0.11.1
+
+## [1.5.1] - 2023-09-18
+
+### Changed
+
+- Using browsertrix-crawler 0.11.0
+- Scraper stat file is not created empty (#211)
+- Crawler statistics are not available anymore (#213)
+- Using warc2zim 1.5.4
+
+## [1.5.0] - 2023-08-23
+
+### Added
+
+- `--long-description` param
+
+## [1.4.1] - 2023-08-23
+
+### Changed
+
+- Using browsertrix-crawler 0.10.4
+- Using warc2zim 1.5.3
+
+## [1.4.0] - 2023-08-02
+
+### Added
+
+- `--title` to set ZIM title
+- `--description` to set ZIM description
+- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
+- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)
+
+### Changed
+
+- Using browsertrix-crawler 0.10.2
+- Default and accepted values for `--waitUntil` from crawler's update
+- Using warc2zim 1.5.2
+- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
+- `--failOnFailedSeed` used inconditionally
+- `--lang` now passed to crawler (ISO-639-1)
+
+### Removed
+
+- `--newContext` from crawler's update
+
+## [1.3.1] - 2023-02-06
+
+### Changed
+
+- Using browsertrix-crawler 0.8.0
+- Using warc2zim version 1.5.1 with wabac.js 2.15.2
+
+## [1.3.0] - 2023-02-02
+
+### Added
+
+- Initial url check normalizes homepage redirects to standart ports – 80/443 (#137)
+
+### Changed
+
+- Using warc2zim version 1.5.0 with scope conflict fix and videos fix
+- Using browsertrix-crawler 0.8.0-beta.1
+- Fixed `--allowHashUrls` being a boolean param
+- Increased `check_url` timeout (12s to connect, 27s to read) instead of 10s
+
+## [1.2.0] - 2022-06-21
+
+### Added
+
+- `--urlFile` browsertrix crawler parameter
+- `--depth` browsertrix crawler parameter
+- `--extraHops`, parameter
+- `--collection` browsertrix crawler parameter
+- `--allowHashUrls` browsertrix crawler parameter
+- `--userAgentSuffix` browsertrix crawler parameter
+- `--behaviors`, parameter
+- `--behaviorTimeout` browsertrix crawler parameter
+- `--profile` browsertrix crawler parameter
+- `--sizeLimit` browsertrix crawler parameter
+- `--timeLimit` browsertrix crawler parameter
+- `--healthCheckPort`, parameter
+- `--overwrite` parameter
+
+### Changed
+
+- using browsertrix-crawler `0.6.0` and warc2zim `1.4.2`
+- default WARC location after crawl changed
+from `collections/capture-*/archive/` to `collections/crawl-*/archive/`
+
+### Removed
+
+- `--scroll` browsertrix crawler parameter (see `--behaviors`)
+- `--scope` browsertrix crawler parameter (see `--scopeType`, `--include` and `--exclude`)
+
+
+## [1.1.5]
+
+- using crawler 0.3.2 and warc2zim 1.3.6
+
+## [1.1.4]
+
+- Defaults to `load,networkidle0` for waitUntil param (same as crawler)
+- Allows setting combinations of values for waitUntil param
+- Updated warc2zim to 1.3.5
+- Updated browsertrix-crawler to 0.3.1
+- Warc to zim now written to `{temp_root_dir}/collections/capture-*/archive/` where
+  `capture-*` is dynamic and includes the datetime. (from browsertrix-crawler)
+
+## [1.1.3]
+
+- allows same first-level-domain redirects
+- fixed redirects to URL in scope
+- updated crawler to 0.2.0
+- `statsFilename` now informs whether limit was hit or not
+
+## [1.1.2]
+
+- added support for --custom-css
+- added domains block list (dfault)
+
+## [1.1.1]
+
+- updated browsertrix-crawler to 0.1.4
+  - autofetcher script to be injected by defaultDriver to capture srcsets + URLs in dynamically added stylesheets
+
+## [1.0]
+
+- initial version using browsertrix-crawler:0.1.3 and warc2zim:1.3.3
--- a/49
+++ b/49
@ -0,0 +1,49 @@
+FROM webrecorder/browsertrix-crawler:1.6.0
+LABEL org.opencontainers.image.source=https://github.com/openzim/zimit
+
+# add deadsnakes ppa for latest Python on Ubuntu
+RUN add-apt-repository ppa:deadsnakes/ppa -y
+
+RUN apt-get update \
+ && apt-get install -qqy --no-install-recommends \
+      libmagic1 \
+      python3.13-venv \
+ && rm -rf /var/lib/apt/lists/* \
+ # python setup (in venv not to conflict with browsertrix)
+ && python3.13 -m venv /app/zimit \
+ # placeholder (default output location)
+ && mkdir -p /output \
+ # disable chrome upgrade
+ && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
+ # download list of bad domains to filter-out. intentionnaly ran post-install \
+ # so it's not cached in earlier layers (url stays same but content updated) \
+ && mkdir -p /tmp/ads \
+ && cd /tmp/ads \
+ && curl -L -O https://hosts.anudeep.me/mirror/adservers.txt \
+ && curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt \
+ && curl -L -O https://hosts.anudeep.me/mirror/facebook.txt \
+ && cat ./*.txt > /etc/blocklist.txt \
+ && rm ./*.txt \
+ && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh \
+ && chmod +x /usr/local/bin/entrypoint.sh
+
+# Copy pyproject.toml and its dependencies
+COPY pyproject.toml README.md /src/
+COPY src/zimit/__about__.py /src/src/zimit/__about__.py
+
+# Install Python dependencies
+RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src
+
+# Copy code + associated artifacts
+COPY src /src/src
+COPY *.md /src/
+
+# Install + cleanup
+RUN . /app/zimit/bin/activate && python -m pip install --no-cache-dir /src \
+ && ln -s /app/zimit/bin/zimit /usr/bin/zimit \
+ && ln -s /app/zimit/bin/warc2zim /usr/bin/warc2zim \
+ && chmod +x /usr/bin/zimit \
+ && rm -rf /src
+
+ENTRYPOINT ["entrypoint.sh"]
+CMD ["zimit", "--help"]
--- a/674
+++ b/674
@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
--- a/README.md
+++ b/README.md
@ -0,0 +1,90 @@
+Zimit
+=====
+
+Zimit is a scraper allowing to create [ZIM file](https://en.wikipedia.org/wiki/ZIM_(file_format)) from any Web site.
+
+[![CodeFactor](https://www.codefactor.io/repository/github/openzim/zimit/badge)](https://www.codefactor.io/repository/github/openzim/zimit)
+[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0)
+[![Docker](https://ghcr-badge.egpl.dev/openzim/zimit/latest_tag?label=docker)](https://ghcr.io/openzim/zimit)
+
+Zimit adheres to openZIM's [Contribution Guidelines](https://github.com/openzim/overview/wiki/Contributing).
+
+Zimit has implemented openZIM's [Python bootstrap, conventions and policies](https://github.com/openzim/_python-bootstrap/blob/main/docs/Policy.md) **v1.0.1**.
+
+Capabilities and known limitations
+--------------------
+
+While we would like to support as many websites as possible, making an offline archive of any website with a versatile tool obviously has some limitations.
+
+Most capabilities and known limitations are documented in [warc2zim README](https://github.com/openzim/warc2zim/blob/main/README.md). There are also some limitations in Browsertrix Crawler (used to fetch the website) and wombat (used to properly replay dynamic web requests), but these are not (yet?) clearly documented.
+
+Technical background
+--------------------
+
+Zimit runs a fully automated browser-based crawl of a website property and produces a ZIM of the crawled content. Zimit runs in a Docker container.
+
+The system:
+- runs a website crawl with [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler), which produces WARC files
+- converts the crawled WARC files to a single ZIM using [warc2zim](https://github.com/openzim/warc2zim)
+
+The `zimit.py` is the entrypoint for the system.
+
+After the crawl is done, warc2zim is used to write a zim to the `/output` directory, which should be mounted as a volume to not loose the ZIM created when container stops.
+
+Using the `--keep` flag, the crawled WARCs and few other artifacts will also be kept in a temp directory inside `/output`
+
+Usage
+-----
+
+`zimit` is intended to be run in Docker. Docker image is published at https://github.com/orgs/openzim/packages/container/package/zimit.
+
+The image accepts the following parameters, **as well as any of the [Browsertrix crawler](https://crawler.docs.browsertrix.com/user-guide/cli-options/) and [warc2zim](https://github.com/openzim/warc2zim) ones**:
+
+- Required: `--seeds URL` - the url to start crawling from ; multiple URLs can be separated by a comma (even if **usually not needed**, these are just the **seeds** of the crawl) ; first seed URL is used as ZIM homepage
+- Required: `--name` - Name of ZIM file
+- `--output` - output directory (defaults to `/output`)
+- `--pageLimit U` - Limit capture to at most U URLs
+- `--scopeExcludeRx <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times. An example is `--scopeExcludeRx="(\?q=|signup-landing\?|\?cid=)"`, where URLs that contain either `?q=` or `signup-landing?` or `?cid=` will be excluded.
+- `--workers N` - number of crawl workers to be run in parallel
+- `--waitUntil` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--waitUntil domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
+- `--keep` - in case of failure, WARC files and other temporary files (which are stored as a subfolder of output directory) are always kept, otherwise they are automatically deleted. Use this flag to always keep WARC files, even in case of success.
+
+Example command:
+
+```bash
+docker run ghcr.io/openzim/zimit zimit --help
+docker run ghcr.io/openzim/zimit warc2zim --help
+docker run  -v /output:/output ghcr.io/openzim/zimit zimit --seeds URL --name myzimfile
+```
+
+**Note**: Image automatically filters out a large number of ads by using the 3 blocklists from [anudeepND](https://github.com/anudeepND/blacklist). If you don't want this filtering, disable the image's entrypoint in your container (`docker run --entrypoint="" ghcr.io/openzim/zimit ...`).
+
+To re-build the Docker image locally run:
+
+```bash
+docker build -t ghcr.io/openzim/zimit .
+```
+
+FAQ
+---
+
+The Zimit contributor's team maintains [a page with most Frequently Asked Questions](https://github.com/openzim/zimit/wiki/Frequently-Asked-Questions).
+
+Nota bene
+---------
+
+While Zimit 1.x relied on a Service Worker to display the ZIM content, this is not anymore the case
+since Zimit 2.x which does not have any special requirements anymore.
+
+It should also be noted that a first version of a generic HTTP scraper was created in 2016 during
+the [Wikimania Esino Lario
+Hackathon](https://wikimania2016.wikimedia.org/wiki/Programme/Kiwix-dedicated_Hackathon).
+
+That version is now considered outdated and [archived in `2016`
+branch](https://github.com/openzim/zimit/tree/2016).
+
+License
+-------
+
+[GPLv3](https://www.gnu.org/licenses/gpl-3.0) or later, see
+[LICENSE](LICENSE) for more details.
--- a/README.rst
+++ b/README.rst
@ -1,246 +0,0 @@
-#####################################
-Create ZIM files out of HTTP websites
-#####################################
-
-This project provides an API and an user interface in order to convert any
-website into a Zim file.
-
-Exposed API
-###########
-
-All APIs are talking JSON over HTTP. As such, all parameters should be sent as
-stringified JSON and the Content-Type should be set to "application/json".
-
-POST /website-zim
-=================
-
-By posting to this endpoint, you are asking the system to start a new download
-of a website and a conversion into a Zim format.
-
-Required parameters
-------------------
-
- **url**: URL of the website to be crawled
- **title**: Title that will be used in the created Zim file
- **email**: Email address that will get notified when the creation of the file is over
-
-Optional parameters
-------------------
-
- **language**: An `ISO 639-3 <https://en.wikipedia.org/wiki/ISO_639-3>`_ code
-   representing the language
- **welcome**: the page that will be first shown in the Zim file
- **description**: The description that will be embedded in the Zim file
- **author**: The author of the content
-
-Return values
-------------
-
- **job_id**: The job id is returned in JSON format. It can be used to know the
-  status of the process.
-
-Status codes
------------
-
- `400 Bad Request` will be returned in case you are not respecting the
-  expected inputs. In case of error, have a look at the body of the response:
-  it contains information about what is missing.
- `201 Created` will be returned if the process started.
-
-Exemple
-------
-
-::
-
-  $ http POST http://0.0.0.0:6543/website-url url="https://refugeeinfo.eu/" title="Refugee Info" email="alexis@notmyidea.org"
-  HTTP/1.1 201 Created
-
-  {
-      "job": "5012abe3-bee2-4dd7-be87-39a88d76035d"
-  }
-
-
-GET /status/{jobid}
-===================
-
-Retrieve the status of a job and displays the associated logs.
-
-Return values
-------------
-
- **status**: The status of the job, it is one of 'queued', finished',
-  'failed', 'started' and 'deferred'.
- **log**: The logs of the job.
-
-Status codes
------------
-
- `404 Not Found` will be returned in case the requested job does not exist.
- `200 OK` will be returned in any other case.
-
-Exemple
-------
-
-::
-
-    http GET http://0.0.0.0:6543/status/5012abe3-bee2-4dd7-be87-39a88d76035d
-    HTTP/1.1 200 OK
-
-    {
-        "log": "<snip>",
-        "status": "finished"
-    }
-
-
-Okay, so how do I install it on my server?
-##########################################
-
-Currently, the best way to install it is by retrieving the sources from github
-
-::
-
-  $ git clone https://github.com/almet/zimit.git
-  $ cd zimit
-
-Create a virtual environment and install the project in it::
-
-  $ virtualenv venv
-  $ venv/bin/pip install -e .
-
-Then, run it how you want, for instance with pserve::
-
-  $ venv/bin/pserve zimit.ini
-
-
-In a separate process, you also need to run the worker::
-
-  $ venv/bin/rqworker
-
-
-And you're ready to go. To test it::
-
-  $ http POST http://0.0.0.0:6543/website-url url="https://refugeeinfo.eu/" title="Refugee Info" email="alexis@notmyidea.org"
-
-
-Debian dependencies
-####################
-
-Installing the dependencies
-===========================
-
-::
-
-    sudo apt-get install httrack libzim-dev libmagic-dev liblzma-dev libz-dev build-essential libtool libgumbo-dev redis-server automake pkg-config
-
-Installing zimwriterfs
-======================
-
-::
-
-    git clone https://github.com/wikimedia/openzim.git
-    cd openzim/zimwriterfs
-    ./autogen.sh
-    ./configure
-    make
-
-Then upgrade the path to zimwriterfs executable in zimit.ini
-
-::
-
-  $ rqworker & pserve zimit.ini
-
-How to deploy?
-##############
-
-There are multiple ways to deploy such service, so I'll describe how I do it
-with my own best-practices.
-
-First of all, get all the dependencies and the code. I like to have everything
-available in /home/www, so let's consider this will be the case here::
-
-  $ mkdir /home/www/zimit.notmyidea.org
-  $ cd /home/www/zimit.notmyidea.org
-  $ git clone https://github.com/almet/zimit.git
-
-Then, you can change the configuration file, by creating a new one::
-
-  $ cd zimit
-  $ cp zimit.ini local.ini
-
-From there, you need to update the configuration to point to the correct
-binaries and locations.
-
-Nginx configuration
-===================
-
-::
-
-  # the upstream component nginx needs to connect to
-    upstream zimit_upstream {
-        server unix:///tmp/zimit.sock;
-    }
-
-    # configuration of the server
-    server {
-        listen      80;
-        listen   [::]:80;
-        server_name zimit.ideascube.org;
-        charset     utf-8;
-
-        client_max_body_size 200M;
-
-        location /zims {
-            alias /home/ideascube/zimit.ideascube.org/zims/;
-            autoindex on;
-        }
-
-        # Finally, send all non-media requests to the Pyramid server.
-        location / {
-            uwsgi_pass  zimit_upstream;
-            include     /var/ideascube/uwsgi_params;
-        }
-      }
-
-
-UWSGI configuration
-===================
-
-::
-
-  [uwsgi]
-  uid = ideascube
-  gid = ideascube
-  chdir           = /home/ideascube/zimit.ideascube.org/zimit/
-  ini             = /home/ideascube/zimit.ideascube.org/zimit/local.ini
-  # the virtualenv (full path)
-  home            = /home/ideascube/zimit.ideascube.org/venv/
-
-  # process-related settings
-  # master
-  master          = true
-  # maximum number of worker processes
-  processes       = 4
-  # the socket (use the full path to be safe
-  socket          = /tmp/zimit.sock
-  # ... with appropriate permissions - may be needed
-  chmod-socket    = 666
-  # stats           = /tmp/ideascube.stats.sock
-  # clear environment on exit
-  vacuum          = true
-  plugins         = python
-
-
-supervisord configuration
-=========================
-
-::
-
-  [program:zimit-worker]
-  command=/home/ideascube/zimit.ideascube.org/venv/bin/rqworker
-  directory=/home/ideascube/zimit.ideascube.org/zimit/
-  user=www-data
-  autostart=true
-  autorestart=true
-  redirect_stderr=true
-
-That's it!
--- a/app.wsgi
+++ b/app.wsgi
@ -1,24 +0,0 @@
-try:
-    import ConfigParser as configparser
-except ImportError:
-    import configparser
-import logging.config
-import os
-
-from zimit import main
-
-here = os.path.dirname(__file__)
-
-ini_path = os.environ.get('ZIMIT_INI')
-if ini_path is None:
-    ini_path = os.path.join(here, 'local.ini')
-
-# Set up logging
-logging.config.fileConfig(ini_path)
-
-# Parse config and create WSGI app
-config = configparser.ConfigParser()
-config.read(ini_path)
-
-application = main(config.items('DEFAULT'), **dict(config.items('app:main'
-)))
--- a/app/assets/alertify.css
+++ b/app/assets/alertify.css
@ -1 +0,0 @@
-.alertify-logs>*{padding:12px 24px;color:#fff;box-shadow:0 2px 5px 0 rgba(0,0,0,.2);border-radius:1px}.alertify-logs>*,.alertify-logs>.default{background:rgba(0,0,0,.8)}.alertify-logs>.error{background:rgba(244,67,54,.8)}.alertify-logs>.success{background:rgba(76,175,80,.9)}.alertify{position:fixed;background-color:rgba(0,0,0,.3);left:0;right:0;top:0;bottom:0;width:100%;height:100%;z-index:1}.alertify.hide{opacity:0;pointer-events:none}.alertify,.alertify.show{box-sizing:border-box;transition:all .33s cubic-bezier(.25,.8,.25,1)}.alertify,.alertify *{box-sizing:border-box}.alertify .dialog{padding:12px}.alertify .alert,.alertify .dialog{width:100%;margin:0 auto;position:relative;top:50%;transform:translateY(-50%)}.alertify .alert>*,.alertify .dialog>*{width:400px;max-width:95%;margin:0 auto;text-align:center;padding:12px;background:#fff;box-shadow:0 2px 4px -1px rgba(0,0,0,.14),0 4px 5px 0 rgba(0,0,0,.098),0 1px 10px 0 rgba(0,0,0,.084)}.alertify .alert .msg,.alertify .dialog .msg{padding:12px;margin-bottom:12px;margin:0;text-align:left}.alertify .alert input:not(.form-control),.alertify .dialog input:not(.form-control){margin-bottom:15px;width:100%;font-size:100%;padding:12px}.alertify .alert input:not(.form-control):focus,.alertify .dialog input:not(.form-control):focus{outline-offset:-2px}.alertify .alert nav,.alertify .dialog nav{text-align:right}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button),.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button){background:transparent;box-sizing:border-box;color:rgba(0,0,0,.87);position:relative;outline:0;border:0;display:inline-block;-ms-flex-align:center;-ms-grid-row-align:center;align-items:center;padding:0 6px;margin:6px 8px;line-height:36px;min-height:36px;white-space:nowrap;min-width:88px;text-align:center;text-transform:uppercase;font-size:14px;text-decoration:none;cursor:pointer;border:1px solid transparent;border-radius:2px}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):active,.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):hover,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):active,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):hover{background-color:rgba(0,0,0,.05)}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):focus,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):focus{border:1px solid rgba(0,0,0,.1)}.alertify .alert nav button.btn,.alertify .dialog nav button.btn{margin:6px 4px}.alertify-logs{position:fixed;z-index:1}.alertify-logs.bottom,.alertify-logs:not(.top){bottom:16px}.alertify-logs.left,.alertify-logs:not(.right){left:16px}.alertify-logs.left>*,.alertify-logs:not(.right)>*{float:left;transform:translateZ(0);height:auto}.alertify-logs.left>.show,.alertify-logs:not(.right)>.show{left:0}.alertify-logs.left>*,.alertify-logs.left>.hide,.alertify-logs:not(.right)>*,.alertify-logs:not(.right)>.hide{left:-110%}.alertify-logs.right{right:16px}.alertify-logs.right>*{float:right;transform:translateZ(0)}.alertify-logs.right>.show{right:0;opacity:1}.alertify-logs.right>*,.alertify-logs.right>.hide{right:-110%;opacity:0}.alertify-logs.top{top:0}.alertify-logs>*{box-sizing:border-box;transition:all .4s cubic-bezier(.25,.8,.25,1);position:relative;clear:both;backface-visibility:hidden;perspective:1000;max-height:0;margin:0;padding:0;overflow:hidden;opacity:0;pointer-events:none}.alertify-logs>.show{margin-top:12px;opacity:1;max-height:1000px;padding:12px;pointer-events:auto}
--- a/app/assets/alertify.js
+++ b/app/assets/alertify.js
--- a/app/assets/bootstrap.css
+++ b/app/assets/bootstrap.css
--- a/app/index.html
+++ b/app/index.html
@ -1,84 +0,0 @@
-<!DOCTYPE html>
-
-<head>
-</head>
-  <link rel="stylesheet" href="./assets/bootstrap.css">
-  <link rel="stylesheet" href="./assets/alertify.css">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <meta http-equiv="X-UA-Compatible" content="IE=edge">
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>Zimit — Create a zim archive out of a website URL</title>
-
-    <meta charset="utf-8" />
-<body>
-<div class="navbar navbar-default navbar-static-top">
-  <div class="container">
-    <div class="navbar-header">
-      <a class="navbar-brand" href="#">Zim it!</a>
-    </div>
-    <div class="navbar-collapse collapse">
-      <ul class="nav navbar-nav navbar-right">
-        <li><a href="http://www.openzim.org/wiki/Mission">Our values</a></li>
-      </ul>
-    </div>
-  </div>
-</div>
-<div class="container">
-<form action="#" id="zimcreator" onSubmit="submitForm()">
-  <div class="form-group field field-object">
-    <fieldset>
-      <div class="form-group field field-string">
-        <label class="control-label" for="url">Website URL</label>
-        <input id="url" label="Website URL" placeholder="https://google.com" class="form-control" type="url">
-      </div>
-      <div class="form-group field field-string">
-        <label class="control-label" for="url">Zim Title</label>
-        <input id="title" label="Website URL" placeholder="A great website" class="form-control" type="text">
-      </div>
-      <div class="form-group field field-string">
-        <label class="control-label" for="url">Enter an email to be notified when this is finished</label>
-        <input id="email" label="Email" placeholder="john@doe.com" class="form-control" type="email">
-      </div>
-    </fieldset>
-  </div>
-  <p>
-    <button type="submit" class="btn btn-info">Create the Zim file!</button>
-  </p>
-</form>
-<p>
-  This is a <a href="http://www.openzim.org/wiki/OpenZIM">Zim</a> creator. Enter the <em>url</em> of the website you want ton turn in a zim file, a <em>title</em> and click on <em>Create zim File</em>
-</p>
-<p>Enjoy !</p>
-</div>
-  <script src="./assets/alertify.js"></script>
-  <script type="text/javascript">
-
-function getField(field) {
-  return document.forms['zimcreator'].elements[field].value;
-}
-
-function submitForm() {
-  var content = {
-    url: getField('url'),
-    title: getField('title'),
-    email: getField('email'),
-  }
-  fetch("/website-zim", {
-    method: "POST",
-    body: JSON.stringify(content),
-    headers: {'Content-Type': 'application/json'}
-  }).then(function (result) {
-    if (result.status >= 400) {
-      alertify.error("The server wasn't able to start the job, please check your inputs.");
-    } else {
-      alertify.success("The job has been submitted! You'll receive an email when it's finished.");
-    }
-  })
- .catch(function (error) {
-   alertify.error("Sorry, we weren't able to join the server. This is usually due to connectivity issues.");
- });
-   return false;
-}
-  </script>
-
-</body>
--- a/favicon.ico
+++ b/favicon.ico
--- a/offliner-definition.json
+++ b/offliner-definition.json
@ -0,0 +1,981 @@
+{
+  "offliner_id": "zimit",
+  "stdOutput": true,
+  "stdStats": "zimit-progress-file",
+  "flags": {
+    "seeds": {
+      "type": "string",
+      "required": false,
+      "title": "Seeds",
+      "description": "The seed URL(s) to start crawling from. Multile seed URL must be separated by a comma (usually not needed, these are just the crawl seeds). First seed URL is used as ZIM homepage"
+    },
+    "seed_file": {
+      "type": "string",
+      "required": false,
+      "title": "Seed File",
+      "description": "If set, read a list of seed urls, one per line. HTTPS URL to an online file."
+    },
+    "lang": {
+      "type": "string",
+      "required": false,
+      "title": "Browser Language",
+      "description": "If set, sets the language used by the browser, should be ISO 639 language[-country] code, e.g. `en` or `en-GB`"
+    },
+    "title": {
+      "type": "string",
+      "required": false,
+      "title": "Title",
+      "description": "Custom title for your ZIM. Defaults to title of main page",
+      "minLength": 1,
+      "maxLength": 30
+    },
+    "description": {
+      "type": "string",
+      "required": false,
+      "title": "Description",
+      "description": "Description for ZIM",
+      "minLength": 1,
+      "maxLength": 80
+    },
+    "favicon": {
+      "type": "blob",
+      "kind": "image",
+      "required": false,
+      "title": "Illustration",
+      "description": "URL for Illustration. "
+    },
+    "tags": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM Tags",
+      "description": "Single string with individual tags separated by a semicolon."
+    },
+    "creator": {
+      "type": "string",
+      "required": false,
+      "title": "Creator",
+      "description": "Name of content creator"
+    },
+    "publisher": {
+      "type": "string",
+      "required": false,
+      "title": "Publisher",
+      "isPublisher": true,
+      "description": "Custom publisher name (ZIM metadata). openZIM otherwise"
+    },
+    "source": {
+      "type": "string",
+      "required": false,
+      "title": "Source",
+      "description": "Source name/URL of content"
+    },
+    "workers": {
+      "type": "integer",
+      "required": false,
+      "title": "Workers",
+      "description": "The number of workers to run in parallel. Defaults to 1",
+      "min": 1
+    },
+    "wait_until": {
+      "type": "string",
+      "required": false,
+      "title": "WaitUntil",
+      "description": "Puppeteer page.goto() condition to wait for before continuing. One of load, domcontentloaded, networkidle0 or networkidle2, or a comma-separated combination of those. Default is load,networkidle2"
+    },
+    "extra_hops": {
+      "type": "integer",
+      "required": false,
+      "title": "Extra Hops",
+      "description": "Number of extra 'hops' to follow, beyond the current scope. Default is 0",
+      "min": 0
+    },
+    "page_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Limit",
+      "description": "Limit crawl to this number of pages. Default is 0 (no-limit).",
+      "min": 0
+    },
+    "max_page_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Max Page Limit",
+      "description": "Maximum pages to crawl, overriding pageLimit if both are set. Default is 0 (no-limit)",
+      "min": 0
+    },
+    "page_load_timeout": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Load Timeout",
+      "description": "Timeout for each page to load (in seconds). Default is 90",
+      "min": 0
+    },
+    "scope_type": {
+      "type": "string-enum",
+      "required": false,
+      "title": "Scope Type",
+      "description": "A predfined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx/scopeExcludeRx regexes. Default is custom if scopeIncludeRx is set, prefix otherwise.",
+      "choices": [
+        {
+          "title": "Page",
+          "value": "page"
+        },
+        {
+          "title": "Page SPA",
+          "value": "page-spa"
+        },
+        {
+          "title": "Prefix",
+          "value": "prefix"
+        },
+        {
+          "title": "Host",
+          "value": "host"
+        },
+        {
+          "title": "Domain",
+          "value": "domain"
+        },
+        {
+          "title": "Any",
+          "value": "any"
+        },
+        {
+          "title": "Custom",
+          "value": "custom"
+        }
+      ]
+    },
+    "scope_include_rx": {
+      "type": "string",
+      "required": false,
+      "title": "Scope Include Regex",
+      "description": "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of seed)"
+    },
+    "scope_exclude_rx": {
+      "type": "string",
+      "required": false,
+      "title": "Scope Exclude Regex",
+      "description": "Regex of page URLs that should be excluded from the crawl"
+    },
+    "allow_hash_urls": {
+      "type": "boolean",
+      "required": false,
+      "title": "Allow Hashtag URLs",
+      "description": "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content"
+    },
+    "mobile_device": {
+      "type": "string-enum",
+      "required": false,
+      "title": "As device",
+      "description": "Device to crawl as. See Pupeeter's Device.ts for a list",
+      "choices": [
+        {
+          "title": "Blackberry Playbook",
+          "value": "Blackberry PlayBook"
+        },
+        {
+          "title": "Blackberry Playbook Landscape",
+          "value": "Blackberry PlayBook landscape"
+        },
+        {
+          "title": "Blackberry Z30",
+          "value": "BlackBerry Z30"
+        },
+        {
+          "title": "Blackberry Z30 Landscape",
+          "value": "BlackBerry Z30 landscape"
+        },
+        {
+          "title": "Galaxy Note 3",
+          "value": "Galaxy Note 3"
+        },
+        {
+          "title": "Galaxy Note 3 Landscape",
+          "value": "Galaxy Note 3 landscape"
+        },
+        {
+          "title": "Galaxy Note II",
+          "value": "Galaxy Note II"
+        },
+        {
+          "title": "Galaxy Note II Landscape",
+          "value": "Galaxy Note II landscape"
+        },
+        {
+          "title": "Galaxy S III",
+          "value": "Galaxy S III"
+        },
+        {
+          "title": "Galaxy S III Landscape",
+          "value": "Galaxy S III landscape"
+        },
+        {
+          "title": "Galaxy S5",
+          "value": "Galaxy S5"
+        },
+        {
+          "title": "Galaxy S5 Landscape",
+          "value": "Galaxy S5 landscape"
+        },
+        {
+          "title": "Galaxy S8",
+          "value": "Galaxy S8"
+        },
+        {
+          "title": "Galaxy S8 Landscape",
+          "value": "Galaxy S8 landscape"
+        },
+        {
+          "title": "Galaxy S9 Plus",
+          "value": "Galaxy S9+"
+        },
+        {
+          "title": "Galaxy S9 Plus Landscape",
+          "value": "Galaxy S9+ landscape"
+        },
+        {
+          "title": "Galaxy Tab S4",
+          "value": "Galaxy Tab S4"
+        },
+        {
+          "title": "Galaxy Tab S4 Landscape",
+          "value": "Galaxy Tab S4 landscape"
+        },
+        {
+          "title": "iPad",
+          "value": "iPad"
+        },
+        {
+          "title": "iPad Landscape",
+          "value": "iPad landscape"
+        },
+        {
+          "title": "iPad Gen 6",
+          "value": "iPad (gen 6)"
+        },
+        {
+          "title": "iPad Gen 6 Landscape",
+          "value": "iPad (gen 6) landscape"
+        },
+        {
+          "title": "iPad Gen 7",
+          "value": "iPad (gen 7)"
+        },
+        {
+          "title": "iPad Gen 7 Landscape",
+          "value": "iPad (gen 7) landscape"
+        },
+        {
+          "title": "iPad Mini",
+          "value": "iPad Mini"
+        },
+        {
+          "title": "iPad Mini Landscape",
+          "value": "iPad Mini landscape"
+        },
+        {
+          "title": "iPad Pro",
+          "value": "iPad Pro"
+        },
+        {
+          "title": "iPad Pro Landscape",
+          "value": "iPad Pro landscape"
+        },
+        {
+          "title": "iPad Pro 11",
+          "value": "iPad Pro 11"
+        },
+        {
+          "title": "iPad Pro 11 Landscape",
+          "value": "iPad Pro 11 landscape"
+        },
+        {
+          "title": "iPhone 4",
+          "value": "iPhone 4"
+        },
+        {
+          "title": "iPhone 4 Landscape",
+          "value": "iPhone 4 landscape"
+        },
+        {
+          "title": "iPhone 5",
+          "value": "iPhone 5"
+        },
+        {
+          "title": "iPhone 5 Landscape",
+          "value": "iPhone 5 landscape"
+        },
+        {
+          "title": "iPhone 6",
+          "value": "iPhone 6"
+        },
+        {
+          "title": "iPhone 6 Landscape",
+          "value": "iPhone 6 landscape"
+        },
+        {
+          "title": "iPhone 6 Plus",
+          "value": "iPhone 6 Plus"
+        },
+        {
+          "title": "iPhone 6 Plus Landscape",
+          "value": "iPhone 6 Plus landscape"
+        },
+        {
+          "title": "iPhone 7",
+          "value": "iPhone 7"
+        },
+        {
+          "title": "iPhone 7 Landscape",
+          "value": "iPhone 7 landscape"
+        },
+        {
+          "title": "iPhone 7 Plus",
+          "value": "iPhone 7 Plus"
+        },
+        {
+          "title": "iPhone 7 Plus Landscape",
+          "value": "iPhone 7 Plus landscape"
+        },
+        {
+          "title": "iPhone 8",
+          "value": "iPhone 8"
+        },
+        {
+          "title": "iPhone 8 Landscape",
+          "value": "iPhone 8 landscape"
+        },
+        {
+          "title": "iPhone 8 Plus",
+          "value": "iPhone 8 Plus"
+        },
+        {
+          "title": "iPhone 8 Plus Landscape",
+          "value": "iPhone 8 Plus landscape"
+        },
+        {
+          "title": "iPhone SE",
+          "value": "iPhone SE"
+        },
+        {
+          "title": "iPhone SE Landscape",
+          "value": "iPhone SE landscape"
+        },
+        {
+          "title": "iPhone X",
+          "value": "iPhone X"
+        },
+        {
+          "title": "iPhone X Landscape",
+          "value": "iPhone X landscape"
+        },
+        {
+          "title": "iPhone XR",
+          "value": "iPhone XR"
+        },
+        {
+          "title": "iPhone XR Landscape",
+          "value": "iPhone XR landscape"
+        },
+        {
+          "title": "iPhone 11",
+          "value": "iPhone 11"
+        },
+        {
+          "title": "iPhone 11 Landscape",
+          "value": "iPhone 11 landscape"
+        },
+        {
+          "title": "iPhone 11 Pro",
+          "value": "iPhone 11 Pro"
+        },
+        {
+          "title": "iPhone 11 Pro Landscape",
+          "value": "iPhone 11 Pro landscape"
+        },
+        {
+          "title": "iPhone 11 Pro Max",
+          "value": "iPhone 11 Pro Max"
+        },
+        {
+          "title": "iPhone 11 Pro Max Landscape",
+          "value": "iPhone 11 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 12",
+          "value": "iPhone 12"
+        },
+        {
+          "title": "iPhone 12 Landscape",
+          "value": "iPhone 12 landscape"
+        },
+        {
+          "title": "iPhone 12 Pro",
+          "value": "iPhone 12 Pro"
+        },
+        {
+          "title": "iPhone 12 Pro Landscape",
+          "value": "iPhone 12 Pro landscape"
+        },
+        {
+          "title": "iPhone 12 Pro Max",
+          "value": "iPhone 12 Pro Max"
+        },
+        {
+          "title": "iPhone 12 Pro Max Landscape",
+          "value": "iPhone 12 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 12 Mini",
+          "value": "iPhone 12 Mini"
+        },
+        {
+          "title": "iPhone 12 Mini Landscape",
+          "value": "iPhone 12 Mini landscape"
+        },
+        {
+          "title": "iPhone 13",
+          "value": "iPhone 13"
+        },
+        {
+          "title": "iPhone 13 Landscape",
+          "value": "iPhone 13 landscape"
+        },
+        {
+          "title": "iPhone 13 Pro",
+          "value": "iPhone 13 Pro"
+        },
+        {
+          "title": "iPhone 13 Pro Landscape",
+          "value": "iPhone 13 Pro landscape"
+        },
+        {
+          "title": "iPhone 13 Pro Max",
+          "value": "iPhone 13 Pro Max"
+        },
+        {
+          "title": "iPhone 13 Pro Max Landscape",
+          "value": "iPhone 13 Pro Max landscape"
+        },
+        {
+          "title": "iPhone 13 Mini",
+          "value": "iPhone 13 Mini"
+        },
+        {
+          "title": "iPhone 13 Mini Landscape",
+          "value": "iPhone 13 Mini landscape"
+        },
+        {
+          "title": "Jio Phone 2",
+          "value": "JioPhone 2"
+        },
+        {
+          "title": "Jio Phone 2 Landscape",
+          "value": "JioPhone 2 landscape"
+        },
+        {
+          "title": "Kindle Fire HDX",
+          "value": "Kindle Fire HDX"
+        },
+        {
+          "title": "Kindle Fire HDX Landscape",
+          "value": "Kindle Fire HDX landscape"
+        },
+        {
+          "title": "LG Optimus L70",
+          "value": "LG Optimus L70"
+        },
+        {
+          "title": "LG Optimus L70 Landscape",
+          "value": "LG Optimus L70 landscape"
+        },
+        {
+          "title": "Microsoft Lumia 550",
+          "value": "Microsoft Lumia 550"
+        },
+        {
+          "title": "Microsoft Lumia 950",
+          "value": "Microsoft Lumia 950"
+        },
+        {
+          "title": "Microsoft Lumia 950 Landscape",
+          "value": "Microsoft Lumia 950 landscape"
+        },
+        {
+          "title": "Nexus 10",
+          "value": "Nexus 10"
+        },
+        {
+          "title": "Nexus 10 Landscape",
+          "value": "Nexus 10 landscape"
+        },
+        {
+          "title": "Nexus 4",
+          "value": "Nexus 4"
+        },
+        {
+          "title": "Nexus 4 Landscape",
+          "value": "Nexus 4 landscape"
+        },
+        {
+          "title": "Nexus 5",
+          "value": "Nexus 5"
+        },
+        {
+          "title": "Nexus 5 Landscape",
+          "value": "Nexus 5 landscape"
+        },
+        {
+          "title": "Nexus 5X",
+          "value": "Nexus 5X"
+        },
+        {
+          "title": "Nexus 5X Landscape",
+          "value": "Nexus 5X landscape"
+        },
+        {
+          "title": "Nexus 6",
+          "value": "Nexus 6"
+        },
+        {
+          "title": "Nexus 6 Landscape",
+          "value": "Nexus 6 landscape"
+        },
+        {
+          "title": "Nexus 6P",
+          "value": "Nexus 6P"
+        },
+        {
+          "title": "Nexus 6P Landscape",
+          "value": "Nexus 6P landscape"
+        },
+        {
+          "title": "Nexus 7",
+          "value": "Nexus 7"
+        },
+        {
+          "title": "Nexus 7 Landscape",
+          "value": "Nexus 7 landscape"
+        },
+        {
+          "title": "Nokia Lumia 520",
+          "value": "Nokia Lumia 520"
+        },
+        {
+          "title": "Nokia Lumia 520 Landscape",
+          "value": "Nokia Lumia 520 landscape"
+        },
+        {
+          "title": "Nokia N9",
+          "value": "Nokia N9"
+        },
+        {
+          "title": "Nokia N9 Landscape",
+          "value": "Nokia N9 landscape"
+        },
+        {
+          "title": "Pixel 2",
+          "value": "Pixel 2"
+        },
+        {
+          "title": "Pixel 2 Landscape",
+          "value": "Pixel 2 landscape"
+        },
+        {
+          "title": "Pixel 2 XL",
+          "value": "Pixel 2 XL"
+        },
+        {
+          "title": "Pixel 2 XL Landscape",
+          "value": "Pixel 2 XL landscape"
+        },
+        {
+          "title": "Pixel 3",
+          "value": "Pixel 3"
+        },
+        {
+          "title": "Pixel 3 Landscape",
+          "value": "Pixel 3 landscape"
+        },
+        {
+          "title": "Pixel 4",
+          "value": "Pixel 4"
+        },
+        {
+          "title": "Pixel 4 Landscape",
+          "value": "Pixel 4 landscape"
+        },
+        {
+          "title": "Pixel 4A 5G",
+          "value": "Pixel 4a (5G)"
+        },
+        {
+          "title": "Pixel 4A 5G Landscape",
+          "value": "Pixel 4a (5G) landscape"
+        },
+        {
+          "title": "Pixel 5",
+          "value": "Pixel 5"
+        },
+        {
+          "title": "Pixel 5 Landscape",
+          "value": "Pixel 5 landscape"
+        },
+        {
+          "title": "Moto G4",
+          "value": "Moto G4"
+        },
+        {
+          "title": "Moto G4 Landscape",
+          "value": "Moto G4 landscape"
+        }
+      ]
+    },
+    "select_links": {
+      "type": "string",
+      "required": false,
+      "title": "Select Links",
+      "description": "One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]"
+    },
+    "click_selector": {
+      "type": "string",
+      "required": false,
+      "title": "Click Selector",
+      "description": "Selector for elements to click when using the autoclick behavior. Default is 'a'"
+    },
+    "block_rules": {
+      "type": "string",
+      "required": false,
+      "title": "Block Rules",
+      "description": "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe"
+    },
+    "block_message": {
+      "type": "string",
+      "required": false,
+      "title": "Block Message",
+      "description": "If specified, when a URL is blocked, a record with this error message is added instead"
+    },
+    "block_ads": {
+      "type": "boolean",
+      "required": false,
+      "title": "Block Ads",
+      "description": "If set, block advertisements from being loaded (based on Stephen Black's blocklist). Note that some bad domains are also blocked by zimit configuration even if this option is not set."
+    },
+    "ad_block_message": {
+      "type": "string",
+      "required": false,
+      "title": "Ads Block Message",
+      "description": "If specified, when an ad is blocked, a record with this error message is added instead"
+    },
+    "user_agent": {
+      "type": "string",
+      "required": false,
+      "title": "User Agent",
+      "description": "Override user-agent with specified"
+    },
+    "user_agent_suffix": {
+      "type": "string",
+      "required": false,
+      "title": "User Agent Suffix",
+      "description": "Append suffix to existing browser user-agent. Defaults to +Zimit"
+    },
+    "use_sitemap": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap URL",
+      "description": "Use as sitemap to get additional URLs for the crawl (usually at /sitemap.xml)"
+    },
+    "sitemap_from_date": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap From Date",
+      "description": "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
+    },
+    "sitemap_to_date": {
+      "type": "string",
+      "required": false,
+      "title": "Sitemap To Date",
+      "description": "If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)"
+    },
+    "behavior_timeout": {
+      "type": "integer",
+      "required": false,
+      "title": "Behavior Timeout",
+      "description": "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish. Default is 90.",
+      "min": 0
+    },
+    "post_load_delay": {
+      "type": "integer",
+      "required": false,
+      "title": "Post Load Delay",
+      "description": "If >0, amount of time to sleep (in seconds) after page has loaded, before taking screenshots / getting text / running behaviors. Default is 0.",
+      "min": 0
+    },
+    "page_extra_delay": {
+      "type": "integer",
+      "required": false,
+      "title": "Page Extra Delay",
+      "description": "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page. Default is 0.",
+      "min": 0
+    },
+    "dedup_policy": {
+      "type": "string-enum",
+      "required": false,
+      "title": "Dedup Policy",
+      "description": "Deduplication policy. One of skip, revisit or keep. Default is skip",
+      "choices": [
+        {
+          "title": "Skip",
+          "value": "skip"
+        },
+        {
+          "title": "Revisit",
+          "value": "revisit"
+        },
+        {
+          "title": "Keep",
+          "value": "keep"
+        }
+      ]
+    },
+    "screenshot": {
+      "type": "string",
+      "required": false,
+      "title": "Screenshot",
+      "description": "Screenshot options for crawler. One of view, thumbnail, fullPage, fullPageFinal or a comma-separated combination of those."
+    },
+    "size_soft_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Size Soft Limit",
+      "description": "If set, save crawl state and stop crawl if WARC size exceeds this value. ZIM will still be created.",
+      "min": 0
+    },
+    "size_hard_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Size Hard Limit",
+      "description": "If set, exit crawler and fail the scraper immediately if WARC size exceeds this value",
+      "min": 0
+    },
+    "disk_utilization": {
+      "type": "integer",
+      "required": false,
+      "title": "Disk Utilization",
+      "description": "Save state and exit if disk utilization exceeds this percentage value. Default (if not set) is 90%. Set to 0 to disable disk utilization check.",
+      "min": 0
+    },
+    "time_soft_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Time Soft Limit",
+      "description": "If set, save crawl state and stop crawl if WARC(s) creation takes longer than this value, in seconds. ZIM will still be created.",
+      "min": 0
+    },
+    "time_hard_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Time Hard Limit",
+      "description": "If set, exit crawler and fail the scraper immediately if WARC(s) creation takes longer than this value, in seconds",
+      "min": 0
+    },
+    "net_idle_wait": {
+      "type": "integer",
+      "required": false,
+      "title": "Net Idle Wait",
+      "description": "If set, wait for network idle after page load and after behaviors are done (in seconds). If -1 (default), determine based on scope."
+    },
+    "origin_override": {
+      "type": "string",
+      "required": false,
+      "title": "Origin Override",
+      "description": "If set, will redirect requests from each origin in key to origin in the value, eg. https://host:port=http://alt-host:alt-port."
+    },
+    "max_page_retries": {
+      "type": "integer",
+      "required": false,
+      "title": "Max Page Retries",
+      "description": "If set, number of times to retry a page that failed to load before page is considered to have failed. Default is 2.",
+      "min": 0
+    },
+    "fail_on_failed_seed": {
+      "type": "boolean",
+      "required": false,
+      "title": "Fail on failed seed",
+      "description": "Whether to display additional logs"
+    },
+    "fail_on_invalid_status": {
+      "type": "boolean",
+      "required": false,
+      "title": "Fail on invalid status",
+      "description": "If set, will treat pages with 4xx or 5xx response as failures. When combined with --failOnFailedLimit or --failOnFailedSeed may result in crawl failing due to non-200 responses"
+    },
+    "fail_on_failed_limit": {
+      "type": "integer",
+      "required": false,
+      "title": "Fail on failed - Limit",
+      "description": "If set, save state and exit if number of failed pages exceeds this value.",
+      "min": 0
+    },
+    "warcs": {
+      "type": "string",
+      "required": false,
+      "title": "WARC files",
+      "description": "Comma-separated list of WARC files to use as input."
+    },
+    "verbose": {
+      "type": "boolean",
+      "required": false,
+      "title": "Verbose mode",
+      "description": "Whether to display additional logs"
+    },
+    "keep": {
+      "type": "boolean",
+      "required": false,
+      "title": "Keep",
+      "description": "Should be True. Developer option: must be True if we want to keep the WARC files for artifacts archiving.",
+      "default": true
+    },
+    "output": {
+      "type": "string",
+      "required": false,
+      "title": "Output folder",
+      "description": "Output folder for ZIM file(s). Leave it as `/output`",
+      "pattern": "^/output$"
+    },
+    "admin_email": {
+      "type": "email",
+      "required": false,
+      "title": "Admin Email",
+      "description": "Admin Email for crawler: used in UserAgent so website admin can contact us",
+      "default": "contact+zimfarm@kiwix.org"
+    },
+    "profile": {
+      "type": "string",
+      "required": false,
+      "title": "Browser profile",
+      "description": "Path or HTTP(S) URL to tar.gz file which contains the browser profile directory for Browsertrix crawler."
+    },
+    "behaviors": {
+      "type": "string",
+      "required": false,
+      "title": "Behaviors",
+      "description": "Which background behaviors to enable on each page. Defaults to autoplay,autofetch,siteSpecific."
+    },
+    "depth": {
+      "type": "integer",
+      "required": false,
+      "title": "Depth",
+      "description": "The depth of the crawl for all seeds. Default is -1 (infinite).",
+      "min": -1
+    },
+    "zim_lang": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM Language",
+      "description": "Language metadata of ZIM (warc2zim --lang param). ISO-639-3 code. Retrieved from homepage if found, fallback to `eng`",
+      "alias": "zim-lang",
+      "customValidator": "language_code"
+    },
+    "long_description": {
+      "type": "string",
+      "required": false,
+      "title": "Long description",
+      "description": "Optional long description for your ZIM",
+      "minLength": 1,
+      "maxLength": 4000,
+      "alias": "long-description"
+    },
+    "custom_css": {
+      "type": "blob",
+      "kind": "css",
+      "required": false,
+      "title": "Custom CSS",
+      "description": "URL to a CSS file to inject into pages",
+      "alias": "custom-css"
+    },
+    "charsets_to_try": {
+      "type": "string",
+      "required": false,
+      "title": "Charsets to try",
+      "description": "List of charsets to try decode content when charset is not found",
+      "alias": "charsets-to-try"
+    },
+    "ignore_content_header_charsets": {
+      "type": "boolean",
+      "required": false,
+      "title": "Ignore Content Header Charsets",
+      "description": "Ignore the charsets specified in content headers - first bytes - typically because they are wrong.",
+      "alias": "ignore-content-header-charsets"
+    },
+    "content_header_bytes_length": {
+      "type": "integer",
+      "required": false,
+      "title": "Content Header Bytes Length",
+      "description": "How many bytes to consider when searching for content charsets in header (default is 1024).",
+      "alias": "content-header-bytes-length",
+      "min": 0
+    },
+    "ignore_http_header_charsets": {
+      "type": "boolean",
+      "required": false,
+      "title": "Ignore HTTP Header Charsets",
+      "description": "Ignore the charsets specified in HTTP `Content-Type` headers, typically because they are wrong.",
+      "alias": "ignore-http-header-charsets"
+    },
+    "encoding_aliases": {
+      "type": "string",
+      "required": false,
+      "title": "Encoding Aliases",
+      "description": "List of encoding/charset aliases to decode WARC content. Aliases are used when the encoding specified in upstream server exists in Python under a different name. This parameter is single string, multiple values are separated by a comma, like in alias1=encoding1,alias2=encoding2.",
+      "alias": "encoding-aliases"
+    },
+    "custom_behaviors": {
+      "type": "string",
+      "required": false,
+      "title": "Custom Behaviors",
+      "description": "JS code for custom behaviors to customize crawler. Single string with individual JS files URL/path separated by a comma.",
+      "alias": "custom-behaviours"
+    },
+    "zimit_progress_file": {
+      "type": "string",
+      "required": false,
+      "title": "Zimit Progress File",
+      "description": "Scraping progress file. Leave it as `/output/task_progress.json`",
+      "alias": "zimit-progress-file",
+      "pattern": "^/output/task_progress\\.json$"
+    },
+    "replay_viewer_source": {
+      "type": "url",
+      "required": false,
+      "title": "Replay Viewer Source",
+      "description": "URL from which to load the ReplayWeb.page replay viewer from",
+      "alias": "replay-viewer-source"
+    },
+    "zim_file": {
+      "type": "string",
+      "required": false,
+      "title": "ZIM filename",
+      "description": "ZIM file name (based on --name if not provided). Include {period} to insert date period dynamically",
+      "alias": "zim-file",
+      "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+_)([a-z0-9\\-\\.]+_|)([\\d]{4}-[\\d]{2}|\\{period\\}).zim$",
+      "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    },
+    "name": {
+      "type": "string",
+      "required": true,
+      "title": "ZIM name",
+      "description": "Name of the ZIM.",
+      "alias": "name",
+      "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
+      "relaxedPattern": "^[A-Za-z0-9._-]+$"
+    },
+    "overwrite": {
+      "type": "boolean",
+      "required": false,
+      "title": "Overwrite",
+      "description": "Whether to overwrite existing ZIM file if it exists"
+    }
+  }
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,225 @@
+[build-system]
+requires = ["hatchling", "hatch-openzim"]
+build-backend = "hatchling.build"
+
+[project]
+name = "zimit"
+requires-python = ">=3.13,<3.14"
+description = "Make ZIM file from any website through crawling"
+readme = "README.md"
+dependencies = [
+  "requests==2.32.3",
+  "inotify==0.2.10",
+  "tld==0.13",
+  "warc2zim @ git+https://github.com/openzim/warc2zim@main",
+]
+dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
+
+[tool.hatch.metadata.hooks.openzim-metadata]
+kind = "scraper"
+
+[tool.hatch.metadata]
+allow-direct-references = true  # to be removed once we use a released warc2zim version
+
+[project.optional-dependencies]
+scripts = [
+  "invoke==2.2.0",
+]
+lint = [
+  "black==25.1.0",
+  "ruff==0.9.4",
+]
+check = [
+  "pyright==1.1.393",
+]
+test = [
+  "pytest==8.3.4",
+  "coverage==7.6.10",
+]
+dev = [
+  "pre-commit==4.1.0",
+  "debugpy==1.8.12",
+  "selenium==4.28.1", # used in daily tests, convenient for dev purpose (autocompletion)
+  "zimit[scripts]",
+  "zimit[lint]",
+  "zimit[test]",
+  "zimit[check]",
+]
+
+[project.scripts]
+zimit = "zimit:zimit.zimit"
+
+[tool.hatch.version]
+path = "src/zimit/__about__.py"
+
+[tool.hatch.build]
+exclude = [
+  "/.github",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/zimit"]
+
+[tool.hatch.envs.default]
+features = ["dev"]
+
+[tool.hatch.envs.test]
+features = ["scripts", "test"]
+
+[tool.hatch.envs.test.scripts]
+run = "inv test --args '{args}'"
+run-cov = "inv test-cov --args '{args}'"
+report-cov = "inv report-cov"
+coverage = "inv coverage --args '{args}'"
+html = "inv coverage --html --args '{args}'"
+
+[tool.hatch.envs.lint]
+template = "lint"
+skip-install = false
+features = ["scripts", "lint"]
+
+[tool.hatch.envs.lint.scripts]
+black = "inv lint-black --args '{args}'"
+ruff = "inv lint-ruff --args '{args}'"
+all = "inv lintall --args '{args}'"
+fix-black = "inv fix-black --args '{args}'"
+fix-ruff = "inv fix-ruff --args '{args}'"
+fixall = "inv fixall --args '{args}'"
+
+[tool.hatch.envs.check]
+features = ["scripts", "check"]
+
+[tool.hatch.envs.check.scripts]
+pyright = "inv check-pyright --args '{args}'"
+all = "inv checkall --args '{args}'"
+
+[tool.black]
+line-length = 88
+target-version = ['py313']
+
+[tool.ruff]
+target-version = "py313"
+line-length = 88
+src = ["src"]
+
+[tool.ruff.lint]
+select = [
+  "A",  # flake8-builtins
+  # "ANN",  # flake8-annotations
+  "ARG",  # flake8-unused-arguments
+  # "ASYNC",  # flake8-async
+  "B",  # flake8-bugbear
+  # "BLE",  # flake8-blind-except
+  "C4",  # flake8-comprehensions
+  "C90",  # mccabe
+  # "COM",  # flake8-commas
+  # "D",  # pydocstyle
+  # "DJ",  # flake8-django
+  "DTZ",  # flake8-datetimez
+  "E",  # pycodestyle (default)
+  "EM",  # flake8-errmsg
+  # "ERA",  # eradicate
+  # "EXE",  # flake8-executable
+  "F",  # Pyflakes (default)
+  # "FA",  # flake8-future-annotations
+  "FBT",  # flake8-boolean-trap
+  # "FLY",  # flynt
+  # "G",  # flake8-logging-format
+  "I",  # isort
+  "ICN",  # flake8-import-conventions
+  # "INP",  # flake8-no-pep420
+  # "INT",  # flake8-gettext
+  "ISC",  # flake8-implicit-str-concat
+  "N",  # pep8-naming
+  # "NPY",  # NumPy-specific rules
+  # "PD",  # pandas-vet
+  # "PGH",  # pygrep-hooks
+  # "PIE",  # flake8-pie
+  # "PL",  # Pylint
+  "PLC",  # Pylint: Convention
+  "PLE",  # Pylint: Error
+  "PLR",  # Pylint: Refactor
+  "PLW",  # Pylint: Warning
+  # "PT",  # flake8-pytest-style
+  # "PTH",  # flake8-use-pathlib
+  # "PYI",  # flake8-pyi
+  "Q",  # flake8-quotes
+  # "RET",  # flake8-return
+  # "RSE",  # flake8-raise
+  "RUF",  # Ruff-specific rules
+  "S",  # flake8-bandit
+  # "SIM",  # flake8-simplify
+  # "SLF",  # flake8-self
+  "T10",  # flake8-debugger
+  "T20",  # flake8-print
+  # "TCH",  # flake8-type-checking
+  # "TD",  # flake8-todos
+  "TID",  # flake8-tidy-imports
+  # "TRY",  # tryceratops
+  "UP",  # pyupgrade
+  "W",  # pycodestyle
+  "YTT",  # flake8-2020
+]
+ignore = [
+  # Allow non-abstract empty methods in abstract base classes
+  "B027",
+  # Remove flake8-errmsg since we consider they bloat the code and provide limited value
+  "EM",
+  # Allow boolean positional values in function calls, like `dict.get(... True)`
+  "FBT003",
+  # Ignore checks for possible passwords
+  "S105", "S106", "S107",
+  # Ignore warnings on subprocess.run / popen
+  "S603",
+  # Ignore complexity
+  "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
+]
+unfixable = [
+  # Don't touch unused imports
+  "F401",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["zimit"]
+
+[tool.ruff.lint.flake8-bugbear]
+# add exceptions to B008 for fastapi.
+extend-immutable-calls = ["fastapi.Depends", "fastapi.Query"]
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.per-file-ignores]
+# Tests can use magic values, assertions, and relative imports
+"tests**/**/*" = ["PLR2004", "S101", "TID252"]
+
+[tool.pytest.ini_options]
+minversion = "7.3"
+testpaths = ["tests"]
+pythonpath = [".", "src"]
+
+[tool.coverage.paths]
+zimit = ["src/zimit"]
+tests = ["tests"]
+
+[tool.coverage.run]
+source_pkgs = ["zimit"]
+branch = true
+parallel = true
+omit = [
+  "src/zimit/__about__.py",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
+
+[tool.pyright]
+include = ["src", "tests", "tasks.py"]
+exclude = [".env/**", ".venv/**"]
+extraPaths = ["src"]
+pythonVersion = "3.13"
+typeCheckingMode="basic"
--- a/setup.py
+++ b/setup.py
@ -1,33 +0,0 @@
-import os
-from setuptools import setup, find_packages
-
-here = os.path.abspath(os.path.dirname(__file__))
-
-with open(os.path.join(here, 'README.rst')) as f:
-    README = f.read()
-
-
-setup(name='zimit',
-      version=0.1,
-      description='zimit',
-      long_description=README,
-      classifiers=[
-          "Programming Language :: Python",
-          "Framework :: Pylons",
-          "Topic :: Internet :: WWW/HTTP",
-          "Topic :: Internet :: WWW/HTTP :: WSGI :: Application"
-      ],
-      keywords="web services",
-      author='',
-      author_email='',
-      url='',
-      packages=find_packages(),
-      include_package_data=True,
-      zip_safe=False,
-      install_requires=['cornice', 'waitress', 'rq', 'colander',
-                        'python-slugify', 'pyramid_mailer'],
-      entry_points="""\
-      [paste.app_factory]
-      main=zimit:main
-      """,
-      paster_plugins=['pyramid'])
--- a/src/zimit/about.py
+++ b/src/zimit/about.py
@ -0,0 +1 @@
+__version__ = "3.0.6-dev0"
--- a/src/zimit/constants.py
+++ b/src/zimit/constants.py
@ -0,0 +1,11 @@
+import logging
+
+from zimscraperlib.logging import getLogger
+
+EXIT_CODE_WARC2ZIM_CHECK_FAILED = 2
+EXIT_CODE_CRAWLER_SIZE_LIMIT_HIT = 14
+EXIT_CODE_CRAWLER_TIME_LIMIT_HIT = 15
+NORMAL_WARC2ZIM_EXIT_CODE = 100
+REQUESTS_TIMEOUT = 10
+
+logger = getLogger(name="zimit", level=logging.INFO)
--- a/src/zimit/utils.py
+++ b/src/zimit/utils.py
@ -0,0 +1,14 @@
+from pathlib import Path
+
+import requests
+
+from zimit.constants import REQUESTS_TIMEOUT
+
+
+def download_file(url: str, fpath: Path):
+    """Download file from url to fpath with streaming"""
+    with requests.get(url, timeout=REQUESTS_TIMEOUT, stream=True) as resp:
+        resp.raise_for_status()
+        with open(fpath, "wb") as f:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
--- a/tasks.py
+++ b/tasks.py
@ -0,0 +1,109 @@
+# pyright: strict, reportUntypedFunctionDecorator=false
+import os
+
+from invoke.context import Context
+from invoke.tasks import task  # pyright: ignore [reportUnknownVariableType]
+
+use_pty = not os.getenv("CI", "")
+
+
+@task(optional=["args"], help={"args": "pytest additional arguments"})
+def test(ctx: Context, args: str = ""):
+    """run tests (without coverage)"""
+    ctx.run(f"pytest {args}", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "pytest additional arguments"})
+def test_cov(ctx: Context, args: str = ""):
+    """run test vith coverage"""
+    ctx.run(f"coverage run -m pytest {args}", pty=use_pty)
+
+
+@task(optional=["html"], help={"html": "flag to export html report"})
+def report_cov(ctx: Context, *, html: bool = False):
+    """report coverage"""
+    ctx.run("coverage combine", warn=True, pty=use_pty)
+    ctx.run("coverage report --show-missing", pty=use_pty)
+    if html:
+        ctx.run("coverage html", pty=use_pty)
+
+
+@task(
+    optional=["args", "html"],
+    help={
+        "args": "pytest additional arguments",
+        "html": "flag to export html report",
+    },
+)
+def coverage(ctx: Context, args: str = "", *, html: bool = False):
+    """run tests and report coverage"""
+    test_cov(ctx, args=args)
+    report_cov(ctx, html=html)
+
+
+@task(optional=["args"], help={"args": "black additional arguments"})
+def lint_black(ctx: Context, args: str = "."):
+    args = args or "."  # needed for hatch script
+    ctx.run("black --version", pty=use_pty)
+    ctx.run(f"black --check --diff {args}", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "ruff additional arguments"})
+def lint_ruff(ctx: Context, args: str = "."):
+    args = args or "."  # needed for hatch script
+    ctx.run("ruff --version", pty=use_pty)
+    ctx.run(f"ruff check {args}", pty=use_pty)
+
+
+@task(
+    optional=["args"],
+    help={
+        "args": "linting tools (black, ruff) additional arguments, typically a path",
+    },
+)
+def lintall(ctx: Context, args: str = "."):
+    """Check linting"""
+    args = args or "."  # needed for hatch script
+    lint_black(ctx, args)
+    lint_ruff(ctx, args)
+
+
+@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
+def check_pyright(ctx: Context, args: str = ""):
+    """check static types with pyright"""
+    ctx.run("pyright --version")
+    ctx.run(f"pyright {args}", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "check tools (pyright) additional arguments"})
+def checkall(ctx: Context, args: str = ""):
+    """check static types"""
+    check_pyright(ctx, args)
+
+
+@task(optional=["args"], help={"args": "black additional arguments"})
+def fix_black(ctx: Context, args: str = "."):
+    """fix black formatting"""
+    args = args or "."  # needed for hatch script
+    ctx.run(f"black {args}", pty=use_pty)
+
+
+@task(optional=["args"], help={"args": "ruff additional arguments"})
+def fix_ruff(ctx: Context, args: str = "."):
+    """fix all ruff rules"""
+    args = args or "."  # needed for hatch script
+    ctx.run(f"ruff check --fix {args}", pty=use_pty)
+
+
+@task(
+    optional=["args"],
+    help={
+        "args": "linting tools (black, ruff) additional arguments, typically a path",
+    },
+)
+def fixall(ctx: Context, args: str = "."):
+    """Fix everything automatically"""
+    args = args or "."  # needed for hatch script
+    fix_black(ctx, args)
+    fix_ruff(ctx, args)
+    lintall(ctx, args)
--- a/tests-daily/Dockerfile
+++ b/tests-daily/Dockerfile
@ -0,0 +1,75 @@
+# Let's extract kiwix-tools as usual on alpine temporary build container
+FROM alpine:3.21 as kiwix-serve
+LABEL org.opencontainers.image.source https://github.com/openzim/kiwix-tools
+
+# TARGETPLATFORM is injected by docker build
+ARG TARGETPLATFORM
+ARG KIWIX_TOOLS_VERSION
+
+RUN set -e && \
+    # default (no KIWIX_TOOLS_VERSION set) to today's nightly
+    if [ -z "$KIWIX_TOOLS_VERSION" ] ; then KIWIX_TOOLS_VERSION=$(date +"%Y-%m-%d") ; fi && \
+    apk --no-cache add dumb-init curl && \
+    echo "TARGETPLATFORM: $TARGETPLATFORM" && \
+    if [ "$TARGETPLATFORM" = "linux/386" ]; then ARCH="i586"; \
+    # linux/arm64/v8 points to linux/arm64
+    elif [ "$TARGETPLATFORM" = "linux/arm64/v8" \
+        -o "$TARGETPLATFORM" = "linux/arm64" ]; then ARCH="aarch64"; \
+    # linux/arm translates to linux/arm/v7
+    elif [ "$TARGETPLATFORM" = "linux/arm/v7" ]; then ARCH="armv8"; \
+    elif [ "$TARGETPLATFORM" = "linux/arm/v6" ]; then ARCH="armv6"; \
+    elif [ "$TARGETPLATFORM" = "linux/amd64/v3" \
+        -o "$TARGETPLATFORM" = "linux/amd64/v2" \
+        -o "$TARGETPLATFORM" = "linux/amd64" ]; then ARCH="x86_64"; \
+    # we dont suppot any other arch so let it fail
+    else ARCH="unknown"; fi && \
+    # download requested kiwix-tools version
+    url="http://mirror.download.kiwix.org/nightly/$KIWIX_TOOLS_VERSION/kiwix-tools_linux-$ARCH-$KIWIX_TOOLS_VERSION.tar.gz" && \
+    echo "URL: $url" && \
+    mkdir /kiwix-serve && \
+    curl -k -L $url | tar -xz -C /kiwix-serve --strip-components 1
+
+# Build real "workload" container
+FROM python:3.13-slim-bookworm
+
+# Add kiwix-serve
+COPY --from=kiwix-serve /kiwix-serve /usr/local/bin
+
+# Update apt + install dependencies + install Google Chrome dependencies + clean-up apt lists
+RUN apt-get update -y && \
+    apt-get install -qqy wget xvfb unzip jq && \
+    apt-get install -qqy libxss1 libappindicator1 libgconf-2-4 \
+    fonts-liberation libasound2 libnspr4 libnss3 libx11-xcb1 libxtst6 lsb-release xdg-utils \
+    libgbm1 libnss3 libatk-bridge2.0-0 libgtk-3-0 libx11-xcb1 libxcb-dri3-0 && \
+    rm -rf /var/lib/apt/lists/*
+
+# Fetch the latest version numbers and URLs for Chrome and ChromeDriver
+RUN wget -q -O /tmp/versions.json https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json
+
+# Install chrome
+RUN CHROME_URL=$(jq -r '.channels.Stable.downloads.chrome[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
+    wget -q --continue -O /tmp/chrome-linux64.zip $CHROME_URL && \
+    unzip /tmp/chrome-linux64.zip -d /opt/chrome
+
+RUN chmod +x /opt/chrome/chrome-linux64/chrome
+
+# Install chromedriver
+RUN CHROMEDRIVER_URL=$(jq -r '.channels.Stable.downloads.chromedriver[] | select(.platform=="linux64") | .url' /tmp/versions.json) && \
+    wget -q --continue -O /tmp/chromedriver-linux64.zip $CHROMEDRIVER_URL && \
+    unzip /tmp/chromedriver-linux64.zip -d /opt/chromedriver && \
+    chmod +x /opt/chromedriver/chromedriver-linux64/chromedriver
+
+# Set up Chromedriver Environment variables
+ENV CHROMEDRIVER_DIR /opt/chromedriver
+ENV PATH $CHROMEDRIVER_DIR:$PATH
+
+# Clean up
+RUN rm /tmp/chrome-linux64.zip /tmp/chromedriver-linux64.zip /tmp/versions.json
+
+# Update pip, install selenium, create work directory
+RUN \
+   python -m pip install --no-cache-dir -U \
+     pip \
+     selenium==4.28.1 \
+     pytest==8.3.4 \
+&& mkdir -p /work
--- a/tests-daily/daily.py
+++ b/tests-daily/daily.py
@ -0,0 +1,128 @@
+import logging
+import os
+import subprocess
+from time import sleep
+
+import pytest
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions
+from selenium.webdriver.support.ui import WebDriverWait
+
+KIWIX_SERVE_START_SLEEP = 1
+
+ZIM_NAME = "tests_eng_test-website"
+YOUTUBE_VIDEO_PATH = "youtube.fuzzy.replayweb.page/embed/g5skcrNXdDM"
+
+SKIP_YOUTUBE_TEST = os.getenv("SKIP_YOUTUBE_TEST", "False").lower() == "true"
+
+CHECK_VIDEO_IS_PLAYING_AFTER_SECS = 30
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture(scope="module")
+def chrome_driver():
+    """Start chrome and setup chrome driver / selenium"""
+
+    logger.info("Starting Chrome")
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    # Other options of interest:
+    # --disable-dev-shm-usage (not needed anymore with recent chrome versions)
+    # --disable-gpu (important for some versions of Chrome)
+    # --remote-debugging-port=9222 (should you need to remote debug)
+
+    # Set path to Chrome binary
+    chrome_options.binary_location = "/opt/chrome/chrome-linux64/chrome"
+
+    # Set path to ChromeDriver
+    chrome_service = ChromeService(
+        executable_path="/opt/chromedriver/chromedriver-linux64/chromedriver"
+    )
+
+    # Set up driver
+    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
+
+    yield driver
+
+    # Cleanup
+    logger.info("Quitting Chrome")
+    driver.quit()
+
+
+@pytest.fixture(scope="module")
+def kiwix_serve():
+    """Start  kiwix-serve with given ZIM"""
+
+    logger.info("Starting kiwix-serve")
+    process = subprocess.Popen(
+        [
+            "/usr/bin/env",
+            "/usr/local/bin/kiwix-serve",
+            f"/output/{ZIM_NAME}.zim",
+        ]
+    )
+
+    logger.info(
+        f"Waiting {KIWIX_SERVE_START_SLEEP} secs to be 'sure' that kiwix-serve is ready"
+    )
+    sleep(KIWIX_SERVE_START_SLEEP)
+
+    if process.poll() is not None:
+        raise Exception("kiwix-serve has terminated too early")
+
+    yield process
+
+    # Cleanup
+    logger.info("Quitting kiwix-serve")
+    process.terminate()
+
+
+@pytest.mark.skipif(SKIP_YOUTUBE_TEST, reason="Youtube test disabled by environment")
+def test_youtube_video(chrome_driver, kiwix_serve):  # noqa: ARG001
+    """Test that youtube video loads, and still plays after a while"""
+
+    chrome_driver.get(f"http://localhost:80/content/{ZIM_NAME}/{YOUTUBE_VIDEO_PATH}")
+
+    if chrome_driver.title == "Content not found":
+        raise Exception("Wrong URL, kiwix-serve said that content is not found")
+
+    button = WebDriverWait(chrome_driver, 1).until(
+        expected_conditions.presence_of_element_located(
+            (By.XPATH, "//button[@title='Play']")
+        )
+    )
+
+    logger.info("Play button found in page")
+
+    button.click()
+
+    video = WebDriverWait(chrome_driver, 1).until(
+        expected_conditions.presence_of_element_located((By.TAG_NAME, "video"))
+    )
+
+    logger.info("Video found in page")
+
+    # arguments[0] is the video tag passed to execute_script
+    if not chrome_driver.execute_script("return arguments[0].paused === false", video):
+        raise Exception("Video is not playing, failed to start probably")
+
+    logger.info("Video is playing")
+
+    logger.info(
+        f"Waiting {CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs to check video is still "
+        "playing"
+    )
+    sleep(CHECK_VIDEO_IS_PLAYING_AFTER_SECS)
+
+    # arguments[0] is the video tag passed to execute_script
+    if not chrome_driver.execute_script("return arguments[0].paused === false", video):
+        raise Exception(
+            "Video is not playing anymore after "
+            f"{CHECK_VIDEO_IS_PLAYING_AFTER_SECS} secs"
+        )
+    logger.info("Video is still playing")
--- a/tests-integration/README.md
+++ b/tests-integration/README.md
@ -0,0 +1 @@
+These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)
--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@ -0,0 +1,145 @@
+import glob
+import json
+import os
+from pathlib import Path
+
+import pytest
+from warcio import ArchiveIterator
+from zimscraperlib.zim import Archive
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/tests_en_onepage.zim", id="onepage"),
+        pytest.param("/output/tests_en_sizesoftlimit.zim", id="sizesoftlimit"),
+        pytest.param("/output/tests_en_timesoftlimit.zim", id="timesoftlimit"),
+    ],
+)
+def test_zim_created(filename):
+    """Ensure ZIM file exists"""
+    assert os.path.isfile(filename)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/tests_en_sizehardlimit.zim", id="sizehardlimit"),
+        pytest.param("/output/tests_en_timehardlimit.zim", id="timehardlimit"),
+    ],
+)
+def test_zim_not_created(filename):
+    """Ensure ZIM file does not exists"""
+    assert not os.path.exists(filename)
+
+
+def test_zim_main_page():
+    """Main page specified, http://website.test.openzim.org/http-return-codes.html,
+    was a redirect to https
+    Ensure main page is the redirected page"""
+
+    main_entry = Archive(Path("/output/tests_en_onepage.zim")).main_entry
+    assert main_entry.is_redirect
+    assert (
+        main_entry.get_redirect_entry().path
+        == "website.test.openzim.org/http-return-codes.html"
+    )
+
+
+def test_zim_scraper():
+    """Check content of scraper metadata"""
+
+    zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
+    scraper = zim_fh.get_text_metadata("Scraper")
+    assert "zimit " in scraper
+    assert "warc2zim " in scraper
+    assert "Browsertrix-Crawler " in scraper
+
+
+def test_files_list():
+    """Check that expected files are present in the ZIM at proper path"""
+    zim_fh = Archive(Path("/output/tests_en_onepage.zim"))
+    for expected_entry in [
+        "_zim_static/__wb_module_decl.js",
+        "_zim_static/wombat.js",
+        "_zim_static/wombatSetup.js",
+        "website.test.openzim.org/http-return-codes.html",
+        "website.test.openzim.org/200-response",
+        "website.test.openzim.org/201-response",
+        "website.test.openzim.org/202-response",
+        "website.test.openzim.org/301-external-redirect-ok",
+        "website.test.openzim.org/301-internal-redirect-ok",
+        "website.test.openzim.org/302-external-redirect-ok",
+        "website.test.openzim.org/302-internal-redirect-ok",
+        "website.test.openzim.org/307-external-redirect-ok",
+        "website.test.openzim.org/307-internal-redirect-ok",
+        "website.test.openzim.org/308-external-redirect-ok",
+        "website.test.openzim.org/308-internal-redirect-ok",
+        "website.test.openzim.org/http-return-codes.html",
+        "website.test.openzim.org/icons/favicon.ico",
+        "website.test.openzim.org/icons/site.webmanifest",
+        "website.test.openzim.org/internal_redirect_target.html",
+        "www.example.com/",
+    ]:
+        assert zim_fh.get_content(expected_entry)
+
+
+def test_user_agent():
+    """Test that mobile user agent was used
+
+    Check is done in WARC request records with custom Zimit and email suffix
+    """
+
+    found = False
+    for warc in glob.glob("/output/.tmp*/collections/crawl-*/archive/*.warc.gz"):
+        with open(warc, "rb") as fh:
+            for record in ArchiveIterator(fh):
+                if record.rec_type == "request":
+                    print(record.http_headers)  # noqa: T201
+                    ua = record.http_headers.get_header("User-Agent")
+                    if ua:
+                        assert "Mozilla" in ua
+                        assert ua.endswith(" +Zimit test@example.com")
+                        found = True
+
+    # should find at least one
+    assert found
+
+
+def test_stats_output_standard():
+    assert json.loads(Path("/output/crawl.json").read_bytes()) == {
+        "crawled": 17,
+        "pending": 0,
+        "pendingPages": [],
+        "total": 35,
+        "failed": 18,
+        "limit": {"max": 0, "hit": False},
+    }
+
+    assert json.loads(Path("/output/warc2zim.json").read_bytes()) == {
+        "written": 8,
+        "total": 8,
+    }
+
+    assert json.loads(Path("/output/stats.json").read_bytes()) == {
+        "done": 8,
+        "total": 8,
+        "partialZim": False,
+    }
+
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        pytest.param("/output/stats_sizesoftlimit.json", id="sizesoftlimit"),
+        pytest.param("/output/stats_timesoftlimit.json", id="timesoftlimit"),
+    ],
+)
+def test_stats_output_softlimit(filename):
+    file = Path(filename)
+    assert file.exists
+    content = json.loads(file.read_bytes())
+    assert "done" in content
+    assert "total" in content
+    assert "partialZim" in content
+    assert content["partialZim"]
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,14 @@
+import pytest
+
+from zimit import zimit as app
+
+"""
+ cleanup disabled because atexit hooks run at the very end of the Python process
+ shutdown. By the time cleanup() is called, the logging module has already closed its
+ file streams.
+"""
+
+
+@pytest.fixture(autouse=True)
+def disable_zimit_cleanup(monkeypatch):
+    monkeypatch.setattr(app, "cleanup", lambda: None)
--- a/tests/data/example-response.warc
+++ b/tests/data/example-response.warc
--- a/tests/test_dummy.py
+++ b/tests/test_dummy.py
@ -0,0 +1,6 @@
+from zimit.zimit import NORMAL_WARC2ZIM_EXIT_CODE
+
+
+# dummy test, just to have coverage report done
+def test_something_exists():
+    assert NORMAL_WARC2ZIM_EXIT_CODE
--- a/tests/test_overwrite.py
+++ b/tests/test_overwrite.py
@ -0,0 +1,83 @@
+import pathlib
+
+import pytest
+
+from zimit.zimit import run
+
+TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
+
+
+def test_overwrite_flag_behaviour(tmp_path):
+    zim_output = "overwrite-test.zim"
+    output_path = tmp_path / zim_output
+
+    # 1st run → creates file
+    result = run(
+        [
+            "--seeds",
+            "https://example.com",
+            "--warcs",
+            str(TEST_DATA_DIR / "example-response.warc"),
+            "--output",
+            str(tmp_path),
+            "--zim-file",
+            zim_output,
+            "--name",
+            "overwrite-test",
+        ]
+    )
+    assert result in (None, 100)
+    assert output_path.exists()
+
+    # 2nd run, no overwrite → should fail
+    with pytest.raises(SystemExit) as exc:
+        run(
+            [
+                "--seeds",
+                "https://example.com",
+                "--warcs",
+                str(TEST_DATA_DIR / "example-response.warc"),
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--name",
+                "overwrite-test",
+            ]
+        )
+    assert exc.value.code == 2
+
+    # 2nd run, no overwrite → should fail
+    with pytest.raises(SystemExit) as exc:
+        run(
+            [
+                "--seeds",
+                "https://example.com",
+                "--output",
+                str(tmp_path),
+                "--zim-file",
+                zim_output,
+                "--name",
+                "overwrite-test",
+            ]
+        )
+    assert exc.value.code == 2
+
+    # 3rd run, with overwrite → should succeed
+    result = run(
+        [
+            "--seeds",
+            "https://example.com",
+            "--warcs",
+            str(TEST_DATA_DIR / "example-response.warc"),
+            "--output",
+            str(tmp_path),
+            "--zim-file",
+            zim_output,
+            "--name",
+            "overwrite-test",
+            "--overwrite",
+        ]
+    )
+    assert result in (None, 100)
+    assert output_path.exists()
--- a/zimit.ini
+++ b/zimit.ini
@ -1,62 +0,0 @@
-[app:main]
-use = egg:zimit
-
-zimit.zimwriterfs_bin = /home/alexis/dev/openzim/zimwriterfs/zimwriterfs
-zimit.httrack_bin = /usr/bin/httrack
-zimit.output_location = /home/alexis/dev/zimit/zims
-zimit.output_url = http://zimit.notmyidea.org/zims
-
-mail.host = localhost
-mail.port = 2525
-mail.default_sender = zimit@notmyidea.org
-
-pyramid.includes =
-   pyramid_mailer
-
-[server:main]
-use = egg:waitress#main
-host = 0.0.0.0
-port = 6543
-
-# Begin logging configuration
-
-[uwsgi]
-wsgi-file = app.wsgi
-http-socket = :8000
-enable-threads = true
-master = true
-processes = 1
-virtualenv = .
-module = zimit
-lazy = true
-lazy-apps = true
-
-
-[loggers]
-keys = root, gplayproxy
-
-[handlers]
-keys = console
-
-[formatters]
-keys = generic
-
-[logger_root]
-level = INFO
-handlers = console
-
-[logger_gplayproxy]
-level = DEBUG
-handlers =
-qualname = gplayproxy
-
-[handler_console]
-class = StreamHandler
-args = (sys.stderr,)
-level = NOTSET
-formatter = generic
-
-[formatter_generic]
-format = %(asctime)s %(levelname)-5.5s [%(name)s][%(threadName)s] %(message)s
-
-# End logging configuration
--- a/zimit/init.py
+++ b/zimit/init.py
@ -1,25 +0,0 @@
-from pyramid.config import Configurator
-from pyramid.events import NewRequest
-from pyramid.static import static_view
-
-from redis import Redis
-from rq import Queue
-
-
-def main(global_config, **settings):
-    config = Configurator(settings=settings)
-    config.registry.queue = Queue(connection=Redis())
-
-    def attach_objects_to_request(event):
-        event.request.queue = config.registry.queue
-
-    config.add_subscriber(attach_objects_to_request, NewRequest)
-
-    config.include("cornice")
-    config.include('pyramid_mailer')
-    config.scan("zimit.views")
-
-    static = static_view('../app', use_subpath=True, index='index.html')
-    config.add_route('catchall_static', '/app/*subpath')
-    config.add_view(static, route_name="catchall_static")
-    return config.make_wsgi_app()
--- a/zimit/creator.py
+++ b/zimit/creator.py
@ -1,146 +0,0 @@
-import os
-import os.path
-import shutil
-import tempfile
-import urlparse
-
-from slugify import slugify
-
-from zimit import utils
-
-HTTRACK_BIN = "/usr/bin/httrack"
-DEFAULT_AUTHOR = "ZimIt"
-
-
-class ZimCreator(object):
-    """A synchronous zim creator, using HTTrack to spider websites and
-    zimwriterfs to create the zim files.
-
-    Please note that every operation is blocking the interpretor. As such, it
-    is recommended to run this operation in a worker if invoked from a website
-    view / controller.
-    """
-
-    def __init__(self, zimwriterfs_bin, output_location,
-                 author=DEFAULT_AUTHOR, httrack_bin=HTTRACK_BIN,
-                 log_file=None, max_download_speed=25000):
-        self.output_location = output_location
-        self.author = author
-        self.zimwriterfs_bin = zimwriterfs_bin
-        self.httrack_bin = httrack_bin
-        self.log_file = log_file
-        self.max_download_speed = max_download_speed
-
-        utils.ensure_paths_exists(
-            self.zimwriterfs_bin,
-            self.httrack_bin,
-            self.output_location)
-
-    def _spawn(self, cmd):
-        return utils.spawn(cmd, self.log_file)
-
-    def download_website(self, url, destination_path):
-        """Downloads the website using HTTrack and wait for the results to
-        be available before returning.
-
-        :param url:
-            The entry URL of the website to retrieve.
-
-        :param destination_path:
-            The absolute location of a folder where the files will be written.
-        """
-        options = {
-            "path": destination_path,
-            "max-rate": self.max_download_speed,
-            "keep-alive": None,
-            "robots": 0,
-            "near": None,
-        }
-
-        self._spawn(utils.get_command(self.httrack_bin, url, **options))
-
-    def prepare_website_folder(self, url, input_location):
-        """Prepare the website files to make them ready to be embedded in a zim
-        file.
-
-        :returns:
-            the absolute location of the website folder, ready to be embedded.
-        """
-        netloc = urlparse.urlparse(url).netloc.replace(":", "_")
-        website_folder = os.path.join(input_location, netloc)
-        if not os.path.isdir(website_folder):
-            message = "Unable to find the website folder! %s" % website_folder
-            raise Exception(message)
-        shutil.copy('./favicon.ico', website_folder)
-        return website_folder
-
-    def create_zim(self, input_location, output_name, zim_options):
-        """Create a zim file out of an existing folder on disk.
-
-        :param input_location:
-            The absolute location of the files to be bundled in the zim file.
-        :param output_name:
-            The name to use to create the zim file.
-        :param options:
-            Options to pass to the zim creator.
-        """
-
-        zim_options.update({
-            'bin': self.zimwriterfs_bin,
-            'location': input_location,
-            'output': os.path.join(self.output_location, output_name),
-            'icon': 'favicon.ico',
-            'publisher': self.author,
-        })
-
-        # Spawn zimwriterfs with the correct options.
-        options = (
-            '{bin} -w "{welcome}" -l "{language}" -t "{title}"'
-            ' -d "{description}" -f {icon} -c "{author}"'
-            ' -p "{publisher}" {location} {output}'
-        ).format(**zim_options)
-        self._spawn(options)
-        return output_name
-
-    def create_zim_from_website(self, url, zim_options):
-        """Create a zim file from a website. It might take some time.
-
-        The name of the generated zim file is a slugified version of its URL.
-
-        :param url:
-            the URL of the website to download.
-
-        :param zim_options:
-            A dictionary of options to use when generating the Zim file. They
-            are title, language, welcome and description.
-
-        :returns:
-            the name of the generated zim_file (relative to the output_folder)
-        """
-        temporary_location = tempfile.mkdtemp("zimit")
-        self.download_website(url, temporary_location)
-        website_folder = self.prepare_website_folder(url, temporary_location)
-        output_name = "{slug}.zim".format(slug=slugify(url))
-        zim_file = self.create_zim(website_folder, output_name, zim_options)
-        return zim_file
-
-
-def load_from_settings(settings, log_file=None):
-    """Load the ZimCreator object from the given pyramid settings, converting
-    them to actual parameters.
-
-    This is a convenience function for people wanting to create a ZimCreator
-    out of a ini file compatible with the pyramid framework.
-
-    :param settings: the dictionary of settings.
-    """
-    if 'zimit.zimwriterfs_bin' not in settings:
-        raise ValueError('Please define zimit.zimwriterfs_bin config.')
-
-    return ZimCreator(
-        zimwriterfs_bin=settings['zimit.zimwriterfs_bin'],
-        httrack_bin=settings.get('zimit.httrack_bin'),
-        output_location=settings.get('zimit.output_location'),
-        author=settings.get('zimit.default_author'),
-        log_file=log_file
-    )
--- a/zimit/mailer.py
+++ b/zimit/mailer.py
@ -1,42 +0,0 @@
-from pyramid_mailer.message import Attachment, Message
-from pyramid_mailer import Mailer
-
-
-def send_zim_url(settings, email, zim_url):
-    """Send an email with a link to one zim file.
-
-    :param settings:
-        A pyramid settings object, used by pyramid_mailer.
-    :param email:
-        The email of the recipient.
-    :param zim_url:
-        The URL of the zim file.
-    """
-    mailer = Mailer.from_settings(settings)
-    msg = ZimReadyMessage(email, zim_url)
-    mailer.send_immediately(msg)
-
-
-class ZimReadyMessage(Message):
-    def __init__(self, email, zim_link):
-        subject = "[ZimIt!] Your zimfile is ready!"
-
-        bdata = """
-Hi,
-
-You have asked for the creation of a zim file, and it is now ready !
-
-You can access it at the following URL:
-
-    {zim_link}
-
-Cheers,
-ZimIt.
-""".format(zim_link=zim_link)
-        hdata = bdata
-
-        body = Attachment(data=bdata, transfer_encoding="quoted-printable")
-        html = Attachment(data=hdata, transfer_encoding="quoted-printable")
-
-        super(ZimReadyMessage, self).__init__(
-            subject=subject, body=body, html=html, recipients=[email])
--- a/zimit/utils.py
+++ b/zimit/utils.py
@ -1,35 +0,0 @@
-import os
-import shlex
-import subprocess
-
-
-def spawn(cmd, logfile=None):
-    """Quick shortcut to spawn a command on the filesystem"""
-    if logfile is not None:
-        with open(logfile, "a+") as f:
-            prepared_cmd = shlex.split("stdbuf -o0 %s" % cmd)
-            process = subprocess.Popen(prepared_cmd, stdout=f)
-    else:
-        prepared_cmd = shlex.split(cmd)
-        process = subprocess.Popen(prepared_cmd)
-    process.wait()
-    return process
-
-
-def ensure_paths_exists(*paths):
-    for path in paths:
-        if not os.path.exists(path):
-            msg = '%s does not exist.' % path
-            raise OSError(msg)
-
-
-def get_command(cmd, *params, **options):
-    prepared_options = []
-    for key, value in options.items():
-        if value is None:
-            opt = "--%s" % key
-        else:
-            opt = "--%s=%s" % (key, value)
-        prepared_options.append(opt)
-
-    return " ".join((cmd, " ".join(params), " ".join(prepared_options)))
--- a/zimit/views.py
+++ b/zimit/views.py
@ -1,63 +0,0 @@
-import os
-
-from cornice import Service
-from colander import MappingSchema, SchemaNode, String
-from pyramid.httpexceptions import HTTPTemporaryRedirect, HTTPNotFound
-
-from zimit.worker import create_zim
-
-website = Service(name='website', path='/website-zim')
-home = Service(name='home', path='/')
-status = Service(name='status', path='/status/{id}')
-
-
-@home.get()
-def redirect_to_app(request):
-    raise HTTPTemporaryRedirect("/app/index.html")
-
-
-class WebSiteSchema(MappingSchema):
-    url = SchemaNode(String(), location="body", type='str')
-    title = SchemaNode(String(), location="body", type='str')
-    email = SchemaNode(String(), location="body", type='str')
-    description = SchemaNode(String(), default="-",
-                             location="body", type='str')
-    author = SchemaNode(String(), default=None,
-                        location="body", type='str')
-    welcome = SchemaNode(String(), default="index.html",
-                         location="body", type='str')
-    language = SchemaNode(String(), default="eng",
-                          location="body", type='str')
-
-
-@website.post(schema=WebSiteSchema)
-def crawl_new_website(request):
-    job = request.queue.enqueue(
-        create_zim,
-        request.registry.settings,
-        request.validated,
-        timeout=1800)
-    request.response.status_code = 201
-    return {
-        'job_id': job.id
-    }
-
-
-@status.get()
-def display_status(request):
-    job = request.queue.fetch_job(request.matchdict["id"])
-    if job is None:
-        raise HTTPNotFound()
-
-    log_dir = request.registry.settings.get('zimit.logdir', '/tmp')
-    log_file = os.path.join(log_dir, "%s.log" % job.id)
-
-    log_content = None
-    if os.path.exists(log_file):
-        with open(log_file) as f:
-            log_content = f.read()
-
-    return {
-        "status": job.status,
-        "log": log_content
-    }
--- a/zimit/worker.py
+++ b/zimit/worker.py
@ -1,20 +0,0 @@
-import os
-import urlparse
-
-from rq import get_current_job
-
-from zimit.mailer import send_zim_url
-from zimit.creator import load_from_settings
-
-
-def create_zim(settings, options):
-    """Call the zim creator and the mailer when it is finished.
-    """
-    job = get_current_job()
-    log_dir = settings.get('zimit.logdir', '/tmp')
-    log_file = os.path.join(log_dir, "%s.log" % job.id)
-    zim_creator = load_from_settings(settings, log_file)
-    zim_file = zim_creator.create_zim_from_website(options['url'], options)
-    output_url = settings.get('zimit.output_url')
-    zim_url = urlparse.urljoin(output_url, zim_file)
-    send_zim_url(settings, options['email'], zim_url)
				`@ -1 +0,0 @@`
				.alertify-logs>{padding:12px 24px;color:#fff;box-shadow:0 2px 5px 0 rgba(0,0,0,.2);border-radius:1px}.alertify-logs>,.alertify-logs>.default{background:rgba(0,0,0,.8)}.alertify-logs>.error{background:rgba(244,67,54,.8)}.alertify-logs>.success{background:rgba(76,175,80,.9)}.alertify{position:fixed;background-color:rgba(0,0,0,.3);left:0;right:0;top:0;bottom:0;width:100%;height:100%;z-index:1}.alertify.hide{opacity:0;pointer-events:none}.alertify,.alertify.show{box-sizing:border-box;transition:all .33s cubic-bezier(.25,.8,.25,1)}.alertify,.alertify {box-sizing:border-box}.alertify .dialog{padding:12px}.alertify .alert,.alertify .dialog{width:100%;margin:0 auto;position:relative;top:50%;transform:translateY(-50%)}.alertify .alert>,.alertify .dialog>{width:400px;max-width:95%;margin:0 auto;text-align:center;padding:12px;background:#fff;box-shadow:0 2px 4px -1px rgba(0,0,0,.14),0 4px 5px 0 rgba(0,0,0,.098),0 1px 10px 0 rgba(0,0,0,.084)}.alertify .alert .msg,.alertify .dialog .msg{padding:12px;margin-bottom:12px;margin:0;text-align:left}.alertify .alert input:not(.form-control),.alertify .dialog input:not(.form-control){margin-bottom:15px;width:100%;font-size:100%;padding:12px}.alertify .alert input:not(.form-control):focus,.alertify .dialog input:not(.form-control):focus{outline-offset:-2px}.alertify .alert nav,.alertify .dialog nav{text-align:right}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button),.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button){background:transparent;box-sizing:border-box;color:rgba(0,0,0,.87);position:relative;outline:0;border:0;display:inline-block;-ms-flex-align:center;-ms-grid-row-align:center;align-items:center;padding:0 6px;margin:6px 8px;line-height:36px;min-height:36px;white-space:nowrap;min-width:88px;text-align:center;text-transform:uppercase;font-size:14px;text-decoration:none;cursor:pointer;border:1px solid transparent;border-radius:2px}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):active,.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):hover,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):active,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):hover{background-color:rgba(0,0,0,.05)}.alertify .alert nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):focus,.alertify .dialog nav button:not(.btn):not(.pure-button):not(.md-button):not(.mdl-button):focus{border:1px solid rgba(0,0,0,.1)}.alertify .alert nav button.btn,.alertify .dialog nav button.btn{margin:6px 4px}.alertify-logs{position:fixed;z-index:1}.alertify-logs.bottom,.alertify-logs:not(.top){bottom:16px}.alertify-logs.left,.alertify-logs:not(.right){left:16px}.alertify-logs.left>,.alertify-logs:not(.right)>{float:left;transform:translateZ(0);height:auto}.alertify-logs.left>.show,.alertify-logs:not(.right)>.show{left:0}.alertify-logs.left>,.alertify-logs.left>.hide,.alertify-logs:not(.right)>,.alertify-logs:not(.right)>.hide{left:-110%}.alertify-logs.right{right:16px}.alertify-logs.right>{float:right;transform:translateZ(0)}.alertify-logs.right>.show{right:0;opacity:1}.alertify-logs.right>,.alertify-logs.right>.hide{right:-110%;opacity:0}.alertify-logs.top{top:0}.alertify-logs>{box-sizing:border-box;transition:all .4s cubic-bezier(.25,.8,.25,1);position:relative;clear:both;backface-visibility:hidden;perspective:1000;max-height:0;margin:0;padding:0;overflow:hidden;opacity:0;pointer-events:none}.alertify-logs>.show{margin-top:12px;opacity:1;max-height:1000px;padding:12px;pointer-events:auto}
				`@ -0,0 +1 @@`
				`These are integration tests, meant to be ran inside the CI (because we need to first perform a zimit run on a given website and then check its output)`