diff --git a/.gitignore b/.gitignore index ddea4cf..0a2d877 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,362 @@ -*.pyc -__pycache__ -*.zim +# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,visualstudiocode,intellij + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ *.egg-info/ -collections/ -node_modules/ -output/ -venv \ No newline at end of file +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,visualstudiocode,intellij + +# output dir +output + +# ignore all vscode, this editor specific, not maintained by openzim +.vscode diff --git a/CHANGELOG.md b/CHANGELOG.md index 45136ac..a3d2894 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,17 @@ All notable changes to this project are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) (as of version 1.2.0). -## Unreleased +## [Unreleased] + +### Added + +- New `--build` parameter (optional) to specify the directory holding Browsertrix files ; if not set, `--output` +directory is used ; zimit creates one subdir of this folder per invocation to isolate datasets ; subdir is kept only +if `--keep` is set. + +### Fixed + +- `--collection` parameter was not working (#252) ## [1.6.2] - 2023-11-17 diff --git a/zimit.py b/zimit.py index cf8b970..cd5d66a 100755 --- a/zimit.py +++ b/zimit.py @@ -30,6 +30,7 @@ from zimscraperlib.uri import rebuild_uri DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15" + class ProgressFileWatcher: def __init__(self, output_dir, stats_path): self.crawl_path = output_dir / "crawl.json" @@ -153,9 +154,7 @@ def zimit(args=None): type=int, ) - parser.add_argument( - "--limit", help="Limit crawl to this number of pages", type=int - ) + parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int) parser.add_argument( "--maxPageLimit", @@ -226,7 +225,7 @@ def zimit(args=None): parser.add_argument( "--userAgent", help="Override default user-agent with specified value ; --userAgentSuffix is still applied", - default=DEFAULT_USER_AGENT + default=DEFAULT_USER_AGENT, ) parser.add_argument( @@ -309,8 +308,11 @@ def zimit(args=None): action="store_true", ) + parser.add_argument("--output", help="Output directory for ZIM", default="/output") + parser.add_argument( - "--output", help="Output directory for ZIM and WARC files", default="/output" + "--build", + help="Build directory for WARC files (if not set, output directory is used)", ) parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") @@ -379,7 +381,10 @@ def zimit(args=None): return 2 # make temp dir for this crawl - temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) + if zimit_args.build: + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.build, prefix=".tmp")) + else: + temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: @@ -431,14 +436,30 @@ def zimit(args=None): elif crawl.returncode != 0: raise subprocess.CalledProcessError(crawl.returncode, cmd_args) - warc_files = list(temp_root_dir.rglob("collections/crawl-*/archive/"))[-1] - warc2zim_args.append(str(warc_files)) - - num_files = sum(1 for e in warc_files.iterdir()) + if zimit_args.collection: + warc_directory = temp_root_dir.joinpath( + f"collections/{zimit_args.collection}/archive/" + ) + else: + warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/")) + if len(warc_dirs) == 0: + raise RuntimeError( + "Failed to find directory where WARC files have been created" + ) + elif len(warc_dirs) > 1: + print("Found many WARC files directories, only last one will be used") + for directory in warc_dirs: + print(f"- {directory}") + warc_directory = warc_dirs[-1] print("") print("----------") - print(f"Processing {num_files} WARC files to ZIM", flush=True) + print(f"Processing WARC files in {warc_directory}") + warc2zim_args.append(str(warc_directory)) + + num_files = sum(1 for _ in warc_directory.iterdir()) + print(f"{num_files} WARC files found", flush=True) + print(f"Calling warc2zim with these args: {warc2zim_args}", flush=True) return warc2zim(warc2zim_args) @@ -447,7 +468,11 @@ def check_url(url, user_agent, scope=None): url = urllib.parse.urlparse(url) try: with requests.get( - url.geturl(), stream=True, allow_redirects=True, timeout=(12.2, 27), headers={"User-Agent": user_agent} + url.geturl(), + stream=True, + allow_redirects=True, + timeout=(12.2, 27), + headers={"User-Agent": user_agent}, ) as resp: resp.raise_for_status() except requests.exceptions.RequestException as exc: