Merge pull request #528 from aaryansinhaa/feature/overwrite

Added --overwrite flag to zimit
This commit is contained in:
benoit74 2025-12-22 11:56:16 +01:00 committed by GitHub
commit a7e236f0d7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 110 additions and 2 deletions

View file

@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
### Added
- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
### Changed ### Changed
- Fix issues preventing interrupted crawls from being resumed. (#499) - Fix issues preventing interrupted crawls from being resumed. (#499)
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.

View file

@ -970,6 +970,12 @@
"alias": "name", "alias": "name",
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
"relaxedPattern": "^[A-Za-z0-9._-]+$" "relaxedPattern": "^[A-Za-z0-9._-]+$"
},
"overwrite": {
"type": "boolean",
"required": false,
"title": "Overwrite",
"description": "Whether to overwrite existing ZIM file if it exists"
} }
} }
} }

View file

@ -849,6 +849,9 @@ def run(raw_args):
warc2zim_args.append("--lang") warc2zim_args.append("--lang")
warc2zim_args.append(known_args.zim_lang) warc2zim_args.append(known_args.zim_lang)
if known_args.overwrite:
warc2zim_args.append("--overwrite")
logger.info("----------") logger.info("----------")
logger.info("Testing warc2zim args") logger.info("Testing warc2zim args")
logger.info("Running: warc2zim " + " ".join(warc2zim_args)) logger.info("Running: warc2zim " + " ".join(warc2zim_args))
@ -1036,7 +1039,6 @@ def run(raw_args):
warc_files.append(Path(extract_path)) warc_files.append(Path(extract_path))
else: else:
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
crawl = subprocess.run(crawler_args, check=False) crawl = subprocess.run(crawler_args, check=False)
if ( if (
@ -1091,7 +1093,7 @@ def run(raw_args):
logger.info("----------") logger.info("----------")
logger.info( logger.info(
f"Processing WARC files in/at " f"Processing WARC files in/at "
f'{" ".join(str(warc_file) for warc_file in warc_files)}' f"{' '.join(str(warc_file) for warc_file in warc_files)}"
) )
warc2zim_args.extend(str(warc_file) for warc_file in warc_files) warc2zim_args.extend(str(warc_file) for warc_file in warc_files)

14
tests/conftest.py Normal file
View file

@ -0,0 +1,14 @@
import pytest
from zimit import zimit as app
"""
cleanup disabled because atexit hooks run at the very end of the Python process
shutdown. By the time cleanup() is called, the logging module has already closed its
file streams.
"""
@pytest.fixture(autouse=True)
def disable_zimit_cleanup(monkeypatch):
monkeypatch.setattr(app, "cleanup", lambda: None)

Binary file not shown.

83
tests/test_overwrite.py Normal file
View file

@ -0,0 +1,83 @@
import pathlib
import pytest
from zimit.zimit import run
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
def test_overwrite_flag_behaviour(tmp_path):
zim_output = "overwrite-test.zim"
output_path = tmp_path / zim_output
# 1st run → creates file
result = run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert result in (None, 100)
assert output_path.exists()
# 2nd run, no overwrite → should fail
with pytest.raises(SystemExit) as exc:
run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert exc.value.code == 2
# 2nd run, no overwrite → should fail
with pytest.raises(SystemExit) as exc:
run(
[
"--seeds",
"https://example.com",
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert exc.value.code == 2
# 3rd run, with overwrite → should succeed
result = run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
"--overwrite",
]
)
assert result in (None, 100)
assert output_path.exists()