Added --overwrite flag to zimit

This commit is contained in:
Aaryan Kumar Sinha 2025-12-13 01:30:33 +05:30 committed by Aaryan Kumar Sinha
parent 34ce7eb98d
commit 81018f06fa
6 changed files with 110 additions and 2 deletions

View file

@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
### Added
- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
### Changed ### Changed
- Fix issues preventing interrupted crawls from being resumed. (#499) - Fix issues preventing interrupted crawls from being resumed. (#499)
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.

View file

@ -970,6 +970,12 @@
"alias": "name", "alias": "name",
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
"relaxedPattern": "^[A-Za-z0-9._-]+$" "relaxedPattern": "^[A-Za-z0-9._-]+$"
},
"overwrite": {
"type": "boolean",
"required": false,
"title": "Overwrite",
"description": "Whether to overwrite existing ZIM file if it exists"
} }
} }
} }

View file

@ -849,6 +849,9 @@ def run(raw_args):
warc2zim_args.append("--lang") warc2zim_args.append("--lang")
warc2zim_args.append(known_args.zim_lang) warc2zim_args.append(known_args.zim_lang)
if known_args.overwrite:
warc2zim_args.append("--overwrite")
logger.info("----------") logger.info("----------")
logger.info("Testing warc2zim args") logger.info("Testing warc2zim args")
logger.info("Running: warc2zim " + " ".join(warc2zim_args)) logger.info("Running: warc2zim " + " ".join(warc2zim_args))
@ -1036,7 +1039,6 @@ def run(raw_args):
warc_files.append(Path(extract_path)) warc_files.append(Path(extract_path))
else: else:
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
crawl = subprocess.run(crawler_args, check=False) crawl = subprocess.run(crawler_args, check=False)
if ( if (
@ -1091,7 +1093,7 @@ def run(raw_args):
logger.info("----------") logger.info("----------")
logger.info( logger.info(
f"Processing WARC files in/at " f"Processing WARC files in/at "
f'{" ".join(str(warc_file) for warc_file in warc_files)}' f"{' '.join(str(warc_file) for warc_file in warc_files)}"
) )
warc2zim_args.extend(str(warc_file) for warc_file in warc_files) warc2zim_args.extend(str(warc_file) for warc_file in warc_files)

14
tests/conftest.py Normal file
View file

@ -0,0 +1,14 @@
import pytest
from zimit import zimit as app
"""
cleanup disabled because atexit hooks run at the very end of the Python process
shutdown. By the time cleanup() is called, the logging module has already closed its
file streams.
"""
@pytest.fixture(autouse=True)
def disable_zimit_cleanup(monkeypatch):
monkeypatch.setattr(app, "cleanup", lambda: None)

Binary file not shown.

83
tests/test_overwrite.py Normal file
View file

@ -0,0 +1,83 @@
import pathlib
import pytest
from zimit.zimit import run
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
def test_overwrite_flag_behaviour(tmp_path):
zim_output = "overwrite-test.zim"
output_path = tmp_path / zim_output
# 1st run → creates file
result = run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert result in (None, 100)
assert output_path.exists()
# 2nd run, no overwrite → should fail
with pytest.raises(SystemExit) as exc:
run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert exc.value.code == 2
# 2nd run, no overwrite → should fail
with pytest.raises(SystemExit) as exc:
run(
[
"--seeds",
"https://example.com",
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
]
)
assert exc.value.code == 2
# 3rd run, with overwrite → should succeed
result = run(
[
"--seeds",
"https://example.com",
"--warcs",
str(TEST_DATA_DIR / "example-response.warc"),
"--output",
str(tmp_path),
"--zim-file",
zim_output,
"--name",
"overwrite-test",
"--overwrite",
]
)
assert result in (None, 100)
assert output_path.exists()