mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
Added --overwrite flag to zimit
This commit is contained in:
parent
34ce7eb98d
commit
81018f06fa
6 changed files with 110 additions and 2 deletions
|
|
@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399)
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- Fix issues preventing interrupted crawls from being resumed. (#499)
|
- Fix issues preventing interrupted crawls from being resumed. (#499)
|
||||||
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
|
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
|
||||||
|
|
|
||||||
|
|
@ -970,6 +970,12 @@
|
||||||
"alias": "name",
|
"alias": "name",
|
||||||
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
|
"pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$",
|
||||||
"relaxedPattern": "^[A-Za-z0-9._-]+$"
|
"relaxedPattern": "^[A-Za-z0-9._-]+$"
|
||||||
|
},
|
||||||
|
"overwrite": {
|
||||||
|
"type": "boolean",
|
||||||
|
"required": false,
|
||||||
|
"title": "Overwrite",
|
||||||
|
"description": "Whether to overwrite existing ZIM file if it exists"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -849,6 +849,9 @@ def run(raw_args):
|
||||||
warc2zim_args.append("--lang")
|
warc2zim_args.append("--lang")
|
||||||
warc2zim_args.append(known_args.zim_lang)
|
warc2zim_args.append(known_args.zim_lang)
|
||||||
|
|
||||||
|
if known_args.overwrite:
|
||||||
|
warc2zim_args.append("--overwrite")
|
||||||
|
|
||||||
logger.info("----------")
|
logger.info("----------")
|
||||||
logger.info("Testing warc2zim args")
|
logger.info("Testing warc2zim args")
|
||||||
logger.info("Running: warc2zim " + " ".join(warc2zim_args))
|
logger.info("Running: warc2zim " + " ".join(warc2zim_args))
|
||||||
|
|
@ -1036,7 +1039,6 @@ def run(raw_args):
|
||||||
warc_files.append(Path(extract_path))
|
warc_files.append(Path(extract_path))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
|
logger.info(f"Running browsertrix-crawler crawl: {cmd_line}")
|
||||||
crawl = subprocess.run(crawler_args, check=False)
|
crawl = subprocess.run(crawler_args, check=False)
|
||||||
if (
|
if (
|
||||||
|
|
@ -1091,7 +1093,7 @@ def run(raw_args):
|
||||||
logger.info("----------")
|
logger.info("----------")
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Processing WARC files in/at "
|
f"Processing WARC files in/at "
|
||||||
f'{" ".join(str(warc_file) for warc_file in warc_files)}'
|
f"{' '.join(str(warc_file) for warc_file in warc_files)}"
|
||||||
)
|
)
|
||||||
warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
|
warc2zim_args.extend(str(warc_file) for warc_file in warc_files)
|
||||||
|
|
||||||
|
|
|
||||||
14
tests/conftest.py
Normal file
14
tests/conftest.py
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from zimit import zimit as app
|
||||||
|
|
||||||
|
"""
|
||||||
|
cleanup disabled because atexit hooks run at the very end of the Python process
|
||||||
|
shutdown. By the time cleanup() is called, the logging module has already closed its
|
||||||
|
file streams.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def disable_zimit_cleanup(monkeypatch):
|
||||||
|
monkeypatch.setattr(app, "cleanup", lambda: None)
|
||||||
BIN
tests/data/example-response.warc
Normal file
BIN
tests/data/example-response.warc
Normal file
Binary file not shown.
83
tests/test_overwrite.py
Normal file
83
tests/test_overwrite.py
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from zimit.zimit import run
|
||||||
|
|
||||||
|
TEST_DATA_DIR = pathlib.Path(__file__).parent / "data"
|
||||||
|
|
||||||
|
|
||||||
|
def test_overwrite_flag_behaviour(tmp_path):
|
||||||
|
zim_output = "overwrite-test.zim"
|
||||||
|
output_path = tmp_path / zim_output
|
||||||
|
|
||||||
|
# 1st run → creates file
|
||||||
|
result = run(
|
||||||
|
[
|
||||||
|
"--seeds",
|
||||||
|
"https://example.com",
|
||||||
|
"--warcs",
|
||||||
|
str(TEST_DATA_DIR / "example-response.warc"),
|
||||||
|
"--output",
|
||||||
|
str(tmp_path),
|
||||||
|
"--zim-file",
|
||||||
|
zim_output,
|
||||||
|
"--name",
|
||||||
|
"overwrite-test",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert result in (None, 100)
|
||||||
|
assert output_path.exists()
|
||||||
|
|
||||||
|
# 2nd run, no overwrite → should fail
|
||||||
|
with pytest.raises(SystemExit) as exc:
|
||||||
|
run(
|
||||||
|
[
|
||||||
|
"--seeds",
|
||||||
|
"https://example.com",
|
||||||
|
"--warcs",
|
||||||
|
str(TEST_DATA_DIR / "example-response.warc"),
|
||||||
|
"--output",
|
||||||
|
str(tmp_path),
|
||||||
|
"--zim-file",
|
||||||
|
zim_output,
|
||||||
|
"--name",
|
||||||
|
"overwrite-test",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert exc.value.code == 2
|
||||||
|
|
||||||
|
# 2nd run, no overwrite → should fail
|
||||||
|
with pytest.raises(SystemExit) as exc:
|
||||||
|
run(
|
||||||
|
[
|
||||||
|
"--seeds",
|
||||||
|
"https://example.com",
|
||||||
|
"--output",
|
||||||
|
str(tmp_path),
|
||||||
|
"--zim-file",
|
||||||
|
zim_output,
|
||||||
|
"--name",
|
||||||
|
"overwrite-test",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert exc.value.code == 2
|
||||||
|
|
||||||
|
# 3rd run, with overwrite → should succeed
|
||||||
|
result = run(
|
||||||
|
[
|
||||||
|
"--seeds",
|
||||||
|
"https://example.com",
|
||||||
|
"--warcs",
|
||||||
|
str(TEST_DATA_DIR / "example-response.warc"),
|
||||||
|
"--output",
|
||||||
|
str(tmp_path),
|
||||||
|
"--zim-file",
|
||||||
|
zim_output,
|
||||||
|
"--name",
|
||||||
|
"overwrite-test",
|
||||||
|
"--overwrite",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
assert result in (None, 100)
|
||||||
|
assert output_path.exists()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue