mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
Merge pull request #524 from Routhinator/issue-490-resume-crawl-from-interrupt
Fixes: #499 - Resolve issues preventing graceful crawl resumption after interrupt
This commit is contained in:
commit
277473884e
2 changed files with 16 additions and 6 deletions
|
|
@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Fix issues preventing interrupted crawls from being resumed. (#499)
|
||||||
|
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
|
||||||
|
- Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
|
||||||
|
- Don't cleanup an explicitly passed build directory.
|
||||||
|
|
||||||
## [3.0.5] - 2024-04-11
|
## [3.0.5] - 2024-04-11
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
|
||||||
|
|
@ -796,11 +796,14 @@ def run(raw_args):
|
||||||
if known_args.adminEmail:
|
if known_args.adminEmail:
|
||||||
user_agent_suffix += f" {known_args.adminEmail}"
|
user_agent_suffix += f" {known_args.adminEmail}"
|
||||||
|
|
||||||
# make temp dir for this crawl
|
# set temp dir to use for this crawl
|
||||||
global temp_root_dir # noqa: PLW0603
|
global temp_root_dir # noqa: PLW0603
|
||||||
if known_args.build:
|
if known_args.build:
|
||||||
temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp"))
|
# use build dir argument if passed
|
||||||
|
temp_root_dir = Path(known_args.build)
|
||||||
|
temp_root_dir.mkdir(parents=True, exist_ok=True)
|
||||||
else:
|
else:
|
||||||
|
# make new randomized temp dir
|
||||||
temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
|
temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
|
||||||
|
|
||||||
seeds = []
|
seeds = []
|
||||||
|
|
@ -854,7 +857,8 @@ def run(raw_args):
|
||||||
logger.info("Exiting, invalid warc2zim params")
|
logger.info("Exiting, invalid warc2zim params")
|
||||||
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
return EXIT_CODE_WARC2ZIM_CHECK_FAILED
|
||||||
|
|
||||||
if not known_args.keep:
|
# only trigger cleanup when the keep argument is passed without a custom build dir.
|
||||||
|
if not known_args.build and not known_args.keep:
|
||||||
atexit.register(cleanup)
|
atexit.register(cleanup)
|
||||||
|
|
||||||
# copy / download custom behaviors to one single folder and configure crawler
|
# copy / download custom behaviors to one single folder and configure crawler
|
||||||
|
|
@ -1076,12 +1080,12 @@ def run(raw_args):
|
||||||
)
|
)
|
||||||
elif len(warc_dirs) > 1:
|
elif len(warc_dirs) > 1:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Found many WARC files directories, only most recently modified one"
|
"Found many WARC files directories, combining pages from all "
|
||||||
" will be used"
|
"of them"
|
||||||
)
|
)
|
||||||
for directory in warc_dirs:
|
for directory in warc_dirs:
|
||||||
logger.info(f"- {directory}")
|
logger.info(f"- {directory}")
|
||||||
warc_files = [warc_dirs[-1]]
|
warc_files = warc_dirs
|
||||||
|
|
||||||
logger.info("")
|
logger.info("")
|
||||||
logger.info("----------")
|
logger.info("----------")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue