Merge pull request #524 from Routhinator/issue-490-resume-crawl-from-interrupt

Fixes:  #499 - Resolve issues preventing graceful crawl resumption after interrupt
This commit is contained in:
benoit74 2025-11-08 10:55:07 +01:00 committed by GitHub
commit 277473884e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 16 additions and 6 deletions

View file

@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ## [Unreleased]
### Changed
- Fix issues preventing interrupted crawls from being resumed. (#499)
- Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist.
- Use all warc_dirs found instead of just the latest so interrupted crawls use all collected pages across runs when an explicit collections directory is not passed.
- Don't cleanup an explicitly passed build directory.
## [3.0.5] - 2024-04-11 ## [3.0.5] - 2024-04-11
### Changed ### Changed

View file

@ -796,11 +796,14 @@ def run(raw_args):
if known_args.adminEmail: if known_args.adminEmail:
user_agent_suffix += f" {known_args.adminEmail}" user_agent_suffix += f" {known_args.adminEmail}"
# make temp dir for this crawl # set temp dir to use for this crawl
global temp_root_dir # noqa: PLW0603 global temp_root_dir # noqa: PLW0603
if known_args.build: if known_args.build:
temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.build, prefix=".tmp")) # use build dir argument if passed
temp_root_dir = Path(known_args.build)
temp_root_dir.mkdir(parents=True, exist_ok=True)
else: else:
# make new randomized temp dir
temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp")) temp_root_dir = Path(tempfile.mkdtemp(dir=known_args.output, prefix=".tmp"))
seeds = [] seeds = []
@ -854,7 +857,8 @@ def run(raw_args):
logger.info("Exiting, invalid warc2zim params") logger.info("Exiting, invalid warc2zim params")
return EXIT_CODE_WARC2ZIM_CHECK_FAILED return EXIT_CODE_WARC2ZIM_CHECK_FAILED
if not known_args.keep: # only trigger cleanup when the keep argument is passed without a custom build dir.
if not known_args.build and not known_args.keep:
atexit.register(cleanup) atexit.register(cleanup)
# copy / download custom behaviors to one single folder and configure crawler # copy / download custom behaviors to one single folder and configure crawler
@ -1076,12 +1080,12 @@ def run(raw_args):
) )
elif len(warc_dirs) > 1: elif len(warc_dirs) > 1:
logger.info( logger.info(
"Found many WARC files directories, only most recently modified one" "Found many WARC files directories, combining pages from all "
" will be used" "of them"
) )
for directory in warc_dirs: for directory in warc_dirs:
logger.info(f"- {directory}") logger.info(f"- {directory}")
warc_files = [warc_dirs[-1]] warc_files = warc_dirs
logger.info("") logger.info("")
logger.info("----------") logger.info("----------")