Sort WARC directories passed to zimit by modification time

This commit is contained in:
benoit74 2024-08-07 11:59:46 +00:00
parent 0d5a08c912
commit eb32adfea7
No known key found for this signature in database
GPG key ID: B89606434FC7B530
2 changed files with 10 additions and 2 deletions

View file

@ -21,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
- Do not log number of WARC files found (#357)
### Fixed
- Sort WARC directories found by modification time (#366)
## [2.0.6] - 2024-08-02
### Changed

View file

@ -586,14 +586,18 @@ def run(raw_args):
]
else:
warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
warc_dirs = sorted(
temp_root_dir.rglob("collections/crawl-*/archive/"),
key=lambda path: path.lstat().st_mtime,
)
if len(warc_dirs) == 0:
raise RuntimeError(
"Failed to find directory where WARC files have been created"
)
elif len(warc_dirs) > 1:
logger.info(
"Found many WARC files directories, only last one will be used"
"Found many WARC files directories, only most recently modified one"
" will be used"
)
for directory in warc_dirs:
logger.info(f"- {directory}")