mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
Sort WARC directories passed to zimit by modification time
This commit is contained in:
parent
0d5a08c912
commit
eb32adfea7
2 changed files with 10 additions and 2 deletions
|
|
@ -21,6 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
- Stop fetching and passing browsertrix crawler version as scraperSuffix to warc2zim (#354)
|
||||
- Do not log number of WARC files found (#357)
|
||||
|
||||
### Fixed
|
||||
|
||||
- Sort WARC directories found by modification time (#366)
|
||||
|
||||
## [2.0.6] - 2024-08-02
|
||||
|
||||
### Changed
|
||||
|
|
|
|||
|
|
@ -586,14 +586,18 @@ def run(raw_args):
|
|||
]
|
||||
|
||||
else:
|
||||
warc_dirs = list(temp_root_dir.rglob("collections/crawl-*/archive/"))
|
||||
warc_dirs = sorted(
|
||||
temp_root_dir.rglob("collections/crawl-*/archive/"),
|
||||
key=lambda path: path.lstat().st_mtime,
|
||||
)
|
||||
if len(warc_dirs) == 0:
|
||||
raise RuntimeError(
|
||||
"Failed to find directory where WARC files have been created"
|
||||
)
|
||||
elif len(warc_dirs) > 1:
|
||||
logger.info(
|
||||
"Found many WARC files directories, only last one will be used"
|
||||
"Found many WARC files directories, only most recently modified one"
|
||||
" will be used"
|
||||
)
|
||||
for directory in warc_dirs:
|
||||
logger.info(f"- {directory}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue