diff --git a/CHANGELOG.md b/CHANGELOG.md index c64f9ab..2ab1684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Handle case where the redirect target is bad / unsupported (#332 and #356) +- Fixed WARC files handling order to follow creation order (#366) ## [2.0.3] - 2024-07-24 diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 5fd0483..bf666f3 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -188,6 +188,15 @@ class Converter: self.failed_content_path.mkdir(parents=True, exist_ok=True) self.inputs = args.inputs + + # sort by filename (not full path) alphabetically to process WARC by crawl time + # in general (at least when browsertrix crawler is used with zimit, not sure for + # **pure** warc2zim scenarii) + self.warc_files = sorted( + iter_file_or_dir(self.inputs), key=lambda filename: Path(filename).name + ) + logger.debug(f"{len(self.warc_files)} WARC files found") + self.include_domains = args.include_domains self.custom_css = args.custom_css @@ -336,7 +345,7 @@ class Converter: if self.custom_css: self.add_custom_css_item() - for record in iter_warc_records(self.inputs): + for record in iter_warc_records(self.warc_files): try: self.add_items_for_warc_record(record) except Exception as exc: @@ -390,7 +399,7 @@ class Converter: def gather_information_from_warc(self): main_page_found = False - for record in iter_warc_records(self.inputs): + for record in iter_warc_records(self.warc_files): # only response records can be considered as main_path and as existing ZIM # path @@ -662,7 +671,7 @@ class Converter: if self.favicon_url or self.favicon_path: # look into WARC records - for record in iter_warc_records(self.inputs): + for record in iter_warc_records(self.warc_files): if record.rec_type != "response": continue url = get_record_url(record) @@ -817,9 +826,9 @@ class Converter: ) -def iter_warc_records(inputs): +def iter_warc_records(warc_files): """iter warc records, including appending request data to matching response""" - for filename in iter_file_or_dir(inputs): + for filename in warc_files: with open(filename, "rb") as fh: for record in buffering_record_iter(ArchiveIterator(fh), post_append=True): if record and record.rec_type in ("resource", "response", "revisit"): diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 7d3a055..7487bf5 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -8,7 +8,7 @@ from warc2zim.converter import Converter from warc2zim.utils import get_version -def main(raw_args=None): +def _create_arguments_parser() -> ArgumentParser: parser = ArgumentParser(description="Create ZIM files from WARC files") parser.add_argument("-V", "--version", action="version", version=get_version()) @@ -141,6 +141,11 @@ def main(raw_args=None): default=False, ) + return parser + + +def main(raw_args=None): + parser = _create_arguments_parser() args = parser.parse_args(args=raw_args) converter = Converter(args) return converter.run() diff --git a/tests/test_converter.py b/tests/test_converter.py new file mode 100644 index 0000000..b38c331 --- /dev/null +++ b/tests/test_converter.py @@ -0,0 +1,52 @@ +import tempfile + +import pytest + +from warc2zim.converter import Converter +from warc2zim.main import _create_arguments_parser + + +@pytest.mark.parametrize( + "inputs, warc_files", + [ + pytest.param([], [], id="empty_array"), + pytest.param(["foo.warc.gz"], ["foo.warc.gz"], id="one_file"), + pytest.param( + [ + "rec-f9c30d949953-20240724035746176-0.warc.gz", + "rec-f9c30d949953-20240724045846176-0.warc.gz", + ], + None, # no change + id="two_already_sorted", + ), + pytest.param( + [ + "rec-f9c30d949953-20240724045846176-0.warc.gz", + "rec-f9c30d949953-20240724035746176-0.warc.gz", + ], + [ + "rec-f9c30d949953-20240724035746176-0.warc.gz", + "rec-f9c30d949953-20240724045846176-0.warc.gz", + ], + id="two_not_sorted", + ), + pytest.param( + [ + "aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz", + "bbb/rec-f9c30d949953-20240724035746176-0.warc.gz", + ], + [ + "bbb/rec-f9c30d949953-20240724035746176-0.warc.gz", + "aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz", + ], + id="two_not_sorted_in_random_unsorted_dirs", + ), + ], +) +def test_sort_warc_files(inputs, warc_files): + parser = _create_arguments_parser() + tmpdir = tempfile.mkdtemp() + args = parser.parse_args(["--name", "foo", "--output", tmpdir]) + args.inputs = inputs + conv = Converter(args) + assert conv.warc_files == (warc_files if warc_files else inputs)