mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 06:23:16 +00:00
Properly sort WARC files and log total number of WARCs found
This commit is contained in:
parent
b2deb60e8c
commit
f08bfc61cc
4 changed files with 73 additions and 6 deletions
|
@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
### Fixed
|
||||
|
||||
- Handle case where the redirect target is bad / unsupported (#332 and #356)
|
||||
- Fixed WARC files handling order to follow creation order (#366)
|
||||
|
||||
## [2.0.3] - 2024-07-24
|
||||
|
||||
|
|
|
@ -188,6 +188,15 @@ class Converter:
|
|||
self.failed_content_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.inputs = args.inputs
|
||||
|
||||
# sort by filename (not full path) alphabetically to process WARC by crawl time
|
||||
# in general (at least when browsertrix crawler is used with zimit, not sure for
|
||||
# **pure** warc2zim scenarii)
|
||||
self.warc_files = sorted(
|
||||
iter_file_or_dir(self.inputs), key=lambda filename: Path(filename).name
|
||||
)
|
||||
logger.debug(f"{len(self.warc_files)} WARC files found")
|
||||
|
||||
self.include_domains = args.include_domains
|
||||
|
||||
self.custom_css = args.custom_css
|
||||
|
@ -336,7 +345,7 @@ class Converter:
|
|||
if self.custom_css:
|
||||
self.add_custom_css_item()
|
||||
|
||||
for record in iter_warc_records(self.inputs):
|
||||
for record in iter_warc_records(self.warc_files):
|
||||
try:
|
||||
self.add_items_for_warc_record(record)
|
||||
except Exception as exc:
|
||||
|
@ -390,7 +399,7 @@ class Converter:
|
|||
|
||||
def gather_information_from_warc(self):
|
||||
main_page_found = False
|
||||
for record in iter_warc_records(self.inputs):
|
||||
for record in iter_warc_records(self.warc_files):
|
||||
|
||||
# only response records can be considered as main_path and as existing ZIM
|
||||
# path
|
||||
|
@ -662,7 +671,7 @@ class Converter:
|
|||
|
||||
if self.favicon_url or self.favicon_path:
|
||||
# look into WARC records
|
||||
for record in iter_warc_records(self.inputs):
|
||||
for record in iter_warc_records(self.warc_files):
|
||||
if record.rec_type != "response":
|
||||
continue
|
||||
url = get_record_url(record)
|
||||
|
@ -817,9 +826,9 @@ class Converter:
|
|||
)
|
||||
|
||||
|
||||
def iter_warc_records(inputs):
|
||||
def iter_warc_records(warc_files):
|
||||
"""iter warc records, including appending request data to matching response"""
|
||||
for filename in iter_file_or_dir(inputs):
|
||||
for filename in warc_files:
|
||||
with open(filename, "rb") as fh:
|
||||
for record in buffering_record_iter(ArchiveIterator(fh), post_append=True):
|
||||
if record and record.rec_type in ("resource", "response", "revisit"):
|
||||
|
|
|
@ -8,7 +8,7 @@ from warc2zim.converter import Converter
|
|||
from warc2zim.utils import get_version
|
||||
|
||||
|
||||
def main(raw_args=None):
|
||||
def _create_arguments_parser() -> ArgumentParser:
|
||||
parser = ArgumentParser(description="Create ZIM files from WARC files")
|
||||
|
||||
parser.add_argument("-V", "--version", action="version", version=get_version())
|
||||
|
@ -141,6 +141,11 @@ def main(raw_args=None):
|
|||
default=False,
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(raw_args=None):
|
||||
parser = _create_arguments_parser()
|
||||
args = parser.parse_args(args=raw_args)
|
||||
converter = Converter(args)
|
||||
return converter.run()
|
||||
|
|
52
tests/test_converter.py
Normal file
52
tests/test_converter.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
import tempfile
|
||||
|
||||
import pytest
|
||||
|
||||
from warc2zim.converter import Converter
|
||||
from warc2zim.main import _create_arguments_parser
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inputs, warc_files",
|
||||
[
|
||||
pytest.param([], [], id="empty_array"),
|
||||
pytest.param(["foo.warc.gz"], ["foo.warc.gz"], id="one_file"),
|
||||
pytest.param(
|
||||
[
|
||||
"rec-f9c30d949953-20240724035746176-0.warc.gz",
|
||||
"rec-f9c30d949953-20240724045846176-0.warc.gz",
|
||||
],
|
||||
None, # no change
|
||||
id="two_already_sorted",
|
||||
),
|
||||
pytest.param(
|
||||
[
|
||||
"rec-f9c30d949953-20240724045846176-0.warc.gz",
|
||||
"rec-f9c30d949953-20240724035746176-0.warc.gz",
|
||||
],
|
||||
[
|
||||
"rec-f9c30d949953-20240724035746176-0.warc.gz",
|
||||
"rec-f9c30d949953-20240724045846176-0.warc.gz",
|
||||
],
|
||||
id="two_not_sorted",
|
||||
),
|
||||
pytest.param(
|
||||
[
|
||||
"aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz",
|
||||
"bbb/rec-f9c30d949953-20240724035746176-0.warc.gz",
|
||||
],
|
||||
[
|
||||
"bbb/rec-f9c30d949953-20240724035746176-0.warc.gz",
|
||||
"aaaa/rec-f9c30d949953-20240724045846176-0.warc.gz",
|
||||
],
|
||||
id="two_not_sorted_in_random_unsorted_dirs",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_sort_warc_files(inputs, warc_files):
|
||||
parser = _create_arguments_parser()
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
args = parser.parse_args(["--name", "foo", "--output", tmpdir])
|
||||
args.inputs = inputs
|
||||
conv = Converter(args)
|
||||
assert conv.warc_files == (warc_files if warc_files else inputs)
|
Loading…
Add table
Add a link
Reference in a new issue