diff --git a/contrib/cleanup_log.py b/contrib/cleanup_log.py index da0b75d..336a15c 100644 --- a/contrib/cleanup_log.py +++ b/contrib/cleanup_log.py @@ -2,8 +2,9 @@ import re import sys from pathlib import Path +from zimscraperlib.rewriting.url_rewriting import ZimPath + from warc2zim.constants import logger -from warc2zim.url_rewriting import ZimPath def notify(_: ZimPath): diff --git a/contrib/html_convert.py b/contrib/html_convert.py index af3d43b..30aac00 100644 --- a/contrib/html_convert.py +++ b/contrib/html_convert.py @@ -1,4 +1,4 @@ -""" html rewrite test utility +"""html rewrite test utility This utility takes a given HTML content as input, base64 encoded, its original URL, and rewrites its content. @@ -17,9 +17,10 @@ import logging import sys from pathlib import Path +from zimscraperlib.rewriting.html import HtmlRewriter +from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath + from warc2zim.constants import logger -from warc2zim.content_rewriting.html import HtmlRewriter -from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath from warc2zim.utils import to_string @@ -37,7 +38,9 @@ def main(path_to_content: str, article_url: str, encoding: str | None = None): content = Path(path_to_content) url_rewriter = ArticleUrlRewriter( - HttpUrl(article_url), existing_zim_paths=set(), missing_zim_paths=set() + article_url=HttpUrl(article_url), + existing_zim_paths=set(), + missing_zim_paths=set(), ) html_rewriter = HtmlRewriter(url_rewriter, "", None, notify) diff --git a/contrib/marxists.org.py b/contrib/marxists.org.py index ccd18db..b20540f 100644 --- a/contrib/marxists.org.py +++ b/contrib/marxists.org.py @@ -1,4 +1,4 @@ -""" MIA English exclude list +"""MIA English exclude list This utility computes the list of all subpages/languages that must be ignored for the English ZIM of The Marxists Internet Archive (MIA) at www.marxists.org. @@ -23,9 +23,9 @@ soup = BeautifulSoup(resp.text, "html.parser") subfolders = set() REGEX = re.compile(r"\.\.\/(?P.*?)\/") for anchor in soup.find_all("a"): - if not anchor.has_attr("href"): + if not anchor.has_attr("href"): # pyright: ignore continue - if match := REGEX.match(anchor["href"]): + if match := REGEX.match(anchor["href"]): # pyright: ignore subfolders.add(match.group("subfolder")) print("|".join(sorted(subfolders))) # noqa: T201 diff --git a/src/warc2zim/constants.py b/src/warc2zim/constants.py index 591db5b..8755109 100644 --- a/src/warc2zim/constants.py +++ b/src/warc2zim/constants.py @@ -1,4 +1,4 @@ -from zimscraperlib import getLogger +from zimscraperlib.logging import getLogger # Shared logger with default log level at this stage logger = getLogger("warc2zim") diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 04bfa46..a476b79 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # vim: ai ts=4 sts=4 et sw=4 nu -""" warc2zim conversion utility +"""warc2zim conversion utility This utility provides a conversion from WARC records to ZIM files. WARC record are directly stored in a zim file as: @@ -739,7 +739,7 @@ class Converter: lang_elem = soup.find("html", attrs={"lang": True}) if lang_elem: self.language = parse_language( - lang_elem.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue] + lang_elem.attrs[ # pyright: ignore[reportArgumentType, reportAttributeAccessIssue] "lang" ] ) @@ -751,7 +751,7 @@ class Converter: ) if lang_elem: self.language = parse_language( - lang_elem.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue] + lang_elem.attrs[ # pyright: ignore[reportArgumentType ,reportAttributeAccessIssue] "content" ] ) @@ -761,7 +761,7 @@ class Converter: lang_elem = soup.find("meta", {"name": "language", "content": True}) if lang_elem: self.language = parse_language( - lang_elem.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue] + lang_elem.attrs[ # pyright: ignore[reportArgumentType ,reportAttributeAccessIssue] "content" ] ) diff --git a/src/warc2zim/items.py b/src/warc2zim/items.py index 07d7547..a446e07 100644 --- a/src/warc2zim/items.py +++ b/src/warc2zim/items.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # vim: ai ts=4 sts=4 et sw=4 nu -""" warc2zim's item classes +"""warc2zim's item classes This module contains the differents Item we may want to add to a Zim archive. """ diff --git a/src/warc2zim/rewriting.py b/src/warc2zim/rewriting.py index 11568cc..d0e1d68 100644 --- a/src/warc2zim/rewriting.py +++ b/src/warc2zim/rewriting.py @@ -27,7 +27,7 @@ JSONP_CALLBACK_REGEX = re.compile(r"[?].*(?:callback|jsonp)=([^&]+)", re.I) def no_title( - function: Callable[..., str | bytes] + function: Callable[..., str | bytes], ) -> Callable[..., tuple[str, str | bytes]]: """Decorator for methods transforming content without extracting a title. diff --git a/tests/test_utils.py b/tests/test_utils.py index f214cf9..efb259c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -258,7 +258,7 @@ class CharsetsTestData: expected_strings: list[str] -def get_testdata() -> Generator[CharsetsTestData, None, None]: +def get_testdata() -> Generator[CharsetsTestData]: data = json.loads( (Path(__file__).parent / "encodings" / "definition.json").read_bytes() ) diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 73c891d..ce7fcfd 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -10,8 +10,9 @@ from urllib.parse import unquote import pytest import requests -from zimscraperlib.image.conversion import convert_image, convert_svg2png, resize_image +from zimscraperlib.image.conversion import convert_image, convert_svg2png from zimscraperlib.image.probing import format_for +from zimscraperlib.image.transformation import resize_image from zimscraperlib.zim import Archive from warc2zim.__about__ import __version__