mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 14:33:17 +00:00
Fix linter / type checker issues
This commit is contained in:
parent
4c584cab75
commit
cd3251b978
9 changed files with 22 additions and 17 deletions
|
@ -2,8 +2,9 @@ import re
|
|||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from zimscraperlib.rewriting.url_rewriting import ZimPath
|
||||
|
||||
from warc2zim.constants import logger
|
||||
from warc2zim.url_rewriting import ZimPath
|
||||
|
||||
|
||||
def notify(_: ZimPath):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
""" html rewrite test utility
|
||||
"""html rewrite test utility
|
||||
|
||||
This utility takes a given HTML content as input, base64 encoded, its original URL, and
|
||||
rewrites its content.
|
||||
|
@ -17,9 +17,10 @@ import logging
|
|||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from zimscraperlib.rewriting.html import HtmlRewriter
|
||||
from zimscraperlib.rewriting.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
|
||||
from warc2zim.constants import logger
|
||||
from warc2zim.content_rewriting.html import HtmlRewriter
|
||||
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
||||
from warc2zim.utils import to_string
|
||||
|
||||
|
||||
|
@ -37,7 +38,9 @@ def main(path_to_content: str, article_url: str, encoding: str | None = None):
|
|||
content = Path(path_to_content)
|
||||
|
||||
url_rewriter = ArticleUrlRewriter(
|
||||
HttpUrl(article_url), existing_zim_paths=set(), missing_zim_paths=set()
|
||||
article_url=HttpUrl(article_url),
|
||||
existing_zim_paths=set(),
|
||||
missing_zim_paths=set(),
|
||||
)
|
||||
|
||||
html_rewriter = HtmlRewriter(url_rewriter, "", None, notify)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
""" MIA English exclude list
|
||||
"""MIA English exclude list
|
||||
|
||||
This utility computes the list of all subpages/languages that must be ignored for the
|
||||
English ZIM of The Marxists Internet Archive (MIA) at www.marxists.org.
|
||||
|
@ -23,9 +23,9 @@ soup = BeautifulSoup(resp.text, "html.parser")
|
|||
subfolders = set()
|
||||
REGEX = re.compile(r"\.\.\/(?P<subfolder>.*?)\/")
|
||||
for anchor in soup.find_all("a"):
|
||||
if not anchor.has_attr("href"):
|
||||
if not anchor.has_attr("href"): # pyright: ignore
|
||||
continue
|
||||
if match := REGEX.match(anchor["href"]):
|
||||
if match := REGEX.match(anchor["href"]): # pyright: ignore
|
||||
subfolders.add(match.group("subfolder"))
|
||||
|
||||
print("|".join(sorted(subfolders))) # noqa: T201
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from zimscraperlib import getLogger
|
||||
from zimscraperlib.logging import getLogger
|
||||
|
||||
# Shared logger with default log level at this stage
|
||||
logger = getLogger("warc2zim")
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
# vim: ai ts=4 sts=4 et sw=4 nu
|
||||
|
||||
""" warc2zim conversion utility
|
||||
"""warc2zim conversion utility
|
||||
|
||||
This utility provides a conversion from WARC records to ZIM files.
|
||||
WARC record are directly stored in a zim file as:
|
||||
|
@ -739,7 +739,7 @@ class Converter:
|
|||
lang_elem = soup.find("html", attrs={"lang": True})
|
||||
if lang_elem:
|
||||
self.language = parse_language(
|
||||
lang_elem.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue]
|
||||
lang_elem.attrs[ # pyright: ignore[reportArgumentType, reportAttributeAccessIssue]
|
||||
"lang"
|
||||
]
|
||||
)
|
||||
|
@ -751,7 +751,7 @@ class Converter:
|
|||
)
|
||||
if lang_elem:
|
||||
self.language = parse_language(
|
||||
lang_elem.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue]
|
||||
lang_elem.attrs[ # pyright: ignore[reportArgumentType ,reportAttributeAccessIssue]
|
||||
"content"
|
||||
]
|
||||
)
|
||||
|
@ -761,7 +761,7 @@ class Converter:
|
|||
lang_elem = soup.find("meta", {"name": "language", "content": True})
|
||||
if lang_elem:
|
||||
self.language = parse_language(
|
||||
lang_elem.attrs[ # pyright: ignore[reportGeneralTypeIssues ,reportAttributeAccessIssue]
|
||||
lang_elem.attrs[ # pyright: ignore[reportArgumentType ,reportAttributeAccessIssue]
|
||||
"content"
|
||||
]
|
||||
)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python
|
||||
# vim: ai ts=4 sts=4 et sw=4 nu
|
||||
|
||||
""" warc2zim's item classes
|
||||
"""warc2zim's item classes
|
||||
|
||||
This module contains the differents Item we may want to add to a Zim archive.
|
||||
"""
|
||||
|
|
|
@ -27,7 +27,7 @@ JSONP_CALLBACK_REGEX = re.compile(r"[?].*(?:callback|jsonp)=([^&]+)", re.I)
|
|||
|
||||
|
||||
def no_title(
|
||||
function: Callable[..., str | bytes]
|
||||
function: Callable[..., str | bytes],
|
||||
) -> Callable[..., tuple[str, str | bytes]]:
|
||||
"""Decorator for methods transforming content without extracting a title.
|
||||
|
||||
|
|
|
@ -258,7 +258,7 @@ class CharsetsTestData:
|
|||
expected_strings: list[str]
|
||||
|
||||
|
||||
def get_testdata() -> Generator[CharsetsTestData, None, None]:
|
||||
def get_testdata() -> Generator[CharsetsTestData]:
|
||||
data = json.loads(
|
||||
(Path(__file__).parent / "encodings" / "definition.json").read_bytes()
|
||||
)
|
||||
|
|
|
@ -10,8 +10,9 @@ from urllib.parse import unquote
|
|||
|
||||
import pytest
|
||||
import requests
|
||||
from zimscraperlib.image.conversion import convert_image, convert_svg2png, resize_image
|
||||
from zimscraperlib.image.conversion import convert_image, convert_svg2png
|
||||
from zimscraperlib.image.probing import format_for
|
||||
from zimscraperlib.image.transformation import resize_image
|
||||
from zimscraperlib.zim import Archive
|
||||
|
||||
from warc2zim.__about__ import __version__
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue