Merge pull request #285 from openzim/crawler_beta5

Upgrade browsertrix crawler and remove redirect handling
2025-12-31 04:23:15 +00:00 · 2024-03-07 11:25:02 +01:00 · 2024-03-07 11:25:02 +01:00 · 867d14fd00
commit 867d14fd00
parent c2dc8c5ccc 5c716747b4
4 changed files with 13 additions and 47 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Using `warc2zim2` warc2zim ⚠️ change before releasing!
 - Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
 - Adopt Python bootstrap conventions
+- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
+- Upgrade to Python 3.12 + upgrade dependencies

 ## [1.6.3] - 2024-01-18

--- a/2
+++ b/2
@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:0.12.4
+FROM webrecorder/browsertrix-crawler:1.0.0-beta.6
 LABEL org.opencontainers.image.source https://github.com/openzim/zimit

 # add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
--- a/src/zimit/zimit.py
+++ b/src/zimit/zimit.py
@ -20,8 +20,6 @@ from pathlib import Path

 import inotify
 import inotify.adapters
-import requests
-from tld import get_fld
 from warc2zim.main import main as warc2zim
 from zimscraperlib.logging import getLogger
 from zimscraperlib.uri import rebuild_uri
@ -393,7 +391,7 @@ def run(raw_args):
        user_agent += f" {zimit_args.adminEmail}"

    if url:
-        url = check_url(url, user_agent, zimit_args.scopeType)
+        url = get_cleaned_url(url)
        warc2zim_args.append("--url")
        warc2zim_args.append(url)

@ -509,48 +507,14 @@ def run(raw_args):
    return warc2zim(warc2zim_args)


-def check_url(url: str, user_agent: str, scope: str | None = None):
+def get_cleaned_url(url: str):
    parsed_url = urllib.parse.urlparse(url)
-    try:
-        with requests.get(
-            parsed_url.geturl(),
-            stream=True,
-            allow_redirects=True,
-            timeout=(12.2, 27),
-            headers={"User-Agent": user_agent},
-        ) as resp:
-            resp.raise_for_status()
-    except requests.exceptions.RequestException as exc:
-        logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}")
-        raise SystemExit(1) from None
-    actual_url = urllib.parse.urlparse(resp.url)

    # remove explicit port in URI for default-for-scheme as browsers does it
-    if actual_url.scheme == "https" and actual_url.port == 443:  # noqa: PLR2004
-        actual_url = rebuild_uri(actual_url, port="")
-    if actual_url.scheme == "http" and actual_url.port == 80:  # noqa: PLR2004
-        actual_url = rebuild_uri(actual_url, port="")
-
-    if actual_url.geturl() != parsed_url.geturl():
-        if scope in (None, "any"):
-            return actual_url.geturl()
-
-        logger.info(
-            "[WARN] Your URL ({}) redirects to {} which {} on same "
-            "first-level domain. Depending on your scopeType ({}), "
-            "your homepage might be out-of-scope. Please check!".format(
-                parsed_url.geturl(),
-                actual_url.geturl(),
-                (
-                    "is"
-                    if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
-                    else "is not"
-                ),
-                scope,
-            )
-        )
-
-        return actual_url.geturl()
+    if parsed_url.scheme == "https" and parsed_url.port == 443:  # noqa: PLR2004
+        parsed_url = rebuild_uri(parsed_url, port="")
+    if parsed_url.scheme == "http" and parsed_url.port == 80:  # noqa: PLR2004
+        parsed_url = rebuild_uri(parsed_url, port="")

    return parsed_url.geturl()

--- a/tests-integration/integration.py
+++ b/tests-integration/integration.py
@ -65,12 +65,12 @@ def test_stats_output():
        }
    with open("/output/warc2zim.json") as fh:
        assert json.loads(fh.read()) == {
-            "written": 8,
-            "total": 8,
+            "written": 7,
+            "total": 7,
        }
    with open("/output/stats.json") as fh:
        assert json.loads(fh.read()) == {
-            "done": 8,
-            "total": 8,
+            "done": 7,
+            "total": 7,
            "limit": {"max": 0, "hit": False},
        }