Merge pull request #285 from openzim/crawler_beta5

Upgrade browsertrix crawler and remove redirect handling
This commit is contained in:
benoit74 2024-03-07 11:25:02 +01:00 committed by GitHub
commit 867d14fd00
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 13 additions and 47 deletions

View file

@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
- Adopt Python bootstrap conventions
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
- Upgrade to Python 3.12 + upgrade dependencies
## [1.6.3] - 2024-01-18

View file

@ -1,4 +1,4 @@
FROM webrecorder/browsertrix-crawler:0.12.4
FROM webrecorder/browsertrix-crawler:1.0.0-beta.6
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy

View file

@ -20,8 +20,6 @@ from pathlib import Path
import inotify
import inotify.adapters
import requests
from tld import get_fld
from warc2zim.main import main as warc2zim
from zimscraperlib.logging import getLogger
from zimscraperlib.uri import rebuild_uri
@ -393,7 +391,7 @@ def run(raw_args):
user_agent += f" {zimit_args.adminEmail}"
if url:
url = check_url(url, user_agent, zimit_args.scopeType)
url = get_cleaned_url(url)
warc2zim_args.append("--url")
warc2zim_args.append(url)
@ -509,48 +507,14 @@ def run(raw_args):
return warc2zim(warc2zim_args)
def check_url(url: str, user_agent: str, scope: str | None = None):
def get_cleaned_url(url: str):
parsed_url = urllib.parse.urlparse(url)
try:
with requests.get(
parsed_url.geturl(),
stream=True,
allow_redirects=True,
timeout=(12.2, 27),
headers={"User-Agent": user_agent},
) as resp:
resp.raise_for_status()
except requests.exceptions.RequestException as exc:
logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}")
raise SystemExit(1) from None
actual_url = urllib.parse.urlparse(resp.url)
# remove explicit port in URI for default-for-scheme as browsers does it
if actual_url.scheme == "https" and actual_url.port == 443: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="")
if actual_url.scheme == "http" and actual_url.port == 80: # noqa: PLR2004
actual_url = rebuild_uri(actual_url, port="")
if actual_url.geturl() != parsed_url.geturl():
if scope in (None, "any"):
return actual_url.geturl()
logger.info(
"[WARN] Your URL ({}) redirects to {} which {} on same "
"first-level domain. Depending on your scopeType ({}), "
"your homepage might be out-of-scope. Please check!".format(
parsed_url.geturl(),
actual_url.geturl(),
(
"is"
if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
else "is not"
),
scope,
)
)
return actual_url.geturl()
if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
parsed_url = rebuild_uri(parsed_url, port="")
if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
parsed_url = rebuild_uri(parsed_url, port="")
return parsed_url.geturl()

View file

@ -65,12 +65,12 @@ def test_stats_output():
}
with open("/output/warc2zim.json") as fh:
assert json.loads(fh.read()) == {
"written": 8,
"total": 8,
"written": 7,
"total": 7,
}
with open("/output/stats.json") as fh:
assert json.loads(fh.read()) == {
"done": 8,
"total": 8,
"done": 7,
"total": 7,
"limit": {"max": 0, "hit": False},
}