mirror of
https://github.com/openzim/zimit.git
synced 2025-12-31 04:23:15 +00:00
Merge pull request #285 from openzim/crawler_beta5
Upgrade browsertrix crawler and remove redirect handling
This commit is contained in:
commit
867d14fd00
4 changed files with 13 additions and 47 deletions
|
|
@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
- Using `warc2zim2` warc2zim ⚠️ change before releasing!
|
||||
- Build temporary `zimit2` Docker image for testing ⚠️ remove before releasing!
|
||||
- Adopt Python bootstrap conventions
|
||||
- Removed handling of redirects by zimit, they are handled by browsertrix crawler and detected properly by warc2zim
|
||||
- Upgrade to Python 3.12 + upgrade dependencies
|
||||
|
||||
## [1.6.3] - 2024-01-18
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
FROM webrecorder/browsertrix-crawler:0.12.4
|
||||
FROM webrecorder/browsertrix-crawler:1.0.0-beta.6
|
||||
LABEL org.opencontainers.image.source https://github.com/openzim/zimit
|
||||
|
||||
# add deadsnakes ppa for Python 3.12 on Ubuntu Jammy
|
||||
|
|
|
|||
|
|
@ -20,8 +20,6 @@ from pathlib import Path
|
|||
|
||||
import inotify
|
||||
import inotify.adapters
|
||||
import requests
|
||||
from tld import get_fld
|
||||
from warc2zim.main import main as warc2zim
|
||||
from zimscraperlib.logging import getLogger
|
||||
from zimscraperlib.uri import rebuild_uri
|
||||
|
|
@ -393,7 +391,7 @@ def run(raw_args):
|
|||
user_agent += f" {zimit_args.adminEmail}"
|
||||
|
||||
if url:
|
||||
url = check_url(url, user_agent, zimit_args.scopeType)
|
||||
url = get_cleaned_url(url)
|
||||
warc2zim_args.append("--url")
|
||||
warc2zim_args.append(url)
|
||||
|
||||
|
|
@ -509,48 +507,14 @@ def run(raw_args):
|
|||
return warc2zim(warc2zim_args)
|
||||
|
||||
|
||||
def check_url(url: str, user_agent: str, scope: str | None = None):
|
||||
def get_cleaned_url(url: str):
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
try:
|
||||
with requests.get(
|
||||
parsed_url.geturl(),
|
||||
stream=True,
|
||||
allow_redirects=True,
|
||||
timeout=(12.2, 27),
|
||||
headers={"User-Agent": user_agent},
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
except requests.exceptions.RequestException as exc:
|
||||
logger.info(f"failed to connect to {parsed_url.geturl()}: {exc}")
|
||||
raise SystemExit(1) from None
|
||||
actual_url = urllib.parse.urlparse(resp.url)
|
||||
|
||||
# remove explicit port in URI for default-for-scheme as browsers does it
|
||||
if actual_url.scheme == "https" and actual_url.port == 443: # noqa: PLR2004
|
||||
actual_url = rebuild_uri(actual_url, port="")
|
||||
if actual_url.scheme == "http" and actual_url.port == 80: # noqa: PLR2004
|
||||
actual_url = rebuild_uri(actual_url, port="")
|
||||
|
||||
if actual_url.geturl() != parsed_url.geturl():
|
||||
if scope in (None, "any"):
|
||||
return actual_url.geturl()
|
||||
|
||||
logger.info(
|
||||
"[WARN] Your URL ({}) redirects to {} which {} on same "
|
||||
"first-level domain. Depending on your scopeType ({}), "
|
||||
"your homepage might be out-of-scope. Please check!".format(
|
||||
parsed_url.geturl(),
|
||||
actual_url.geturl(),
|
||||
(
|
||||
"is"
|
||||
if get_fld(parsed_url.geturl()) == get_fld(actual_url.geturl())
|
||||
else "is not"
|
||||
),
|
||||
scope,
|
||||
)
|
||||
)
|
||||
|
||||
return actual_url.geturl()
|
||||
if parsed_url.scheme == "https" and parsed_url.port == 443: # noqa: PLR2004
|
||||
parsed_url = rebuild_uri(parsed_url, port="")
|
||||
if parsed_url.scheme == "http" and parsed_url.port == 80: # noqa: PLR2004
|
||||
parsed_url = rebuild_uri(parsed_url, port="")
|
||||
|
||||
return parsed_url.geturl()
|
||||
|
||||
|
|
|
|||
|
|
@ -65,12 +65,12 @@ def test_stats_output():
|
|||
}
|
||||
with open("/output/warc2zim.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"written": 8,
|
||||
"total": 8,
|
||||
"written": 7,
|
||||
"total": 7,
|
||||
}
|
||||
with open("/output/stats.json") as fh:
|
||||
assert json.loads(fh.read()) == {
|
||||
"done": 8,
|
||||
"total": 8,
|
||||
"done": 7,
|
||||
"total": 7,
|
||||
"limit": {"max": 0, "hit": False},
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue