From 61dc7926536e2f9589864e1ec97733a9890e6cd1 Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Wed, 2 Aug 2023 11:21:43 +0000 Subject: [PATCH] Fixed #191: --lang to crawler, --zim-lang to warc2zim --- CHANGELOG.md | 2 ++ zimit.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2e9117..497d0cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `--title` to set ZIM title - `--description` to set ZIM description - New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization` +- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3) ### Changed @@ -20,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Using `main` warc2zim ⚠️ change before releasing! - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172) - `--failOnFailedSeed` used inconditionally +- `--lang` now passed to crawler (ISO-639-1) ### Removed diff --git a/zimit.py b/zimit.py index 7fbc6e3..3fd8032 100755 --- a/zimit.py +++ b/zimit.py @@ -205,6 +205,18 @@ def zimit(args=None): action="store_true", ) + parser.add_argument( + "--lang", + help="if set, sets the language used by the browser, should be ISO 639 language[-country] code", + ) + + parser.add_argument( + "--zim-lang", + help="Language metadata of ZIM " + "(warc2zim --lang param). ISO-639-3 code. " + "Retrieved from homepage if found, fallback to `eng`", + ) + parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " @@ -348,6 +360,10 @@ def zimit(args=None): warc2zim_args.append("--description") warc2zim_args.append(zimit_args.description) + if zimit_args.zim_lang: + warc2zim_args.append("--lang") + warc2zim_args.append(zimit_args.zim_lang) + print("----------") print("Testing warc2zim args") print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) @@ -482,6 +498,7 @@ def get_node_cmd_line(args): "exclude", "collection", "allowHashUrls", + "lang", "mobileDevice", "userAgent", "useSitemap",