From 9bf93258e8ddee42d118a3f488959b7991176ca3 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Mon, 17 Mar 2025 15:13:27 +0100 Subject: [PATCH] make encoding aliases case-insensitive --- CHANGELOG.md | 6 +++++- src/warc2zim/main.py | 2 +- src/warc2zim/utils.py | 13 ++++++------- tests/test_utils.py | 38 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d0d600..91113cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added + - Provide default encoding aliases (#416) -- Add `get_encoding_by_alias` method to retrieve an encoding by alias + +### Changed + +- Convert aliases given in `--encoding-aliases` to lower case (#412) ## [2.2.2] - 2024-02-17 diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 60fc1cb..015ff53 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser: " This parameter is single string, multiple values are separated by a comma, " " like in alias1=encoding1,alias2=encoding2.", type=lambda argument_value: { - alias_encoding.strip(): python_encoding.strip() + alias_encoding.lower().strip(): python_encoding.lower().strip() for alias_encoding, python_encoding in ( encoding.split("=") for encoding in argument_value.split(",") ) diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index 2a2e008..829862a 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -39,9 +39,10 @@ def set_encoding_aliases(aliases: dict[str, str]): ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases}) -def get_encoding_by_alias(alias: str, default: str = "") -> str: +def get_encoding_by_alias(alias: str) -> str: """Get the encoding method for alias.""" - return ENCODING_ALIASES.get(alias, default) + key = alias.lower().strip() + return ENCODING_ALIASES.get(key, key) def get_version(): @@ -200,20 +201,18 @@ def to_string( if m := ENCODING_RE.search(content_start): head_encoding = m.group("encoding") return input_.decode( - get_encoding_by_alias(head_encoding, head_encoding), + get_encoding_by_alias(head_encoding), errors="replace", ) # Search for encofing in HTTP `Content-Type` header if not ignore_http_header_charsets and http_encoding: - return input_.decode( - get_encoding_by_alias(http_encoding, http_encoding), errors="replace" - ) + return input_.decode(get_encoding_by_alias(http_encoding), errors="replace") # Try all charsets_to_try passed for charset_to_try in charsets_to_try: try: - return input_.decode(get_encoding_by_alias(charset_to_try, charset_to_try)) + return input_.decode(get_encoding_by_alias(charset_to_try)) except (ValueError, LookupError): pass diff --git a/tests/test_utils.py b/tests/test_utils.py index 6816454..3d00e1b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -363,6 +363,40 @@ def test_decode_charset_too_far_away_without_proper_alias(): ) -def test_override_default_encoding_alias(): +@pytest.mark.parametrize( + "alias, expected", + [ + ("ansi", "windows-1252"), + ("65001", "utf-8"), + ("iso-utf-8", "utf-8"), + ("u", "utf-8"), + ("unicode", "utf-8"), + ("utf-8", "utf-8"), + ("utf-08", "utf-8"), + ("utf-f", "utf-8"), + ("utp-8", "utf-8"), + ("windows-8859-1", "iso-8859-1"), + ("iso88591", "iso-8859-1"), + (" uNiCoDe ", "utf-8"), + (" U ", "utf-8"), + ("UNICODE", "utf-8"), + ], +) +def test_default_encoding_aliases(alias, expected): + assert get_encoding_by_alias(alias) == expected + + +def test_get_unknown_encoding(): + assert get_encoding_by_alias("unKnown") == "unknown" + + +@pytest.mark.parametrize( + "alias, expected", + [ + ("Unicode", "latin1"), + ("unicode", "latin1"), + ], +) +def test_override_default_encoding_alias(alias, expected): set_encoding_aliases({"unicode": "latin1"}) - assert get_encoding_by_alias("unicode") == "latin1" + assert get_encoding_by_alias(alias) == expected