From 67e74d1a23459f301d5f9dd117cb69b43b313666 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Mon, 17 Mar 2025 14:39:53 +0100 Subject: [PATCH 1/2] provide default encoding aliases --- CHANGELOG.md | 4 ++++ src/warc2zim/utils.py | 28 ++++++++++++++++++++++++---- tests/test_utils.py | 7 ++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b7a16f9..1d0d600 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Provide default encoding aliases (#416) +- Add `get_encoding_by_alias` method to retrieve an encoding by alias + ## [2.2.2] - 2024-02-17 ### Changed diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index 8ffd854..2a2e008 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -16,13 +16,32 @@ ENCODING_RE = re.compile( re.ASCII, ) +DEFAULT_ENCODING_ALIASES = { + "ansi": "windows-1252", + "65001": "utf-8", + "iso-utf-8": "utf-8", + "u": "utf-8", + "unicode": "utf-8", + "utf-8": "utf-8", + "utf-08": "utf-8", + "utf-f": "utf-8", + "utp-8": "utf-8", + "windows-8859-1": "iso-8859-1", + "iso88591": "iso-8859-1", +} + ENCODING_ALIASES = {} def set_encoding_aliases(aliases: dict[str, str]): """Set the encoding aliases to use to decode""" ENCODING_ALIASES.clear() - ENCODING_ALIASES.update(aliases) + ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases}) + + +def get_encoding_by_alias(alias: str, default: str = "") -> str: + """Get the encoding method for alias.""" + return ENCODING_ALIASES.get(alias, default) def get_version(): @@ -181,19 +200,20 @@ def to_string( if m := ENCODING_RE.search(content_start): head_encoding = m.group("encoding") return input_.decode( - ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace" + get_encoding_by_alias(head_encoding, head_encoding), + errors="replace", ) # Search for encofing in HTTP `Content-Type` header if not ignore_http_header_charsets and http_encoding: return input_.decode( - ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace" + get_encoding_by_alias(http_encoding, http_encoding), errors="replace" ) # Try all charsets_to_try passed for charset_to_try in charsets_to_try: try: - return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try)) + return input_.decode(get_encoding_by_alias(charset_to_try, charset_to_try)) except (ValueError, LookupError): pass diff --git a/tests/test_utils.py b/tests/test_utils.py index efb259c..6816454 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,7 +5,7 @@ from pathlib import Path import pytest -from warc2zim.utils import set_encoding_aliases, to_string +from warc2zim.utils import get_encoding_by_alias, set_encoding_aliases, to_string @dataclass @@ -361,3 +361,8 @@ def test_decode_charset_too_far_away_without_proper_alias(): ignore_http_header_charsets=False, ignore_content_header_charsets=False, ) + + +def test_override_default_encoding_alias(): + set_encoding_aliases({"unicode": "latin1"}) + assert get_encoding_by_alias("unicode") == "latin1" From 9bf93258e8ddee42d118a3f488959b7991176ca3 Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Mon, 17 Mar 2025 15:13:27 +0100 Subject: [PATCH 2/2] make encoding aliases case-insensitive --- CHANGELOG.md | 6 +++++- src/warc2zim/main.py | 2 +- src/warc2zim/utils.py | 13 ++++++------- tests/test_utils.py | 38 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d0d600..91113cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added + - Provide default encoding aliases (#416) -- Add `get_encoding_by_alias` method to retrieve an encoding by alias + +### Changed + +- Convert aliases given in `--encoding-aliases` to lower case (#412) ## [2.2.2] - 2024-02-17 diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 60fc1cb..015ff53 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser: " This parameter is single string, multiple values are separated by a comma, " " like in alias1=encoding1,alias2=encoding2.", type=lambda argument_value: { - alias_encoding.strip(): python_encoding.strip() + alias_encoding.lower().strip(): python_encoding.lower().strip() for alias_encoding, python_encoding in ( encoding.split("=") for encoding in argument_value.split(",") ) diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index 2a2e008..829862a 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -39,9 +39,10 @@ def set_encoding_aliases(aliases: dict[str, str]): ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases}) -def get_encoding_by_alias(alias: str, default: str = "") -> str: +def get_encoding_by_alias(alias: str) -> str: """Get the encoding method for alias.""" - return ENCODING_ALIASES.get(alias, default) + key = alias.lower().strip() + return ENCODING_ALIASES.get(key, key) def get_version(): @@ -200,20 +201,18 @@ def to_string( if m := ENCODING_RE.search(content_start): head_encoding = m.group("encoding") return input_.decode( - get_encoding_by_alias(head_encoding, head_encoding), + get_encoding_by_alias(head_encoding), errors="replace", ) # Search for encofing in HTTP `Content-Type` header if not ignore_http_header_charsets and http_encoding: - return input_.decode( - get_encoding_by_alias(http_encoding, http_encoding), errors="replace" - ) + return input_.decode(get_encoding_by_alias(http_encoding), errors="replace") # Try all charsets_to_try passed for charset_to_try in charsets_to_try: try: - return input_.decode(get_encoding_by_alias(charset_to_try, charset_to_try)) + return input_.decode(get_encoding_by_alias(charset_to_try)) except (ValueError, LookupError): pass diff --git a/tests/test_utils.py b/tests/test_utils.py index 6816454..3d00e1b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -363,6 +363,40 @@ def test_decode_charset_too_far_away_without_proper_alias(): ) -def test_override_default_encoding_alias(): +@pytest.mark.parametrize( + "alias, expected", + [ + ("ansi", "windows-1252"), + ("65001", "utf-8"), + ("iso-utf-8", "utf-8"), + ("u", "utf-8"), + ("unicode", "utf-8"), + ("utf-8", "utf-8"), + ("utf-08", "utf-8"), + ("utf-f", "utf-8"), + ("utp-8", "utf-8"), + ("windows-8859-1", "iso-8859-1"), + ("iso88591", "iso-8859-1"), + (" uNiCoDe ", "utf-8"), + (" U ", "utf-8"), + ("UNICODE", "utf-8"), + ], +) +def test_default_encoding_aliases(alias, expected): + assert get_encoding_by_alias(alias) == expected + + +def test_get_unknown_encoding(): + assert get_encoding_by_alias("unKnown") == "unknown" + + +@pytest.mark.parametrize( + "alias, expected", + [ + ("Unicode", "latin1"), + ("unicode", "latin1"), + ], +) +def test_override_default_encoding_alias(alias, expected): set_encoding_aliases({"unicode": "latin1"}) - assert get_encoding_by_alias("unicode") == "latin1" + assert get_encoding_by_alias(alias) == expected