Merge pull request #446 from elfkuzco/encoding-aliases

provide default encoding aliases
2025-12-08 06:09:54 +00:00 · 2025-03-20 16:28:32 +01:00 · 2025-03-20 16:28:32 +01:00 · 270f5dbaae
commit 270f5dbaae
parent b66d6a2692 9bf93258e8
4 changed files with 74 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

+### Added
+
+- Provide default encoding aliases (#416)
+
+### Changed
+
+- Convert aliases given in `--encoding-aliases` to lower case (#412)
+
 ## [2.2.2] - 2024-02-17

 ### Changed
--- a/src/warc2zim/main.py
+++ b/src/warc2zim/main.py
@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser:
        " This parameter is single string, multiple values are separated by a comma, "
        " like in alias1=encoding1,alias2=encoding2.",
        type=lambda argument_value: {
-            alias_encoding.strip(): python_encoding.strip()
+            alias_encoding.lower().strip(): python_encoding.lower().strip()
            for alias_encoding, python_encoding in (
                encoding.split("=") for encoding in argument_value.split(",")
            )
--- a/src/warc2zim/utils.py
+++ b/src/warc2zim/utils.py
@ -16,13 +16,33 @@ ENCODING_RE = re.compile(
    re.ASCII,
 )

+DEFAULT_ENCODING_ALIASES = {
+    "ansi": "windows-1252",
+    "65001": "utf-8",
+    "iso-utf-8": "utf-8",
+    "u": "utf-8",
+    "unicode": "utf-8",
+    "utf-8": "utf-8",
+    "utf-08": "utf-8",
+    "utf-f": "utf-8",
+    "utp-8": "utf-8",
+    "windows-8859-1": "iso-8859-1",
+    "iso88591": "iso-8859-1",
+}
+
 ENCODING_ALIASES = {}


 def set_encoding_aliases(aliases: dict[str, str]):
    """Set the encoding aliases to use to decode"""
    ENCODING_ALIASES.clear()
-    ENCODING_ALIASES.update(aliases)
+    ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases})
+
+
+def get_encoding_by_alias(alias: str) -> str:
+    """Get the encoding method for alias."""
+    key = alias.lower().strip()
+    return ENCODING_ALIASES.get(key, key)


 def get_version():
@ -181,19 +201,18 @@ def to_string(
            if m := ENCODING_RE.search(content_start):
                head_encoding = m.group("encoding")
                return input_.decode(
-                    ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace"
+                    get_encoding_by_alias(head_encoding),
+                    errors="replace",
                )

    # Search for encofing in HTTP `Content-Type` header
    if not ignore_http_header_charsets and http_encoding:
-        return input_.decode(
-            ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace"
-        )
+        return input_.decode(get_encoding_by_alias(http_encoding), errors="replace")

    # Try all charsets_to_try passed
    for charset_to_try in charsets_to_try:
        try:
-            return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try))
+            return input_.decode(get_encoding_by_alias(charset_to_try))
        except (ValueError, LookupError):
            pass

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -5,7 +5,7 @@ from pathlib import Path

 import pytest

-from warc2zim.utils import set_encoding_aliases, to_string
+from warc2zim.utils import get_encoding_by_alias, set_encoding_aliases, to_string


@dataclass
@ -361,3 +361,42 @@ def test_decode_charset_too_far_away_without_proper_alias():
            ignore_http_header_charsets=False,
            ignore_content_header_charsets=False,
        )
+
+
+@pytest.mark.parametrize(
+    "alias, expected",
+    [
+        ("ansi", "windows-1252"),
+        ("65001", "utf-8"),
+        ("iso-utf-8", "utf-8"),
+        ("u", "utf-8"),
+        ("unicode", "utf-8"),
+        ("utf-8", "utf-8"),
+        ("utf-08", "utf-8"),
+        ("utf-f", "utf-8"),
+        ("utp-8", "utf-8"),
+        ("windows-8859-1", "iso-8859-1"),
+        ("iso88591", "iso-8859-1"),
+        ("   uNiCoDe    ", "utf-8"),
+        ("   U    ", "utf-8"),
+        ("UNICODE", "utf-8"),
+    ],
+)
+def test_default_encoding_aliases(alias, expected):
+    assert get_encoding_by_alias(alias) == expected
+
+
+def test_get_unknown_encoding():
+    assert get_encoding_by_alias("unKnown") == "unknown"
+
+
+@pytest.mark.parametrize(
+    "alias, expected",
+    [
+        ("Unicode", "latin1"),
+        ("unicode", "latin1"),
+    ],
+)
+def test_override_default_encoding_alias(alias, expected):
+    set_encoding_aliases({"unicode": "latin1"})
+    assert get_encoding_by_alias(alias) == expected