From 67e74d1a23459f301d5f9dd117cb69b43b313666 Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Mon, 17 Mar 2025 14:39:53 +0100
Subject: [PATCH 1/2] provide default encoding aliases

---
 CHANGELOG.md          |  4 ++++
 src/warc2zim/utils.py | 28 ++++++++++++++++++++++++----
 tests/test_utils.py   |  7 ++++++-
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7a16f9..1d0d600 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Provide default encoding aliases (#416)
+- Add `get_encoding_by_alias` method to retrieve an encoding by alias
+
 ## [2.2.2] - 2024-02-17
 
 ### Changed
diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
index 8ffd854..2a2e008 100644
--- a/src/warc2zim/utils.py
+++ b/src/warc2zim/utils.py
@@ -16,13 +16,32 @@ ENCODING_RE = re.compile(
     re.ASCII,
 )
 
+DEFAULT_ENCODING_ALIASES = {
+    "ansi": "windows-1252",
+    "65001": "utf-8",
+    "iso-utf-8": "utf-8",
+    "u": "utf-8",
+    "unicode": "utf-8",
+    "utf-8": "utf-8",
+    "utf-08": "utf-8",
+    "utf-f": "utf-8",
+    "utp-8": "utf-8",
+    "windows-8859-1": "iso-8859-1",
+    "iso88591": "iso-8859-1",
+}
+
 ENCODING_ALIASES = {}
 
 
 def set_encoding_aliases(aliases: dict[str, str]):
     """Set the encoding aliases to use to decode"""
     ENCODING_ALIASES.clear()
-    ENCODING_ALIASES.update(aliases)
+    ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases})
+
+
+def get_encoding_by_alias(alias: str, default: str = "") -> str:
+    """Get the encoding method for alias."""
+    return ENCODING_ALIASES.get(alias, default)
 
 
 def get_version():
@@ -181,19 +200,20 @@ def to_string(
             if m := ENCODING_RE.search(content_start):
                 head_encoding = m.group("encoding")
                 return input_.decode(
-                    ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace"
+                    get_encoding_by_alias(head_encoding, head_encoding),
+                    errors="replace",
                 )
 
     # Search for encofing in HTTP `Content-Type` header
     if not ignore_http_header_charsets and http_encoding:
         return input_.decode(
-            ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace"
+            get_encoding_by_alias(http_encoding, http_encoding), errors="replace"
         )
 
     # Try all charsets_to_try passed
     for charset_to_try in charsets_to_try:
         try:
-            return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try))
+            return input_.decode(get_encoding_by_alias(charset_to_try, charset_to_try))
         except (ValueError, LookupError):
             pass
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index efb259c..6816454 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,7 +5,7 @@ from pathlib import Path
 
 import pytest
 
-from warc2zim.utils import set_encoding_aliases, to_string
+from warc2zim.utils import get_encoding_by_alias, set_encoding_aliases, to_string
 
 
 @dataclass
@@ -361,3 +361,8 @@ def test_decode_charset_too_far_away_without_proper_alias():
             ignore_http_header_charsets=False,
             ignore_content_header_charsets=False,
         )
+
+
+def test_override_default_encoding_alias():
+    set_encoding_aliases({"unicode": "latin1"})
+    assert get_encoding_by_alias("unicode") == "latin1"

From 9bf93258e8ddee42d118a3f488959b7991176ca3 Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Mon, 17 Mar 2025 15:13:27 +0100
Subject: [PATCH 2/2] make encoding aliases case-insensitive

---
 CHANGELOG.md          |  6 +++++-
 src/warc2zim/main.py  |  2 +-
 src/warc2zim/utils.py | 13 ++++++-------
 tests/test_utils.py   | 38 ++++++++++++++++++++++++++++++++++++--
 4 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d0d600..91113cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,8 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+
 - Provide default encoding aliases (#416)
-- Add `get_encoding_by_alias` method to retrieve an encoding by alias
+
+### Changed
+
+- Convert aliases given in `--encoding-aliases` to lower case (#412)
 
 ## [2.2.2] - 2024-02-17
 
diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py
index 60fc1cb..015ff53 100644
--- a/src/warc2zim/main.py
+++ b/src/warc2zim/main.py
@@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser:
         " This parameter is single string, multiple values are separated by a comma, "
         " like in alias1=encoding1,alias2=encoding2.",
         type=lambda argument_value: {
-            alias_encoding.strip(): python_encoding.strip()
+            alias_encoding.lower().strip(): python_encoding.lower().strip()
             for alias_encoding, python_encoding in (
                 encoding.split("=") for encoding in argument_value.split(",")
             )
diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py
index 2a2e008..829862a 100644
--- a/src/warc2zim/utils.py
+++ b/src/warc2zim/utils.py
@@ -39,9 +39,10 @@ def set_encoding_aliases(aliases: dict[str, str]):
     ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases})
 
 
-def get_encoding_by_alias(alias: str, default: str = "") -> str:
+def get_encoding_by_alias(alias: str) -> str:
     """Get the encoding method for alias."""
-    return ENCODING_ALIASES.get(alias, default)
+    key = alias.lower().strip()
+    return ENCODING_ALIASES.get(key, key)
 
 
 def get_version():
@@ -200,20 +201,18 @@ def to_string(
             if m := ENCODING_RE.search(content_start):
                 head_encoding = m.group("encoding")
                 return input_.decode(
-                    get_encoding_by_alias(head_encoding, head_encoding),
+                    get_encoding_by_alias(head_encoding),
                     errors="replace",
                 )
 
     # Search for encofing in HTTP `Content-Type` header
     if not ignore_http_header_charsets and http_encoding:
-        return input_.decode(
-            get_encoding_by_alias(http_encoding, http_encoding), errors="replace"
-        )
+        return input_.decode(get_encoding_by_alias(http_encoding), errors="replace")
 
     # Try all charsets_to_try passed
     for charset_to_try in charsets_to_try:
         try:
-            return input_.decode(get_encoding_by_alias(charset_to_try, charset_to_try))
+            return input_.decode(get_encoding_by_alias(charset_to_try))
         except (ValueError, LookupError):
             pass
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 6816454..3d00e1b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -363,6 +363,40 @@ def test_decode_charset_too_far_away_without_proper_alias():
         )
 
 
-def test_override_default_encoding_alias():
+@pytest.mark.parametrize(
+    "alias, expected",
+    [
+        ("ansi", "windows-1252"),
+        ("65001", "utf-8"),
+        ("iso-utf-8", "utf-8"),
+        ("u", "utf-8"),
+        ("unicode", "utf-8"),
+        ("utf-8", "utf-8"),
+        ("utf-08", "utf-8"),
+        ("utf-f", "utf-8"),
+        ("utp-8", "utf-8"),
+        ("windows-8859-1", "iso-8859-1"),
+        ("iso88591", "iso-8859-1"),
+        ("   uNiCoDe    ", "utf-8"),
+        ("   U    ", "utf-8"),
+        ("UNICODE", "utf-8"),
+    ],
+)
+def test_default_encoding_aliases(alias, expected):
+    assert get_encoding_by_alias(alias) == expected
+
+
+def test_get_unknown_encoding():
+    assert get_encoding_by_alias("unKnown") == "unknown"
+
+
+@pytest.mark.parametrize(
+    "alias, expected",
+    [
+        ("Unicode", "latin1"),
+        ("unicode", "latin1"),
+    ],
+)
+def test_override_default_encoding_alias(alias, expected):
     set_encoding_aliases({"unicode": "latin1"})
-    assert get_encoding_by_alias("unicode") == "latin1"
+    assert get_encoding_by_alias(alias) == expected