diff --git a/CHANGELOG.md b/CHANGELOG.md index b7a16f9..91113cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Provide default encoding aliases (#416) + +### Changed + +- Convert aliases given in `--encoding-aliases` to lower case (#412) + ## [2.2.2] - 2024-02-17 ### Changed diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index 60fc1cb..015ff53 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser: " This parameter is single string, multiple values are separated by a comma, " " like in alias1=encoding1,alias2=encoding2.", type=lambda argument_value: { - alias_encoding.strip(): python_encoding.strip() + alias_encoding.lower().strip(): python_encoding.lower().strip() for alias_encoding, python_encoding in ( encoding.split("=") for encoding in argument_value.split(",") ) diff --git a/src/warc2zim/utils.py b/src/warc2zim/utils.py index 8ffd854..829862a 100644 --- a/src/warc2zim/utils.py +++ b/src/warc2zim/utils.py @@ -16,13 +16,33 @@ ENCODING_RE = re.compile( re.ASCII, ) +DEFAULT_ENCODING_ALIASES = { + "ansi": "windows-1252", + "65001": "utf-8", + "iso-utf-8": "utf-8", + "u": "utf-8", + "unicode": "utf-8", + "utf-8": "utf-8", + "utf-08": "utf-8", + "utf-f": "utf-8", + "utp-8": "utf-8", + "windows-8859-1": "iso-8859-1", + "iso88591": "iso-8859-1", +} + ENCODING_ALIASES = {} def set_encoding_aliases(aliases: dict[str, str]): """Set the encoding aliases to use to decode""" ENCODING_ALIASES.clear() - ENCODING_ALIASES.update(aliases) + ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases}) + + +def get_encoding_by_alias(alias: str) -> str: + """Get the encoding method for alias.""" + key = alias.lower().strip() + return ENCODING_ALIASES.get(key, key) def get_version(): @@ -181,19 +201,18 @@ def to_string( if m := ENCODING_RE.search(content_start): head_encoding = m.group("encoding") return input_.decode( - ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace" + get_encoding_by_alias(head_encoding), + errors="replace", ) # Search for encofing in HTTP `Content-Type` header if not ignore_http_header_charsets and http_encoding: - return input_.decode( - ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace" - ) + return input_.decode(get_encoding_by_alias(http_encoding), errors="replace") # Try all charsets_to_try passed for charset_to_try in charsets_to_try: try: - return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try)) + return input_.decode(get_encoding_by_alias(charset_to_try)) except (ValueError, LookupError): pass diff --git a/tests/test_utils.py b/tests/test_utils.py index efb259c..3d00e1b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,7 +5,7 @@ from pathlib import Path import pytest -from warc2zim.utils import set_encoding_aliases, to_string +from warc2zim.utils import get_encoding_by_alias, set_encoding_aliases, to_string @dataclass @@ -361,3 +361,42 @@ def test_decode_charset_too_far_away_without_proper_alias(): ignore_http_header_charsets=False, ignore_content_header_charsets=False, ) + + +@pytest.mark.parametrize( + "alias, expected", + [ + ("ansi", "windows-1252"), + ("65001", "utf-8"), + ("iso-utf-8", "utf-8"), + ("u", "utf-8"), + ("unicode", "utf-8"), + ("utf-8", "utf-8"), + ("utf-08", "utf-8"), + ("utf-f", "utf-8"), + ("utp-8", "utf-8"), + ("windows-8859-1", "iso-8859-1"), + ("iso88591", "iso-8859-1"), + (" uNiCoDe ", "utf-8"), + (" U ", "utf-8"), + ("UNICODE", "utf-8"), + ], +) +def test_default_encoding_aliases(alias, expected): + assert get_encoding_by_alias(alias) == expected + + +def test_get_unknown_encoding(): + assert get_encoding_by_alias("unKnown") == "unknown" + + +@pytest.mark.parametrize( + "alias, expected", + [ + ("Unicode", "latin1"), + ("unicode", "latin1"), + ], +) +def test_override_default_encoding_alias(alias, expected): + set_encoding_aliases({"unicode": "latin1"}) + assert get_encoding_by_alias(alias) == expected