make encoding aliases case-insensitive

This commit is contained in:
Uchechukwu Orji 2025-03-17 15:13:27 +01:00
parent 67e74d1a23
commit 9bf93258e8
4 changed files with 48 additions and 11 deletions

View file

@ -8,8 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
- Provide default encoding aliases (#416)
- Add `get_encoding_by_alias` method to retrieve an encoding by alias
### Changed
- Convert aliases given in `--encoding-aliases` to lower case (#412)
## [2.2.2] - 2024-02-17

View file

@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser:
" This parameter is single string, multiple values are separated by a comma, "
" like in alias1=encoding1,alias2=encoding2.",
type=lambda argument_value: {
alias_encoding.strip(): python_encoding.strip()
alias_encoding.lower().strip(): python_encoding.lower().strip()
for alias_encoding, python_encoding in (
encoding.split("=") for encoding in argument_value.split(",")
)

View file

@ -39,9 +39,10 @@ def set_encoding_aliases(aliases: dict[str, str]):
ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases})
def get_encoding_by_alias(alias: str, default: str = "") -> str:
def get_encoding_by_alias(alias: str) -> str:
"""Get the encoding method for alias."""
return ENCODING_ALIASES.get(alias, default)
key = alias.lower().strip()
return ENCODING_ALIASES.get(key, key)
def get_version():
@ -200,20 +201,18 @@ def to_string(
if m := ENCODING_RE.search(content_start):
head_encoding = m.group("encoding")
return input_.decode(
get_encoding_by_alias(head_encoding, head_encoding),
get_encoding_by_alias(head_encoding),
errors="replace",
)
# Search for encofing in HTTP `Content-Type` header
if not ignore_http_header_charsets and http_encoding:
return input_.decode(
get_encoding_by_alias(http_encoding, http_encoding), errors="replace"
)
return input_.decode(get_encoding_by_alias(http_encoding), errors="replace")
# Try all charsets_to_try passed
for charset_to_try in charsets_to_try:
try:
return input_.decode(get_encoding_by_alias(charset_to_try, charset_to_try))
return input_.decode(get_encoding_by_alias(charset_to_try))
except (ValueError, LookupError):
pass

View file

@ -363,6 +363,40 @@ def test_decode_charset_too_far_away_without_proper_alias():
)
def test_override_default_encoding_alias():
@pytest.mark.parametrize(
"alias, expected",
[
("ansi", "windows-1252"),
("65001", "utf-8"),
("iso-utf-8", "utf-8"),
("u", "utf-8"),
("unicode", "utf-8"),
("utf-8", "utf-8"),
("utf-08", "utf-8"),
("utf-f", "utf-8"),
("utp-8", "utf-8"),
("windows-8859-1", "iso-8859-1"),
("iso88591", "iso-8859-1"),
(" uNiCoDe ", "utf-8"),
(" U ", "utf-8"),
("UNICODE", "utf-8"),
],
)
def test_default_encoding_aliases(alias, expected):
assert get_encoding_by_alias(alias) == expected
def test_get_unknown_encoding():
assert get_encoding_by_alias("unKnown") == "unknown"
@pytest.mark.parametrize(
"alias, expected",
[
("Unicode", "latin1"),
("unicode", "latin1"),
],
)
def test_override_default_encoding_alias(alias, expected):
set_encoding_aliases({"unicode": "latin1"})
assert get_encoding_by_alias("unicode") == "latin1"
assert get_encoding_by_alias(alias) == expected