mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 06:23:16 +00:00
Merge pull request #446 from elfkuzco/encoding-aliases
provide default encoding aliases
This commit is contained in:
commit
270f5dbaae
4 changed files with 74 additions and 8 deletions
|
@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
|
||||
- Provide default encoding aliases (#416)
|
||||
|
||||
### Changed
|
||||
|
||||
- Convert aliases given in `--encoding-aliases` to lower case (#412)
|
||||
|
||||
## [2.2.2] - 2024-02-17
|
||||
|
||||
### Changed
|
||||
|
|
|
@ -149,7 +149,7 @@ def _create_arguments_parser() -> ArgumentParser:
|
|||
" This parameter is single string, multiple values are separated by a comma, "
|
||||
" like in alias1=encoding1,alias2=encoding2.",
|
||||
type=lambda argument_value: {
|
||||
alias_encoding.strip(): python_encoding.strip()
|
||||
alias_encoding.lower().strip(): python_encoding.lower().strip()
|
||||
for alias_encoding, python_encoding in (
|
||||
encoding.split("=") for encoding in argument_value.split(",")
|
||||
)
|
||||
|
|
|
@ -16,13 +16,33 @@ ENCODING_RE = re.compile(
|
|||
re.ASCII,
|
||||
)
|
||||
|
||||
DEFAULT_ENCODING_ALIASES = {
|
||||
"ansi": "windows-1252",
|
||||
"65001": "utf-8",
|
||||
"iso-utf-8": "utf-8",
|
||||
"u": "utf-8",
|
||||
"unicode": "utf-8",
|
||||
"utf-8": "utf-8",
|
||||
"utf-08": "utf-8",
|
||||
"utf-f": "utf-8",
|
||||
"utp-8": "utf-8",
|
||||
"windows-8859-1": "iso-8859-1",
|
||||
"iso88591": "iso-8859-1",
|
||||
}
|
||||
|
||||
ENCODING_ALIASES = {}
|
||||
|
||||
|
||||
def set_encoding_aliases(aliases: dict[str, str]):
|
||||
"""Set the encoding aliases to use to decode"""
|
||||
ENCODING_ALIASES.clear()
|
||||
ENCODING_ALIASES.update(aliases)
|
||||
ENCODING_ALIASES.update({**DEFAULT_ENCODING_ALIASES, **aliases})
|
||||
|
||||
|
||||
def get_encoding_by_alias(alias: str) -> str:
|
||||
"""Get the encoding method for alias."""
|
||||
key = alias.lower().strip()
|
||||
return ENCODING_ALIASES.get(key, key)
|
||||
|
||||
|
||||
def get_version():
|
||||
|
@ -181,19 +201,18 @@ def to_string(
|
|||
if m := ENCODING_RE.search(content_start):
|
||||
head_encoding = m.group("encoding")
|
||||
return input_.decode(
|
||||
ENCODING_ALIASES.get(head_encoding, head_encoding), errors="replace"
|
||||
get_encoding_by_alias(head_encoding),
|
||||
errors="replace",
|
||||
)
|
||||
|
||||
# Search for encofing in HTTP `Content-Type` header
|
||||
if not ignore_http_header_charsets and http_encoding:
|
||||
return input_.decode(
|
||||
ENCODING_ALIASES.get(http_encoding, http_encoding), errors="replace"
|
||||
)
|
||||
return input_.decode(get_encoding_by_alias(http_encoding), errors="replace")
|
||||
|
||||
# Try all charsets_to_try passed
|
||||
for charset_to_try in charsets_to_try:
|
||||
try:
|
||||
return input_.decode(ENCODING_ALIASES.get(charset_to_try, charset_to_try))
|
||||
return input_.decode(get_encoding_by_alias(charset_to_try))
|
||||
except (ValueError, LookupError):
|
||||
pass
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
|
||||
import pytest
|
||||
|
||||
from warc2zim.utils import set_encoding_aliases, to_string
|
||||
from warc2zim.utils import get_encoding_by_alias, set_encoding_aliases, to_string
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -361,3 +361,42 @@ def test_decode_charset_too_far_away_without_proper_alias():
|
|||
ignore_http_header_charsets=False,
|
||||
ignore_content_header_charsets=False,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alias, expected",
|
||||
[
|
||||
("ansi", "windows-1252"),
|
||||
("65001", "utf-8"),
|
||||
("iso-utf-8", "utf-8"),
|
||||
("u", "utf-8"),
|
||||
("unicode", "utf-8"),
|
||||
("utf-8", "utf-8"),
|
||||
("utf-08", "utf-8"),
|
||||
("utf-f", "utf-8"),
|
||||
("utp-8", "utf-8"),
|
||||
("windows-8859-1", "iso-8859-1"),
|
||||
("iso88591", "iso-8859-1"),
|
||||
(" uNiCoDe ", "utf-8"),
|
||||
(" U ", "utf-8"),
|
||||
("UNICODE", "utf-8"),
|
||||
],
|
||||
)
|
||||
def test_default_encoding_aliases(alias, expected):
|
||||
assert get_encoding_by_alias(alias) == expected
|
||||
|
||||
|
||||
def test_get_unknown_encoding():
|
||||
assert get_encoding_by_alias("unKnown") == "unknown"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alias, expected",
|
||||
[
|
||||
("Unicode", "latin1"),
|
||||
("unicode", "latin1"),
|
||||
],
|
||||
)
|
||||
def test_override_default_encoding_alias(alias, expected):
|
||||
set_encoding_aliases({"unicode": "latin1"})
|
||||
assert get_encoding_by_alias(alias) == expected
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue