2024-06-14 13:21:18 +00:00
|
|
|
|
import json
|
|
|
|
|
from collections.abc import Generator
|
2024-02-14 10:33:19 +01:00
|
|
|
|
from dataclasses import dataclass
|
2024-06-14 13:21:18 +00:00
|
|
|
|
from pathlib import Path
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2025-03-17 14:39:53 +01:00
|
|
|
|
from warc2zim.utils import get_encoding_by_alias, set_encoding_aliases, to_string
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class EncodedForTest:
|
|
|
|
|
content: str
|
|
|
|
|
encoding: str
|
|
|
|
|
encoded: bytes
|
|
|
|
|
valid: bool
|
|
|
|
|
|
|
|
|
|
def __init__(self, content: str, encoding: str):
|
|
|
|
|
self.content = content
|
|
|
|
|
self.encoding = encoding
|
|
|
|
|
try:
|
|
|
|
|
self.encoded = content.encode(encoding)
|
|
|
|
|
self.valid = True
|
|
|
|
|
except ValueError:
|
|
|
|
|
self.valid = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(
|
|
|
|
|
params=[
|
|
|
|
|
"Simple ascii content",
|
2024-06-14 13:21:18 +00:00
|
|
|
|
"A content with non ascii chars éœo€ð",
|
|
|
|
|
"Latin1 contént",
|
|
|
|
|
"Latin2 conteňt",
|
2024-02-14 10:33:19 +01:00
|
|
|
|
"这是中文文本", # "This is a chinese text" (in chinese)
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
def content(request):
|
|
|
|
|
yield request.param
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(
|
|
|
|
|
params=[
|
|
|
|
|
"ascii",
|
|
|
|
|
"utf-8",
|
|
|
|
|
"utf-16",
|
|
|
|
|
"utf-32",
|
|
|
|
|
"latin1",
|
2024-06-14 13:21:18 +00:00
|
|
|
|
"latin2",
|
2024-02-14 10:33:19 +01:00
|
|
|
|
"gb2312",
|
|
|
|
|
"gbk",
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
def encoding(request):
|
|
|
|
|
yield request.param
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def simple_encoded_content(content, encoding):
|
|
|
|
|
return EncodedForTest(content, encoding)
|
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_decode_http_header(simple_encoded_content):
|
2024-02-14 10:33:19 +01:00
|
|
|
|
if not simple_encoded_content.valid:
|
|
|
|
|
# Nothing to test
|
|
|
|
|
return
|
2024-06-14 13:21:18 +00:00
|
|
|
|
assert (
|
2024-06-17 11:37:57 +00:00
|
|
|
|
to_string(
|
|
|
|
|
simple_encoded_content.encoded,
|
|
|
|
|
simple_encoded_content.encoding,
|
|
|
|
|
[],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
== simple_encoded_content.content
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_bad_http_header(simple_encoded_content):
|
|
|
|
|
if not simple_encoded_content.valid:
|
|
|
|
|
# Nothing to test
|
|
|
|
|
return
|
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
|
|
|
|
simple_encoded_content.encoded,
|
|
|
|
|
# HTTP header always pretend it has been encoded with latin1
|
|
|
|
|
"latin1",
|
|
|
|
|
# but we luckily have the proper "try-charset"
|
|
|
|
|
[simple_encoded_content.encoding],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
# and we've disabled the use of HTTP header
|
|
|
|
|
ignore_http_header_charsets=True,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
2024-06-14 13:21:18 +00:00
|
|
|
|
== simple_encoded_content.content
|
|
|
|
|
)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
2024-06-14 13:21:18 +00:00
|
|
|
|
class DeclaredHtmlEncodedForTest(EncodedForTest):
|
|
|
|
|
def __init__(self, content: str, encoding: str):
|
|
|
|
|
html_content = f'<html><meta charset="{encoding}"><body>{content}</body></html>'
|
|
|
|
|
super().__init__(html_content, encoding)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def declared_html_encoded_content(content, encoding):
|
|
|
|
|
return DeclaredHtmlEncodedForTest(content, encoding)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_decode_html_header(declared_html_encoded_content):
|
|
|
|
|
test_case = declared_html_encoded_content
|
2024-02-14 10:33:19 +01:00
|
|
|
|
if not test_case.valid:
|
|
|
|
|
return
|
2024-06-17 11:37:57 +00:00
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
|
|
|
|
test_case.encoded,
|
|
|
|
|
None,
|
|
|
|
|
[],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
== test_case.content
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class BadlyDeclaredHtmlEncodedForTest(EncodedForTest):
|
|
|
|
|
def __init__(self, content: str, encoding: str):
|
|
|
|
|
# pretend to be encoded with `encoding`
|
|
|
|
|
html_content = f"<html><meta charset={encoding}><body>{content}</body></html>"
|
|
|
|
|
# but in fact you are encoded with ISO-8859-1
|
|
|
|
|
super().__init__(html_content, "ISO-8859-1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
|
def badly_declared_html_encoded_content(content, encoding):
|
|
|
|
|
return BadlyDeclaredHtmlEncodedForTest(content, encoding)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_bad_html_header(badly_declared_html_encoded_content):
|
|
|
|
|
test_case = badly_declared_html_encoded_content
|
|
|
|
|
if not test_case.valid:
|
|
|
|
|
return
|
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
|
|
|
|
test_case.encoded,
|
|
|
|
|
None,
|
|
|
|
|
# Indicate proper charset to use in try-charsets
|
|
|
|
|
["ISO-8859-1"],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
# Disable charset defined in content first bytes
|
|
|
|
|
ignore_content_header_charsets=True,
|
|
|
|
|
)
|
|
|
|
|
== test_case.content
|
|
|
|
|
)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_decode_str(content, encoding):
|
2024-06-17 11:37:57 +00:00
|
|
|
|
result = to_string(
|
|
|
|
|
content,
|
|
|
|
|
encoding,
|
|
|
|
|
[],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
2024-06-14 13:21:18 +00:00
|
|
|
|
assert result == content
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_binary_content():
|
|
|
|
|
content = "Hello, 你好".encode("utf-32")
|
|
|
|
|
content = bytes([0xEF, 0xBB, 0xBF]) + content
|
|
|
|
|
# [0xEF, 0xBB, 0xBF] is a BOM marker for utf-8
|
|
|
|
|
# It will trick chardet to be really confident it is utf-8.
|
|
|
|
|
# However, this cannot be properly decoded using utf-8 ; but a value is still
|
|
|
|
|
# returned, since upstream server promised this is utf-8
|
2024-06-17 11:37:57 +00:00
|
|
|
|
assert to_string(
|
|
|
|
|
content,
|
|
|
|
|
"UTF-8",
|
|
|
|
|
[],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_single_bad_character():
|
|
|
|
|
content = bytes([0xEF, 0xBB, 0xBF]) + b"prem" + bytes([0xC3]) + "ière".encode()
|
|
|
|
|
# [0xEF, 0xBB, 0xBF] is a BOM marker for utf-8-sig
|
|
|
|
|
# 0xC3 is a bad character (nothing in utf-8-sig at this position)
|
2024-06-17 11:37:57 +00:00
|
|
|
|
result = to_string(
|
|
|
|
|
content,
|
|
|
|
|
"utf-8-sig",
|
|
|
|
|
[],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
2024-06-14 13:21:18 +00:00
|
|
|
|
assert result == "prem<EFBFBD>ière"
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_decode_charset_to_try(simple_encoded_content):
|
|
|
|
|
if not simple_encoded_content.valid:
|
|
|
|
|
# Nothing to test
|
2024-02-14 10:33:19 +01:00
|
|
|
|
return
|
2024-06-14 13:21:18 +00:00
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
2024-06-17 11:37:57 +00:00
|
|
|
|
simple_encoded_content.encoded,
|
|
|
|
|
None,
|
|
|
|
|
[simple_encoded_content.encoding],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
2024-06-14 13:21:18 +00:00
|
|
|
|
)
|
|
|
|
|
== simple_encoded_content.content
|
|
|
|
|
)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_decode_weird_encoding_not_declared_not_in_try_list():
|
|
|
|
|
with pytest.raises(ValueError):
|
2024-06-17 11:37:57 +00:00
|
|
|
|
to_string(
|
|
|
|
|
"Latin1 contént".encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
["UTF-8"],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def test_decode_weird_encoding_not_declared_in_try_list():
|
|
|
|
|
content = "Latin1 contént"
|
2024-06-17 11:37:57 +00:00
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
|
|
|
|
content.encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
["UTF-8", "latin1"],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
== content
|
|
|
|
|
)
|
2024-02-14 10:33:19 +01:00
|
|
|
|
|
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
@dataclass
|
|
|
|
|
class CharsetsTestData:
|
|
|
|
|
filename: str
|
|
|
|
|
probable_charset: str | None # probable charset to use
|
|
|
|
|
known_charset: str | None # charset we know is being used (fake file typically)
|
|
|
|
|
http_charset: (
|
|
|
|
|
str | None
|
|
|
|
|
) # encoding to pass as http header because file is missing details and encoding is
|
|
|
|
|
# not standard
|
|
|
|
|
expected_strings: list[str]
|
|
|
|
|
|
|
|
|
|
|
2025-02-03 14:50:58 +00:00
|
|
|
|
def get_testdata() -> Generator[CharsetsTestData]:
|
2024-06-14 13:21:18 +00:00
|
|
|
|
data = json.loads(
|
|
|
|
|
(Path(__file__).parent / "encodings" / "definition.json").read_bytes()
|
|
|
|
|
)
|
|
|
|
|
for file in data["files"]:
|
|
|
|
|
yield CharsetsTestData(
|
|
|
|
|
filename=file["filename"],
|
|
|
|
|
probable_charset=file.get("probable_charset", None),
|
|
|
|
|
known_charset=file.get("known_charset", None),
|
|
|
|
|
http_charset=file.get("http_charset", None),
|
|
|
|
|
expected_strings=file.get("expected_strings", []),
|
|
|
|
|
)
|
|
|
|
|
|
2024-05-17 15:18:31 +00:00
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
def get_testdata_id(test_data: CharsetsTestData) -> str:
|
|
|
|
|
return test_data.filename
|
2024-05-17 15:18:31 +00:00
|
|
|
|
|
2024-06-14 13:21:18 +00:00
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("testdata", get_testdata(), ids=get_testdata_id)
|
|
|
|
|
def test_decode_files(testdata: CharsetsTestData):
|
|
|
|
|
result = to_string(
|
|
|
|
|
(Path(__file__).parent / "encodings" / testdata.filename).read_bytes(),
|
|
|
|
|
testdata.http_charset,
|
|
|
|
|
["UTF-8", "latin1"],
|
2024-06-17 13:05:56 +00:00
|
|
|
|
1024,
|
2024-06-17 11:37:57 +00:00
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
2024-06-14 13:21:18 +00:00
|
|
|
|
)
|
|
|
|
|
for expected_string in testdata.expected_strings:
|
|
|
|
|
assert expected_string in result
|
2024-06-17 13:05:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_charset_too_far_away_without_fallback():
|
|
|
|
|
content = '<html><meta charset="latin1"><body>content</body></html>'
|
|
|
|
|
with pytest.raises(ValueError, match="No suitable charset"):
|
|
|
|
|
to_string(
|
|
|
|
|
content.encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
[],
|
|
|
|
|
24,
|
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_charset_too_far_away_with_fallback():
|
|
|
|
|
content = '<html><meta charset="latin1"><body>content</body></html>'
|
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
|
|
|
|
content.encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
["latin1"],
|
|
|
|
|
24,
|
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
== content
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_charset_far_away():
|
|
|
|
|
content = (
|
|
|
|
|
f'<html>{"".join("-" for i in range(1024))}<meta charset="latin1">'
|
|
|
|
|
"<body>content</body></html>"
|
|
|
|
|
)
|
|
|
|
|
assert (
|
|
|
|
|
to_string(
|
|
|
|
|
content.encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
[],
|
|
|
|
|
1200,
|
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
== content
|
|
|
|
|
)
|
2024-07-08 15:20:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_charset_too_far_away_with_alias():
|
|
|
|
|
content = '<html><meta charset="foo"><body>content</body></html>'
|
|
|
|
|
set_encoding_aliases({"foo": "latin1"})
|
|
|
|
|
to_string(
|
|
|
|
|
content.encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
[],
|
|
|
|
|
1024,
|
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_decode_charset_too_far_away_without_proper_alias():
|
|
|
|
|
content = '<html><meta charset="foo"><body>content</body></html>'
|
|
|
|
|
set_encoding_aliases({"bar": "latin1"})
|
|
|
|
|
with pytest.raises(LookupError, match="unknown encoding: foo"):
|
|
|
|
|
to_string(
|
|
|
|
|
content.encode("latin1"),
|
|
|
|
|
None,
|
|
|
|
|
[],
|
|
|
|
|
1024,
|
|
|
|
|
ignore_http_header_charsets=False,
|
|
|
|
|
ignore_content_header_charsets=False,
|
|
|
|
|
)
|
2025-03-17 14:39:53 +01:00
|
|
|
|
|
|
|
|
|
|
2025-03-17 15:13:27 +01:00
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"alias, expected",
|
|
|
|
|
[
|
|
|
|
|
("ansi", "windows-1252"),
|
|
|
|
|
("65001", "utf-8"),
|
|
|
|
|
("iso-utf-8", "utf-8"),
|
|
|
|
|
("u", "utf-8"),
|
|
|
|
|
("unicode", "utf-8"),
|
|
|
|
|
("utf-8", "utf-8"),
|
|
|
|
|
("utf-08", "utf-8"),
|
|
|
|
|
("utf-f", "utf-8"),
|
|
|
|
|
("utp-8", "utf-8"),
|
|
|
|
|
("windows-8859-1", "iso-8859-1"),
|
|
|
|
|
("iso88591", "iso-8859-1"),
|
|
|
|
|
(" uNiCoDe ", "utf-8"),
|
|
|
|
|
(" U ", "utf-8"),
|
|
|
|
|
("UNICODE", "utf-8"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_default_encoding_aliases(alias, expected):
|
|
|
|
|
assert get_encoding_by_alias(alias) == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_get_unknown_encoding():
|
|
|
|
|
assert get_encoding_by_alias("unKnown") == "unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"alias, expected",
|
|
|
|
|
[
|
|
|
|
|
("Unicode", "latin1"),
|
|
|
|
|
("unicode", "latin1"),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_override_default_encoding_alias(alias, expected):
|
2025-03-17 14:39:53 +01:00
|
|
|
|
set_encoding_aliases({"unicode": "latin1"})
|
2025-03-17 15:13:27 +01:00
|
|
|
|
assert get_encoding_by_alias(alias) == expected
|