mirror of
https://github.com/openzim/warc2zim.git
synced 2025-10-19 06:23:16 +00:00

- detect base href in HTML pages head - use it to properly rewrite URLs found in the HTML page - rewrite the base to remove the href (for simplicity) but keep the target
319 lines
9.8 KiB
Python
319 lines
9.8 KiB
Python
import pytest
|
|
|
|
from warc2zim.content_rewriting.js import JsRewriter
|
|
from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath
|
|
|
|
from .utils import ContentForTests
|
|
|
|
|
|
@pytest.fixture
|
|
def simple_js_rewriter(simple_url_rewriter, no_js_notify) -> JsRewriter:
|
|
return JsRewriter(
|
|
url_rewriter=simple_url_rewriter("http://www.example.com"),
|
|
base_href=None,
|
|
notify_js_module=no_js_notify,
|
|
)
|
|
|
|
|
|
@pytest.fixture(
|
|
params=[
|
|
"a = this;",
|
|
"return this.location",
|
|
'func(Function("return this"));',
|
|
"'a||this||that",
|
|
"(a,b,Q.contains(i[t], this))",
|
|
"a = this.location.href; exports.Foo = Foo; /* export className */",
|
|
]
|
|
)
|
|
def rewrite_this_js_content(request):
|
|
content = request.param
|
|
yield ContentForTests(
|
|
content,
|
|
content.replace("this", "_____WB$wombat$check$this$function_____(this)"),
|
|
)
|
|
|
|
|
|
def test_this_js_rewrite(simple_js_rewriter: JsRewriter, rewrite_this_js_content):
|
|
assert (
|
|
simple_js_rewriter.rewrite(rewrite_this_js_content.input_str)
|
|
== rewrite_this_js_content.expected_str
|
|
)
|
|
|
|
|
|
class WrappedTestContent(ContentForTests):
|
|
@staticmethod
|
|
def wrap_script(text: str) -> str:
|
|
"""
|
|
A small wrapper to help generate the expected content.
|
|
|
|
JsRewriter must add this local definition around all js code (when we access on
|
|
of the local varibles)
|
|
"""
|
|
return (
|
|
"var _____WB$wombat$assign$function_____ = function(name) {return (self."
|
|
"_wb_wombat && self._wb_wombat.local_init && self._wb_wombat.local_init"
|
|
"(name)) || self[name]; };\n"
|
|
"if (!self.__WB_pmw) { self.__WB_pmw = function(obj) { this.__WB_source ="
|
|
" obj; return this; } }\n"
|
|
"{\n"
|
|
'let window = _____WB$wombat$assign$function_____("window");\n'
|
|
'let globalThis = _____WB$wombat$assign$function_____("globalThis");\n'
|
|
'let self = _____WB$wombat$assign$function_____("self");\n'
|
|
'let document = _____WB$wombat$assign$function_____("document");\n'
|
|
'let location = _____WB$wombat$assign$function_____("location");\n'
|
|
'let top = _____WB$wombat$assign$function_____("top");\n'
|
|
'let parent = _____WB$wombat$assign$function_____("parent");\n'
|
|
'let frames = _____WB$wombat$assign$function_____("frames");\n'
|
|
'let opener = _____WB$wombat$assign$function_____("opener");\n'
|
|
"let arguments;\n"
|
|
"\n"
|
|
f"{text}"
|
|
"\n"
|
|
"}"
|
|
)
|
|
|
|
def __post_init__(self):
|
|
super().__post_init__()
|
|
self.expected = self.wrap_script(self.expected_str)
|
|
|
|
|
|
@pytest.fixture(
|
|
params=[
|
|
WrappedTestContent(
|
|
"location = http://example.com/",
|
|
"location = ((self.__WB_check_loc && self.__WB_check_loc(location, argument"
|
|
"s)) || {}).href = http://example.com/",
|
|
),
|
|
WrappedTestContent(
|
|
" location = http://example.com/2",
|
|
" location = ((self.__WB_check_loc && self.__WB_check_loc(location, argumen"
|
|
"ts)) || {}).href = http://example.com/2",
|
|
),
|
|
WrappedTestContent("func(location = 0)", "func(location = 0)"),
|
|
WrappedTestContent(
|
|
" location = http://example.com/2",
|
|
" location = ((self.__WB_check_loc && self.__WB_check_loc(location, argumen"
|
|
"ts)) || {}).href = http://example.com/2",
|
|
),
|
|
WrappedTestContent("window.eval(a)", "window.eval(a)"),
|
|
WrappedTestContent("x = window.eval; x(a);", "x = window.eval; x(a);"),
|
|
WrappedTestContent(
|
|
"this. location = 'http://example.com/'",
|
|
"this. location = 'http://example.com/'",
|
|
),
|
|
WrappedTestContent(
|
|
"if (self.foo) { console.log('blah') }",
|
|
"if (self.foo) { console.log('blah') }",
|
|
),
|
|
WrappedTestContent("window.x = 5", "window.x = 5"),
|
|
]
|
|
)
|
|
def rewrite_wrapped_content(request):
|
|
yield request.param
|
|
|
|
|
|
def test_wrapped_rewrite(simple_js_rewriter: JsRewriter, rewrite_wrapped_content):
|
|
assert (
|
|
simple_js_rewriter.rewrite(rewrite_wrapped_content.input_str)
|
|
== rewrite_wrapped_content.expected_str
|
|
)
|
|
|
|
|
|
class ImportTestContent(ContentForTests):
|
|
@staticmethod
|
|
# We want to import js stored in zim file as `_zim_static/__wb_module_decl.js` from
|
|
# `https://exemple.com/some/path/` so path is
|
|
# `../../../_zim_static/__wb_module_decl.js`
|
|
def wrap_import(text: str) -> str:
|
|
"""
|
|
A small wrapper to help us generate the expected content for modules.
|
|
|
|
JsRewriter must add this import line at beginning of module codes (when code
|
|
contains `import` or `export`)
|
|
"""
|
|
return (
|
|
"import { window, globalThis, self, document, location, top, parent, "
|
|
'frames, opener } from "../../../_zim_static/__wb_module_decl.js";\n'
|
|
f"{text}"
|
|
)
|
|
|
|
def __post_init__(self):
|
|
super().__post_init__()
|
|
self.article_url = "https://exemple.com/some/path/"
|
|
self.expected = self.wrap_import(self.expected_str)
|
|
|
|
|
|
@pytest.fixture(
|
|
params=[
|
|
# import rewrite
|
|
ImportTestContent(
|
|
"""import "foo";
|
|
|
|
a = this.location""",
|
|
"""import "foo";
|
|
|
|
a = _____WB$wombat$check$this$function_____(this).location""",
|
|
),
|
|
# import/export module rewrite
|
|
ImportTestContent(
|
|
"""a = this.location
|
|
|
|
export { a };
|
|
""",
|
|
"""a = _____WB$wombat$check$this$function_____(this).location
|
|
|
|
export { a };
|
|
""",
|
|
),
|
|
# rewrite ESM module import
|
|
ImportTestContent(
|
|
'import "https://example.com/file.js"',
|
|
'import "../../../example.com/file.js"',
|
|
),
|
|
ImportTestContent(
|
|
'''
|
|
import {A, B}
|
|
from
|
|
"https://example.com/file.js"''',
|
|
'''
|
|
import {A, B}
|
|
from
|
|
"../../../example.com/file.js"''',
|
|
),
|
|
ImportTestContent(
|
|
"""
|
|
import * from "https://example.com/file.js"
|
|
import A from "http://example.com/path/file2.js";
|
|
|
|
import {C, D} from "./abc.js";
|
|
import {X, Y} from "../parent.js";
|
|
import {E, F, G} from "/path.js";
|
|
import { Z } from "../../../path.js";
|
|
|
|
B = await import(somefile);
|
|
""",
|
|
"""
|
|
import * from "../../../example.com/file.js"
|
|
import A from "../../../example.com/path/file2.js";
|
|
|
|
import {C, D} from "./abc.js";
|
|
import {X, Y} from "../parent.js";
|
|
import {E, F, G} from "../../path.js";
|
|
import { Z } from "../../path.js";
|
|
|
|
B = await ____wb_rewrite_import__(import.meta.url, somefile);
|
|
""",
|
|
),
|
|
ImportTestContent(
|
|
'import"import.js";import{A, B, C} from"test.js";(function() => { frames[0]'
|
|
'.href = "/abc"; })',
|
|
'import"import.js";import{A, B, C} from"test.js";(function() => { frames[0]'
|
|
'.href = "/abc"; })',
|
|
),
|
|
ImportTestContent(
|
|
"""a = location
|
|
|
|
export{ a, $ as b};
|
|
""",
|
|
"""a = location
|
|
|
|
export{ a, $ as b};
|
|
""",
|
|
),
|
|
]
|
|
)
|
|
def rewrite_import_content(request):
|
|
yield request.param
|
|
|
|
|
|
def test_import_rewrite(no_js_notify, rewrite_import_content):
|
|
url_rewriter = ArticleUrlRewriter(
|
|
HttpUrl(rewrite_import_content.article_url), set()
|
|
)
|
|
assert (
|
|
JsRewriter(
|
|
url_rewriter=url_rewriter, base_href=None, notify_js_module=no_js_notify
|
|
).rewrite(rewrite_import_content.input_str, opts={"isModule": True})
|
|
== rewrite_import_content.expected_str
|
|
)
|
|
|
|
|
|
@pytest.fixture(
|
|
params=[
|
|
"return this.abc",
|
|
"return this object",
|
|
"a = 'some, this object'",
|
|
"{foo: bar, this: other}",
|
|
"this.$location = http://example.com/",
|
|
"this. $location = http://example.com/",
|
|
"this. _location = http://example.com/",
|
|
"this. alocation = http://example.com/",
|
|
"this.location = http://example.com/",
|
|
",eval(a)",
|
|
"this.$eval(a)",
|
|
"x = $eval; x(a);",
|
|
"obj = { eval : 1 }",
|
|
"x = obj.eval",
|
|
"x = obj.eval(a)",
|
|
"x = obj._eval(a)",
|
|
"x = obj.$eval(a)",
|
|
"if (a.self.foo) { console.log('blah') }",
|
|
"a.window.x = 5",
|
|
" postMessage({'a': 'b'})",
|
|
"simport(5);",
|
|
"a.import(5);",
|
|
"$import(5);",
|
|
"async import(val) { ... }",
|
|
"""function blah() {
|
|
const text = "text: import a from B.js";
|
|
}
|
|
""",
|
|
"""function blah() {
|
|
const text = `
|
|
import a from "https://example.com/B.js"
|
|
`;
|
|
}
|
|
|
|
""",
|
|
"let a = 7; var b = 5; const foo = 4;\n\n",
|
|
]
|
|
)
|
|
def no_rewrite_js_content(request):
|
|
yield request.param
|
|
|
|
|
|
def test_no_rewrite(simple_js_rewriter: JsRewriter, no_rewrite_js_content):
|
|
assert simple_js_rewriter.rewrite(no_rewrite_js_content) == no_rewrite_js_content
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"js_src,expected_js_module_path",
|
|
[
|
|
("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"),
|
|
("../my-module-script.js", "kiwix.org/my-module-script.js"),
|
|
("../../../my-module-script.js", "kiwix.org/my-module-script.js"),
|
|
("/my-module-script.js", "kiwix.org/my-module-script.js"),
|
|
("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"),
|
|
(
|
|
"https://myserver.com/my-module-script.js",
|
|
"myserver.com/my-module-script.js",
|
|
),
|
|
],
|
|
)
|
|
def test_js_rewrite_nested_module_detected(js_src, expected_js_module_path):
|
|
|
|
js_modules = []
|
|
|
|
def custom_notify(zim_path: ZimPath):
|
|
js_modules.append(zim_path)
|
|
|
|
url_rewriter = ArticleUrlRewriter(
|
|
HttpUrl("http://kiwix.org/my_folder/my_article.html"), set()
|
|
)
|
|
|
|
JsRewriter(
|
|
url_rewriter=url_rewriter, base_href=None, notify_js_module=custom_notify
|
|
).rewrite(f'import * from "{js_src}"', opts={"isModule": True})
|
|
|
|
assert len(js_modules) == 1
|
|
assert js_modules[0].value == expected_js_module_path
|