2022-10-24 15:30:10 +02:00
|
|
|
import child_process from "child_process";
|
|
|
|
import fs from "fs";
|
|
|
|
import yaml from "js-yaml";
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
function runCrawl(name, config, commandExtra = "") {
|
|
|
|
config.generateCDX = true;
|
|
|
|
config.depth = 0;
|
|
|
|
config.collection = name;
|
2023-11-09 19:11:11 -05:00
|
|
|
|
2021-07-19 15:49:43 -07:00
|
|
|
const configYaml = yaml.dump(config);
|
|
|
|
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
const proc = child_process.execSync(
|
|
|
|
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
|
|
|
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
|
|
|
);
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
console.log(proc);
|
2023-11-09 19:11:11 -05:00
|
|
|
} catch (error) {
|
2021-07-19 15:49:43 -07:00
|
|
|
console.log(error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function doesCDXContain(coll, value) {
|
2023-11-09 19:11:11 -05:00
|
|
|
const data = fs.readFileSync(
|
|
|
|
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
|
|
|
);
|
2021-07-19 15:49:43 -07:00
|
|
|
return data.indexOf(value) >= 0;
|
|
|
|
}
|
|
|
|
|
2024-06-13 12:12:46 -07:00
|
|
|
function checkVideo(coll) {
|
|
|
|
return doesCDXContain(coll, '"video/mp4"') || doesCDXContain(coll, '"application/vnd.yt-ump"');
|
|
|
|
}
|
|
|
|
|
2023-10-09 09:41:50 -07:00
|
|
|
// Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior
|
|
|
|
/*
|
2021-07-19 15:49:43 -07:00
|
|
|
test("test crawl without block for specific URL", () => {
|
|
|
|
const config = {
|
|
|
|
"url": "https://www.iana.org/",
|
2023-10-02 14:30:44 -07:00
|
|
|
"pageExtraDelay": 10
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-1-no-block", config);
|
|
|
|
|
|
|
|
// without blocks, URL with add sense is included
|
|
|
|
expect(doesCDXContain("block-1-no-block", "https://cse.google.com/adsense/search/async-ads.js")).toBe(true);
|
|
|
|
});
|
2023-10-09 09:41:50 -07:00
|
|
|
*/
|
2021-07-19 15:49:43 -07:00
|
|
|
|
|
|
|
test("test block rule on specific URL", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
url: "https://www.iana.org/",
|
|
|
|
blockRules: [{ url: "adsense" }],
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-1", config);
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
expect(
|
|
|
|
doesCDXContain(
|
|
|
|
"block-1",
|
|
|
|
"https://cse.google.com/adsense/search/async-ads.js",
|
|
|
|
),
|
|
|
|
).toBe(false);
|
2021-07-19 15:49:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("test block rule based on iframe text, content included due to match", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
blockRules: [
|
|
|
|
{
|
|
|
|
url: "https://www.youtube.com/embed/",
|
|
|
|
frameTextMatch:
|
|
|
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
|
|
|
type: "allowOnly",
|
|
|
|
},
|
|
|
|
],
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-2", config);
|
|
|
|
|
2024-06-13 12:12:46 -07:00
|
|
|
expect(checkVideo("block-2")).toBe(true);
|
2021-07-19 15:49:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
blockRules: [
|
|
|
|
{
|
|
|
|
url: "https://www.youtube.com/embed/",
|
|
|
|
frameTextMatch:
|
|
|
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"',
|
|
|
|
type: "allowOnly",
|
|
|
|
},
|
|
|
|
],
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-3", config);
|
|
|
|
|
2024-06-13 12:12:46 -07:00
|
|
|
expect(checkVideo("block-3")).toBe(false);
|
2021-07-19 15:49:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("test block rule based on iframe text, block matched", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
blockRules: [
|
|
|
|
{
|
|
|
|
url: "https://www.youtube.com/embed/",
|
|
|
|
frameTextMatch:
|
|
|
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
|
|
|
},
|
|
|
|
],
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-4", config);
|
|
|
|
|
2024-06-13 12:12:46 -07:00
|
|
|
expect(checkVideo("block-4")).toBe(false);
|
2021-07-19 15:49:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
blockRules: [
|
|
|
|
{
|
|
|
|
url: "example.com/embed/",
|
|
|
|
frameTextMatch:
|
|
|
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
|
|
|
type: "block",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
url: "(youtube.com|example.com)/embed/",
|
|
|
|
type: "allowOnly",
|
|
|
|
inFrameUrl: "oembed.link/",
|
|
|
|
},
|
|
|
|
],
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("non-block-5", config);
|
|
|
|
|
2024-06-13 12:12:46 -07:00
|
|
|
expect(checkVideo("non-block-5")).toBe(true);
|
2021-07-19 15:49:43 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("test block url in frame url", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
|
|
|
blockRules: [
|
|
|
|
{
|
|
|
|
url: "maxresdefault.jpg",
|
|
|
|
type: "block",
|
|
|
|
inFrameUrl: "youtube.com/embed",
|
|
|
|
},
|
|
|
|
],
|
2021-07-19 15:49:43 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
runCrawl("block-6", config);
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
expect(
|
|
|
|
doesCDXContain(
|
|
|
|
"block-6",
|
|
|
|
'"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"',
|
|
|
|
),
|
|
|
|
).toBe(false);
|
2021-07-19 15:49:43 -07:00
|
|
|
});
|
|
|
|
|
2021-07-27 09:41:21 -07:00
|
|
|
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
|
|
|
const config = {
|
2023-11-09 19:11:11 -05:00
|
|
|
seeds: ["https://archiveweb.page/en/troubleshooting/errors/"],
|
|
|
|
depth: "0",
|
|
|
|
blockRules: [
|
|
|
|
{
|
|
|
|
url: "(archiveweb.page|www.youtube.com)",
|
|
|
|
type: "allowOnly",
|
|
|
|
inFrameUrl: "archiveweb.page",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
url: "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
|
|
|
inFrameUrl: "archiveweb.page",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
url: "https://www.youtube.com/embed/",
|
|
|
|
type: "allowOnly",
|
|
|
|
frameTextMatch:
|
|
|
|
'(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")',
|
|
|
|
},
|
2021-07-27 09:41:21 -07:00
|
|
|
],
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
combineWARC: true,
|
|
|
|
|
|
|
|
logging: "stats,debug",
|
|
|
|
};
|
2021-07-27 09:41:21 -07:00
|
|
|
|
|
|
|
runCrawl("block-7", config);
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
expect(
|
|
|
|
doesCDXContain(
|
|
|
|
"block-7",
|
|
|
|
'"https://archiveweb.page/assets/js/vendor/lunr.min.js"',
|
|
|
|
),
|
|
|
|
).toBe(false);
|
2024-06-13 12:12:46 -07:00
|
|
|
expect(checkVideo("block-7")).toBe(true);
|
2021-07-27 09:41:21 -07:00
|
|
|
});
|