mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

Fixes #841 Crawler work toward long URL lists in Browsertrix. This PR moves seed handling from the arg parser's validation step to the crawler's bootstrap step in order to be able to async fetch the seed file from a URL. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
76 lines
2.1 KiB
JavaScript
76 lines
2.1 KiB
JavaScript
import util from "util";
|
|
import { exec as execCallback } from "child_process";
|
|
import fs from "fs";
|
|
|
|
const exec = util.promisify(execCallback);
|
|
|
|
test("check that URLs in seed-list are crawled", async () => {
|
|
try {
|
|
await exec(
|
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
|
|
);
|
|
} catch (error) {
|
|
console.log(error);
|
|
}
|
|
|
|
let crawled_pages = fs.readFileSync(
|
|
"test-crawls/collections/filelisttest/pages/pages.jsonl",
|
|
"utf8",
|
|
);
|
|
let seed_file = fs
|
|
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
|
.split("\n")
|
|
.sort();
|
|
|
|
let seed_file_list = [];
|
|
for (var j = 0; j < seed_file.length; j++) {
|
|
if (seed_file[j] != undefined) {
|
|
seed_file_list.push(seed_file[j]);
|
|
}
|
|
}
|
|
|
|
let foundSeedUrl = true;
|
|
|
|
for (var i = 1; i < seed_file_list.length; i++) {
|
|
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
|
foundSeedUrl = false;
|
|
}
|
|
}
|
|
expect(foundSeedUrl).toBe(true);
|
|
});
|
|
|
|
|
|
test("check that URLs in seed-list hosted at URL are crawled", async () => {
|
|
try {
|
|
await exec(
|
|
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
|
|
);
|
|
} catch (error) {
|
|
console.log(error);
|
|
}
|
|
|
|
let crawled_pages = fs.readFileSync(
|
|
"test-crawls/collections/onlinefilelisttest/pages/pages.jsonl",
|
|
"utf8",
|
|
);
|
|
let seed_file = fs
|
|
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
|
.split("\n")
|
|
.sort();
|
|
|
|
let seed_file_list = [];
|
|
for (var j = 0; j < seed_file.length; j++) {
|
|
if (seed_file[j] != undefined) {
|
|
seed_file_list.push(seed_file[j]);
|
|
}
|
|
}
|
|
|
|
let foundSeedUrl = true;
|
|
|
|
for (var i = 1; i < seed_file_list.length; i++) {
|
|
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
|
foundSeedUrl = false;
|
|
}
|
|
}
|
|
expect(foundSeedUrl).toBe(true);
|
|
});
|