browsertrix-crawler/tests/url_file_list.test.js
Tessa Walsh 2af94ffab5
Support downloading seed file from URL (#852)
Fixes #841 

Crawler work toward long URL lists in Browsertrix. This PR moves seed
handling from the arg parser's validation step to the crawler's
bootstrap step in order to be able to async fetch the seed file from a
URL.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-07-03 10:49:37 -04:00

76 lines
2.1 KiB
JavaScript

import util from "util";
import { exec as execCallback } from "child_process";
import fs from "fs";
const exec = util.promisify(execCallback);
test("check that URLs in seed-list are crawled", async () => {
try {
await exec(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
);
} catch (error) {
console.log(error);
}
let crawled_pages = fs.readFileSync(
"test-crawls/collections/filelisttest/pages/pages.jsonl",
"utf8",
);
let seed_file = fs
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
.split("\n")
.sort();
let seed_file_list = [];
for (var j = 0; j < seed_file.length; j++) {
if (seed_file[j] != undefined) {
seed_file_list.push(seed_file[j]);
}
}
let foundSeedUrl = true;
for (var i = 1; i < seed_file_list.length; i++) {
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
foundSeedUrl = false;
}
}
expect(foundSeedUrl).toBe(true);
});
test("check that URLs in seed-list hosted at URL are crawled", async () => {
try {
await exec(
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
);
} catch (error) {
console.log(error);
}
let crawled_pages = fs.readFileSync(
"test-crawls/collections/onlinefilelisttest/pages/pages.jsonl",
"utf8",
);
let seed_file = fs
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
.split("\n")
.sort();
let seed_file_list = [];
for (var j = 0; j < seed_file.length; j++) {
if (seed_file[j] != undefined) {
seed_file_list.push(seed_file[j]);
}
}
let foundSeedUrl = true;
for (var i = 1; i < seed_file_list.length; i++) {
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
foundSeedUrl = false;
}
}
expect(foundSeedUrl).toBe(true);
});