browsertrix-crawler/tests/multi-instance-crawl.test.js

138 lines
3.3 KiB
JavaScript
Raw Normal View History

import {exec, execSync} from "child_process";
import fs from "fs";
import { Redis } from "ioredis";
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
let redisId;
let crawler1, crawler2;
beforeAll(() => {
fs.rmSync("./test-crawls/collections/shared-crawler-1", { recursive: true, force: true });
fs.rmSync("./test-crawls/collections/shared-crawler-2", { recursive: true, force: true });
execSync("docker network create crawl");
redisId = execSync("docker run --rm --network=crawl -p 37379:6379 --name redis -d redis");
crawler1 = runCrawl("crawler-1");
crawler2 = runCrawl("crawler-2");
});
afterAll(async () => {
execSync(`docker kill ${redisId}`);
await sleep(3000);
await Promise.allSettled([crawler1, crawler2]);
execSync("docker network rm crawl");
});
function runCrawl(name) {
const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://www.webrecorder.net/ --limit 4 --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`);
return new Promise((resolve) => {
crawler.on("exit", (code) => {
resolve(code);
});
});
}
test("run crawlers with external redis", async () => {
const redis = new Redis("redis://127.0.0.1:37379/0", { lazyConnect: true, retryStrategy: () => null });
await sleep(3000);
await redis.connect({ maxRetriesPerRequest: 50 });
let count = 0;
while (true) {
try {
const values = await redis.hgetall("testcrawl:status");
expect(values["crawler-1"]).toBe("running");
expect(values["crawler-2"]).toBe("running");
break;
} catch (e) {
if (count++ < 5) {
await sleep(1000);
continue;
}
throw e;
}
}
});
test("finish crawls successfully", async () => {
const res = await Promise.allSettled([crawler1, crawler2]);
expect(res[0].value).toBe(0);
expect(res[1].value).toBe(0);
}, 180000);
test("ensure correct number of pages", () => {
expect(
fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"),
).toBe(true);
expect(
fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"),
).toBe(true);
const pages_1 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-1/pages/pages.jsonl",
"utf8",
)
.trim()
.split("\n");
const pages_2 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-2/pages/pages.jsonl",
"utf8",
)
.trim()
.split("\n");
// add 2 for heading in each file
expect(pages_1.length + pages_2.length).toBe(1 + 2);
});
test("ensure correct number of extraPages", () => {
expect(
fs.existsSync("test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl"),
).toBe(true);
expect(
fs.existsSync("test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl"),
).toBe(true);
const pages_1 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
"utf8",
)
.trim()
.split("\n");
const pages_2 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
"utf8",
)
.trim()
.split("\n");
// add 2 for heading in each file
expect(pages_1.length + pages_2.length).toBe(3 + 2);
});