browsertrix-crawler/tests/multi-instance-crawl.test.ts
Emma Segal-Grossman 64fdaf0d11
Convert tests from JS to TS (#1003)
- sets up ts-jest for typescript tests
- various type improvements, some shared functions add in utils.ts
- fixes sitemap test to not check sitemapDone, as it does not mean all URLs have been queued, uses sitemap path to avoid redirect
---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2026-04-02 17:05:41 -07:00

148 lines
3.5 KiB
TypeScript

import { exec, execSync } from "child_process";
import fs from "fs";
import { Redis } from "ioredis";
import { sleep } from "./utils";
let redisId: NonSharedBuffer;
let crawler1: Promise<number | null>, crawler2: Promise<number | null>;
beforeAll(() => {
fs.rmSync("./test-crawls/collections/shared-crawler-1", {
recursive: true,
force: true,
});
fs.rmSync("./test-crawls/collections/shared-crawler-2", {
recursive: true,
force: true,
});
execSync("docker network create crawl");
redisId = execSync(
"docker run --rm --network=crawl -p 37379:6379 --name redis -d redis",
);
crawler1 = runCrawl("crawler-1");
crawler2 = runCrawl("crawler-2");
});
afterAll(async () => {
execSync(`docker kill ${redisId}`);
await sleep(3000);
await Promise.allSettled([crawler1, crawler2]);
execSync("docker network rm crawl");
});
function runCrawl(name: string) {
const crawler = exec(
`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`,
);
return new Promise<number | null>((resolve) => {
crawler.on("exit", (code) => {
resolve(code);
});
});
}
test("run crawlers with external redis", async () => {
const redis = new Redis("redis://127.0.0.1:37379/0", {
lazyConnect: true,
retryStrategy: () => null,
});
await sleep(3000);
redis.options.maxRetriesPerRequest = 50;
await redis.connect();
let count = 0;
while (true) {
try {
const values = await redis.hgetall("testcrawl:status");
expect(values["crawler-1"]).toBe("running");
expect(values["crawler-2"]).toBe("running");
break;
} catch (e) {
if (count++ < 5) {
await sleep(1000);
continue;
}
throw e;
}
}
});
test("finish crawls successfully", async () => {
const res = await Promise.allSettled([crawler1, crawler2]);
expect(res[0].status === "fulfilled" ? res[0].value : null).toBe(0);
expect(res[1].status === "fulfilled" ? res[1].value : null).toBe(0);
}, 180000);
test("ensure correct number of pages", () => {
expect(
fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"),
).toBe(true);
expect(
fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"),
).toBe(true);
const pages_1 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-1/pages/pages.jsonl",
"utf8",
)
.trim()
.split("\n");
const pages_2 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-2/pages/pages.jsonl",
"utf8",
)
.trim()
.split("\n");
// add 2 for heading in each file
expect(pages_1.length + pages_2.length).toBe(1 + 2);
});
test("ensure correct number of extraPages", () => {
expect(
fs.existsSync(
"test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
),
).toBe(true);
expect(
fs.existsSync(
"test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
),
).toBe(true);
const pages_1 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
"utf8",
)
.trim()
.split("\n");
const pages_2 = fs
.readFileSync(
"test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
"utf8",
)
.trim()
.split("\n");
// add 2 for heading in each file
expect(pages_1.length + pages_2.length).toBe(3 + 2);
});