mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 22:43:17 +00:00

replace webrecorder.net -> old.webrecorder.net to fix tests relying on old website for now
137 lines
3.3 KiB
JavaScript
137 lines
3.3 KiB
JavaScript
import {exec, execSync} from "child_process";
|
|
import fs from "fs";
|
|
import { Redis } from "ioredis";
|
|
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
|
|
let redisId;
|
|
let crawler1, crawler2;
|
|
|
|
beforeAll(() => {
|
|
fs.rmSync("./test-crawls/collections/shared-crawler-1", { recursive: true, force: true });
|
|
fs.rmSync("./test-crawls/collections/shared-crawler-2", { recursive: true, force: true });
|
|
|
|
execSync("docker network create crawl");
|
|
|
|
redisId = execSync("docker run --rm --network=crawl -p 37379:6379 --name redis -d redis");
|
|
|
|
crawler1 = runCrawl("crawler-1");
|
|
crawler2 = runCrawl("crawler-2");
|
|
});
|
|
|
|
afterAll(async () => {
|
|
execSync(`docker kill ${redisId}`);
|
|
|
|
await sleep(3000);
|
|
|
|
await Promise.allSettled([crawler1, crawler2]);
|
|
|
|
execSync("docker network rm crawl");
|
|
});
|
|
|
|
function runCrawl(name) {
|
|
const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`);
|
|
|
|
return new Promise((resolve) => {
|
|
crawler.on("exit", (code) => {
|
|
resolve(code);
|
|
});
|
|
});
|
|
}
|
|
|
|
test("run crawlers with external redis", async () => {
|
|
const redis = new Redis("redis://127.0.0.1:37379/0", { lazyConnect: true, retryStrategy: () => null });
|
|
|
|
await sleep(3000);
|
|
|
|
await redis.connect({ maxRetriesPerRequest: 50 });
|
|
|
|
let count = 0;
|
|
|
|
while (true) {
|
|
try {
|
|
const values = await redis.hgetall("testcrawl:status");
|
|
expect(values["crawler-1"]).toBe("running");
|
|
expect(values["crawler-2"]).toBe("running");
|
|
break;
|
|
} catch (e) {
|
|
if (count++ < 5) {
|
|
await sleep(1000);
|
|
continue;
|
|
}
|
|
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
});
|
|
|
|
|
|
test("finish crawls successfully", async () => {
|
|
const res = await Promise.allSettled([crawler1, crawler2]);
|
|
expect(res[0].value).toBe(0);
|
|
expect(res[1].value).toBe(0);
|
|
}, 180000);
|
|
|
|
test("ensure correct number of pages", () => {
|
|
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
|
|
const pages_1 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-1/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
const pages_2 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-2/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
// add 2 for heading in each file
|
|
expect(pages_1.length + pages_2.length).toBe(1 + 2);
|
|
});
|
|
|
|
test("ensure correct number of extraPages", () => {
|
|
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl"),
|
|
).toBe(true);
|
|
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl"),
|
|
).toBe(true);
|
|
|
|
const pages_1 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
const pages_2 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
// add 2 for heading in each file
|
|
expect(pages_1.length + pages_2.length).toBe(3 + 2);
|
|
});
|