mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2026-04-17 23:00:17 +00:00
- sets up ts-jest for typescript tests - various type improvements, some shared functions add in utils.ts - fixes sitemap test to not check sitemapDone, as it does not mean all URLs have been queued, uses sitemap path to avoid redirect --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
148 lines
3.5 KiB
TypeScript
148 lines
3.5 KiB
TypeScript
import { exec, execSync } from "child_process";
|
|
import fs from "fs";
|
|
import { Redis } from "ioredis";
|
|
import { sleep } from "./utils";
|
|
|
|
let redisId: NonSharedBuffer;
|
|
let crawler1: Promise<number | null>, crawler2: Promise<number | null>;
|
|
|
|
beforeAll(() => {
|
|
fs.rmSync("./test-crawls/collections/shared-crawler-1", {
|
|
recursive: true,
|
|
force: true,
|
|
});
|
|
fs.rmSync("./test-crawls/collections/shared-crawler-2", {
|
|
recursive: true,
|
|
force: true,
|
|
});
|
|
|
|
execSync("docker network create crawl");
|
|
|
|
redisId = execSync(
|
|
"docker run --rm --network=crawl -p 37379:6379 --name redis -d redis",
|
|
);
|
|
|
|
crawler1 = runCrawl("crawler-1");
|
|
crawler2 = runCrawl("crawler-2");
|
|
});
|
|
|
|
afterAll(async () => {
|
|
execSync(`docker kill ${redisId}`);
|
|
|
|
await sleep(3000);
|
|
|
|
await Promise.allSettled([crawler1, crawler2]);
|
|
|
|
execSync("docker network rm crawl");
|
|
});
|
|
|
|
function runCrawl(name: string) {
|
|
const crawler = exec(
|
|
`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`,
|
|
);
|
|
|
|
return new Promise<number | null>((resolve) => {
|
|
crawler.on("exit", (code) => {
|
|
resolve(code);
|
|
});
|
|
});
|
|
}
|
|
|
|
test("run crawlers with external redis", async () => {
|
|
const redis = new Redis("redis://127.0.0.1:37379/0", {
|
|
lazyConnect: true,
|
|
retryStrategy: () => null,
|
|
});
|
|
|
|
await sleep(3000);
|
|
|
|
redis.options.maxRetriesPerRequest = 50;
|
|
|
|
await redis.connect();
|
|
|
|
let count = 0;
|
|
|
|
while (true) {
|
|
try {
|
|
const values = await redis.hgetall("testcrawl:status");
|
|
expect(values["crawler-1"]).toBe("running");
|
|
expect(values["crawler-2"]).toBe("running");
|
|
break;
|
|
} catch (e) {
|
|
if (count++ < 5) {
|
|
await sleep(1000);
|
|
continue;
|
|
}
|
|
|
|
throw e;
|
|
}
|
|
}
|
|
});
|
|
|
|
test("finish crawls successfully", async () => {
|
|
const res = await Promise.allSettled([crawler1, crawler2]);
|
|
expect(res[0].status === "fulfilled" ? res[0].value : null).toBe(0);
|
|
expect(res[1].status === "fulfilled" ? res[1].value : null).toBe(0);
|
|
}, 180000);
|
|
|
|
test("ensure correct number of pages", () => {
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
|
|
expect(
|
|
fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"),
|
|
).toBe(true);
|
|
|
|
const pages_1 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-1/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
const pages_2 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-2/pages/pages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
// add 2 for heading in each file
|
|
expect(pages_1.length + pages_2.length).toBe(1 + 2);
|
|
});
|
|
|
|
test("ensure correct number of extraPages", () => {
|
|
expect(
|
|
fs.existsSync(
|
|
"test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
|
|
),
|
|
).toBe(true);
|
|
|
|
expect(
|
|
fs.existsSync(
|
|
"test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
|
|
),
|
|
).toBe(true);
|
|
|
|
const pages_1 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
const pages_2 = fs
|
|
.readFileSync(
|
|
"test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
|
|
"utf8",
|
|
)
|
|
.trim()
|
|
.split("\n");
|
|
|
|
// add 2 for heading in each file
|
|
expect(pages_1.length + pages_2.length).toBe(3 + 2);
|
|
});
|