browsertrix-crawler/tests/saved-state.test.js

153 lines
3.7 KiB
JavaScript

import { execSync } from "child_process";
import fs from "fs";
import path from "path";
import yaml from "js-yaml";
import Redis from "ioredis";
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function waitContainer(containerId) {
try {
execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}
// containerId is initially the full id, but docker ps
// only prints the short id (first 12 characters)
containerId = containerId.slice(0, 12);
while (true) {
try {
const res = execSync("docker ps -q", { encoding: "utf-8" });
if (res.indexOf(containerId) < 0) {
return;
}
} catch (e) {
console.error(e);
}
await sleep(500);
}
}
let savedStateFile;
let state;
let numDone;
let numQueued;
let finished;
test("check crawl interrupted + saved state written", async () => {
let containerId = null;
try {
containerId = execSync(
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10",
{ encoding: "utf-8" },
//wait.callback,
);
} catch (error) {
console.log(error);
}
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
// remove existing pagesFile to support reentrancy
try {
fs.unlinkSync(pagesFile);
} catch (e) {
// ignore
}
while (true) {
try {
const pages = fs
.readFileSync(pagesFile, { encoding: "utf-8" })
.trim()
.split("\n");
if (pages.length >= 2) {
break;
}
} catch (e) {
// ignore
}
await sleep(500);
}
await waitContainer(containerId);
const savedStates = fs.readdirSync(
"test-crawls/collections/int-state-test/crawls",
);
expect(savedStates.length > 0).toEqual(true);
savedStateFile = savedStates[savedStates.length - 1];
});
test("check parsing saved state + page done + queue present", () => {
expect(savedStateFile).toBeTruthy();
const savedState = fs.readFileSync(
path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
"utf-8",
);
const saved = yaml.load(savedState);
expect(!!saved.state).toBe(true);
state = saved.state;
numDone = state.finished.length;
numQueued = state.queued.length;
expect(numDone > 0).toEqual(true);
expect(numQueued > 0).toEqual(true);
expect(numDone + numQueued).toEqual(10);
// ensure extra seeds also set
expect(state.extraSeeds).toEqual([
`{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`,
]);
finished = state.finished;
});
test("check crawl restarted with saved state", async () => {
let containerId = null;
const port = 36379;
try {
containerId = execSync(
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
{ encoding: "utf-8" },
);
} catch (error) {
console.log(error);
}
await sleep(2000);
const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
try {
await redis.connect({
maxRetriesPerRequest: 100,
});
await sleep(2000);
expect(await redis.get("test:d")).toBe(numDone + "");
for (const url of finished) {
const res = await redis.sismember("test:s", url);
expect(res).toBe(1);
}
} catch (e) {
console.log(e);
} finally {
await waitContainer(containerId);
}
});