browsertrix-crawler/tests/saved-state.test.js

121 lines
3 KiB
JavaScript
Raw Normal View History

import { exec } from "child_process";
import fs from "fs";
import path from "path";
import yaml from "js-yaml";
import Redis from "ioredis";
function waitForProcess() {
let callback = null;
const p = new Promise((resolve) => {
callback = (/*error, stdout, stderr*/) => {
//console.log(stdout);
resolve(0);
};
});
return {p, callback};
}
var savedStateFile;
var state;
var numDone;
var redis;
var finishProcess;
test("check crawl interrupted + saved state written", async () => {
let proc = null;
const wait = waitForProcess();
try {
proc = exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20", {"shell": "/bin/bash"}, wait.callback);
}
catch (error) {
console.log(error);
}
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
// remove existing pagesFile to support reentrancy
try {
fs.unlinkSync(pagesFile);
} catch (e) {
// ignore
}
while (true) {
try {
const pages = fs.readFileSync(pagesFile, {encoding: "utf-8"}).trim().split("\n");
if (pages.length >= 2) {
break;
}
} catch(e) {
// ignore
}
await new Promise((resolve) => setTimeout(resolve, 500));
}
proc.kill("SIGINT");
await wait.p;
const savedStates = fs.readdirSync("test-crawls/collections/int-state-test/crawls");
expect(savedStates.length > 0).toEqual(true);
savedStateFile = savedStates[savedStates.length - 1];
});
test("check parsing saved state + page done + queue present", () => {
expect(savedStateFile).toBeTruthy();
const savedState = fs.readFileSync(path.join("test-crawls/collections/int-state-test/crawls", savedStateFile), "utf-8");
const saved = yaml.load(savedState);
expect(!!saved.state).toBe(true);
state = saved.state;
numDone = state.done;
expect(state.done > 0).toEqual(true);
expect(state.queued.length > 0).toEqual(true);
});
test("check crawl restarted with saved state", async () => {
let proc = null;
const wait = waitForProcess();
try {
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
}
catch (error) {
console.log(error);
}
await new Promise((resolve) => setTimeout(resolve, 2000));
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
await redis.connect({maxRetriesPerRequest: 50});
expect(await redis.get("test:d")).toBe(numDone + "");
proc.kill("SIGINT");
finishProcess = wait.p;
});
test("interrupt crawl and exit", async () => {
const res = await Promise.allSettled([finishProcess, redis.quit()]);
expect(res[0].value).toBe(0);
});