2024-03-15 20:54:43 -04:00
|
|
|
import { execSync } from "child_process";
|
2023-10-23 09:36:10 -07:00
|
|
|
import fs from "fs";
|
|
|
|
import path from "path";
|
|
|
|
import yaml from "js-yaml";
|
|
|
|
import Redis from "ioredis";
|
|
|
|
|
2024-03-26 14:50:36 -07:00
|
|
|
|
|
|
|
const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
|
2024-04-11 13:55:52 -07:00
|
|
|
const extraPagesFile = "test-crawls/collections/int-state-test/pages/extraPages.jsonl";
|
2024-03-26 14:50:36 -07:00
|
|
|
|
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
function sleep(ms) {
|
|
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
|
|
}
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2024-03-26 14:50:36 -07:00
|
|
|
async function waitContainerDone(containerId) {
|
2024-03-15 20:54:43 -04:00
|
|
|
// containerId is initially the full id, but docker ps
|
|
|
|
// only prints the short id (first 12 characters)
|
|
|
|
containerId = containerId.slice(0, 12);
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
try {
|
|
|
|
const res = execSync("docker ps -q", { encoding: "utf-8" });
|
|
|
|
if (res.indexOf(containerId) < 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} catch (e) {
|
|
|
|
console.error(e);
|
|
|
|
}
|
|
|
|
await sleep(500);
|
|
|
|
}
|
2023-10-23 09:36:10 -07:00
|
|
|
}
|
|
|
|
|
2024-03-26 14:50:36 -07:00
|
|
|
async function killContainer(containerId) {
|
|
|
|
try {
|
|
|
|
execSync(`docker kill -s SIGINT ${containerId}`);
|
|
|
|
} catch (e) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
await waitContainerDone(containerId);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
let savedStateFile;
|
|
|
|
let state;
|
|
|
|
let numDone;
|
|
|
|
let numQueued;
|
|
|
|
let finished;
|
2023-10-23 09:36:10 -07:00
|
|
|
|
|
|
|
test("check crawl interrupted + saved state written", async () => {
|
2024-03-15 20:54:43 -04:00
|
|
|
let containerId = null;
|
2023-10-23 09:36:10 -07:00
|
|
|
|
|
|
|
try {
|
2024-03-15 20:54:43 -04:00
|
|
|
containerId = execSync(
|
2024-09-05 18:10:27 -07:00
|
|
|
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\" --exclude community",
|
2024-03-15 20:54:43 -04:00
|
|
|
{ encoding: "utf-8" },
|
|
|
|
//wait.callback,
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
|
|
|
} catch (error) {
|
2023-10-23 09:36:10 -07:00
|
|
|
console.log(error);
|
|
|
|
}
|
|
|
|
|
|
|
|
// remove existing pagesFile to support reentrancy
|
|
|
|
try {
|
|
|
|
fs.unlinkSync(pagesFile);
|
|
|
|
} catch (e) {
|
|
|
|
// ignore
|
|
|
|
}
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
const pages = fs
|
|
|
|
.readFileSync(pagesFile, { encoding: "utf-8" })
|
|
|
|
.trim()
|
|
|
|
.split("\n");
|
2023-10-23 09:36:10 -07:00
|
|
|
|
|
|
|
if (pages.length >= 2) {
|
|
|
|
break;
|
|
|
|
}
|
2023-11-09 19:11:11 -05:00
|
|
|
} catch (e) {
|
2023-10-23 09:36:10 -07:00
|
|
|
// ignore
|
|
|
|
}
|
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
await sleep(500);
|
2023-10-23 09:36:10 -07:00
|
|
|
}
|
|
|
|
|
2024-03-26 14:50:36 -07:00
|
|
|
await killContainer(containerId);
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const savedStates = fs.readdirSync(
|
|
|
|
"test-crawls/collections/int-state-test/crawls",
|
|
|
|
);
|
2023-10-23 09:36:10 -07:00
|
|
|
expect(savedStates.length > 0).toEqual(true);
|
|
|
|
|
|
|
|
savedStateFile = savedStates[savedStates.length - 1];
|
|
|
|
});
|
|
|
|
|
|
|
|
test("check parsing saved state + page done + queue present", () => {
|
|
|
|
expect(savedStateFile).toBeTruthy();
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const savedState = fs.readFileSync(
|
|
|
|
path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
|
|
|
|
"utf-8",
|
|
|
|
);
|
|
|
|
|
2023-10-23 09:36:10 -07:00
|
|
|
const saved = yaml.load(savedState);
|
|
|
|
|
|
|
|
state = saved.state;
|
2024-03-26 14:50:36 -07:00
|
|
|
finished = state.finished;
|
|
|
|
|
|
|
|
numDone = finished.length;
|
2024-03-15 20:54:43 -04:00
|
|
|
numQueued = state.queued.length;
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2024-03-26 14:50:36 -07:00
|
|
|
expect(!!state).toBe(true);
|
2024-03-15 20:54:43 -04:00
|
|
|
expect(numDone > 0).toEqual(true);
|
|
|
|
expect(numQueued > 0).toEqual(true);
|
|
|
|
expect(numDone + numQueued).toEqual(10);
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
// ensure extra seeds also set
|
|
|
|
expect(state.extraSeeds).toEqual([
|
|
|
|
`{"origSeedId":0,"newUrl":"https://webrecorder.net/"}`,
|
|
|
|
]);
|
2023-10-23 09:36:10 -07:00
|
|
|
});
|
|
|
|
|
|
|
|
test("check crawl restarted with saved state", async () => {
|
2024-03-15 20:54:43 -04:00
|
|
|
let containerId = null;
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const port = 36379;
|
|
|
|
|
2023-10-23 09:36:10 -07:00
|
|
|
try {
|
2024-03-15 20:54:43 -04:00
|
|
|
containerId = execSync(
|
2024-09-05 18:10:27 -07:00
|
|
|
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors "" --exclude community`,
|
2024-03-15 20:54:43 -04:00
|
|
|
{ encoding: "utf-8" },
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2023-11-03 15:15:09 -07:00
|
|
|
} catch (error) {
|
2023-10-23 09:36:10 -07:00
|
|
|
console.log(error);
|
|
|
|
}
|
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
await sleep(2000);
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2024-03-22 17:32:42 -07:00
|
|
|
const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2023-11-03 15:15:09 -07:00
|
|
|
try {
|
|
|
|
await redis.connect({
|
|
|
|
maxRetriesPerRequest: 100,
|
|
|
|
});
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2024-03-15 20:54:43 -04:00
|
|
|
await sleep(2000);
|
2023-10-23 09:36:10 -07:00
|
|
|
|
2023-11-03 15:15:09 -07:00
|
|
|
expect(await redis.get("test:d")).toBe(numDone + "");
|
2024-03-15 20:54:43 -04:00
|
|
|
|
|
|
|
for (const url of finished) {
|
|
|
|
const res = await redis.sismember("test:s", url);
|
|
|
|
expect(res).toBe(1);
|
|
|
|
}
|
2023-11-03 15:15:09 -07:00
|
|
|
} catch (e) {
|
|
|
|
console.log(e);
|
|
|
|
} finally {
|
2024-03-26 14:50:36 -07:00
|
|
|
await waitContainerDone(containerId);
|
2024-03-15 20:54:43 -04:00
|
|
|
}
|
2023-10-23 09:36:10 -07:00
|
|
|
});
|
2024-03-26 14:50:36 -07:00
|
|
|
|
2024-04-11 13:55:52 -07:00
|
|
|
test("ensure correct number of pages was written to pages + extraPages", () => {
|
2024-03-26 14:50:36 -07:00
|
|
|
const pages = fs
|
|
|
|
.readFileSync(pagesFile, { encoding: "utf-8" })
|
|
|
|
.trim()
|
|
|
|
.split("\n");
|
|
|
|
|
|
|
|
// first line is the header
|
2024-04-11 13:55:52 -07:00
|
|
|
expect(pages.length).toBe(2);
|
|
|
|
|
|
|
|
const extraPages = fs
|
|
|
|
.readFileSync(extraPagesFile, { encoding: "utf-8" })
|
|
|
|
.trim()
|
|
|
|
.split("\n");
|
|
|
|
|
|
|
|
// first line is the header
|
|
|
|
expect(extraPages.length).toBe(10);
|
2024-03-26 14:50:36 -07:00
|
|
|
});
|