browsertrix-crawler/tests/add-exclusion.test.js
Ilya Kreymer bb9c82493b
QA Crawl Support (Beta) (#469)
Initial (beta) support for QA/replay crawling!
- Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page
- Runs local http server with full-page, ui-less ReplayWeb.page embed
- ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs

Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint.
- Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd
- Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified.
- Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff
images.
- If using --writePagesToRedis, a `comparison` key is added to existing page data where:
```
  comparison: {
    screenshotMatch?: number;
    textMatch?: number;
    resourceCounts: {
      crawlGood?: number;
      crawlBad?: number;
      replayGood?: number;
      replayBad?: number;
    };
  };
  ```
- bump version to 1.1.0-beta.2
2024-03-22 17:32:42 -07:00

56 lines
1.5 KiB
JavaScript

import { exec } from "child_process";
import Redis from "ioredis";
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
test("dynamically add exclusion while crawl is running", async () => {
let callback = null;
const p = new Promise((resolve) => {
callback = (error, stdout, stderr) => {
resolve({ error, stdout, stderr });
};
});
try {
exec(
"docker run -p 36382:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
{ shell: "/bin/bash" },
callback,
);
} catch (error) {
console.log(error);
}
await sleep(3000);
const redis = new Redis("redis://127.0.0.1:36382/0", { lazyConnect: true, retryStrategy: () => null })
await redis.connect();
while (true) {
if (Number(await redis.zcard("test:q")) > 1) {
break;
}
await sleep(500);
}
const uids = await redis.hkeys("test:status");
// exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
await redis.rpush(
`${uids[0]}:msg`,
JSON.stringify({ type: "addExclusion", regex: "webrecorder" }),
);
// ensure 'Add Exclusion is contained in the debug logs
const { stdout } = await p;
expect(stdout.indexOf("Add Exclusion") > 0).toBe(true);
expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true);
});