mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
tests: add dedup-basic.test for simple dedup, ensure number of revisit records === number of response records
This commit is contained in:
parent
c447428450
commit
0cadf371d0
1 changed files with 97 additions and 0 deletions
97
tests/dedup-basic.test.js
Normal file
97
tests/dedup-basic.test.js
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
import {exec, execSync} from "child_process";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { Redis } from "ioredis";
|
||||
import { WARCParser } from "warcio";
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
|
||||
let redisId;
|
||||
//let crawler1, crawler2;
|
||||
|
||||
beforeAll(() => {
|
||||
execSync("docker network create dedup");
|
||||
|
||||
redisId = execSync("docker run --rm --network=dedup -p 37379:6379 --name dedup-redis -d redis");
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
execSync(`docker kill ${redisId}`);
|
||||
|
||||
await sleep(3000);
|
||||
|
||||
//await Promise.allSettled([crawler1, crawler2]);
|
||||
|
||||
execSync("docker network rm dedup");
|
||||
});
|
||||
|
||||
function runCrawl(name) {
|
||||
fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true });
|
||||
|
||||
const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379`);
|
||||
|
||||
return new Promise((resolve) => {
|
||||
crawler.on("exit", (code) => {
|
||||
resolve(code);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function loadFirstWARC(name) {
|
||||
const archiveWarcLists = fs.readdirSync(
|
||||
`test-crawls/collections/${name}/archive`,
|
||||
);
|
||||
|
||||
const warcName = path.join(`test-crawls/collections/${name}/archive`, archiveWarcLists[0]);
|
||||
|
||||
const nodeStream = fs.createReadStream(warcName);
|
||||
|
||||
const parser = new WARCParser(nodeStream);
|
||||
|
||||
return parser;
|
||||
}
|
||||
|
||||
test("check revisit records written on duplicate crawl", async () => {
|
||||
|
||||
expect(await runCrawl("dedup-test-orig")).toBe(0);
|
||||
expect(await runCrawl("dedup-test-dupe")).toBe(0);
|
||||
|
||||
let statusCode = -1;
|
||||
|
||||
let response = 0;
|
||||
let revisit = 0;
|
||||
|
||||
const parserOrig = loadFirstWARC("dedup-test-orig");
|
||||
|
||||
for await (const record of parserOrig) {
|
||||
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (record.warcType === "response") {
|
||||
response++;
|
||||
}
|
||||
}
|
||||
|
||||
const dupeOrig = loadFirstWARC("dedup-test-dupe");
|
||||
|
||||
for await (const record of dupeOrig) {
|
||||
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (record.warcType === "revisit") {
|
||||
revisit++;
|
||||
}
|
||||
}
|
||||
|
||||
expect(response).toBeGreaterThan(0);
|
||||
|
||||
// revisits should match number of responses for non urn:
|
||||
expect(response).toBe(revisit);
|
||||
});
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue