browsertrix-crawler/tests/dedupe-basic.test.ts

338 lines
8.9 KiB
TypeScript

import { exec, execSync } from "child_process";
import fs from "fs";
import path from "path";
import Redis from "ioredis";
import { WARCParser } from "warcio";
import { sleep } from "./utils";
let redisId: NonSharedBuffer;
let numResponses = 0;
let sizeSaved = 0;
beforeAll(() => {
execSync("docker network create dedupe");
redisId = execSync(
"docker run --rm --network=dedupe -p 37379:6379 --name dedupe-redis -d redis",
);
});
afterAll(async () => {
execSync(`docker kill ${redisId}`);
await sleep(3000);
execSync("docker network rm dedupe");
});
function runCrawl(name: string, { db = 0, limit = 4, wacz = true } = {}) {
fs.rmSync(`./test-crawls/collections/${name}`, {
recursive: true,
force: true,
});
const crawler = exec(
`docker run -v $PWD/test-crawls:/crawls --network=dedupe -e CRAWL_ID=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit ${limit} --exclude community --collection ${name} --redisDedupeUrl redis://dedupe-redis:6379/${db} ${
wacz ? "--generateWACZ" : ""
}`,
);
return new Promise((resolve) => {
crawler.on("exit", (code) => {
resolve(code);
});
});
}
function loadFirstWARC(name: string) {
const archiveWarcLists = fs.readdirSync(
`test-crawls/collections/${name}/archive`,
);
const warcName = path.join(
`test-crawls/collections/${name}/archive`,
archiveWarcLists[0],
);
const nodeStream = fs.createReadStream(warcName);
const parser = new WARCParser(nodeStream);
return parser;
}
function deleteFirstWARC(name: string) {
const archiveWarcLists = fs.readdirSync(
`test-crawls/collections/${name}/archive`,
);
const warcName = path.join(
`test-crawls/collections/${name}/archive`,
archiveWarcLists[0],
);
fs.unlinkSync(warcName);
}
function loadDataPackageRelated(name: string) {
execSync(
`unzip test-crawls/collections/${name}/${name}.wacz -d test-crawls/collections/${name}/wacz`,
);
const data = fs.readFileSync(
`test-crawls/collections/${name}/wacz/datapackage.json`,
"utf8",
);
const dataPackageJSON = JSON.parse(data);
return dataPackageJSON.relation;
}
async function redisGetHash(key: string, db = 0) {
const redis = new Redis(`redis://127.0.0.1:37379/${db}`, {
lazyConnect: true,
retryStrategy: () => null,
});
redis.options.maxRetriesPerRequest = 50;
await redis.connect();
return await redis.hgetall(key);
}
async function checkSizeStats(
numUniq: number,
key: string,
db: number,
minSize: number,
) {
const result = await redisGetHash(key, db);
console.log(result);
expect(numUniq).toBeLessThan(Number(result.totalUrls));
const conservedSize = Number(result.conservedSize);
expect(conservedSize).toBeGreaterThan(minSize);
return conservedSize;
}
test("check revisit records written on duplicate crawl, same collection, no wacz", async () => {
const collName = "dedupe-test-same-coll";
expect(await runCrawl(collName, { limit: 1, wacz: false })).toBe(0);
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const statusCode = -1;
let response = 0;
let revisit = 0;
const parserOrig = loadFirstWARC(collName);
for await (const record of parserOrig) {
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
continue;
}
if (record.warcType === "response") {
response++;
}
}
deleteFirstWARC(collName);
expect(await runCrawl(collName, { limit: 1, wacz: false })).toBe(0);
const dupeOrig = loadFirstWARC(collName);
for await (const record of dupeOrig) {
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
continue;
}
if (record.warcType === "revisit") {
revisit++;
}
}
expect(response).toBeGreaterThan(0);
// revisits should match number of responses for non urn:
expect(response).toBe(revisit);
numResponses = response;
await checkSizeStats(numResponses, "allcounts", 0, 77000);
});
test("dedupe same collection, with wacz, no external waczs referenced", async () => {
const collName = "dedupe-test-same-coll";
expect(await runCrawl(collName, { limit: 1, wacz: true })).toBe(0);
const related = loadDataPackageRelated(collName);
expect(related).toBe(undefined);
});
test("check revisit records written on duplicate crawl, different collections, with wacz", async () => {
expect(await runCrawl("dedupe-test-orig", { db: 1 })).toBe(0);
expect(await runCrawl("dedupe-test-dupe", { db: 1 })).toBe(0);
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const statusCode = -1;
let response = 0;
let revisit = 0;
const parserOrig = loadFirstWARC("dedupe-test-orig");
for await (const record of parserOrig) {
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
continue;
}
if (record.warcType === "response") {
response++;
}
}
const dupeOrig = loadFirstWARC("dedupe-test-dupe");
for await (const record of dupeOrig) {
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
continue;
}
if (record.warcType === "revisit") {
revisit++;
expect(record.warcHeader("WARC-Refers-To-Container")).toBe(
"file://dedupe-test-orig.wacz",
);
}
}
expect(response).toBeGreaterThan(0);
// revisits should match number of responses for non urn:
expect(response).toBe(revisit);
numResponses = response;
sizeSaved = await checkSizeStats(numResponses, "allcounts", 1, 48400000);
});
test("import dupe index, orig then revisits, from single wacz", async () => {
execSync(
`docker run -v $PWD/test-crawls:/crawls --network=dedupe webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedupe-test-orig/dedupe-test-orig.wacz --sourceCrawlId dedupe-test-orig --redisDedupeUrl redis://dedupe-redis:6379/2`,
);
const redis = new Redis("redis://127.0.0.1:37379/2", {
lazyConnect: true,
retryStrategy: () => null,
});
redis.options.maxRetriesPerRequest = 50;
await redis.connect();
});
test("verify new crawl against imported dupe index has same dupes as dedupe against original", async () => {
expect(await runCrawl("dedupe-test-dupe-2", { db: 2 })).toBe(0);
const dupeOrig = loadFirstWARC("dedupe-test-dupe-2");
let revisit = 0;
for await (const record of dupeOrig) {
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
continue;
}
if (record.warcType === "revisit") {
revisit++;
expect(record.warcHeader("WARC-Refers-To-Container")).toBe(
"file://dedupe-test-orig.wacz",
);
}
}
// matches same number of revisits as original
expect(revisit).toBe(numResponses);
await checkSizeStats(numResponses, "allcounts", 2, 48400000);
});
test("import dupe index from json, reverse, revisits than orig, from wacz", async () => {
const importJson = {
resources: [
{
name: "dedupe-test-dupe",
path: "/crawls/collections/dedupe-test-dupe/dedupe-test-dupe.wacz",
crawlId: "dedupe-test-dupe",
},
{
name: "invalid-file",
path: "/crawls/invalid-file",
crawlId: "dedupe-test-dupe",
},
{
name: "dedupe-test-orig",
path: "/crawls/collections/dedupe-test-orig/dedupe-test-orig.wacz",
crawlId: "dedupe-test-orig",
},
],
};
fs.writeFileSync(
"./test-crawls/collections/dedupe-test-dupe/import-1.json",
JSON.stringify(importJson),
"utf-8",
);
execSync(
`docker run -v $PWD/test-crawls:/crawls --network=dedupe webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedupe-test-dupe/import-1.json --redisDedupeUrl redis://dedupe-redis:6379/3`,
);
const redis = new Redis("redis://127.0.0.1:37379/3", {
lazyConnect: true,
retryStrategy: () => null,
});
redis.options.maxRetriesPerRequest = 50;
await redis.connect();
expect(await redis.hlen("alldupes")).toBe(numResponses);
const sizeSavedImport = await checkSizeStats(
numResponses,
"allcounts",
3,
48400000,
);
expect(sizeSavedImport).toBe(sizeSaved);
});
test("test requires in datapackage.json of wacz deduped against previous crawl", () => {
const res1 = loadDataPackageRelated("dedupe-test-dupe");
expect(res1.requires.length).toBe(1);
const entry = res1.requires[0];
expect(entry.crawlId).toBe("dedupe-test-orig");
expect(entry.filename).toBe("dedupe-test-orig.wacz");
expect(entry.size).toBeDefined();
expect(entry.hash).toBeDefined();
});
test("test requires in datapackage.json of wacz deduped against import from wacz", () => {
const res2 = loadDataPackageRelated("dedupe-test-dupe-2");
expect(res2.requires.length).toBe(1);
const entry2 = res2.requires[0];
expect(entry2.crawlId).toBe("dedupe-test-orig");
expect(entry2.filename).toBe("dedupe-test-orig.wacz");
// undefined as importing from single WACZ and not computing
expect(entry2.size).toBeUndefined();
expect(entry2.hash).toBeUndefined();
});