mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
dedup indexing: strip hash prefix from digest, as cdx does not have it
tests: add index import + dedup crawl to ensure digests match fully
This commit is contained in:
parent
db4393c2a1
commit
ca02f09b5d
2 changed files with 42 additions and 4 deletions
|
|
@ -202,6 +202,7 @@ export class RedisDedupIndex {
|
||||||
key = HASH_DUPE_KEY,
|
key = HASH_DUPE_KEY,
|
||||||
//url: string,
|
//url: string,
|
||||||
): Promise<{ origDate?: string; origUrl?: string }> {
|
): Promise<{ origDate?: string; origUrl?: string }> {
|
||||||
|
hash = hash.split(":").at(-1)!;
|
||||||
const value = await this.dedupRedis.hget(key, hash);
|
const value = await this.dedupRedis.hget(key, hash);
|
||||||
if (!value) {
|
if (!value) {
|
||||||
return {};
|
return {};
|
||||||
|
|
@ -217,6 +218,7 @@ export class RedisDedupIndex {
|
||||||
key = HASH_DUPE_KEY,
|
key = HASH_DUPE_KEY,
|
||||||
) {
|
) {
|
||||||
const val = date.replace(/[^\d]/g, "") + "|" + url;
|
const val = date.replace(/[^\d]/g, "") + "|" + url;
|
||||||
|
hash = hash.split(":").at(-1)!;
|
||||||
await this.dedupRedis.hsetnx(key, hash, val);
|
await this.dedupRedis.hsetnx(key, hash, val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import {exec, execSync} from "child_process";
|
import {exec, execSync} from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import { Redis } from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { WARCParser } from "warcio";
|
import { WARCParser } from "warcio";
|
||||||
|
|
||||||
function sleep(ms) {
|
function sleep(ms) {
|
||||||
|
|
@ -10,7 +10,7 @@ function sleep(ms) {
|
||||||
|
|
||||||
|
|
||||||
let redisId;
|
let redisId;
|
||||||
//let crawler1, crawler2;
|
let numResponses = 0;
|
||||||
|
|
||||||
beforeAll(() => {
|
beforeAll(() => {
|
||||||
execSync("docker network create dedup");
|
execSync("docker network create dedup");
|
||||||
|
|
@ -28,10 +28,10 @@ afterAll(async () => {
|
||||||
execSync("docker network rm dedup");
|
execSync("docker network rm dedup");
|
||||||
});
|
});
|
||||||
|
|
||||||
function runCrawl(name) {
|
function runCrawl(name, db="0") {
|
||||||
fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true });
|
fs.rmSync(`./test-crawls/collections/${name}`, { recursive: true, force: true });
|
||||||
|
|
||||||
const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379`);
|
const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection ${name} --redisDedupUrl redis://dedup-redis:6379/${db} --generateWACZ`);
|
||||||
|
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
crawler.on("exit", (code) => {
|
crawler.on("exit", (code) => {
|
||||||
|
|
@ -92,6 +92,42 @@ test("check revisit records written on duplicate crawl", async () => {
|
||||||
|
|
||||||
// revisits should match number of responses for non urn:
|
// revisits should match number of responses for non urn:
|
||||||
expect(response).toBe(revisit);
|
expect(response).toBe(revisit);
|
||||||
|
|
||||||
|
numResponses = response;
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("import index and crawl dupe", async () => {
|
||||||
|
|
||||||
|
execSync(`docker run --rm -v $PWD/test-crawls:/crawls --network=dedup webrecorder/browsertrix-crawler indexer --sourceUrl /crawls/collections/dedup-test-orig/dedup-test-orig.wacz --redisDedupUrl redis://dedup-redis:6379/1`);
|
||||||
|
|
||||||
|
const redis = new Redis("redis://127.0.0.1:37379/1", { lazyConnect: true, retryStrategy: () => null });
|
||||||
|
|
||||||
|
await redis.connect({maxRetriesPerRequest: 50});
|
||||||
|
|
||||||
|
expect(await redis.hlen("dupe")).toBe(numResponses);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("imported crawl dupe matches previous dupe count", async () => {
|
||||||
|
expect(await runCrawl("dedup-test-dupe-2", 1)).toBe(0);
|
||||||
|
|
||||||
|
const dupeOrig = loadFirstWARC("dedup-test-dupe-2");
|
||||||
|
|
||||||
|
let revisit = 0;
|
||||||
|
|
||||||
|
for await (const record of dupeOrig) {
|
||||||
|
if (record.warcTargetURI && record.warcTargetURI.startsWith("urn:")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (record.warcType === "revisit") {
|
||||||
|
revisit++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// matches same number of revisits as original
|
||||||
|
expect(revisit).toBe(numResponses);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue