mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
add indexer entrypoint:
- populate dedup index from remote wacz/multi wacz/multiwacz json refactor: - move WACZLoader to wacz to be shared with indexer - state: move hash-based dedup to RedisDedupIndex cli args: - add --minPageDedupDepth to indicate when pages are skipped for dedup - skip same URLs by same hash within same crawl
This commit is contained in:
parent
eb6b87fbaf
commit
2ecf290d38
7 changed files with 292 additions and 72 deletions
|
|
@ -44,11 +44,12 @@ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rw
|
||||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
|
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
|
||||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
|
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
|
||||||
|
|
||||||
RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
|
RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js /app/dist/indexer.js && chmod a+r /app/html/rwp/*
|
||||||
|
|
||||||
RUN ln -s /app/dist/main.js /usr/bin/crawl; \
|
RUN ln -s /app/dist/main.js /usr/bin/crawl; \
|
||||||
ln -s /app/dist/main.js /usr/bin/qa; \
|
ln -s /app/dist/main.js /usr/bin/qa; \
|
||||||
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
|
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile; \
|
||||||
|
ln -s /app/dist/indexer.js /usr/bin/indexer;
|
||||||
|
|
||||||
RUN mkdir -p /app/behaviors
|
RUN mkdir -p /app/behaviors
|
||||||
|
|
||||||
|
|
|
||||||
180
src/indexer.ts
Normal file
180
src/indexer.ts
Normal file
|
|
@ -0,0 +1,180 @@
|
||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
import yargs from "yargs";
|
||||||
|
import { logger } from "./util/logger.js";
|
||||||
|
import { getInfoString } from "./util/file_reader.js";
|
||||||
|
import { openAsBlob } from "node:fs";
|
||||||
|
import { WACZLoader } from "./util/wacz.js";
|
||||||
|
import { ExitCodes } from "./util/constants.js";
|
||||||
|
import { initRedisWaitForSuccess } from "./util/redis.js";
|
||||||
|
import { AsyncIterReader } from "warcio";
|
||||||
|
import { RedisDedupIndex } from "./util/state.js";
|
||||||
|
|
||||||
|
export class CrawlIndexer {
|
||||||
|
constructor() {}
|
||||||
|
|
||||||
|
initArgs() {
|
||||||
|
return yargs(process.argv)
|
||||||
|
.usage("indexer [options]")
|
||||||
|
.options({
|
||||||
|
dedupStoreUrl: {
|
||||||
|
describe: "URL for remote redis instance to index into",
|
||||||
|
type: "string",
|
||||||
|
required: true,
|
||||||
|
},
|
||||||
|
|
||||||
|
sourceUrl: {
|
||||||
|
describe: "Source WACZ or Multi WACZ or Multi WACZ JSON to index",
|
||||||
|
type: "string",
|
||||||
|
required: true,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.parseSync();
|
||||||
|
}
|
||||||
|
|
||||||
|
async run() {
|
||||||
|
logger.setDebugLogging(true);
|
||||||
|
|
||||||
|
process.on("SIGINT", () => this.handleTerminate("SIGINT"));
|
||||||
|
|
||||||
|
process.on("SIGTERM", () => this.handleTerminate("SIGTERM"));
|
||||||
|
|
||||||
|
logger.info(await getInfoString());
|
||||||
|
|
||||||
|
const params = this.initArgs();
|
||||||
|
|
||||||
|
const redis = await initRedisWaitForSuccess(params.dedupStoreUrl);
|
||||||
|
const dedupIndex = new RedisDedupIndex(redis);
|
||||||
|
|
||||||
|
const allFiles = [];
|
||||||
|
|
||||||
|
for await (const waczfile of this.iterWACZ(params.sourceUrl)) {
|
||||||
|
allFiles.push(waczfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
const total = allFiles.length;
|
||||||
|
|
||||||
|
for (const waczfile of allFiles) {
|
||||||
|
count += 1;
|
||||||
|
const loader = new WACZLoader(waczfile);
|
||||||
|
logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile });
|
||||||
|
for await (const file of loader.iterFiles("indexes/")) {
|
||||||
|
const filename = file.filename;
|
||||||
|
if (filename.endsWith(".cdx.gz")) {
|
||||||
|
logger.debug("Processing CDX GZ Index", { filename });
|
||||||
|
await this.ingestCDXJ(dedupIndex, loader, filename, "gzip");
|
||||||
|
} else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) {
|
||||||
|
logger.debug("Processing CDX Index", { filename });
|
||||||
|
await this.ingestCDXJ(dedupIndex, loader, filename);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Done!");
|
||||||
|
process.exit(ExitCodes.Success);
|
||||||
|
}
|
||||||
|
|
||||||
|
async ingestCDXJ(
|
||||||
|
dedupIndex: RedisDedupIndex,
|
||||||
|
loader: WACZLoader,
|
||||||
|
filename: string,
|
||||||
|
compression?: string,
|
||||||
|
) {
|
||||||
|
let reader = await loader.loadFile(filename);
|
||||||
|
|
||||||
|
if (!reader) {
|
||||||
|
logger.error("File not found, skipping!");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (compression === "gzip") {
|
||||||
|
reader = new AsyncIterReader(reader, "gzip", false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = 0;
|
||||||
|
|
||||||
|
for await (const line of reader.iterLines()) {
|
||||||
|
const inx = line.indexOf(" {");
|
||||||
|
if (inx < 0) {
|
||||||
|
logger.error("Skipping invalid CDXJ, no JSON", { line });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
let cdx: Record<string, any>;
|
||||||
|
|
||||||
|
try {
|
||||||
|
cdx = JSON.parse(line.slice(inx));
|
||||||
|
} catch (e) {
|
||||||
|
logger.error("Skipping invalid CDXJ, JSON invalid", { line });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const date = line.split(" ", 2)[1];
|
||||||
|
const url = cdx.url;
|
||||||
|
const hash = cdx.digest;
|
||||||
|
|
||||||
|
if (url.startsWith("urn:")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// only adding originals to dedup against, don't want to dedup against existing revisits
|
||||||
|
if (cdx.mime === "warc/revisit") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (url && date && hash) {
|
||||||
|
await dedupIndex.addHashDupe(hash, url, date);
|
||||||
|
} else {
|
||||||
|
logger.warn("Skipping invalid CDXJ, data missing", {
|
||||||
|
url,
|
||||||
|
date,
|
||||||
|
digest: hash,
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.debug("Processed", { count });
|
||||||
|
}
|
||||||
|
|
||||||
|
async *iterWACZ(url: string): AsyncIterable<string> {
|
||||||
|
let path: string = url;
|
||||||
|
|
||||||
|
try {
|
||||||
|
path = new URL(url).pathname;
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path.endsWith(".wacz")) {
|
||||||
|
yield url;
|
||||||
|
} else if (path.endsWith(".json")) {
|
||||||
|
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
||||||
|
const blob = await openAsBlob(url);
|
||||||
|
url = URL.createObjectURL(blob);
|
||||||
|
}
|
||||||
|
|
||||||
|
const resp = await fetch(url);
|
||||||
|
const json = await resp.json();
|
||||||
|
|
||||||
|
for (const entry of json.resources) {
|
||||||
|
if (entry.path) {
|
||||||
|
yield* this.iterWACZ(entry.path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.warn("Unknown source", { url }, "replay");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
handleTerminate(signame: string) {
|
||||||
|
logger.info(`Got signal ${signame}, exiting`);
|
||||||
|
process.exit(ExitCodes.SignalInterrupted);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await new CrawlIndexer().run();
|
||||||
|
|
@ -449,7 +449,13 @@ class ArgParser {
|
||||||
describe:
|
describe:
|
||||||
"If set, url for remote redis server to store state. Otherwise, using local redis instance",
|
"If set, url for remote redis server to store state. Otherwise, using local redis instance",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: "redis://localhost:6379/0",
|
},
|
||||||
|
|
||||||
|
minPageDedupDepth: {
|
||||||
|
describe:
|
||||||
|
"If set >= 0, minimum depth at which duplicate pages can be skipped. -1 means never skip duplicate pages",
|
||||||
|
type: "number",
|
||||||
|
default: -1,
|
||||||
},
|
},
|
||||||
|
|
||||||
saveState: {
|
saveState: {
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,8 @@ export const DETECT_SITEMAP = "<detect>";
|
||||||
|
|
||||||
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
||||||
|
|
||||||
|
export const HASH_DUPE_KEY = "dupe";
|
||||||
|
|
||||||
export enum BxFunctionBindings {
|
export enum BxFunctionBindings {
|
||||||
BehaviorLogFunc = "__bx_log",
|
BehaviorLogFunc = "__bx_log",
|
||||||
AddLinkFunc = "__bx_addLink",
|
AddLinkFunc = "__bx_addLink",
|
||||||
|
|
|
||||||
|
|
@ -15,12 +15,19 @@ import {
|
||||||
removeRangeAsQuery,
|
removeRangeAsQuery,
|
||||||
rewriteDASH,
|
rewriteDASH,
|
||||||
rewriteHLS,
|
rewriteHLS,
|
||||||
|
tsToDate,
|
||||||
} from "@webrecorder/wabac";
|
} from "@webrecorder/wabac";
|
||||||
|
|
||||||
import { WARCRecord, multiValueHeader } from "warcio";
|
import { WARCRecord, multiValueHeader } from "warcio";
|
||||||
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
||||||
import { WARCWriter } from "./warcwriter.js";
|
import { WARCWriter } from "./warcwriter.js";
|
||||||
import { LoadState, PageState, RedisCrawlState, WorkerId } from "./state.js";
|
import {
|
||||||
|
LoadState,
|
||||||
|
normalizeDedupStatus,
|
||||||
|
PageState,
|
||||||
|
RedisCrawlState,
|
||||||
|
WorkerId,
|
||||||
|
} from "./state.js";
|
||||||
import { CDPSession, Protocol } from "puppeteer-core";
|
import { CDPSession, Protocol } from "puppeteer-core";
|
||||||
import { Crawler } from "../crawler.js";
|
import { Crawler } from "../crawler.js";
|
||||||
import { getProxyDispatcher } from "./proxy.js";
|
import { getProxyDispatcher } from "./proxy.js";
|
||||||
|
|
@ -38,7 +45,7 @@ const TAKE_STREAM_BUFF_SIZE = 1024 * 64;
|
||||||
|
|
||||||
const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe";
|
const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe";
|
||||||
|
|
||||||
const WRITE_DUPE_KEY = "dupe";
|
const WRITE_DUPE_KEY = "s:writedupe";
|
||||||
|
|
||||||
const MIME_EVENT_STREAM = "text/event-stream";
|
const MIME_EVENT_STREAM = "text/event-stream";
|
||||||
|
|
||||||
|
|
@ -142,6 +149,7 @@ export class Recorder extends EventEmitter {
|
||||||
|
|
||||||
pageSeed?: ScopedSeed;
|
pageSeed?: ScopedSeed;
|
||||||
pageSeedDepth = 0;
|
pageSeedDepth = 0;
|
||||||
|
minPageDedupDepth = -1;
|
||||||
|
|
||||||
frameIdToExecId: Map<string, number> | null;
|
frameIdToExecId: Map<string, number> | null;
|
||||||
|
|
||||||
|
|
@ -165,6 +173,8 @@ export class Recorder extends EventEmitter {
|
||||||
|
|
||||||
this.shouldSaveStorage = !!crawler.params.saveStorage;
|
this.shouldSaveStorage = !!crawler.params.saveStorage;
|
||||||
|
|
||||||
|
this.minPageDedupDepth = crawler.params.minPageDedupDepth;
|
||||||
|
|
||||||
this.writer = writer;
|
this.writer = writer;
|
||||||
|
|
||||||
this.fetcherQ = new PQueue({ concurrency: 1 });
|
this.fetcherQ = new PQueue({ concurrency: 1 });
|
||||||
|
|
@ -821,11 +831,16 @@ export class Recorder extends EventEmitter {
|
||||||
|
|
||||||
const rewritten = await this.rewriteResponse(reqresp, mimeType);
|
const rewritten = await this.rewriteResponse(reqresp, mimeType);
|
||||||
|
|
||||||
if (url === this.pageUrl && reqresp.payload && this.pageSeedDepth >= 1) {
|
if (
|
||||||
|
url === this.pageUrl &&
|
||||||
|
reqresp.payload &&
|
||||||
|
this.minPageDedupDepth >= 0 &&
|
||||||
|
this.pageSeedDepth >= this.minPageDedupDepth
|
||||||
|
) {
|
||||||
const hash =
|
const hash =
|
||||||
"sha256:" + createHash("sha256").update(reqresp.payload).digest("hex");
|
"sha256:" + createHash("sha256").update(reqresp.payload).digest("hex");
|
||||||
const res = await this.crawlState.getHashDupe(WRITE_DUPE_KEY, hash, url);
|
const { origUrl } = await this.crawlState.getHashDupe(hash);
|
||||||
if (res && res.dupe) {
|
if (origUrl) {
|
||||||
const errorReason = "BlockedByResponse";
|
const errorReason = "BlockedByResponse";
|
||||||
await cdp.send("Fetch.failRequest", {
|
await cdp.send("Fetch.failRequest", {
|
||||||
requestId,
|
requestId,
|
||||||
|
|
@ -1497,7 +1512,11 @@ export class Recorder extends EventEmitter {
|
||||||
if (
|
if (
|
||||||
method === "GET" &&
|
method === "GET" &&
|
||||||
url &&
|
url &&
|
||||||
!(await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status))
|
!(await this.crawlState.addIfNoDupe(
|
||||||
|
ASYNC_FETCH_DUPE_KEY,
|
||||||
|
url,
|
||||||
|
normalizeDedupStatus(status),
|
||||||
|
))
|
||||||
) {
|
) {
|
||||||
reqresp.asyncLoading = false;
|
reqresp.asyncLoading = false;
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -1608,7 +1627,7 @@ export class Recorder extends EventEmitter {
|
||||||
// !isRedirectStatus(status) &&
|
// !isRedirectStatus(status) &&
|
||||||
// !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
// !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
||||||
// ) {
|
// ) {
|
||||||
// logNetwork("Skipping dupe", { url, status, ...this.logDetails });
|
// logNetwork("Skipping exact URL dupe in this crawl", { url, status, ...this.logDetails });
|
||||||
// return false;
|
// return false;
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
|
@ -1625,7 +1644,11 @@ export class Recorder extends EventEmitter {
|
||||||
!(await this.checkStreamingRecordPayload(reqresp, serializer, false))
|
!(await this.checkStreamingRecordPayload(reqresp, serializer, false))
|
||||||
) {
|
) {
|
||||||
serializer.externalBuffer?.purge();
|
serializer.externalBuffer?.purge();
|
||||||
await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
|
await this.crawlState.removeDupe(
|
||||||
|
ASYNC_FETCH_DUPE_KEY,
|
||||||
|
url,
|
||||||
|
normalizeDedupStatus(status),
|
||||||
|
);
|
||||||
//await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
|
//await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -1659,29 +1682,29 @@ export class Recorder extends EventEmitter {
|
||||||
}
|
}
|
||||||
|
|
||||||
const hash = responseRecord.warcPayloadDigest || "";
|
const hash = responseRecord.warcPayloadDigest || "";
|
||||||
|
|
||||||
|
if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) {
|
||||||
|
serializer.externalBuffer?.purge();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
const date = responseRecord.warcDate || "";
|
const date = responseRecord.warcDate || "";
|
||||||
|
|
||||||
const isEmpty = reqresp.readSize === 0;
|
const isEmpty = reqresp.readSize === 0;
|
||||||
|
|
||||||
if (!isEmpty && url && method === "GET" && !isRedirectStatus(status)) {
|
if (!isEmpty && url && method === "GET" && !isRedirectStatus(status)) {
|
||||||
const { dupe, origUrl, origDate } = await this.crawlState.getHashDupe(
|
const { origUrl, origDate } = await this.crawlState.getHashDupe(hash);
|
||||||
WRITE_DUPE_KEY,
|
|
||||||
hash,
|
|
||||||
url,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (dupe) {
|
if (hash && origUrl && origDate) {
|
||||||
// duplicate url at origTs
|
const date = tsToDate(origDate).toISOString();
|
||||||
// skip, no need for revisit
|
// always write revisit here
|
||||||
logNetwork("Skipping dupe", { url, status, ...this.logDetails });
|
// duplicate URLs in same crawl filtered out separately
|
||||||
return false;
|
|
||||||
} else if (origUrl && origDate) {
|
|
||||||
serializer.externalBuffer?.purge();
|
serializer.externalBuffer?.purge();
|
||||||
({ responseRecord, serializer } = await createRevisitForResponse(
|
({ responseRecord, serializer } = await createRevisitForResponse(
|
||||||
responseRecord,
|
responseRecord,
|
||||||
serializer,
|
serializer,
|
||||||
origUrl,
|
origUrl,
|
||||||
origDate,
|
date,
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
// no dupe, continue
|
// no dupe, continue
|
||||||
|
|
@ -1717,7 +1740,7 @@ export class Recorder extends EventEmitter {
|
||||||
this.addPageRecord(reqresp);
|
this.addPageRecord(reqresp);
|
||||||
|
|
||||||
if (!isEmpty) {
|
if (!isEmpty) {
|
||||||
await this.crawlState.addHashDupe(WRITE_DUPE_KEY, hash, url, date);
|
await this.crawlState.addHashDupe(hash, url, date);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
@ -2095,6 +2118,7 @@ async function createRevisitForResponse(
|
||||||
|
|
||||||
const warcHeaders: Record<string, string> = {
|
const warcHeaders: Record<string, string> = {
|
||||||
"WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!,
|
"WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!,
|
||||||
|
"WARC-Payload-Digest": origPayloadDigest!,
|
||||||
};
|
};
|
||||||
|
|
||||||
const revisitRecord = WARCRecord.create({
|
const revisitRecord = WARCRecord.create({
|
||||||
|
|
@ -2115,13 +2139,6 @@ async function createRevisitForResponse(
|
||||||
|
|
||||||
await serializer.digestRecord();
|
await serializer.digestRecord();
|
||||||
|
|
||||||
if (origPayloadDigest) {
|
|
||||||
revisitRecord.warcHeaders.headers.set(
|
|
||||||
"WARC-Payload-Digest",
|
|
||||||
origPayloadDigest,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return { serializer, responseRecord: revisitRecord };
|
return { serializer, responseRecord: revisitRecord };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import {
|
||||||
MAX_DEPTH,
|
MAX_DEPTH,
|
||||||
DEFAULT_MAX_RETRIES,
|
DEFAULT_MAX_RETRIES,
|
||||||
ROBOTS_CACHE_LIMIT,
|
ROBOTS_CACHE_LIMIT,
|
||||||
|
HASH_DUPE_KEY
|
||||||
} from "./constants.js";
|
} from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { Frame } from "puppeteer-core";
|
import { Frame } from "puppeteer-core";
|
||||||
|
|
@ -30,11 +31,11 @@ export enum QueueState {
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// treat 0 or 206 as 200 for purposes of dedup
|
// treat 0 or 206 as 200 for purposes of dedup
|
||||||
function normalizeDedupStatus(status: number): number {
|
export function normalizeDedupStatus(status: number): string {
|
||||||
if (status === 0 || status === 206) {
|
if (status === 0 || status === 206) {
|
||||||
return 200;
|
return "200";
|
||||||
}
|
}
|
||||||
return status;
|
return status + "";
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
@ -189,10 +190,41 @@ export type SaveState = {
|
||||||
};
|
};
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class RedisCrawlState {
|
export class RedisDedupIndex {
|
||||||
|
dedupRedis: Redis;
|
||||||
|
|
||||||
|
constructor(dedupRedis: Redis) {
|
||||||
|
this.dedupRedis = dedupRedis;
|
||||||
|
}
|
||||||
|
|
||||||
|
async getHashDupe(
|
||||||
|
hash: string,
|
||||||
|
key = HASH_DUPE_KEY,
|
||||||
|
//url: string,
|
||||||
|
): Promise<{ origDate?: string; origUrl?: string }> {
|
||||||
|
const value = await this.dedupRedis.hget(key, hash);
|
||||||
|
if (!value) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
const val = value.split("|");
|
||||||
|
return { origUrl: val[1], origDate: val[0] };
|
||||||
|
}
|
||||||
|
|
||||||
|
async addHashDupe(
|
||||||
|
hash: string,
|
||||||
|
url: string,
|
||||||
|
date: string,
|
||||||
|
key = HASH_DUPE_KEY,
|
||||||
|
) {
|
||||||
|
const val = date.replace(/[^\d]/g, "") + "|" + url;
|
||||||
|
await this.dedupRedis.hsetnx(key, hash, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
export class RedisCrawlState extends RedisDedupIndex {
|
||||||
redis: Redis;
|
redis: Redis;
|
||||||
maxRetries: number;
|
maxRetries: number;
|
||||||
dedupRedis: Redis;
|
|
||||||
|
|
||||||
uid: string;
|
uid: string;
|
||||||
key: string;
|
key: string;
|
||||||
|
|
@ -224,8 +256,8 @@ export class RedisCrawlState {
|
||||||
maxRetries?: number,
|
maxRetries?: number,
|
||||||
dedupRedis?: Redis,
|
dedupRedis?: Redis,
|
||||||
) {
|
) {
|
||||||
|
super(dedupRedis || redis);
|
||||||
this.redis = redis;
|
this.redis = redis;
|
||||||
this.dedupRedis = dedupRedis || redis;
|
|
||||||
|
|
||||||
this.uid = uid;
|
this.uid = uid;
|
||||||
this.key = key;
|
this.key = key;
|
||||||
|
|
@ -1012,43 +1044,12 @@ return inx;
|
||||||
return await this.redis.zcard(this.qkey);
|
return await this.redis.zcard(this.qkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
async addIfNoDupe(key: string, url: string, status: number) {
|
async addIfNoDupe(key: string, url: string, other_id: string) {
|
||||||
return (
|
return (await this.redis.sadd(key, other_id + "|" + url)) === 1;
|
||||||
(await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
|
|
||||||
1
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async removeDupe(key: string, url: string, status: number) {
|
async removeDupe(key: string, url: string, other_id: string) {
|
||||||
return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url);
|
return await this.redis.srem(key, other_id + "|" + url);
|
||||||
}
|
|
||||||
|
|
||||||
async getHashDupe(
|
|
||||||
key: string,
|
|
||||||
hash: string,
|
|
||||||
url: string,
|
|
||||||
): Promise<{ dupe?: boolean; origDate?: string; origUrl?: string }> {
|
|
||||||
const value = await this.dedupRedis.hget(key, hash);
|
|
||||||
if (!value) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
const val = value.split("|");
|
|
||||||
// if matches the first entry, return
|
|
||||||
if (val[1] === url) {
|
|
||||||
return { dupe: true };
|
|
||||||
}
|
|
||||||
// otherwise, check if a revisit entry
|
|
||||||
if (await this.dedupRedis.sismember(`${key}:${hash}`, url)) {
|
|
||||||
return { dupe: true };
|
|
||||||
}
|
|
||||||
return { origUrl: val[1], origDate: val[0] };
|
|
||||||
}
|
|
||||||
|
|
||||||
async addHashDupe(key: string, hash: string, url: string, date: string) {
|
|
||||||
const val = date + "|" + url;
|
|
||||||
if (!(await this.dedupRedis.hsetnx(key, hash, val))) {
|
|
||||||
await this.dedupRedis.sadd(`${key}:${hash}`, url);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async isInUserSet(value: string) {
|
async isInUserSet(value: string) {
|
||||||
|
|
|
||||||
|
|
@ -464,4 +464,17 @@ export class WACZLoader {
|
||||||
|
|
||||||
return reader;
|
return reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async *iterFiles(prefix: string) {
|
||||||
|
if (!this.zipreader) {
|
||||||
|
await this.init();
|
||||||
|
}
|
||||||
|
const entries = await this.zipreader!.load();
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(entries)) {
|
||||||
|
if (key.startsWith(prefix)) {
|
||||||
|
yield value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue