add indexer entrypoint:

- populate dedup index from remote wacz/multi wacz/multiwacz json refactor: - move WACZLoader to wacz to be shared with indexer - state: move hash-based dedup to RedisDedupIndex cli args: - add --minPageDedupDepth to indicate when pages are skipped for dedup - skip same URLs by same hash within same crawl
2025-12-08 06:09:48 +00:00 · 2025-09-17 19:23:32 -07:00 · 2025-09-17 19:23:32 -07:00 · 2ecf290d38
commit 2ecf290d38
parent eb6b87fbaf
7 changed files with 292 additions and 72 deletions
--- a/5
+++ b/5
@ -44,11 +44,12 @@ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rw
 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
-RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
+RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js /app/dist/indexer.js && chmod a+r /app/html/rwp/*
 RUN ln -s /app/dist/main.js /usr/bin/crawl; \
    ln -s /app/dist/main.js /usr/bin/qa; \
-    ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
+    ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile; \
    ln -s /app/dist/indexer.js /usr/bin/indexer;
 RUN mkdir -p /app/behaviors
--- a/src/indexer.ts
+++ b/src/indexer.ts
@ -0,0 +1,180 @@
 #!/usr/bin/env node
 import yargs from "yargs";
 import { logger } from "./util/logger.js";
 import { getInfoString } from "./util/file_reader.js";
 import { openAsBlob } from "node:fs";
 import { WACZLoader } from "./util/wacz.js";
 import { ExitCodes } from "./util/constants.js";
 import { initRedisWaitForSuccess } from "./util/redis.js";
 import { AsyncIterReader } from "warcio";
 import { RedisDedupIndex } from "./util/state.js";
 export class CrawlIndexer {
  constructor() {}
  initArgs() {
    return yargs(process.argv)
      .usage("indexer [options]")
      .options({
        dedupStoreUrl: {
          describe: "URL for remote redis instance to index into",
          type: "string",
          required: true,
        },
        sourceUrl: {
          describe: "Source WACZ or Multi WACZ or Multi WACZ JSON to index",
          type: "string",
          required: true,
        },
      })
      .parseSync();
  }
  async run() {
    logger.setDebugLogging(true);
    process.on("SIGINT", () => this.handleTerminate("SIGINT"));
    process.on("SIGTERM", () => this.handleTerminate("SIGTERM"));
    logger.info(await getInfoString());
    const params = this.initArgs();
    const redis = await initRedisWaitForSuccess(params.dedupStoreUrl);
    const dedupIndex = new RedisDedupIndex(redis);
    const allFiles = [];
    for await (const waczfile of this.iterWACZ(params.sourceUrl)) {
      allFiles.push(waczfile);
    }
    let count = 0;
    const total = allFiles.length;
    for (const waczfile of allFiles) {
      count += 1;
      const loader = new WACZLoader(waczfile);
      logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile });
      for await (const file of loader.iterFiles("indexes/")) {
        const filename = file.filename;
        if (filename.endsWith(".cdx.gz")) {
          logger.debug("Processing CDX GZ Index", { filename });
          await this.ingestCDXJ(dedupIndex, loader, filename, "gzip");
        } else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) {
          logger.debug("Processing CDX Index", { filename });
          await this.ingestCDXJ(dedupIndex, loader, filename);
        }
      }
    }
    logger.info("Done!");
    process.exit(ExitCodes.Success);
  }
  async ingestCDXJ(
    dedupIndex: RedisDedupIndex,
    loader: WACZLoader,
    filename: string,
    compression?: string,
  ) {
    let reader = await loader.loadFile(filename);
    if (!reader) {
      logger.error("File not found, skipping!");
      return;
    }
    if (compression === "gzip") {
      reader = new AsyncIterReader(reader, "gzip", false);
    }
    let count = 0;
    for await (const line of reader.iterLines()) {
      const inx = line.indexOf(" {");
      if (inx < 0) {
        logger.error("Skipping invalid CDXJ, no JSON", { line });
        continue;
      }
      // eslint-disable-next-line @typescript-eslint/no-explicit-any
      let cdx: Record<string, any>;
      try {
        cdx = JSON.parse(line.slice(inx));
      } catch (e) {
        logger.error("Skipping invalid CDXJ, JSON invalid", { line });
        continue;
      }
      const date = line.split(" ", 2)[1];
      const url = cdx.url;
      const hash = cdx.digest;
      if (url.startsWith("urn:")) {
        continue;
      }
      // only adding originals to dedup against, don't want to dedup against existing revisits
      if (cdx.mime === "warc/revisit") {
        continue;
      }
      if (url && date && hash) {
        await dedupIndex.addHashDupe(hash, url, date);
      } else {
        logger.warn("Skipping invalid CDXJ, data missing", {
          url,
          date,
          digest: hash,
        });
        continue;
      }
      count += 1;
    }
    logger.debug("Processed", { count });
  }
  async *iterWACZ(url: string): AsyncIterable<string> {
    let path: string = url;
    try {
      path = new URL(url).pathname;
    } catch (e) {
      // ignore
    }
    if (path.endsWith(".wacz")) {
      yield url;
    } else if (path.endsWith(".json")) {
      if (!url.startsWith("http://") && !url.startsWith("https://")) {
        const blob = await openAsBlob(url);
        url = URL.createObjectURL(blob);
      }
      const resp = await fetch(url);
      const json = await resp.json();
      for (const entry of json.resources) {
        if (entry.path) {
          yield* this.iterWACZ(entry.path);
        }
      }
    } else {
      logger.warn("Unknown source", { url }, "replay");
    }
  }
  handleTerminate(signame: string) {
    logger.info(`Got signal ${signame}, exiting`);
    process.exit(ExitCodes.SignalInterrupted);
  }
 }
 await new CrawlIndexer().run();
--- a/src/util/argParser.ts
+++ b/src/util/argParser.ts
@ -449,7 +449,13 @@ class ArgParser {
          describe:
            "If set, url for remote redis server to store state. Otherwise, using local redis instance",
          type: "string",
-          default: "redis://localhost:6379/0",
+        },
        minPageDedupDepth: {
          describe:
            "If set >= 0, minimum depth at which duplicate pages can be skipped. -1 means never skip duplicate pages",
          type: "number",
          default: -1,
        },
        saveState: {
--- a/src/util/constants.ts
+++ b/src/util/constants.ts
@ -22,6 +22,8 @@ export const DETECT_SITEMAP = "<detect>";
 export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
 export const HASH_DUPE_KEY = "dupe";
 export enum BxFunctionBindings {
  BehaviorLogFunc = "__bx_log",
  AddLinkFunc = "__bx_addLink",
--- a/src/util/recorder.ts
+++ b/src/util/recorder.ts
@ -15,12 +15,19 @@ import {
  removeRangeAsQuery,
  rewriteDASH,
  rewriteHLS,
  tsToDate,
 } from "@webrecorder/wabac";
 import { WARCRecord, multiValueHeader } from "warcio";
 import { TempFileBuffer, WARCSerializer } from "warcio/node";
 import { WARCWriter } from "./warcwriter.js";
-import { LoadState, PageState, RedisCrawlState, WorkerId } from "./state.js";
+import {
  LoadState,
  normalizeDedupStatus,
  PageState,
  RedisCrawlState,
  WorkerId,
 } from "./state.js";
 import { CDPSession, Protocol } from "puppeteer-core";
 import { Crawler } from "../crawler.js";
 import { getProxyDispatcher } from "./proxy.js";
@ -38,7 +45,7 @@ const TAKE_STREAM_BUFF_SIZE = 1024 * 64;
 const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe";
-const WRITE_DUPE_KEY = "dupe";
+const WRITE_DUPE_KEY = "s:writedupe";
 const MIME_EVENT_STREAM = "text/event-stream";
@ -142,6 +149,7 @@ export class Recorder extends EventEmitter {
  pageSeed?: ScopedSeed;
  pageSeedDepth = 0;
  minPageDedupDepth = -1;
  frameIdToExecId: Map<string, number> | null;
@ -165,6 +173,8 @@ export class Recorder extends EventEmitter {
    this.shouldSaveStorage = !!crawler.params.saveStorage;
    this.minPageDedupDepth = crawler.params.minPageDedupDepth;
    this.writer = writer;
    this.fetcherQ = new PQueue({ concurrency: 1 });
@ -821,11 +831,16 @@ export class Recorder extends EventEmitter {
    const rewritten = await this.rewriteResponse(reqresp, mimeType);
-    if (url === this.pageUrl && reqresp.payload && this.pageSeedDepth >= 1) {
+    if (
      url === this.pageUrl &&
      reqresp.payload &&
      this.minPageDedupDepth >= 0 &&
      this.pageSeedDepth >= this.minPageDedupDepth
    ) {
      const hash =
        "sha256:" + createHash("sha256").update(reqresp.payload).digest("hex");
-      const res = await this.crawlState.getHashDupe(WRITE_DUPE_KEY, hash, url);
+      const { origUrl } = await this.crawlState.getHashDupe(hash);
-      if (res && res.dupe) {
+      if (origUrl) {
        const errorReason = "BlockedByResponse";
        await cdp.send("Fetch.failRequest", {
          requestId,
@ -1497,7 +1512,11 @@ export class Recorder extends EventEmitter {
    if (
      method === "GET" &&
      url &&
-      !(await this.crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status))
+      !(await this.crawlState.addIfNoDupe(
        ASYNC_FETCH_DUPE_KEY,
        url,
        normalizeDedupStatus(status),
      ))
    ) {
      reqresp.asyncLoading = false;
      return true;
@ -1608,7 +1627,7 @@ export class Recorder extends EventEmitter {
    //   !isRedirectStatus(status) &&
    //   !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
    // ) {
-    //   logNetwork("Skipping dupe", { url, status, ...this.logDetails });
+    //   logNetwork("Skipping exact URL dupe in this crawl", { url, status, ...this.logDetails });
    //   return false;
    // }
@ -1625,7 +1644,11 @@ export class Recorder extends EventEmitter {
        !(await this.checkStreamingRecordPayload(reqresp, serializer, false))
      ) {
        serializer.externalBuffer?.purge();
-        await this.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
+        await this.crawlState.removeDupe(
          ASYNC_FETCH_DUPE_KEY,
          url,
          normalizeDedupStatus(status),
        );
        //await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status);
        return false;
      }
@ -1659,29 +1682,29 @@ export class Recorder extends EventEmitter {
    }
    const hash = responseRecord.warcPayloadDigest || "";
    if (!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, hash))) {
      serializer.externalBuffer?.purge();
      return false;
    }
    const date = responseRecord.warcDate || "";
    const isEmpty = reqresp.readSize === 0;
    if (!isEmpty && url && method === "GET" && !isRedirectStatus(status)) {
-      const { dupe, origUrl, origDate } = await this.crawlState.getHashDupe(
+      const { origUrl, origDate } = await this.crawlState.getHashDupe(hash);
        WRITE_DUPE_KEY,
        hash,
        url,
      );
-      if (dupe) {
+      if (hash && origUrl && origDate) {
-        // duplicate url at origTs
+        const date = tsToDate(origDate).toISOString();
-        // skip, no need for revisit
+        // always write revisit here
-        logNetwork("Skipping dupe", { url, status, ...this.logDetails });
+        // duplicate URLs in same crawl filtered out separately
        return false;
      } else if (origUrl && origDate) {
        serializer.externalBuffer?.purge();
        ({ responseRecord, serializer } = await createRevisitForResponse(
          responseRecord,
          serializer,
          origUrl,
-          origDate,
+          date,
        ));
      } else {
        // no dupe, continue
@ -1717,7 +1740,7 @@ export class Recorder extends EventEmitter {
    this.addPageRecord(reqresp);
    if (!isEmpty) {
-      await this.crawlState.addHashDupe(WRITE_DUPE_KEY, hash, url, date);
+      await this.crawlState.addHashDupe(hash, url, date);
    }
    return true;
@ -2095,6 +2118,7 @@ async function createRevisitForResponse(
  const warcHeaders: Record<string, string> = {
    "WARC-Page-ID": responseRecord.warcHeaders.headers.get("WARC-Page-ID")!,
    "WARC-Payload-Digest": origPayloadDigest!,
  };
  const revisitRecord = WARCRecord.create({
@ -2115,13 +2139,6 @@ async function createRevisitForResponse(
  await serializer.digestRecord();
  if (origPayloadDigest) {
    revisitRecord.warcHeaders.headers.set(
      "WARC-Payload-Digest",
      origPayloadDigest,
    );
  }
  return { serializer, responseRecord: revisitRecord };
 }
--- a/src/util/state.ts
+++ b/src/util/state.ts
@ -7,6 +7,7 @@ import {
  MAX_DEPTH,
  DEFAULT_MAX_RETRIES,
  ROBOTS_CACHE_LIMIT,
  HASH_DUPE_KEY
 } from "./constants.js";
 import { ScopedSeed } from "./seeds.js";
 import { Frame } from "puppeteer-core";
@ -30,11 +31,11 @@ export enum QueueState {
 // ============================================================================
 // treat 0 or 206 as 200 for purposes of dedup
-function normalizeDedupStatus(status: number): number {
+export function normalizeDedupStatus(status: number): string {
  if (status === 0 || status === 206) {
-    return 200;
+    return "200";
  }
-  return status;
+  return status + "";
 }
 // ============================================================================
@ -189,10 +190,41 @@ export type SaveState = {
 };
 // ============================================================================
-export class RedisCrawlState {
+export class RedisDedupIndex {
  dedupRedis: Redis;
  constructor(dedupRedis: Redis) {
    this.dedupRedis = dedupRedis;
  }
  async getHashDupe(
    hash: string,
    key = HASH_DUPE_KEY,
    //url: string,
  ): Promise<{ origDate?: string; origUrl?: string }> {
    const value = await this.dedupRedis.hget(key, hash);
    if (!value) {
      return {};
    }
    const val = value.split("|");
    return { origUrl: val[1], origDate: val[0] };
  }
  async addHashDupe(
    hash: string,
    url: string,
    date: string,
    key = HASH_DUPE_KEY,
  ) {
    const val = date.replace(/[^\d]/g, "") + "|" + url;
    await this.dedupRedis.hsetnx(key, hash, val);
  }
 }
 // ============================================================================
 export class RedisCrawlState extends RedisDedupIndex {
  redis: Redis;
  maxRetries: number;
  dedupRedis: Redis;
  uid: string;
  key: string;
@ -224,8 +256,8 @@ export class RedisCrawlState {
    maxRetries?: number,
    dedupRedis?: Redis,
  ) {
    super(dedupRedis || redis);
    this.redis = redis;
    this.dedupRedis = dedupRedis || redis;
    this.uid = uid;
    this.key = key;
@ -1012,43 +1044,12 @@ return inx;
    return await this.redis.zcard(this.qkey);
  }
-  async addIfNoDupe(key: string, url: string, status: number) {
+  async addIfNoDupe(key: string, url: string, other_id: string) {
-    return (
+    return (await this.redis.sadd(key, other_id + "|" + url)) === 1;
      (await this.redis.sadd(key, normalizeDedupStatus(status) + "|" + url)) ===
      1
    );
  }
-  async removeDupe(key: string, url: string, status: number) {
+  async removeDupe(key: string, url: string, other_id: string) {
-    return await this.redis.srem(key, normalizeDedupStatus(status) + "|" + url);
+    return await this.redis.srem(key, other_id + "|" + url);
  }
  async getHashDupe(
    key: string,
    hash: string,
    url: string,
  ): Promise<{ dupe?: boolean; origDate?: string; origUrl?: string }> {
    const value = await this.dedupRedis.hget(key, hash);
    if (!value) {
      return {};
    }
    const val = value.split("|");
    // if matches the first entry, return
    if (val[1] === url) {
      return { dupe: true };
    }
    // otherwise, check if a revisit entry
    if (await this.dedupRedis.sismember(`${key}:${hash}`, url)) {
      return { dupe: true };
    }
    return { origUrl: val[1], origDate: val[0] };
  }
  async addHashDupe(key: string, hash: string, url: string, date: string) {
    const val = date + "|" + url;
    if (!(await this.dedupRedis.hsetnx(key, hash, val))) {
      await this.dedupRedis.sadd(`${key}:${hash}`, url);
    }
  }
  async isInUserSet(value: string) {
--- a/src/util/wacz.ts
+++ b/src/util/wacz.ts
@ -464,4 +464,17 @@ export class WACZLoader {
    return reader;
  }
  async *iterFiles(prefix: string) {
    if (!this.zipreader) {
      await this.init();
    }
    const entries = await this.zipreader!.load();
    for (const [key, value] of Object.entries(entries)) {
      if (key.startsWith(prefix)) {
        yield value;
      }
    }
  }
 }