QA Crawl Support (Beta) (#469)

Initial (beta) support for QA/replay crawling! - Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page - Runs local http server with full-page, ui-less ReplayWeb.page embed - ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint. - Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd - Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified. - Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff images. - If using --writePagesToRedis, a `comparison` key is added to existing page data where: ``` comparison: { screenshotMatch?: number; textMatch?: number; resourceCounts: { crawlGood?: number; crawlBad?: number; replayGood?: number; replayBad?: number; }; }; ``` - bump version to 1.1.0-beta.2
2025-12-08 06:09:48 +00:00 · 2024-03-22 17:32:42 -07:00 · 2024-03-22 17:32:42 -07:00 · bb9c82493b
commit bb9c82493b
parent 22a7351dc7
22 changed files with 2068 additions and 598 deletions
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -13,6 +13,8 @@ import {
  PageCallbacks,
 } from "./util/state.js";

+import { parseArgs } from "./util/argParser.js";
+
 import yaml from "js-yaml";

 import * as warcio from "warcio";
@ -29,7 +31,6 @@ import {
 } from "./util/storage.js";
 import { ScreenCaster, WSTransport } from "./util/screencaster.js";
 import { Screenshots } from "./util/screenshots.js";
-import { parseArgs } from "./util/argParser.js";
 import { initRedis } from "./util/redis.js";
 import { logger, formatErr } from "./util/logger.js";
 import {
@ -57,6 +58,7 @@ import { OriginOverride } from "./util/originoverride.js";
 import { Agent as HTTPAgent } from "http";
 import { Agent as HTTPSAgent } from "https";
 import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
+import { Recorder } from "./util/recorder.js";
 import { SitemapReader } from "./util/sitemapper.js";
 import { ScopedSeed } from "./util/seeds.js";

@ -146,6 +148,8 @@ export class Crawler {
  pagesDir: string;
  pagesFile: string;

+  archivesDir: string;
+
  blockRules: BlockRules | null;
  adBlockRules: AdBlockRules | null;

@ -154,11 +158,15 @@ export class Crawler {

  screencaster: ScreenCaster | null = null;

+  skipTextDocs = 0;
+
  interrupted = false;
  finalExit = false;
  uploadAndDeleteLocal = false;
  done = false;

+  textInPages = false;
+
  customBehaviors = "";
  behaviorsChecked = false;
  behaviorLastLine?: string;
@ -178,10 +186,12 @@ export class Crawler {
    crawler: Crawler;
  }) => NonNullable<unknown>;

+  recording = true;
+
  constructor() {
-    const res = parseArgs();
-    this.params = res.parsed;
-    this.origConfig = res.origConfig;
+    const args = this.parseArgs();
+    this.params = args.parsed;
+    this.origConfig = args.origConfig;

    // root collections dir
    this.collDir = path.join(
@ -259,6 +269,9 @@ export class Crawler {
    // pages file
    this.pagesFile = path.join(this.pagesDir, "pages.jsonl");

+    // archives dir
+    this.archivesDir = path.join(this.collDir, "archive");
+
    this.blockRules = null;
    this.adBlockRules = null;

@ -268,6 +281,8 @@ export class Crawler {
    this.finalExit = false;
    this.uploadAndDeleteLocal = false;

+    this.textInPages = this.params.text.includes("to-pages");
+
    this.done = false;

    this.customBehaviors = "";
@ -281,6 +296,10 @@ export class Crawler {
    }
  }

+  protected parseArgs() {
+    return parseArgs();
+  }
+
  configureUA() {
    // override userAgent
    if (this.params.userAgent) {
@ -434,7 +453,9 @@ export class Crawler {
    //  logger.info("wb-manager init failed, collection likely already exists");
    //}

-    fs.mkdirSync(this.logDir, { recursive: true });
+    await fsp.mkdir(this.logDir, { recursive: true });
+    await fsp.mkdir(this.archivesDir, { recursive: true });
+
    this.logFH = fs.createWriteStream(this.logFilename);
    logger.setExternalLogStream(this.logFH);

@ -721,10 +742,10 @@ self.__bx_behaviors.selectMainBehavior();
    return "";
  }

-  async crawlPage(opts: WorkerState) {
+  async crawlPage(opts: WorkerState): Promise<void> {
    await this.writeStats();

-    const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
+    const { page, data, workerid, callbacks, directFetchCapture } = opts;
    data.callbacks = callbacks;

    const { url } = data;
@ -764,7 +785,7 @@ self.__bx_behaviors.selectMainBehavior();
            { url, ...logDetails },
            "fetch",
          );
-          return true;
+          return;
        }
      } catch (e) {
        // filtered out direct fetch
@ -782,7 +803,14 @@ self.__bx_behaviors.selectMainBehavior();
    data.title = await page.title();
    data.favicon = await this.getFavicon(page, logDetails);

-    const archiveDir = path.join(this.collDir, "archive");
+    await this.doPostLoadActions(opts);
+  }
+
+  async doPostLoadActions(opts: WorkerState, saveOutput = false) {
+    const { page, cdp, data, workerid } = opts;
+    const { url } = data;
+
+    const logDetails = { page: url, workerid };

    if (this.params.screenshot) {
      if (!data.isHTMLPage) {
@ -793,10 +821,10 @@ self.__bx_behaviors.selectMainBehavior();
        browser: this.browser,
        page,
        url,
-        directory: archiveDir,
+        directory: this.archivesDir,
      });
      if (this.params.screenshot.includes("view")) {
-        await screenshots.take();
+        await screenshots.take("view", saveOutput ? data : null);
      }
      if (this.params.screenshot.includes("fullPage")) {
        await screenshots.takeFullPage();
@ -812,15 +840,16 @@ self.__bx_behaviors.selectMainBehavior();
      textextract = new TextExtractViaSnapshot(cdp, {
        warcPrefix: this.warcPrefix,
        url,
-        directory: archiveDir,
+        directory: this.archivesDir,
+        skipDocs: this.skipTextDocs,
      });
-      const { changed, text } = await textextract.extractAndStoreText(
+      const { text } = await textextract.extractAndStoreText(
        "text",
        false,
        this.params.text.includes("to-warc"),
      );

-      if (changed && text && this.params.text.includes("to-pages")) {
+      if (text && (this.textInPages || saveOutput)) {
        data.text = text;
      }
    }
@ -868,8 +897,6 @@ self.__bx_behaviors.selectMainBehavior();
      );
      await sleep(this.params.pageExtraDelay);
    }
-
-    return true;
  }

  async pageFinished(data: PageState) {
@ -1047,8 +1074,7 @@ self.__bx_behaviors.selectMainBehavior();
  async checkLimits() {
    let interrupt = false;

-    const dir = path.join(this.collDir, "archive");
-    const size = await getDirSize(dir);
+    const size = await getDirSize(this.archivesDir);

    await this.crawlState.setArchiveSize(size);

@ -1230,28 +1256,11 @@ self.__bx_behaviors.selectMainBehavior();

    this.screencaster = this.initScreenCaster();

-    if (this.params.originOverride.length) {
+    if (this.params.originOverride && this.params.originOverride.length) {
      this.originOverride = new OriginOverride(this.params.originOverride);
    }

-    for (let i = 0; i < this.params.scopedSeeds.length; i++) {
-      const seed = this.params.scopedSeeds[i];
-      if (!(await this.queueUrl(i, seed.url, 0, 0))) {
-        if (this.limitHit) {
-          break;
-        }
-      }
-
-      if (seed.sitemap) {
-        await timedRun(
-          this.parseSitemap(seed, i),
-          SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
-          "Sitemap initial fetch timed out",
-          { sitemap: seed.sitemap, seed: seed.url },
-          "sitemap",
-        );
-      }
-    }
+    await this._addInitialSeeds();

    await this.browser.launch({
      profileUrl: this.params.profile,
@ -1272,12 +1281,14 @@ self.__bx_behaviors.selectMainBehavior();
          "browser",
        );
      },
+
+      recording: this.recording,
      // eslint-disable-next-line @typescript-eslint/no-explicit-any
    } as any);

    // --------------
    // Run Crawl Here!
-    await runWorkers(this, this.params.workers, this.maxPageTime, this.collDir);
+    await runWorkers(this, this.params.workers, this.maxPageTime);
    // --------------

    await this.serializeConfig(true);
@ -1297,6 +1308,27 @@ self.__bx_behaviors.selectMainBehavior();
    await this.postCrawl();
  }

+  protected async _addInitialSeeds() {
+    for (let i = 0; i < this.params.scopedSeeds.length; i++) {
+      const seed = this.params.scopedSeeds[i];
+      if (!(await this.queueUrl(i, seed.url, 0, 0))) {
+        if (this.limitHit) {
+          break;
+        }
+      }
+
+      if (seed.sitemap) {
+        await timedRun(
+          this.parseSitemap(seed, i),
+          SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
+          "Sitemap initial fetch timed out",
+          { sitemap: seed.sitemap, seed: seed.url },
+          "sitemap",
+        );
+      }
+    }
+  }
+
  async postCrawl() {
    if (this.params.combineWARC) {
      await this.combineWARC();
@ -1307,9 +1339,9 @@ self.__bx_behaviors.selectMainBehavior();
      await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
      await this.crawlState.setStatus("generate-cdx");

-      const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
+      const warcList = await fsp.readdir(this.archivesDir);
      const warcListFull = warcList.map((filename) =>
-        path.join(this.collDir, "archive", filename),
+        path.join(this.archivesDir, filename),
      );

      //const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
@ -1377,10 +1409,8 @@ self.__bx_behaviors.selectMainBehavior();
    logger.info("Generating WACZ");
    await this.crawlState.setStatus("generate-wacz");

-    const archiveDir = path.join(this.collDir, "archive");
-
    // Get a list of the warcs inside
-    const warcFileList = await fsp.readdir(archiveDir);
+    const warcFileList = await fsp.readdir(this.archivesDir);

    // is finished (>0 pages and all pages written)
    const isFinished = await this.crawlState.isFinished();
@ -1440,7 +1470,9 @@ self.__bx_behaviors.selectMainBehavior();

    createArgs.push("-f");

-    warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
+    warcFileList.forEach((val) =>
+      createArgs.push(path.join(this.archivesDir, val)),
+    );

    // create WACZ
    const waczResult = await this.awaitProcess(
@ -1900,13 +1932,15 @@ self.__bx_behaviors.selectMainBehavior();
    depth: number,
    extraHops: number,
    logDetails: LogDetails = {},
+    ts = 0,
+    pageid?: string,
  ) {
    if (this.limitHit) {
      return false;
    }

    const result = await this.crawlState.addToQueue(
-      { url, seedId, depth, extraHops },
+      { url, seedId, depth, extraHops, ts, pageid },
      this.pageLimit,
    );

@ -1954,7 +1988,7 @@ self.__bx_behaviors.selectMainBehavior();
          id: "pages",
          title: "All Pages",
        };
-        header["hasText"] = this.params.text.includes("to-pages");
+        header["hasText"] = String(this.textInPages);
        if (this.params.text.length) {
          logger.debug("Text Extraction: " + this.params.text.join(","));
        } else {
@ -1968,20 +2002,30 @@ self.__bx_behaviors.selectMainBehavior();
    }
  }

-  async writePage({
-    pageid,
-    url,
-    depth,
-    title,
-    text,
-    loadState,
-    mime,
-    favicon,
-    ts,
-    status,
-  }: PageState) {
-    const row: PageEntry = { id: pageid!, url, title, loadState };
+  protected pageEntryForRedis(
+    entry: Record<string, string | number | boolean | object>,
+    // eslint-disable-next-line @typescript-eslint/no-unused-vars
+    state: PageState,
+  ) {
+    return entry;
+  }

+  async writePage(state: PageState) {
+    const {
+      pageid,
+      url,
+      depth,
+      title,
+      text,
+      loadState,
+      mime,
+      favicon,
+      status,
+    } = state;
+
+    const row: PageEntry = { id: pageid, url, title, loadState };
+
+    let { ts } = state;
    if (!ts) {
      ts = new Date();
      logger.warn("Page date missing, setting to now", { url, ts });
@ -1998,14 +2042,16 @@ self.__bx_behaviors.selectMainBehavior();
    }

    if (this.params.writePagesToRedis) {
-      await this.crawlState.writeToPagesQueue(JSON.stringify(row));
+      await this.crawlState.writeToPagesQueue(
+        JSON.stringify(this.pageEntryForRedis(row, state)),
+      );
    }

    if (depth === 0) {
      row.seed = true;
    }

-    if (text) {
+    if (text && this.textInPages) {
      row.text = text;
    }

@ -2151,7 +2197,7 @@ self.__bx_behaviors.selectMainBehavior();
    await this.crawlState.setStatus("generate-warc");

    // Get the list of created Warcs
-    const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
+    const warcLists = await fsp.readdir(this.archivesDir);

    logger.debug(`Combining ${warcLists.length} WARCs...`);

@ -2159,7 +2205,7 @@ self.__bx_behaviors.selectMainBehavior();

    // Go through a list of the created works and create an array sorted by their filesize with the largest file first.
    for (let i = 0; i < warcLists.length; i++) {
-      const fileName = path.join(this.collDir, "archive", warcLists[i]);
+      const fileName = path.join(this.archivesDir, warcLists[i]);
      const fileSize = await getFileSize(fileName);
      fileSizeObjects.push({ fileSize: fileSize, fileName: fileName });
      fileSizeObjects.sort((a, b) => b.fileSize - a.fileSize);
@ -2316,6 +2362,21 @@ self.__bx_behaviors.selectMainBehavior();
      await this.storage.uploadFile(filename, targetFilename);
    }
  }
+
+  createRecorder(id: number): Recorder | null {
+    if (!this.recording) {
+      return null;
+    }
+
+    const res = new Recorder({
+      workerid: id,
+      collDir: this.collDir,
+      crawler: this,
+    });
+
+    this.browser.recorders.push(res);
+    return res;
+  }
 }

 function shouldIgnoreAbort(req: HTTPRequest) {