Add option to write JSONL file with data on skipped pages (#966)

Fixes #965 Add `--reportSkipped` argument, which will enable the creation of a `reports/skippedPages.jsonl` file with the following elements for each URL encountered that was not queued: - `url` - `seedUrl` - `depth` - `reason` (one of `outOfScope`, `pageLimit`, `robotsTxt`, or `redirectToExcluded`) - `ts` The `reports/` directory is new and will likely be expanded with other crawl-time reporting moving forward. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2026-04-18 07:00:22 +00:00 · 2026-04-09 15:51:41 -04:00 · 2026-04-09 15:51:41 -04:00 · 1c6e814e15
commit 1c6e814e15
parent 64fdaf0d11
7 changed files with 308 additions and 4 deletions
--- a/.eslintrc.cjs
+++ b/.eslintrc.cjs
@ -33,4 +33,12 @@ module.exports = {
    "@typescript-eslint/await-thenable": "error",
  },
  reportUnusedDisableDirectives: true,
+  overrides: [
+    {
+      "files": ["tests/*.ts"],
+      "rules": {
+        "@typescript-eslint/no-floating-promises": "off"
+      }
+    }
+  ]
 };
--- a/docs/docs/user-guide/cli-options.md
+++ b/docs/docs/user-guide/cli-options.md
@ -337,6 +337,10 @@ Options:
      --robotsAgent                         Agent to check in addition to '*' fo
                                            r robots rules
                                           [string] [default: "Browsertrix/1.x"]
+      --reportSkipped                       If set, write information about URLs
+                                             encountered but not queued to repor
+                                            ts/skippedPages.jsonl
+                                                      [boolean] [default: false]
      --config                              Path to YAML config file
 ```

--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -46,6 +46,7 @@ import {
  SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
  ExitCodes,
  InterruptReason,
+  SkippedReason,
  BxFunctionBindings,
  MAX_JS_DIALOG_PER_PAGE,
  CrawlStatus,
@ -119,6 +120,7 @@ export class Crawler {

  pagesFH?: WriteStream | null = null;
  extraPagesFH?: WriteStream | null = null;
+  skippedPagesFH?: WriteStream | null = null;

  crawlId: string;

@ -149,6 +151,9 @@ export class Crawler {
  seedPagesFile: string;
  otherPagesFile: string;

+  reportsDir: string;
+  skippedPagesFile: string;
+
  archivesDir: string;
  warcCdxDir: string;
  indexesDir: string;
@ -284,6 +289,12 @@ export class Crawler {
    this.seedPagesFile = path.join(this.pagesDir, "pages.jsonl");
    this.otherPagesFile = path.join(this.pagesDir, "extraPages.jsonl");

+    // reports directory
+    this.reportsDir = path.join(this.collDir, "reports");
+
+    // reports files
+    this.skippedPagesFile = path.join(this.reportsDir, "skippedPages.jsonl");
+
    // archives dir
    this.archivesDir = path.join(this.collDir, "archive");

@ -787,7 +798,13 @@ export class Crawler {
      seedId,
    );

-    return !!seed.isIncluded(url, depth, extraHops, logDetails);
+    const res = seed.isIncluded(url, depth, extraHops, logDetails);
+
+    if (!res) {
+      this.writeSkippedPage(url, seedId, depth, SkippedReason.OutOfScope);
+    }
+
+    return !!res;
  }

  async setupPage(opts: WorkerState) {
@ -1349,6 +1366,13 @@ self.__bx_behaviors.selectMainBehavior();
    } else {
      if (pageSkipped) {
        await this.crawlState.markExcluded(url);
+
+        this.writeSkippedPage(
+          url,
+          data.seedId,
+          depth,
+          SkippedReason.RedirectToExcluded,
+        );
        this.limitHit = false;
      } else {
        const retry = await this.crawlState.markFailed(url, noRetries);
@ -1822,6 +1846,13 @@ self.__bx_behaviors.selectMainBehavior();
      this.otherPagesFile,
      "Non-Seed Pages",
    );
+    if (this.params.reportSkipped) {
+      this.skippedPagesFH = await this.initPages(
+        this.skippedPagesFile,
+        "Skipped Pages",
+        true,
+      );
+    }

    this.adBlockRules = new AdBlockRules(
      this.captureBasePrefix,
@ -1927,6 +1958,18 @@ self.__bx_behaviors.selectMainBehavior();
        this.extraPagesFH = null;
      }
    }
+
+    if (this.skippedPagesFH) {
+      try {
+        await new Promise<void>((resolve) =>
+          this.skippedPagesFH!.close(() => resolve()),
+        );
+      } catch (e) {
+        // ignore
+      } finally {
+        this.skippedPagesFH = null;
+      }
+    }
  }

  async closeFiles() {
@ -2107,6 +2150,10 @@ self.__bx_behaviors.selectMainBehavior();
      }
    }

+    if (this.params.reportSkipped) {
+      waczOpts.reportsDir = this.reportsDir;
+    }
+
    if (this.params.title) {
      waczOpts.title = this.params.title;
    }
@ -2587,6 +2634,12 @@ self.__bx_behaviors.selectMainBehavior();
        );

        if (!res) {
+          this.writeSkippedPage(
+            possibleUrl,
+            seedId,
+            depth,
+            SkippedReason.OutOfScope,
+          );
          continue;
        }

@ -2642,6 +2695,12 @@ self.__bx_behaviors.selectMainBehavior();
    pageid?: string,
  ) {
    if (this.limitHit) {
+      logger.debug(
+        "Page URL not queued, at page limit",
+        { url, ...logDetails },
+        "links",
+      );
+      this.writeSkippedPage(url, seedId, depth, SkippedReason.PageLimit);
      return false;
    }

@ -2654,6 +2713,7 @@ self.__bx_behaviors.selectMainBehavior();
        { url, ...logDetails },
        "links",
      );
+      this.writeSkippedPage(url, seedId, depth, SkippedReason.RobotsTxt);
      return false;
    }

@ -2679,6 +2739,7 @@ self.__bx_behaviors.selectMainBehavior();
          );
        }
        this.limitHit = true;
+        this.writeSkippedPage(url, seedId, depth, SkippedReason.PageLimit);
        return false;

      case QueueState.DUPE_URL:
@ -2693,11 +2754,13 @@ self.__bx_behaviors.selectMainBehavior();
    return false;
  }

-  async initPages(filename: string, title: string) {
+  async initPages(filename: string, title: string, isReport: boolean = false) {
    let fh = null;

    try {
-      await fsp.mkdir(this.pagesDir, { recursive: true });
+      await fsp.mkdir(isReport ? this.reportsDir : this.pagesDir, {
+        recursive: true,
+      });

      const createNew = !fs.existsSync(filename);

@ -2810,6 +2873,48 @@ self.__bx_behaviors.selectMainBehavior();
    }
  }

+  writeSkippedPage(
+    url: string,
+    seedId: number,
+    depth: number,
+    reason: SkippedReason,
+  ) {
+    if (!this.params.reportSkipped) {
+      return;
+    }
+
+    const seedUrl = this.seeds[seedId]?.url || "";
+
+    const ts = new Date();
+
+    let seed = false;
+    if (depth === 0) {
+      seed = true;
+    }
+
+    const row = { url, seedUrl, depth, seed, reason, ts: ts.toISOString() };
+    const processedRow = JSON.stringify(row) + "\n";
+
+    if (!this.skippedPagesFH) {
+      logger.error(
+        "Can't write skipped pages, missing stream",
+        {},
+        "pageStatus",
+      );
+      return;
+    }
+
+    try {
+      this.skippedPagesFH.write(processedRow);
+    } catch (err) {
+      logger.warn(
+        "Page append failed",
+        { pagesFile: this.skippedPagesFile },
+        "pageStatus",
+      );
+    }
+  }
+
  async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
    if (!sitemap) {
      return;
--- a/src/util/argParser.ts
+++ b/src/util/argParser.ts
@ -736,6 +736,13 @@ class ArgParser {
          type: "string",
          default: "Browsertrix/1.x",
        },
+
+        reportSkipped: {
+          describe:
+            "If set, write information about URLs encountered but not queued to reports/skippedPages.jsonl",
+          type: "boolean",
+          default: false,
+        },
      });
  }

--- a/src/util/constants.ts
+++ b/src/util/constants.ts
@ -117,3 +117,10 @@ export type CrawlStatus =
  | "canceled";

 export const WARC_REFERS_TO_CONTAINER = "WARC-Refers-To-Container";
+
+export enum SkippedReason {
+  OutOfScope = "outOfScope",
+  PageLimit = "pageLimit",
+  RobotsTxt = "robotsTxt",
+  RedirectToExcluded = "redirectToExcluded",
+}
--- a/src/util/wacz.ts
+++ b/src/util/wacz.ts
@ -39,6 +39,7 @@ export type WACZInitOpts = {
  warcCdxDir: string;
  indexesDir: string;
  logDirectory: string;
+  reportsDir?: string;

  softwareString: string;

@ -102,6 +103,7 @@ export class WACZ {
  logsDir: string;
  warcCdxDir: string;
  indexesDir: string;
+  reportsDir: string | null;

  datapackage: WACZDataPackage;

@ -119,6 +121,7 @@ export class WACZ {
    this.warcCdxDir = config.warcCdxDir;
    this.collDir = collDir;
    this.indexesDir = config.indexesDir;
+    this.reportsDir = config.reportsDir || null;

    this.datapackage = {
      resources: [],
@ -144,13 +147,17 @@ export class WACZ {
  }

  generate(): Readable {
-    const files = [
+    const baseFiles = [
      ...this.warcs,
      ...addDirFiles(this.indexesDir),
      ...addDirFiles(this.pagesDir),
      ...addDirFiles(this.logsDir),
    ];

+    const files = this.reportsDir
+      ? [...baseFiles, ...addDirFiles(this.reportsDir)]
+      : baseFiles;
+
    const zip = makeZip(
      this.iterDirForZip(files),
    ) as ReadableStream<Uint8Array>;
--- a/tests/skipped_pages.test.ts
+++ b/tests/skipped_pages.test.ts
@ -0,0 +1,166 @@
+import child_process from "child_process";
+import fs from "fs";
+import md5 from "md5";
+
+//const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
+//const testIf = (condition: string, ...args: Parameters<typeof test>) => condition ? test(...args) : test.skip(...args);
+
+test("ensure basic crawl run with docker run passes with reportSkipped option, out of scope pages", async () => {
+  child_process.execSync(
+    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --generateWACZ  --collection skipped-pages --workers 2 --reportSkipped --limit 5",
+  );
+
+  child_process.execSync(
+    "unzip test-crawls/collections/skipped-pages/skipped-pages.wacz -d test-crawls/collections/skipped-pages/wacz",
+  );
+});
+
+// testIf(doValidate, "validate wacz with skippedPages.jsonl", () => {
+//   child_process.execSync(
+//     "wacz validate --file ./test-crawls/collections/skipped-pages/skipped-pages.wacz",
+//   );
+// });
+
+test("ensure skippedPages.jsonl was written as expected, contains outOfScope page", () => {
+  const skippedPages = fs.readFileSync(
+    "test-crawls/collections/skipped-pages/reports/skippedPages.jsonl",
+    "utf8",
+  );
+
+  let pageCount = 0;
+
+  for (const line of skippedPages.trim().split("\n")) {
+    const data = JSON.parse(line);
+    if (data.format) {
+      continue;
+    }
+
+    pageCount++;
+
+    expect(data).toHaveProperty("url");
+    expect(data).toHaveProperty("seedUrl");
+    expect(data).toHaveProperty("depth");
+    expect(data).toHaveProperty("reason");
+    expect(data.reason).toBe("outOfScope");
+    expect(data).toHaveProperty("ts");
+  }
+
+  expect(pageCount > 0).toBe(true);
+});
+
+test("ensure skippedPages.jsonl was written to wacz", () => {
+  const crawlHash = md5(
+    fs.readFileSync(
+      "test-crawls/collections/skipped-pages/reports/skippedPages.jsonl",
+      "utf8",
+    ),
+  );
+  const waczHash = md5(
+    fs.readFileSync(
+      "test-crawls/collections/skipped-pages/wacz/reports/skippedPages.jsonl",
+      "utf8",
+    ),
+  );
+
+  expect(crawlHash).toEqual(waczHash);
+});
+
+test("check that skippedPages.jsonl file made it into WACZ datapackage.json", () => {
+  expect(
+    fs.existsSync(
+      "test-crawls/collections/skipped-pages/wacz/datapackage.json",
+    ),
+  ).toBe(true);
+
+  const data = fs.readFileSync(
+    "test-crawls/collections/skipped-pages/wacz/datapackage.json",
+    "utf8",
+  );
+
+  let found = false;
+
+  const dataPackageJSON = JSON.parse(data);
+  const resources = dataPackageJSON.resources;
+
+  for (let i = 0; i < resources.length; i++) {
+    const res = resources[i];
+    if (res.path == "reports/skippedPages.jsonl" && res.bytes > 0) {
+      found = true;
+    }
+  }
+
+  expect(found).toBe(true);
+});
+
+test("ensure basic crawl run with docker run passes with reportSkipped option, pageLimit report", async () => {
+  child_process.execSync(
+    "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --extraHops 1 --reportSkipped --collection skipped-pages-2 --limit 1",
+  );
+});
+
+test("ensure skippedPages.jsonl was written as expected, contains pageLimit page", () => {
+  const skippedPages = fs.readFileSync(
+    "test-crawls/collections/skipped-pages-2/reports/skippedPages.jsonl",
+    "utf8",
+  );
+
+  let pageCount = 0;
+
+  for (const line of skippedPages.trim().split("\n")) {
+    const data = JSON.parse(line);
+    if (data.format) {
+      continue;
+    }
+
+    pageCount++;
+
+    expect(data).toHaveProperty("url");
+    expect(data).toHaveProperty("seedUrl");
+    expect(data).toHaveProperty("depth");
+    expect(data).toHaveProperty("reason");
+    expect(data.reason).toBe("pageLimit");
+    expect(data).toHaveProperty("ts");
+  }
+
+  expect(pageCount > 0).toBe(true);
+});
+
+test("redirect to excluded page, crawl fails as no seeds crawled", () => {
+  let failed = false;
+  try {
+    child_process.execSync(
+      "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://old.webrecorder.net/ --exclude https://old.webrecorder.net/ --reportSkipped --collection skipped-pages-3",
+    );
+  } catch (e) {
+    failed = true;
+  }
+
+  expect(failed).toBe(true);
+});
+
+test("ensure skippedPages.jsonl was written as expected, contains redirectToExcluded page", () => {
+  const skippedPages = fs.readFileSync(
+    "test-crawls/collections/skipped-pages-3/reports/skippedPages.jsonl",
+    "utf8",
+  );
+
+  let pageCount = 0;
+
+  for (const line of skippedPages.trim().split("\n")) {
+    const data = JSON.parse(line);
+    if (data.format) {
+      continue;
+    }
+
+    pageCount++;
+
+    expect(data).toHaveProperty("url");
+    expect(data).toHaveProperty("seedUrl");
+    expect(data).toHaveProperty("depth");
+    expect(data).toHaveProperty("reason");
+    expect(data.reason).toBe("redirectToExcluded");
+    expect(data).toHaveProperty("ts");
+  }
+
+  expect(pageCount > 0).toBe(true);
+});