Better default crawlId (#806)

- set crawl id from collection, not other way around, to ensure unique redis keyspace for different collections - by default, set crawl id to unique value based on host and collection, eg. '@hostname-@id' - don't include '@id' in collection interpolation, can only used hostname or timestamp - fixes issue mentioned / workaround provided in #784 - ci: add docker login + cacheing to work around rate limits - tests: fix sitemap tests
2025-10-19 14:33:17 +00:00 · 2025-04-01 13:40:03 -07:00 · 2025-04-01 13:40:03 -07:00 · e585b6d194
commit e585b6d194
parent 5fedde6eee
7 changed files with 52 additions and 32 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -4,6 +4,12 @@ on:
  push:
  pull_request:

+# Cancel in progress workflows on pull_requests.
+# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
  lint:
    runs-on: ubuntu-latest
@ -48,6 +54,17 @@ jobs:
      - name: build js
        run: yarn run tsc

+      - name: Cache Docker Images
+        uses: ScribeMD/docker-cache@0.5.0
+        with:
+          key: docker-${{ runner.os }}-${{ hashFiles('Dockerfile') }}
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
      - name: build docker
        run: docker compose build

--- a/docs/docs/user-guide/cli-options.md
+++ b/docs/docs/user-guide/cli-options.md
@ -17,8 +17,10 @@ Options:
                                            llel           [number] [default: 1]
      --crawlId, --id                       A user provided ID for this crawl or
                                             crawl configuration (can also be se
-                                            t via CRAWL_ID env var, defaults to
-                                            Docker container hostname)  [string]
+                                            t via CRAWL_ID env var), defaults to
+                                             combination of Docker container hos
+                                            tname and collection
+                                             [string] [default: "@hostname-@id"]
      --waitUntil                           Puppeteer page.goto() condition to w
                                            ait for before continuing, can be mu
                                            ltiple separated by ','
@ -72,17 +74,14 @@ Options:
      --adBlockMessage                      If specified, when an ad is blocked,
                                             a record with this error message is
                                             added instead[string] [default: ""]
-  -c, --collection                          Collection name to crawl to (replay
-                                            will be accessible under this name i
-                                            n pywb preview)
-                                                 [string] [default: "crawl-@ts"]
+  -c, --collection                          Collection name / directory to crawl
+                                             into[string] [default: "crawl-@ts"]
      --headless                            Run in headless mode, otherwise star
                                            t xvfb    [boolean] [default: false]
      --driver                              Custom driver for the crawler, if an
                                            y                           [string]
-      --generateCDX, --generatecdx, --gene  If set, generate index (CDXJ) for us
-      rateCdx                               e with pywb after crawl is done
-                                                      [boolean] [default: false]
+      --generateCDX, --generatecdx, --gene  If set, generate merged index in CDX
+      rateCdx                               J format  [boolean] [default: false]
      --combineWARC, --combinewarc, --comb  If set, combine the warcs
      ineWarc                                         [boolean] [default: false]
      --rolloverSize                        If set, declare the rollover size
@ -116,9 +115,8 @@ Options:
                                            record(s)
                       [array] [choices: "to-pages", "to-warc", "final-to-warc"]
      --cwd                                 Crawl working directory for captures
-                                             (pywb root). If not set, defaults t
-                                            o process.cwd()
-                                                   [string] [default: "/crawls"]
+                                            . If not set, defaults to process.cw
+                                            d()    [string] [default: "/crawls"]
      --mobileDevice                        Emulate mobile device by name from:
                                            https://github.com/puppeteer/puppete
                                            er/blob/main/src/common/DeviceDescri
--- a/src/crawler.ts
+++ b/src/crawler.ts
@ -204,6 +204,8 @@ export class Crawler {
    this.params = args as CrawlerArgs;
    this.origConfig = this.params.origConfig;

+    this.crawlId = this.params.crawlId;
+
    // root collections dir
    this.collDir = path.join(
      this.params.cwd,
@ -213,7 +215,7 @@ export class Crawler {
    this.logDir = path.join(this.collDir, "logs");
    this.logFilename = path.join(
      this.logDir,
-      `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
+      `${interpolateFilename("@ts", "")}.log`,
    );

    const debugLogging = this.params.logging.includes("debug");
@ -243,8 +245,6 @@ export class Crawler {
    // pages file
    this.pagesFH = null;

-    this.crawlId = this.params.crawlId;
-
    this.startTime = Date.now();

    // was the limit hit?
@ -2739,13 +2739,11 @@ self.__bx_behaviors.selectMainBehavior();

    this.lastSaveTime = now.getTime();

-    const ts = now.toISOString().slice(0, 19).replace(/[T:-]/g, "");
-
    const crawlDir = path.join(this.collDir, "crawls");

    await fsp.mkdir(crawlDir, { recursive: true });

-    const filenameOnly = `crawl-${ts}-${this.crawlId}.yaml`;
+    const filenameOnly = `${interpolateFilename("@ts-@id", this.crawlId)}.yaml`;

    const filename = path.join(crawlDir, filenameOnly);

--- a/src/util/argParser.ts
+++ b/src/util/argParser.ts
@ -1,6 +1,5 @@
 import path from "path";
 import fs from "fs";
-import os from "os";

 import yaml from "js-yaml";
 import { KnownDevices as devices } from "puppeteer-core";
@ -18,6 +17,7 @@ import {
  ExtractSelector,
  DEFAULT_MAX_RETRIES,
  BxFunctionBindings,
+  DEFAULT_CRAWL_ID_TEMPLATE,
 } from "./constants.js";
 import { ScopedSeed } from "./seeds.js";
 import { interpolateFilename } from "./storage.js";
@ -85,7 +85,7 @@ class ArgParser {
        crawlId: {
          alias: "id",
          describe:
-            "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to Docker container hostname)",
+            "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var), defaults to combination of Docker container hostname and collection",
          type: "string",
        },

@ -213,8 +213,7 @@ class ArgParser {

        collection: {
          alias: "c",
-          describe:
-            "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
+          describe: "Collection name / directory to crawl into",
          type: "string",
          default: "crawl-@ts",
        },
@ -232,8 +231,7 @@ class ArgParser {

        generateCDX: {
          alias: ["generatecdx", "generateCdx"],
-          describe:
-            "If set, generate index (CDXJ) for use with pywb after crawl is done",
+          describe: "If set, generate merged index in CDXJ format",
          type: "boolean",
          default: false,
        },
@ -309,7 +307,7 @@ class ArgParser {

        cwd: {
          describe:
-            "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
+            "Crawl working directory for captures. If not set, defaults to process.cwd()",
          type: "string",
          default: process.cwd(),
        },
@ -697,8 +695,11 @@ class ArgParser {

  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  validateArgs(argv: any, isQA: boolean) {
-    argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
-    argv.collection = interpolateFilename(argv.collection, argv.crawlId);
+    argv.collection = interpolateFilename(argv.collection, "");
+    argv.crawlId = interpolateFilename(
+      argv.crawlId || process.env.CRAWL_ID || DEFAULT_CRAWL_ID_TEMPLATE,
+      argv.collection,
+    );

    // css selector parser
    const parser = createParser();
--- a/src/util/constants.ts
+++ b/src/util/constants.ts
@ -50,6 +50,8 @@ export const DEFAULT_SELECTORS: ExtractSelector[] = [
  },
 ];

+export const DEFAULT_CRAWL_ID_TEMPLATE = "@hostname-@id";
+
 export const BEHAVIOR_TYPES = [
  "autoplay",
  "autofetch",
--- a/tests/log_filtering.test.js
+++ b/tests/log_filtering.test.js
@ -26,7 +26,7 @@ test("check that log files exist and were filtered according to options", () =>
  const logDir = "test-crawls/collections/wr-specs-logs/logs/";
  const logFiles = [];
  fs.readdirSync(logDir).forEach((file) => {
-    if (file.startsWith("crawl-") && file.endsWith(".log")) {
+    if (file.endsWith(".log")) {
      logFiles.push(path.join(logDir, file));
    }
  });
--- a/tests/sitemap-parse.test.js
+++ b/tests/sitemap-parse.test.js
@ -30,7 +30,9 @@ async function waitContainer(containerId) {
 }

 async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
-  const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});
+  const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`;
+  console.log(command);
+  const containerId = child_process.execSync(command, {encoding: "utf-8"});

  await sleep(3000);

@ -46,6 +48,9 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
    while (true) {
      finished = await redis.zcard("test:q");

+      if (await redis.get("test:sitemapDone")) {
+        break;
+      }
      if (finished >= numExpected) {
        break;
      }
@ -64,7 +69,7 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
 }

 test("test sitemap fully finish", async () => {
-  await runCrawl(7000, "https://www.mozilla.org/", "", 0);
+  await runCrawl(3500, "https://www.mozilla.org/", "", 0);
 });

 test("test sitemap with limit", async () => {
@ -79,7 +84,6 @@ test("test sitemap with application/xml content-type", async () => {
  await runCrawl(10, "https://bitarchivist.net/", "", 0);
 });

-
 test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
-  await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
+  await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
 });