Better default crawlId (#806)

- set crawl id from collection, not other way around, to ensure unique
redis keyspace for different collections
- by default, set crawl id to unique value based on host and collection,
eg. '@hostname-@id'
- don't include '@id' in collection interpolation, can only used
hostname or timestamp
- fixes issue mentioned / workaround provided in #784 
- ci: add docker login + cacheing to work around rate limits
- tests: fix sitemap tests
This commit is contained in:
Ilya Kreymer 2025-04-01 13:40:03 -07:00 committed by GitHub
parent 5fedde6eee
commit e585b6d194
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 52 additions and 32 deletions

View file

@ -4,6 +4,12 @@ on:
push:
pull_request:
# Cancel in progress workflows on pull_requests.
# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
lint:
runs-on: ubuntu-latest
@ -48,6 +54,17 @@ jobs:
- name: build js
run: yarn run tsc
- name: Cache Docker Images
uses: ScribeMD/docker-cache@0.5.0
with:
key: docker-${{ runner.os }}-${{ hashFiles('Dockerfile') }}
- name: Login to DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: build docker
run: docker compose build

View file

@ -17,8 +17,10 @@ Options:
llel [number] [default: 1]
--crawlId, --id A user provided ID for this crawl or
crawl configuration (can also be se
t via CRAWL_ID env var, defaults to
Docker container hostname) [string]
t via CRAWL_ID env var), defaults to
combination of Docker container hos
tname and collection
[string] [default: "@hostname-@id"]
--waitUntil Puppeteer page.goto() condition to w
ait for before continuing, can be mu
ltiple separated by ','
@ -72,17 +74,14 @@ Options:
--adBlockMessage If specified, when an ad is blocked,
a record with this error message is
added instead[string] [default: ""]
-c, --collection Collection name to crawl to (replay
will be accessible under this name i
n pywb preview)
[string] [default: "crawl-@ts"]
-c, --collection Collection name / directory to crawl
into[string] [default: "crawl-@ts"]
--headless Run in headless mode, otherwise star
t xvfb [boolean] [default: false]
--driver Custom driver for the crawler, if an
y [string]
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
rateCdx e with pywb after crawl is done
[boolean] [default: false]
--generateCDX, --generatecdx, --gene If set, generate merged index in CDX
rateCdx J format [boolean] [default: false]
--combineWARC, --combinewarc, --comb If set, combine the warcs
ineWarc [boolean] [default: false]
--rolloverSize If set, declare the rollover size
@ -116,9 +115,8 @@ Options:
record(s)
[array] [choices: "to-pages", "to-warc", "final-to-warc"]
--cwd Crawl working directory for captures
(pywb root). If not set, defaults t
o process.cwd()
[string] [default: "/crawls"]
. If not set, defaults to process.cw
d() [string] [default: "/crawls"]
--mobileDevice Emulate mobile device by name from:
https://github.com/puppeteer/puppete
er/blob/main/src/common/DeviceDescri

View file

@ -204,6 +204,8 @@ export class Crawler {
this.params = args as CrawlerArgs;
this.origConfig = this.params.origConfig;
this.crawlId = this.params.crawlId;
// root collections dir
this.collDir = path.join(
this.params.cwd,
@ -213,7 +215,7 @@ export class Crawler {
this.logDir = path.join(this.collDir, "logs");
this.logFilename = path.join(
this.logDir,
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
`${interpolateFilename("@ts", "")}.log`,
);
const debugLogging = this.params.logging.includes("debug");
@ -243,8 +245,6 @@ export class Crawler {
// pages file
this.pagesFH = null;
this.crawlId = this.params.crawlId;
this.startTime = Date.now();
// was the limit hit?
@ -2739,13 +2739,11 @@ self.__bx_behaviors.selectMainBehavior();
this.lastSaveTime = now.getTime();
const ts = now.toISOString().slice(0, 19).replace(/[T:-]/g, "");
const crawlDir = path.join(this.collDir, "crawls");
await fsp.mkdir(crawlDir, { recursive: true });
const filenameOnly = `crawl-${ts}-${this.crawlId}.yaml`;
const filenameOnly = `${interpolateFilename("@ts-@id", this.crawlId)}.yaml`;
const filename = path.join(crawlDir, filenameOnly);

View file

@ -1,6 +1,5 @@
import path from "path";
import fs from "fs";
import os from "os";
import yaml from "js-yaml";
import { KnownDevices as devices } from "puppeteer-core";
@ -18,6 +17,7 @@ import {
ExtractSelector,
DEFAULT_MAX_RETRIES,
BxFunctionBindings,
DEFAULT_CRAWL_ID_TEMPLATE,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
@ -85,7 +85,7 @@ class ArgParser {
crawlId: {
alias: "id",
describe:
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to Docker container hostname)",
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var), defaults to combination of Docker container hostname and collection",
type: "string",
},
@ -213,8 +213,7 @@ class ArgParser {
collection: {
alias: "c",
describe:
"Collection name to crawl to (replay will be accessible under this name in pywb preview)",
describe: "Collection name / directory to crawl into",
type: "string",
default: "crawl-@ts",
},
@ -232,8 +231,7 @@ class ArgParser {
generateCDX: {
alias: ["generatecdx", "generateCdx"],
describe:
"If set, generate index (CDXJ) for use with pywb after crawl is done",
describe: "If set, generate merged index in CDXJ format",
type: "boolean",
default: false,
},
@ -309,7 +307,7 @@ class ArgParser {
cwd: {
describe:
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
"Crawl working directory for captures. If not set, defaults to process.cwd()",
type: "string",
default: process.cwd(),
},
@ -697,8 +695,11 @@ class ArgParser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
validateArgs(argv: any, isQA: boolean) {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
argv.collection = interpolateFilename(argv.collection, "");
argv.crawlId = interpolateFilename(
argv.crawlId || process.env.CRAWL_ID || DEFAULT_CRAWL_ID_TEMPLATE,
argv.collection,
);
// css selector parser
const parser = createParser();

View file

@ -50,6 +50,8 @@ export const DEFAULT_SELECTORS: ExtractSelector[] = [
},
];
export const DEFAULT_CRAWL_ID_TEMPLATE = "@hostname-@id";
export const BEHAVIOR_TYPES = [
"autoplay",
"autofetch",

View file

@ -26,7 +26,7 @@ test("check that log files exist and were filtered according to options", () =>
const logDir = "test-crawls/collections/wr-specs-logs/logs/";
const logFiles = [];
fs.readdirSync(logDir).forEach((file) => {
if (file.startsWith("crawl-") && file.endsWith(".log")) {
if (file.endsWith(".log")) {
logFiles.push(path.join(logDir, file));
}
});

View file

@ -30,7 +30,9 @@ async function waitContainer(containerId) {
}
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});
const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`;
console.log(command);
const containerId = child_process.execSync(command, {encoding: "utf-8"});
await sleep(3000);
@ -46,6 +48,9 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
while (true) {
finished = await redis.zcard("test:q");
if (await redis.get("test:sitemapDone")) {
break;
}
if (finished >= numExpected) {
break;
}
@ -64,7 +69,7 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
}
test("test sitemap fully finish", async () => {
await runCrawl(7000, "https://www.mozilla.org/", "", 0);
await runCrawl(3500, "https://www.mozilla.org/", "", 0);
});
test("test sitemap with limit", async () => {
@ -79,7 +84,6 @@ test("test sitemap with application/xml content-type", async () => {
await runCrawl(10, "https://bitarchivist.net/", "", 0);
});
test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});