mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Better default crawlId (#806)
- set crawl id from collection, not other way around, to ensure unique redis keyspace for different collections - by default, set crawl id to unique value based on host and collection, eg. '@hostname-@id' - don't include '@id' in collection interpolation, can only used hostname or timestamp - fixes issue mentioned / workaround provided in #784 - ci: add docker login + cacheing to work around rate limits - tests: fix sitemap tests
This commit is contained in:
parent
5fedde6eee
commit
e585b6d194
7 changed files with 52 additions and 32 deletions
17
.github/workflows/ci.yaml
vendored
17
.github/workflows/ci.yaml
vendored
|
@ -4,6 +4,12 @@ on:
|
|||
push:
|
||||
pull_request:
|
||||
|
||||
# Cancel in progress workflows on pull_requests.
|
||||
# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -48,6 +54,17 @@ jobs:
|
|||
- name: build js
|
||||
run: yarn run tsc
|
||||
|
||||
- name: Cache Docker Images
|
||||
uses: ScribeMD/docker-cache@0.5.0
|
||||
with:
|
||||
key: docker-${{ runner.os }}-${{ hashFiles('Dockerfile') }}
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: build docker
|
||||
run: docker compose build
|
||||
|
||||
|
|
|
@ -17,8 +17,10 @@ Options:
|
|||
llel [number] [default: 1]
|
||||
--crawlId, --id A user provided ID for this crawl or
|
||||
crawl configuration (can also be se
|
||||
t via CRAWL_ID env var, defaults to
|
||||
Docker container hostname) [string]
|
||||
t via CRAWL_ID env var), defaults to
|
||||
combination of Docker container hos
|
||||
tname and collection
|
||||
[string] [default: "@hostname-@id"]
|
||||
--waitUntil Puppeteer page.goto() condition to w
|
||||
ait for before continuing, can be mu
|
||||
ltiple separated by ','
|
||||
|
@ -72,17 +74,14 @@ Options:
|
|||
--adBlockMessage If specified, when an ad is blocked,
|
||||
a record with this error message is
|
||||
added instead[string] [default: ""]
|
||||
-c, --collection Collection name to crawl to (replay
|
||||
will be accessible under this name i
|
||||
n pywb preview)
|
||||
[string] [default: "crawl-@ts"]
|
||||
-c, --collection Collection name / directory to crawl
|
||||
into[string] [default: "crawl-@ts"]
|
||||
--headless Run in headless mode, otherwise star
|
||||
t xvfb [boolean] [default: false]
|
||||
--driver Custom driver for the crawler, if an
|
||||
y [string]
|
||||
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
|
||||
rateCdx e with pywb after crawl is done
|
||||
[boolean] [default: false]
|
||||
--generateCDX, --generatecdx, --gene If set, generate merged index in CDX
|
||||
rateCdx J format [boolean] [default: false]
|
||||
--combineWARC, --combinewarc, --comb If set, combine the warcs
|
||||
ineWarc [boolean] [default: false]
|
||||
--rolloverSize If set, declare the rollover size
|
||||
|
@ -116,9 +115,8 @@ Options:
|
|||
record(s)
|
||||
[array] [choices: "to-pages", "to-warc", "final-to-warc"]
|
||||
--cwd Crawl working directory for captures
|
||||
(pywb root). If not set, defaults t
|
||||
o process.cwd()
|
||||
[string] [default: "/crawls"]
|
||||
. If not set, defaults to process.cw
|
||||
d() [string] [default: "/crawls"]
|
||||
--mobileDevice Emulate mobile device by name from:
|
||||
https://github.com/puppeteer/puppete
|
||||
er/blob/main/src/common/DeviceDescri
|
||||
|
|
|
@ -204,6 +204,8 @@ export class Crawler {
|
|||
this.params = args as CrawlerArgs;
|
||||
this.origConfig = this.params.origConfig;
|
||||
|
||||
this.crawlId = this.params.crawlId;
|
||||
|
||||
// root collections dir
|
||||
this.collDir = path.join(
|
||||
this.params.cwd,
|
||||
|
@ -213,7 +215,7 @@ export class Crawler {
|
|||
this.logDir = path.join(this.collDir, "logs");
|
||||
this.logFilename = path.join(
|
||||
this.logDir,
|
||||
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
|
||||
`${interpolateFilename("@ts", "")}.log`,
|
||||
);
|
||||
|
||||
const debugLogging = this.params.logging.includes("debug");
|
||||
|
@ -243,8 +245,6 @@ export class Crawler {
|
|||
// pages file
|
||||
this.pagesFH = null;
|
||||
|
||||
this.crawlId = this.params.crawlId;
|
||||
|
||||
this.startTime = Date.now();
|
||||
|
||||
// was the limit hit?
|
||||
|
@ -2739,13 +2739,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
this.lastSaveTime = now.getTime();
|
||||
|
||||
const ts = now.toISOString().slice(0, 19).replace(/[T:-]/g, "");
|
||||
|
||||
const crawlDir = path.join(this.collDir, "crawls");
|
||||
|
||||
await fsp.mkdir(crawlDir, { recursive: true });
|
||||
|
||||
const filenameOnly = `crawl-${ts}-${this.crawlId}.yaml`;
|
||||
const filenameOnly = `${interpolateFilename("@ts-@id", this.crawlId)}.yaml`;
|
||||
|
||||
const filename = path.join(crawlDir, filenameOnly);
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import path from "path";
|
||||
import fs from "fs";
|
||||
import os from "os";
|
||||
|
||||
import yaml from "js-yaml";
|
||||
import { KnownDevices as devices } from "puppeteer-core";
|
||||
|
@ -18,6 +17,7 @@ import {
|
|||
ExtractSelector,
|
||||
DEFAULT_MAX_RETRIES,
|
||||
BxFunctionBindings,
|
||||
DEFAULT_CRAWL_ID_TEMPLATE,
|
||||
} from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
|
@ -85,7 +85,7 @@ class ArgParser {
|
|||
crawlId: {
|
||||
alias: "id",
|
||||
describe:
|
||||
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to Docker container hostname)",
|
||||
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var), defaults to combination of Docker container hostname and collection",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
|
@ -213,8 +213,7 @@ class ArgParser {
|
|||
|
||||
collection: {
|
||||
alias: "c",
|
||||
describe:
|
||||
"Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
describe: "Collection name / directory to crawl into",
|
||||
type: "string",
|
||||
default: "crawl-@ts",
|
||||
},
|
||||
|
@ -232,8 +231,7 @@ class ArgParser {
|
|||
|
||||
generateCDX: {
|
||||
alias: ["generatecdx", "generateCdx"],
|
||||
describe:
|
||||
"If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
describe: "If set, generate merged index in CDXJ format",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
@ -309,7 +307,7 @@ class ArgParser {
|
|||
|
||||
cwd: {
|
||||
describe:
|
||||
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||
"Crawl working directory for captures. If not set, defaults to process.cwd()",
|
||||
type: "string",
|
||||
default: process.cwd(),
|
||||
},
|
||||
|
@ -697,8 +695,11 @@ class ArgParser {
|
|||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
validateArgs(argv: any, isQA: boolean) {
|
||||
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
|
||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||
argv.collection = interpolateFilename(argv.collection, "");
|
||||
argv.crawlId = interpolateFilename(
|
||||
argv.crawlId || process.env.CRAWL_ID || DEFAULT_CRAWL_ID_TEMPLATE,
|
||||
argv.collection,
|
||||
);
|
||||
|
||||
// css selector parser
|
||||
const parser = createParser();
|
||||
|
|
|
@ -50,6 +50,8 @@ export const DEFAULT_SELECTORS: ExtractSelector[] = [
|
|||
},
|
||||
];
|
||||
|
||||
export const DEFAULT_CRAWL_ID_TEMPLATE = "@hostname-@id";
|
||||
|
||||
export const BEHAVIOR_TYPES = [
|
||||
"autoplay",
|
||||
"autofetch",
|
||||
|
|
|
@ -26,7 +26,7 @@ test("check that log files exist and were filtered according to options", () =>
|
|||
const logDir = "test-crawls/collections/wr-specs-logs/logs/";
|
||||
const logFiles = [];
|
||||
fs.readdirSync(logDir).forEach((file) => {
|
||||
if (file.startsWith("crawl-") && file.endsWith(".log")) {
|
||||
if (file.endsWith(".log")) {
|
||||
logFiles.push(path.join(logDir, file));
|
||||
}
|
||||
});
|
||||
|
|
|
@ -30,7 +30,9 @@ async function waitContainer(containerId) {
|
|||
}
|
||||
|
||||
async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
|
||||
const containerId = child_process.execSync(`docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`, {encoding: "utf-8"});
|
||||
const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`;
|
||||
console.log(command);
|
||||
const containerId = child_process.execSync(command, {encoding: "utf-8"});
|
||||
|
||||
await sleep(3000);
|
||||
|
||||
|
@ -46,6 +48,9 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
|
|||
while (true) {
|
||||
finished = await redis.zcard("test:q");
|
||||
|
||||
if (await redis.get("test:sitemapDone")) {
|
||||
break;
|
||||
}
|
||||
if (finished >= numExpected) {
|
||||
break;
|
||||
}
|
||||
|
@ -64,7 +69,7 @@ async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessTh
|
|||
}
|
||||
|
||||
test("test sitemap fully finish", async () => {
|
||||
await runCrawl(7000, "https://www.mozilla.org/", "", 0);
|
||||
await runCrawl(3500, "https://www.mozilla.org/", "", 0);
|
||||
});
|
||||
|
||||
test("test sitemap with limit", async () => {
|
||||
|
@ -79,7 +84,6 @@ test("test sitemap with application/xml content-type", async () => {
|
|||
await runCrawl(10, "https://bitarchivist.net/", "", 0);
|
||||
});
|
||||
|
||||
|
||||
test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
|
||||
await runCrawl(1, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
|
||||
await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue