crawler args typing (#680)

- Refactors args parsing so that `Crawler.params` is properly timed with
CLI options + additions with `CrawlerArgs` type.
- also adds typing to create-login-profile CLI options
- validation still done w/o typing due to yargs limitations
- tests: exclude slow page from tests for faster test runs
This commit is contained in:
Ilya Kreymer 2024-09-05 18:10:27 -07:00 committed by GitHub
parent 802a416c7e
commit 9c9643c24f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 686 additions and 642 deletions

View file

@ -43,9 +43,9 @@ Options:
tom"]
--scopeIncludeRx, --include Regex of page URLs that should be in
cluded in the crawl (defaults to the
immediate directory of URL)
immediate directory of URL)[string]
--scopeExcludeRx, --exclude Regex of page URLs that should be ex
cluded from the crawl.
cluded from the crawl. [string]
--allowHashUrls Allow Hashtag URLs, useful for singl
e-page-application crawling or when
different hashtags load dynamic cont
@ -56,14 +56,14 @@ Options:
an iframe [array] [default: []]
--blockMessage If specified, when a URL is blocked,
a record with this error message is
added instead [string]
added instead[string] [default: ""]
--blockAds, --blockads If set, block advertisements from be
ing loaded (based on Stephen Black's
blocklist)
[boolean] [default: false]
--adBlockMessage If specified, when an ad is blocked,
a record with this error message is
added instead [string]
added instead[string] [default: ""]
-c, --collection Collection name to crawl to (replay
will be accessible under this name i
n pywb preview)
@ -79,7 +79,7 @@ Options:
ineWarc [boolean] [default: false]
--rolloverSize If set, declare the rollover size
[number] [default: 1000000000]
--generateWACZ, --generatewacz, --ge If set, generate wacz
--generateWACZ, --generatewacz, --ge If set, generate WACZ on disk
nerateWacz [boolean] [default: false]
--logging Logging options for crawler, can inc
lude: stats (enabled by default), js
@ -94,15 +94,15 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap", "replay", "proxy"] [default: []]
inks", "sitemap", "wacz", "replay", "proxy"] [default: []]
--logExcludeContext Comma-separated list of contexts to
NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
inks", "sitemap", "replay", "proxy"] [default: ["recorderNetwork","jsError","s
creencast"]]
inks", "sitemap", "wacz", "replay", "proxy"] [default: ["recorderNetwork","jsE
rror","screencast"]]
--text Extract initial (default) or final t
ext to pages.jsonl or WARC resource
record(s)
@ -127,15 +127,15 @@ Options:
those greater than or equal to (>=)
provided ISO Date string (YYYY-MM-D
D or YYYY-MM-DDTHH:MM:SS or partial
date)
date) [string]
--sitemapToDate, --sitemapTo If set, filter URLs from sitemaps to
those less than or equal to (<=) pr
ovided ISO Date string (YYYY-MM-DD o
r YYYY-MM-DDTHH:MM:SS or partial dat
e)
e) [string]
--statsFilename If set, output stats as JSON to this
file. (Relative filename resolves t
o crawl working directory)
o crawl working directory) [string]
--behaviors Which background behaviors to enable
on each page
[array] [choices: "autoplay", "autofetch", "autoscroll", "siteSpecific"] [defa
@ -304,7 +304,7 @@ Options:
--shutdownWait Shutdown browser in interactive after this many seco
nds, if no pings received [number] [default: 0]
--profile Path or HTTP(S) URL to tar.gz file which contains th
e browser profile directory [string]
e browser profile directory [string] [default: ""]
--windowSize Browser window dimensions, specified as: width,heigh
t [string] [default: "1360,1020"]
--cookieDays If >0, set all cookies, including session cookies, t

View file

@ -12,7 +12,7 @@ import {
WorkerId,
} from "./util/state.js";
import { parseArgs } from "./util/argParser.js";
import { CrawlerArgs, parseArgs } from "./util/argParser.js";
import yaml from "js-yaml";
@ -52,7 +52,7 @@ import {
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
} from "./util/constants.js";
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
import { OriginOverride } from "./util/originoverride.js";
import {
@ -107,8 +107,7 @@ type PageEntry = {
// ============================================================================
export class Crawler {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any;
params: CrawlerArgs;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig: any;
@ -200,8 +199,8 @@ export class Crawler {
constructor() {
const args = this.parseArgs();
this.params = args.parsed;
this.origConfig = args.origConfig;
this.params = args as CrawlerArgs;
this.origConfig = this.params.origConfig;
// root collections dir
this.collDir = path.join(
@ -872,7 +871,7 @@ self.__bx_behaviors.selectMainBehavior();
const result = await timedRun(
directFetchCapture({ url, headers, cdp }),
this.params.timeout,
this.params.pageLoadTimeout,
"Direct fetch of page URL timed out",
logDetails,
"fetch",
@ -1396,7 +1395,7 @@ self.__bx_behaviors.selectMainBehavior();
if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(
this.params.blockRules,
this.params.blockRules as BlockRuleDecl[],
this.captureBasePrefix,
this.params.blockMessage,
);
@ -1405,7 +1404,9 @@ self.__bx_behaviors.selectMainBehavior();
this.screencaster = this.initScreenCaster();
if (this.params.originOverride && this.params.originOverride.length) {
this.originOverride = new OriginOverride(this.params.originOverride);
this.originOverride = new OriginOverride(
this.params.originOverride as string[],
);
}
await this._addInitialSeeds();
@ -2183,7 +2184,7 @@ self.__bx_behaviors.selectMainBehavior();
id: "pages",
title,
};
header.hasText = this.params.text.includes("to-pages");
header.hasText = this.params.text.includes("to-pages") + "";
if (this.params.text.length) {
logger.debug("Text Extraction: " + this.params.text.join(","));
} else {
@ -2290,8 +2291,12 @@ self.__bx_behaviors.selectMainBehavior();
return;
}
const fromDate = this.params.sitemapFromDate;
const toDate = this.params.sitemapToDate;
const fromDate = this.params.sitemapFromDate
? new Date(this.params.sitemapFromDate)
: undefined;
const toDate = this.params.sitemapToDate
? new Date(this.params.sitemapToDate)
: undefined;
const headers = this.headers;
logger.info(

View file

@ -7,7 +7,7 @@ import http, { IncomingMessage, ServerResponse } from "http";
import readline from "readline";
import child_process from "child_process";
import yargs, { Options } from "yargs";
import yargs from "yargs";
import { logger } from "./util/logger.js";
@ -35,8 +35,10 @@ const behaviors = fs.readFileSync(
{ encoding: "utf8" },
);
function cliOpts(): { [key: string]: Options } {
return {
function initArgs() {
return yargs(process.argv)
.usage("browsertrix-crawler profile [options]")
.options({
url: {
describe: "The URL of the login page",
type: "string",
@ -46,22 +48,27 @@ function cliOpts(): { [key: string]: Options } {
user: {
describe:
"The username for the login. If not specified, will be prompted",
type: "string",
},
password: {
describe:
"The password for the login. If not specified, will be prompted (recommended)",
type: "string",
},
filename: {
describe:
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
type: "string",
default: "/crawls/profiles/profile.tar.gz",
},
debugScreenshot: {
describe:
"If specified, take a screenshot after login and save as this filename",
type: "boolean",
default: false,
},
headless: {
@ -93,18 +100,19 @@ function cliOpts(): { [key: string]: Options } {
describe:
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
type: "string",
default: "",
},
windowSize: {
type: "string",
describe: "Browser window dimensions, specified as: width,height",
type: "string",
default: getDefaultWindowSize(),
},
cookieDays: {
type: "number",
describe:
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
type: "number",
default: 7,
},
@ -115,7 +123,8 @@ function cliOpts(): { [key: string]: Options } {
},
sshProxyPrivateKeyFile: {
describe: "path to SSH private key for SOCKS5 over SSH proxy connection",
describe:
"path to SSH private key for SOCKS5 over SSH proxy connection",
type: "string",
},
@ -124,7 +133,8 @@ function cliOpts(): { [key: string]: Options } {
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
type: "string",
},
};
})
.parseSync();
}
function getDefaultWindowSize() {
@ -140,10 +150,7 @@ function handleTerminate(signame: string) {
}
async function main() {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const params: any = yargs(process.argv)
.usage("browsertrix-crawler profile [options]")
.option(cliOpts()).argv;
const params = initArgs();
logger.setDebugLogging(true);

View file

@ -110,8 +110,8 @@ export class ReplayCrawler extends Crawler {
this.infoWriter = null;
this.includeRx = parseRx(this.params.include);
this.excludeRx = parseRx(this.params.include);
this.includeRx = parseRx(this.params.scopeIncludeRx);
this.excludeRx = parseRx(this.params.scopeExcludeRx);
}
async bootstrap(): Promise<void> {

View file

@ -4,7 +4,7 @@ import os from "os";
import yaml from "js-yaml";
import { KnownDevices as devices } from "puppeteer-core";
import yargs, { Options } from "yargs";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import {
@ -19,17 +19,38 @@ import { screenshotTypes } from "./screenshots.js";
import {
DEFAULT_EXCLUDE_LOG_CONTEXTS,
LOG_CONTEXT_TYPES,
LogContext,
logger,
} from "./logger.js";
import { SaveState } from "./state.js";
// ============================================================================
export type CrawlerArgs = ReturnType<typeof parseArgs> & {
logContext: LogContext[];
logExcludeContext: LogContext[];
text: string[];
scopedSeeds: ScopedSeed[];
crawlId: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig: Record<string, any>;
state?: SaveState;
warcInfo?: Record<string, string>;
};
// ============================================================================
class ArgParser {
get cliOpts(): { [key: string]: Options } {
const coerce = (array: string[]) => {
initArgs(argv: string[]) {
const coerce = (array: string[]): string[] => {
return array.flatMap((v) => v.split(",")).filter((x) => !!x);
};
return {
return yargs(hideBin(argv))
.usage("crawler [options]")
.options({
seeds: {
alias: "url",
describe: "The URL to start crawling from",
@ -74,7 +95,8 @@ class ArgParser {
},
extraHops: {
describe: "Number of extra 'hops' to follow, beyond the current scope",
describe:
"Number of extra 'hops' to follow, beyond the current scope",
default: 0,
type: "number",
},
@ -119,11 +141,14 @@ class ArgParser {
alias: "include",
describe:
"Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
type: "string",
},
scopeExcludeRx: {
alias: "exclude",
describe: "Regex of page URLs that should be excluded from the crawl.",
describe:
"Regex of page URLs that should be excluded from the crawl.",
type: "string",
},
allowHashUrls: {
@ -142,6 +167,7 @@ class ArgParser {
describe:
"If specified, when a URL is blocked, a record with this error message is added instead",
type: "string",
default: "",
},
blockAds: {
@ -156,6 +182,7 @@ class ArgParser {
describe:
"If specified, when an ad is blocked, a record with this error message is added instead",
type: "string",
default: "",
},
collection: {
@ -289,15 +316,18 @@ class ArgParser {
alias: "sitemapFrom",
describe:
"If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
type: "string",
},
sitemapToDate: {
alias: "sitemapTo",
describe:
"If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
type: "string",
},
statsFilename: {
type: "string",
describe:
"If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)",
},
@ -584,7 +614,7 @@ class ArgParser {
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
type: "string",
},
};
});
}
parseArgs(argvParams?: string[], isQA = false) {
@ -601,9 +631,7 @@ class ArgParser {
let origConfig = {};
const parsed = yargs(hideBin(argv))
.usage("crawler [options]")
.option(this.cliOpts)
const parsed = this.initArgs(argv)
.config(
"config",
"Path to YAML config file",
@ -616,9 +644,12 @@ class ArgParser {
return origConfig;
},
)
.check((argv) => this.validateArgs(argv, isQA)).argv;
.check((argv) => this.validateArgs(argv, isQA))
.parseSync();
return { parsed, origConfig };
parsed.origConfig = origConfig;
return parsed;
}
splitCrawlArgsQuoteSafe(crawlArgs: string): string[] {
@ -629,8 +660,8 @@ class ArgParser {
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
validateArgs(argv: Record<string, any>, isQA: boolean) {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname;
validateArgs(argv: any, isQA: boolean) {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
// Check that the collection name is valid.
@ -675,7 +706,8 @@ class ArgParser {
for (const seed of urlSeedFileList) {
if (seed) {
argv.seeds.push(seed);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(argv.seeds as any).push(seed);
}
}
}
@ -689,7 +721,7 @@ class ArgParser {
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
}
argv.scopedSeeds = [];
const scopedSeeds: ScopedSeed[] = [];
if (!isQA) {
const scopeOpts = {
@ -701,24 +733,22 @@ class ArgParser {
extraHops: argv.extraHops,
};
for (let seed of argv.seeds) {
if (typeof seed === "string") {
seed = { url: seed };
}
for (const seed of argv.seeds) {
const newSeed = typeof seed === "string" ? { url: seed } : seed;
try {
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Failed to create seed", {
error: e.toString(),
...scopeOpts,
...seed,
...newSeed,
});
if (argv.failOnFailedSeed) {
logger.fatal(
"Invalid seed specified, aborting crawl",
{ url: seed.url },
{ url: newSeed.url },
"general",
1,
);
@ -726,13 +756,15 @@ class ArgParser {
}
}
if (!argv.scopedSeeds.length) {
if (!scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl");
}
} else if (!argv.qaSource) {
logger.fatal("--qaSource required for QA mode");
}
argv.scopedSeeds = scopedSeeds;
// Resolve statsFilename
if (argv.statsFilename) {
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);

View file

@ -18,7 +18,7 @@ const BlockState = {
BLOCK_AD: "advertisement",
};
type BlockRuleDecl = {
export type BlockRuleDecl = {
url?: string;
frameTextMatch?: string;
inFrameUrl?: string;

View file

@ -149,7 +149,7 @@ declare module "ioredis" {
}
// ============================================================================
type SaveState = {
export type SaveState = {
done?: number | string[];
finished: string[];
queued: string[];

View file

@ -3,7 +3,7 @@ import fs from "fs";
test("ensure dryRun crawl only writes pages and logs", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun --exclude community',
);
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();

View file

@ -3,7 +3,7 @@ import fs from "fs";
test("ensure that stats file is modified", async () => {
const child = child_process.exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --exclude community --collection file-stats --statsFilename progress.json",
);
// detect crawler exit

View file

@ -6,7 +6,7 @@ const exec = util.promisify(execCallback);
test("ensure page limit reached", async () => {
execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json --exclude community',
);
});

View file

@ -6,7 +6,7 @@ const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...
test("ensure multi url crawl run with docker run passes", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2 --exclude community',
);
});

View file

@ -3,7 +3,7 @@ import fs from "fs";
test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --collection rollover-500K --rolloverSize 500000 --screenshot view"
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --exclude community --collection rollover-500K --rolloverSize 500000 --screenshot view"
);
const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive");

View file

@ -53,7 +53,7 @@ test("check crawl interrupted + saved state written", async () => {
try {
containerId = execSync(
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\" --exclude community",
{ encoding: "utf-8" },
//wait.callback,
);
@ -129,7 +129,7 @@ test("check crawl restarted with saved state", async () => {
try {
containerId = execSync(
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors "" --exclude community`,
{ encoding: "utf-8" },
);
} catch (error) {

View file

@ -13,7 +13,7 @@ function getSeeds(config) {
};
const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
return res.parsed.scopedSeeds;
return res.scopedSeeds;
}
test("default scope", async () => {