mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
crawler args typing (#680)
- Refactors args parsing so that `Crawler.params` is properly timed with CLI options + additions with `CrawlerArgs` type. - also adds typing to create-login-profile CLI options - validation still done w/o typing due to yargs limitations - tests: exclude slow page from tests for faster test runs
This commit is contained in:
parent
802a416c7e
commit
9c9643c24f
14 changed files with 686 additions and 642 deletions
|
@ -43,9 +43,9 @@ Options:
|
|||
tom"]
|
||||
--scopeIncludeRx, --include Regex of page URLs that should be in
|
||||
cluded in the crawl (defaults to the
|
||||
immediate directory of URL)
|
||||
immediate directory of URL)[string]
|
||||
--scopeExcludeRx, --exclude Regex of page URLs that should be ex
|
||||
cluded from the crawl.
|
||||
cluded from the crawl. [string]
|
||||
--allowHashUrls Allow Hashtag URLs, useful for singl
|
||||
e-page-application crawling or when
|
||||
different hashtags load dynamic cont
|
||||
|
@ -56,14 +56,14 @@ Options:
|
|||
an iframe [array] [default: []]
|
||||
--blockMessage If specified, when a URL is blocked,
|
||||
a record with this error message is
|
||||
added instead [string]
|
||||
added instead[string] [default: ""]
|
||||
--blockAds, --blockads If set, block advertisements from be
|
||||
ing loaded (based on Stephen Black's
|
||||
blocklist)
|
||||
[boolean] [default: false]
|
||||
--adBlockMessage If specified, when an ad is blocked,
|
||||
a record with this error message is
|
||||
added instead [string]
|
||||
added instead[string] [default: ""]
|
||||
-c, --collection Collection name to crawl to (replay
|
||||
will be accessible under this name i
|
||||
n pywb preview)
|
||||
|
@ -79,7 +79,7 @@ Options:
|
|||
ineWarc [boolean] [default: false]
|
||||
--rolloverSize If set, declare the rollover size
|
||||
[number] [default: 1000000000]
|
||||
--generateWACZ, --generatewacz, --ge If set, generate wacz
|
||||
--generateWACZ, --generatewacz, --ge If set, generate WACZ on disk
|
||||
nerateWacz [boolean] [default: false]
|
||||
--logging Logging options for crawler, can inc
|
||||
lude: stats (enabled by default), js
|
||||
|
@ -94,15 +94,15 @@ Options:
|
|||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
|
||||
inks", "sitemap", "replay", "proxy"] [default: []]
|
||||
inks", "sitemap", "wacz", "replay", "proxy"] [default: []]
|
||||
--logExcludeContext Comma-separated list of contexts to
|
||||
NOT include in logs
|
||||
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||
orScript", "jsError", "fetch", "pageStatus", "memoryStatus", "crawlStatus", "l
|
||||
inks", "sitemap", "replay", "proxy"] [default: ["recorderNetwork","jsError","s
|
||||
creencast"]]
|
||||
inks", "sitemap", "wacz", "replay", "proxy"] [default: ["recorderNetwork","jsE
|
||||
rror","screencast"]]
|
||||
--text Extract initial (default) or final t
|
||||
ext to pages.jsonl or WARC resource
|
||||
record(s)
|
||||
|
@ -127,15 +127,15 @@ Options:
|
|||
those greater than or equal to (>=)
|
||||
provided ISO Date string (YYYY-MM-D
|
||||
D or YYYY-MM-DDTHH:MM:SS or partial
|
||||
date)
|
||||
date) [string]
|
||||
--sitemapToDate, --sitemapTo If set, filter URLs from sitemaps to
|
||||
those less than or equal to (<=) pr
|
||||
ovided ISO Date string (YYYY-MM-DD o
|
||||
r YYYY-MM-DDTHH:MM:SS or partial dat
|
||||
e)
|
||||
e) [string]
|
||||
--statsFilename If set, output stats as JSON to this
|
||||
file. (Relative filename resolves t
|
||||
o crawl working directory)
|
||||
o crawl working directory) [string]
|
||||
--behaviors Which background behaviors to enable
|
||||
on each page
|
||||
[array] [choices: "autoplay", "autofetch", "autoscroll", "siteSpecific"] [defa
|
||||
|
@ -304,7 +304,7 @@ Options:
|
|||
--shutdownWait Shutdown browser in interactive after this many seco
|
||||
nds, if no pings received [number] [default: 0]
|
||||
--profile Path or HTTP(S) URL to tar.gz file which contains th
|
||||
e browser profile directory [string]
|
||||
e browser profile directory [string] [default: ""]
|
||||
--windowSize Browser window dimensions, specified as: width,heigh
|
||||
t [string] [default: "1360,1020"]
|
||||
--cookieDays If >0, set all cookies, including session cookies, t
|
||||
|
|
|
@ -12,7 +12,7 @@ import {
|
|||
WorkerId,
|
||||
} from "./util/state.js";
|
||||
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
import { CrawlerArgs, parseArgs } from "./util/argParser.js";
|
||||
|
||||
import yaml from "js-yaml";
|
||||
|
||||
|
@ -52,7 +52,7 @@ import {
|
|||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||
import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js";
|
||||
import { OriginOverride } from "./util/originoverride.js";
|
||||
|
||||
import {
|
||||
|
@ -107,8 +107,7 @@ type PageEntry = {
|
|||
|
||||
// ============================================================================
|
||||
export class Crawler {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: any;
|
||||
params: CrawlerArgs;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
origConfig: any;
|
||||
|
||||
|
@ -200,8 +199,8 @@ export class Crawler {
|
|||
|
||||
constructor() {
|
||||
const args = this.parseArgs();
|
||||
this.params = args.parsed;
|
||||
this.origConfig = args.origConfig;
|
||||
this.params = args as CrawlerArgs;
|
||||
this.origConfig = this.params.origConfig;
|
||||
|
||||
// root collections dir
|
||||
this.collDir = path.join(
|
||||
|
@ -872,7 +871,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const result = await timedRun(
|
||||
directFetchCapture({ url, headers, cdp }),
|
||||
this.params.timeout,
|
||||
this.params.pageLoadTimeout,
|
||||
"Direct fetch of page URL timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
|
@ -1396,7 +1395,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(
|
||||
this.params.blockRules,
|
||||
this.params.blockRules as BlockRuleDecl[],
|
||||
this.captureBasePrefix,
|
||||
this.params.blockMessage,
|
||||
);
|
||||
|
@ -1405,7 +1404,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.screencaster = this.initScreenCaster();
|
||||
|
||||
if (this.params.originOverride && this.params.originOverride.length) {
|
||||
this.originOverride = new OriginOverride(this.params.originOverride);
|
||||
this.originOverride = new OriginOverride(
|
||||
this.params.originOverride as string[],
|
||||
);
|
||||
}
|
||||
|
||||
await this._addInitialSeeds();
|
||||
|
@ -2183,7 +2184,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
id: "pages",
|
||||
title,
|
||||
};
|
||||
header.hasText = this.params.text.includes("to-pages");
|
||||
header.hasText = this.params.text.includes("to-pages") + "";
|
||||
if (this.params.text.length) {
|
||||
logger.debug("Text Extraction: " + this.params.text.join(","));
|
||||
} else {
|
||||
|
@ -2290,8 +2291,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return;
|
||||
}
|
||||
|
||||
const fromDate = this.params.sitemapFromDate;
|
||||
const toDate = this.params.sitemapToDate;
|
||||
const fromDate = this.params.sitemapFromDate
|
||||
? new Date(this.params.sitemapFromDate)
|
||||
: undefined;
|
||||
const toDate = this.params.sitemapToDate
|
||||
? new Date(this.params.sitemapToDate)
|
||||
: undefined;
|
||||
const headers = this.headers;
|
||||
|
||||
logger.info(
|
||||
|
|
|
@ -7,7 +7,7 @@ import http, { IncomingMessage, ServerResponse } from "http";
|
|||
import readline from "readline";
|
||||
import child_process from "child_process";
|
||||
|
||||
import yargs, { Options } from "yargs";
|
||||
import yargs from "yargs";
|
||||
|
||||
import { logger } from "./util/logger.js";
|
||||
|
||||
|
@ -35,96 +35,106 @@ const behaviors = fs.readFileSync(
|
|||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
function cliOpts(): { [key: string]: Options } {
|
||||
return {
|
||||
url: {
|
||||
describe: "The URL of the login page",
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
},
|
||||
function initArgs() {
|
||||
return yargs(process.argv)
|
||||
.usage("browsertrix-crawler profile [options]")
|
||||
.options({
|
||||
url: {
|
||||
describe: "The URL of the login page",
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
},
|
||||
|
||||
user: {
|
||||
describe:
|
||||
"The username for the login. If not specified, will be prompted",
|
||||
},
|
||||
user: {
|
||||
describe:
|
||||
"The username for the login. If not specified, will be prompted",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
password: {
|
||||
describe:
|
||||
"The password for the login. If not specified, will be prompted (recommended)",
|
||||
},
|
||||
password: {
|
||||
describe:
|
||||
"The password for the login. If not specified, will be prompted (recommended)",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
filename: {
|
||||
describe:
|
||||
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
|
||||
default: "/crawls/profiles/profile.tar.gz",
|
||||
},
|
||||
filename: {
|
||||
describe:
|
||||
"The filename for the profile tarball, stored within /crawls/profiles if absolute path not provided",
|
||||
type: "string",
|
||||
default: "/crawls/profiles/profile.tar.gz",
|
||||
},
|
||||
|
||||
debugScreenshot: {
|
||||
describe:
|
||||
"If specified, take a screenshot after login and save as this filename",
|
||||
},
|
||||
debugScreenshot: {
|
||||
describe:
|
||||
"If specified, take a screenshot after login and save as this filename",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
headless: {
|
||||
describe: "Run in headless mode, otherwise start xvfb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
headless: {
|
||||
describe: "Run in headless mode, otherwise start xvfb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
automated: {
|
||||
describe: "Start in automated mode, no interactive browser",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
automated: {
|
||||
describe: "Start in automated mode, no interactive browser",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
interactive: {
|
||||
describe: "Deprecated. Now the default option!",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
interactive: {
|
||||
describe: "Deprecated. Now the default option!",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
shutdownWait: {
|
||||
describe:
|
||||
"Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
shutdownWait: {
|
||||
describe:
|
||||
"Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
profile: {
|
||||
describe:
|
||||
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
|
||||
type: "string",
|
||||
},
|
||||
profile: {
|
||||
describe:
|
||||
"Path or HTTP(S) URL to tar.gz file which contains the browser profile directory",
|
||||
type: "string",
|
||||
default: "",
|
||||
},
|
||||
|
||||
windowSize: {
|
||||
type: "string",
|
||||
describe: "Browser window dimensions, specified as: width,height",
|
||||
default: getDefaultWindowSize(),
|
||||
},
|
||||
windowSize: {
|
||||
describe: "Browser window dimensions, specified as: width,height",
|
||||
type: "string",
|
||||
default: getDefaultWindowSize(),
|
||||
},
|
||||
|
||||
cookieDays: {
|
||||
type: "number",
|
||||
describe:
|
||||
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||
default: 7,
|
||||
},
|
||||
cookieDays: {
|
||||
describe:
|
||||
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||
type: "number",
|
||||
default: 7,
|
||||
},
|
||||
|
||||
proxyServer: {
|
||||
describe:
|
||||
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
|
||||
type: "string",
|
||||
},
|
||||
proxyServer: {
|
||||
describe:
|
||||
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
sshProxyPrivateKeyFile: {
|
||||
describe: "path to SSH private key for SOCKS5 over SSH proxy connection",
|
||||
type: "string",
|
||||
},
|
||||
sshProxyPrivateKeyFile: {
|
||||
describe:
|
||||
"path to SSH private key for SOCKS5 over SSH proxy connection",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
sshProxyKnownHostsFile: {
|
||||
describe:
|
||||
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
|
||||
type: "string",
|
||||
},
|
||||
};
|
||||
sshProxyKnownHostsFile: {
|
||||
describe:
|
||||
"path to SSH known hosts file for SOCKS5 over SSH proxy connection",
|
||||
type: "string",
|
||||
},
|
||||
})
|
||||
.parseSync();
|
||||
}
|
||||
|
||||
function getDefaultWindowSize() {
|
||||
|
@ -140,10 +150,7 @@ function handleTerminate(signame: string) {
|
|||
}
|
||||
|
||||
async function main() {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const params: any = yargs(process.argv)
|
||||
.usage("browsertrix-crawler profile [options]")
|
||||
.option(cliOpts()).argv;
|
||||
const params = initArgs();
|
||||
|
||||
logger.setDebugLogging(true);
|
||||
|
||||
|
|
|
@ -110,8 +110,8 @@ export class ReplayCrawler extends Crawler {
|
|||
|
||||
this.infoWriter = null;
|
||||
|
||||
this.includeRx = parseRx(this.params.include);
|
||||
this.excludeRx = parseRx(this.params.include);
|
||||
this.includeRx = parseRx(this.params.scopeIncludeRx);
|
||||
this.excludeRx = parseRx(this.params.scopeExcludeRx);
|
||||
}
|
||||
|
||||
async bootstrap(): Promise<void> {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -18,7 +18,7 @@ const BlockState = {
|
|||
BLOCK_AD: "advertisement",
|
||||
};
|
||||
|
||||
type BlockRuleDecl = {
|
||||
export type BlockRuleDecl = {
|
||||
url?: string;
|
||||
frameTextMatch?: string;
|
||||
inFrameUrl?: string;
|
||||
|
|
|
@ -149,7 +149,7 @@ declare module "ioredis" {
|
|||
}
|
||||
|
||||
// ============================================================================
|
||||
type SaveState = {
|
||||
export type SaveState = {
|
||||
done?: number | string[];
|
||||
finished: string[];
|
||||
queued: string[];
|
||||
|
|
|
@ -3,7 +3,7 @@ import fs from "fs";
|
|||
|
||||
test("ensure dryRun crawl only writes pages and logs", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun --exclude community',
|
||||
);
|
||||
|
||||
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
|
||||
|
|
|
@ -3,7 +3,7 @@ import fs from "fs";
|
|||
|
||||
test("ensure that stats file is modified", async () => {
|
||||
const child = child_process.exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --exclude community --collection file-stats --statsFilename progress.json",
|
||||
);
|
||||
|
||||
// detect crawler exit
|
||||
|
|
|
@ -6,7 +6,7 @@ const exec = util.promisify(execCallback);
|
|||
|
||||
test("ensure page limit reached", async () => {
|
||||
execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json --exclude community',
|
||||
);
|
||||
});
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...
|
|||
|
||||
test("ensure multi url crawl run with docker run passes", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2 --exclude community',
|
||||
);
|
||||
});
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import fs from "fs";
|
|||
|
||||
test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --collection rollover-500K --rolloverSize 500000 --screenshot view"
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --limit 5 --exclude community --collection rollover-500K --rolloverSize 500000 --screenshot view"
|
||||
);
|
||||
|
||||
const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive");
|
||||
|
|
|
@ -53,7 +53,7 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
|
||||
try {
|
||||
containerId = execSync(
|
||||
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\"",
|
||||
"docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://www.webrecorder.net/ --limit 10 --behaviors \"\" --exclude community",
|
||||
{ encoding: "utf-8" },
|
||||
//wait.callback,
|
||||
);
|
||||
|
@ -129,7 +129,7 @@ test("check crawl restarted with saved state", async () => {
|
|||
|
||||
try {
|
||||
containerId = execSync(
|
||||
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors ""`,
|
||||
`docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors "" --exclude community`,
|
||||
{ encoding: "utf-8" },
|
||||
);
|
||||
} catch (error) {
|
||||
|
|
|
@ -13,7 +13,7 @@ function getSeeds(config) {
|
|||
};
|
||||
|
||||
const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
|
||||
return res.parsed.scopedSeeds;
|
||||
return res.scopedSeeds;
|
||||
}
|
||||
|
||||
test("default scope", async () => {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue