mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
add --dryRun flag and mode (#594)
- if set, runs the crawl but doesn't store any archive data (WARCS, WACZ, CDXJ) while logs and pages are still written, and saved state can be generated (per the --saveState options). - adds test to ensure only 'logs' and 'pages' dirs are generated with --dryRun - screenshot, text extraction are skipped altogether in dryRun mode, warning is printed that storage and archiving-related options may be ignored - fixes #593
This commit is contained in:
parent
32435bfac7
commit
b83d1c58da
4 changed files with 70 additions and 22 deletions
|
@ -144,6 +144,11 @@ Options:
|
|||
age behavior will run on each page.
|
||||
If 0, a behavior can run until finis
|
||||
h. [number] [default: 90]
|
||||
--postLoadDelay If >0, amount of time to sleep (in s
|
||||
econds) after page has loaded, befor
|
||||
e taking screenshots / getting text
|
||||
/ running behaviors
|
||||
[number] [default: 0]
|
||||
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
|
||||
econds) after behaviors before movin
|
||||
g on to next page
|
||||
|
@ -227,16 +232,19 @@ Options:
|
|||
--writePagesToRedis If set, write page objects to redis
|
||||
[boolean] [default: false]
|
||||
--failOnFailedSeed If set, crawler will fail with exit
|
||||
code 1 if any seed fails
|
||||
[boolean] [default: false]
|
||||
code 1 if any seed fails. When combi
|
||||
ned with --failOnInvalidStatus,will
|
||||
result in crawl failing with exit co
|
||||
de 1 if any seed has a 4xx/5xx respo
|
||||
nse [boolean] [default: false]
|
||||
--failOnFailedLimit If set, save state and exit if numbe
|
||||
r of failed pages exceeds this value
|
||||
[number] [default: 0]
|
||||
--failOnInvalidStatus If set, will treat pages with non-20
|
||||
0 response as failures. When combine
|
||||
d with --failOnFailedLimit or --fail
|
||||
OnFailedSeedmay result in crawl fail
|
||||
ing due to non-200 responses
|
||||
--failOnInvalidStatus If set, will treat pages with 4xx or
|
||||
5xx response as failures. When comb
|
||||
ined with --failOnFailedLimit or --f
|
||||
ailOnFailedSeed may result in crawl
|
||||
failing due to non-200 responses
|
||||
[boolean] [default: false]
|
||||
--customBehaviors injects a custom behavior file or se
|
||||
t of behavior files in a directory
|
||||
|
@ -250,6 +258,8 @@ Options:
|
|||
nabled, or disabled with custom prof
|
||||
ile
|
||||
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
|
||||
--dryRun If true, no data is written to disk,
|
||||
only logs [boolean]
|
||||
--qaSource Required for QA mode. Source (WACZ o
|
||||
r multi WACZ) for QA [string]
|
||||
--qaDebugImageDiff if specified, will write crawl.png,
|
||||
|
@ -269,7 +279,8 @@ Options:
|
|||
ted
|
||||
--password The password for the login. If not specified, will be promp
|
||||
ted (recommended)
|
||||
--filename The filename for the profile tarball
|
||||
--filename The filename for the profile tarball, stored within /crawls
|
||||
/profiles if absolute path not provided
|
||||
[default: "/crawls/profiles/profile.tar.gz"]
|
||||
--debugScreenshot If specified, take a screenshot after login and save as thi
|
||||
s filename
|
||||
|
|
|
@ -177,7 +177,7 @@ export class Crawler {
|
|||
crawler: Crawler;
|
||||
}) => NonNullable<unknown>;
|
||||
|
||||
recording = true;
|
||||
recording: boolean;
|
||||
|
||||
constructor() {
|
||||
const args = this.parseArgs();
|
||||
|
@ -211,6 +211,13 @@ export class Crawler {
|
|||
|
||||
logger.debug("Writing log to: " + this.logFilename, {}, "general");
|
||||
|
||||
this.recording = !this.params.dryRun;
|
||||
if (this.params.dryRun) {
|
||||
logger.warn(
|
||||
"Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.",
|
||||
);
|
||||
}
|
||||
|
||||
this.headers = {};
|
||||
|
||||
// pages file
|
||||
|
@ -439,9 +446,12 @@ export class Crawler {
|
|||
subprocesses.push(this.launchRedis());
|
||||
|
||||
await fsp.mkdir(this.logDir, { recursive: true });
|
||||
|
||||
if (!this.params.dryRun) {
|
||||
await fsp.mkdir(this.archivesDir, { recursive: true });
|
||||
await fsp.mkdir(this.tempdir, { recursive: true });
|
||||
await fsp.mkdir(this.tempCdxDir, { recursive: true });
|
||||
}
|
||||
|
||||
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
|
||||
logger.setExternalLogStream(this.logFH);
|
||||
|
@ -503,10 +513,10 @@ export class Crawler {
|
|||
);
|
||||
}
|
||||
|
||||
if (this.params.screenshot) {
|
||||
if (this.params.screenshot && !this.params.dryRun) {
|
||||
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
|
||||
}
|
||||
if (this.params.text) {
|
||||
if (this.params.text && !this.params.dryRun) {
|
||||
this.textWriter = this.createExtraResourceWarcWriter("text");
|
||||
}
|
||||
}
|
||||
|
@ -1089,7 +1099,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async checkLimits() {
|
||||
let interrupt = false;
|
||||
|
||||
const size = await getDirSize(this.archivesDir);
|
||||
const size = this.params.dryRun ? 0 : await getDirSize(this.archivesDir);
|
||||
|
||||
await this.crawlState.setArchiveSize(size);
|
||||
|
||||
|
@ -1389,11 +1399,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
async postCrawl() {
|
||||
if (this.params.combineWARC) {
|
||||
if (this.params.combineWARC && !this.params.dryRun) {
|
||||
await this.combineWARC();
|
||||
}
|
||||
|
||||
if (this.params.generateCDX) {
|
||||
if (this.params.generateCDX && !this.params.dryRun) {
|
||||
logger.info("Generating CDX");
|
||||
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
||||
await this.crawlState.setStatus("generate-cdx");
|
||||
|
@ -1425,6 +1435,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (
|
||||
this.params.generateWACZ &&
|
||||
!this.params.dryRun &&
|
||||
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
|
||||
) {
|
||||
const uploaded = await this.generateWACZ();
|
||||
|
@ -2125,12 +2136,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
let { ts } = state;
|
||||
if (!ts) {
|
||||
ts = new Date();
|
||||
if (!this.params.dryRun) {
|
||||
logger.warn(
|
||||
"Page date missing, setting to now",
|
||||
{ url, ts },
|
||||
"pageStatus",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
row.ts = ts.toISOString();
|
||||
|
||||
|
|
|
@ -545,6 +545,12 @@ class ArgParser {
|
|||
default: "disabled",
|
||||
},
|
||||
|
||||
dryRun: {
|
||||
describe:
|
||||
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
|
||||
type: "boolean",
|
||||
},
|
||||
|
||||
qaSource: {
|
||||
describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA",
|
||||
type: "string",
|
||||
|
|
18
tests/dryrun.test.js
Normal file
18
tests/dryrun.test.js
Normal file
|
@ -0,0 +1,18 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
test("ensure dryRun crawl only writes pages and logs", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
|
||||
);
|
||||
|
||||
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
|
||||
expect(files.length).toBe(2);
|
||||
expect(files[0]).toBe("logs");
|
||||
expect(files[1]).toBe("pages");
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue