add --dryRun flag and mode (#594)

- if set, runs the crawl but doesn't store any archive data (WARCS,
WACZ, CDXJ) while logs and pages are still written, and saved state can be
generated (per the --saveState options).
- adds test to ensure only 'logs' and 'pages' dirs are generated with --dryRun
- screenshot, text extraction are skipped altogether in dryRun mode,
warning is printed that storage and archiving-related options may be
ignored
- fixes #593
This commit is contained in:
Ilya Kreymer 2024-06-07 10:34:19 -07:00 committed by GitHub
parent 32435bfac7
commit b83d1c58da
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 70 additions and 22 deletions

View file

@ -144,6 +144,11 @@ Options:
age behavior will run on each page.
If 0, a behavior can run until finis
h. [number] [default: 90]
--postLoadDelay If >0, amount of time to sleep (in s
econds) after page has loaded, befor
e taking screenshots / getting text
/ running behaviors
[number] [default: 0]
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
econds) after behaviors before movin
g on to next page
@ -227,16 +232,19 @@ Options:
--writePagesToRedis If set, write page objects to redis
[boolean] [default: false]
--failOnFailedSeed If set, crawler will fail with exit
code 1 if any seed fails
[boolean] [default: false]
code 1 if any seed fails. When combi
ned with --failOnInvalidStatus,will
result in crawl failing with exit co
de 1 if any seed has a 4xx/5xx respo
nse [boolean] [default: false]
--failOnFailedLimit If set, save state and exit if numbe
r of failed pages exceeds this value
[number] [default: 0]
--failOnInvalidStatus If set, will treat pages with non-20
0 response as failures. When combine
d with --failOnFailedLimit or --fail
OnFailedSeedmay result in crawl fail
ing due to non-200 responses
--failOnInvalidStatus If set, will treat pages with 4xx or
5xx response as failures. When comb
ined with --failOnFailedLimit or --f
ailOnFailedSeed may result in crawl
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
@ -250,6 +258,8 @@ Options:
nabled, or disabled with custom prof
ile
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
--dryRun If true, no data is written to disk,
only logs [boolean]
--qaSource Required for QA mode. Source (WACZ o
r multi WACZ) for QA [string]
--qaDebugImageDiff if specified, will write crawl.png,
@ -269,7 +279,8 @@ Options:
ted
--password The password for the login. If not specified, will be promp
ted (recommended)
--filename The filename for the profile tarball
--filename The filename for the profile tarball, stored within /crawls
/profiles if absolute path not provided
[default: "/crawls/profiles/profile.tar.gz"]
--debugScreenshot If specified, take a screenshot after login and save as thi
s filename

View file

@ -177,7 +177,7 @@ export class Crawler {
crawler: Crawler;
}) => NonNullable<unknown>;
recording = true;
recording: boolean;
constructor() {
const args = this.parseArgs();
@ -211,6 +211,13 @@ export class Crawler {
logger.debug("Writing log to: " + this.logFilename, {}, "general");
this.recording = !this.params.dryRun;
if (this.params.dryRun) {
logger.warn(
"Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.",
);
}
this.headers = {};
// pages file
@ -439,9 +446,12 @@ export class Crawler {
subprocesses.push(this.launchRedis());
await fsp.mkdir(this.logDir, { recursive: true });
if (!this.params.dryRun) {
await fsp.mkdir(this.archivesDir, { recursive: true });
await fsp.mkdir(this.tempdir, { recursive: true });
await fsp.mkdir(this.tempCdxDir, { recursive: true });
}
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
logger.setExternalLogStream(this.logFH);
@ -503,10 +513,10 @@ export class Crawler {
);
}
if (this.params.screenshot) {
if (this.params.screenshot && !this.params.dryRun) {
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
}
if (this.params.text) {
if (this.params.text && !this.params.dryRun) {
this.textWriter = this.createExtraResourceWarcWriter("text");
}
}
@ -1089,7 +1099,7 @@ self.__bx_behaviors.selectMainBehavior();
async checkLimits() {
let interrupt = false;
const size = await getDirSize(this.archivesDir);
const size = this.params.dryRun ? 0 : await getDirSize(this.archivesDir);
await this.crawlState.setArchiveSize(size);
@ -1389,11 +1399,11 @@ self.__bx_behaviors.selectMainBehavior();
}
async postCrawl() {
if (this.params.combineWARC) {
if (this.params.combineWARC && !this.params.dryRun) {
await this.combineWARC();
}
if (this.params.generateCDX) {
if (this.params.generateCDX && !this.params.dryRun) {
logger.info("Generating CDX");
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
await this.crawlState.setStatus("generate-cdx");
@ -1425,6 +1435,7 @@ self.__bx_behaviors.selectMainBehavior();
if (
this.params.generateWACZ &&
!this.params.dryRun &&
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
) {
const uploaded = await this.generateWACZ();
@ -2125,12 +2136,14 @@ self.__bx_behaviors.selectMainBehavior();
let { ts } = state;
if (!ts) {
ts = new Date();
if (!this.params.dryRun) {
logger.warn(
"Page date missing, setting to now",
{ url, ts },
"pageStatus",
);
}
}
row.ts = ts.toISOString();

View file

@ -545,6 +545,12 @@ class ArgParser {
default: "disabled",
},
dryRun: {
describe:
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
type: "boolean",
},
qaSource: {
describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA",
type: "string",

18
tests/dryrun.test.js Normal file
View file

@ -0,0 +1,18 @@
import child_process from "child_process";
import fs from "fs";
test("ensure dryRun crawl only writes pages and logs", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
);
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
expect(files.length).toBe(2);
expect(files[0]).toBe("logs");
expect(files[1]).toBe("pages");
});