browsertrix-crawler/tests/dryrun.test.js
Ilya Kreymer b83d1c58da
add --dryRun flag and mode (#594)
- if set, runs the crawl but doesn't store any archive data (WARCS,
WACZ, CDXJ) while logs and pages are still written, and saved state can be
generated (per the --saveState options).
- adds test to ensure only 'logs' and 'pages' dirs are generated with --dryRun
- screenshot, text extraction are skipped altogether in dryRun mode,
warning is printed that storage and archiving-related options may be
ignored
- fixes #593
2024-06-07 10:34:19 -07:00

18 lines
656 B
JavaScript

import child_process from "child_process";
import fs from "fs";
test("ensure dryRun crawl only writes pages and logs", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
);
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
expect(files.length).toBe(2);
expect(files[0]).toBe("logs");
expect(files[1]).toBe("pages");
});