browsertrix-crawler/tests/dryrun.test.js
Ilya Kreymer 30646ca7ba
Add downloads dir to cache external dependency within the crawl (#921)
Fixes #920 
- Downloads profile, custom behavior, and seed list to `/downloads`
directory in the crawl
- Seed File: Downloaded into downloads. Never refetched if already
exists on subsequent crawl restarts.
- Custom Behaviors: Git: Downloaded into dir, then moved to
/downloads/behaviors/<dir name>. if already exist, failure to downloaded
will reuse existing directory
- Custom Behaviors: File: Downloaded into temp file, then moved to
/downloads/behaviors/<name.js>. if already exists, failure to download
will reuse existing file.
- Profile: using `/profile` directory to contain the browser profile
- Profile: downloaded to temp file, then placed into
/downloads/profile.tar.gz. If failed to download, but already exists,
existing /profile directory is used
- Also fixes #897
2025-11-26 19:30:27 -08:00

20 lines
754 B
JavaScript

import child_process from "child_process";
import fs from "fs";
test("ensure dryRun crawl only writes pages and logs", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun --exclude community',
);
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
expect(files.length).toBe(4);
expect(files[0]).toBe("downloads");
expect(files[1]).toBe("logs");
expect(files[2]).toBe("pages");
expect(files[3]).toBe("profile");
});