Profile download support (#126)

* profiles: support loading profiles via a URL.

* add 'request' dependency

* README: mention profile URLs
This commit is contained in:
Ilya Kreymer 2022-03-14 14:44:24 -07:00 committed by GitHub
parent 1fae21b0cf
commit 12d96f22c6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 4 deletions

View file

@ -574,6 +574,8 @@ After running the above command, you can now run a crawl with the profile, as fo
docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --profile /crawls/profiles/profile.tar.gz --url https://twitter.com/ --generateWACZ --collection test-with-profile
```
Profiles can also be loaded from an http/https URL, eg. `--profile https://example.com/path/to/profile.tar.gz`
## Published Releases / Production Use
When using Browsertrix Crawler in production, it is recommended to use a specific, published version of the image, eg. `webrecorder/browsertrix-crawler:[VERSION]` instead of `webrecorder/browsertrix-crawler` where `[VERSION]` corresponds to one of the published release tag.

View file

@ -63,8 +63,6 @@ class Crawler {
this.debugLogging = this.params.logging.includes("debug");
this.profileDir = loadProfile(this.params.profile);
if (this.params.profile) {
this.statusLog("With Browser Profile: " + this.params.profile);
}
@ -372,6 +370,7 @@ class Crawler {
}
async crawl() {
this.profileDir = await loadProfile(this.params.profile);
try {
this.driver = require(this.params.driver);

View file

@ -17,6 +17,7 @@
"node-fetch": "^2.6.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "^13.3.2",
"request": "^2.88.2",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",
"warcio": "^1.5.0",

View file

@ -2,12 +2,36 @@ const child_process = require("child_process");
const fs = require("fs");
const path = require("path");
const os = require("os");
const request = require("request");
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
module.exports.loadProfile = function(profileFilename) {
module.exports.loadProfile = async function(profileFilename) {
if (profileFilename &&
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
const targetFilename = "/tmp/profile.tar.gz";
console.log(`Downloading ${profileFilename} to ${targetFilename}`);
const p = new Promise((resolve, reject) => {
request.get(profileFilename).
on("error", (err) => reject(err)).
pipe(fs.createWriteStream(targetFilename)).
on("finish", () => resolve());
});
await p;
profileFilename = targetFilename;
}
if (profileFilename) {
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
try {
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
} catch (e) {
console.error(`Profile filename ${profileFilename} not a valid tar.gz`);
}
}
return profileDir;