mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Profile download support (#126)
* profiles: support loading profiles via a URL. * add 'request' dependency * README: mention profile URLs
This commit is contained in:
parent
1fae21b0cf
commit
12d96f22c6
4 changed files with 30 additions and 4 deletions
|
@ -574,6 +574,8 @@ After running the above command, you can now run a crawl with the profile, as fo
|
|||
docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --profile /crawls/profiles/profile.tar.gz --url https://twitter.com/ --generateWACZ --collection test-with-profile
|
||||
```
|
||||
|
||||
Profiles can also be loaded from an http/https URL, eg. `--profile https://example.com/path/to/profile.tar.gz`
|
||||
|
||||
## Published Releases / Production Use
|
||||
|
||||
When using Browsertrix Crawler in production, it is recommended to use a specific, published version of the image, eg. `webrecorder/browsertrix-crawler:[VERSION]` instead of `webrecorder/browsertrix-crawler` where `[VERSION]` corresponds to one of the published release tag.
|
||||
|
|
|
@ -63,8 +63,6 @@ class Crawler {
|
|||
|
||||
this.debugLogging = this.params.logging.includes("debug");
|
||||
|
||||
this.profileDir = loadProfile(this.params.profile);
|
||||
|
||||
if (this.params.profile) {
|
||||
this.statusLog("With Browser Profile: " + this.params.profile);
|
||||
}
|
||||
|
@ -372,6 +370,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
async crawl() {
|
||||
this.profileDir = await loadProfile(this.params.profile);
|
||||
|
||||
try {
|
||||
this.driver = require(this.params.driver);
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||
"puppeteer-core": "^13.3.2",
|
||||
"request": "^2.88.2",
|
||||
"sitemapper": "^3.1.2",
|
||||
"uuid": "8.3.2",
|
||||
"warcio": "^1.5.0",
|
||||
|
|
|
@ -2,12 +2,36 @@ const child_process = require("child_process");
|
|||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const os = require("os");
|
||||
const request = require("request");
|
||||
|
||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
|
||||
module.exports.loadProfile = function(profileFilename) {
|
||||
module.exports.loadProfile = async function(profileFilename) {
|
||||
if (profileFilename &&
|
||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
||||
|
||||
const targetFilename = "/tmp/profile.tar.gz";
|
||||
|
||||
console.log(`Downloading ${profileFilename} to ${targetFilename}`);
|
||||
|
||||
const p = new Promise((resolve, reject) => {
|
||||
request.get(profileFilename).
|
||||
on("error", (err) => reject(err)).
|
||||
pipe(fs.createWriteStream(targetFilename)).
|
||||
on("finish", () => resolve());
|
||||
});
|
||||
|
||||
await p;
|
||||
|
||||
profileFilename = targetFilename;
|
||||
}
|
||||
|
||||
if (profileFilename) {
|
||||
try {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
|
||||
} catch (e) {
|
||||
console.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
}
|
||||
}
|
||||
|
||||
return profileDir;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue