browser config settings:

- add support for --userAgent to override user agent
- add support for --mobileDevice to use puppeteer device emulation presets
- add support for --userAgentSuffix to append to default user agent (including device userAgent)
bump to 0.1.2
This commit is contained in:
Ilya Kreymer 2020-11-14 19:32:31 +00:00
parent bfa1fc1618
commit fe406b5f74
4 changed files with 69 additions and 8 deletions

View file

@ -25,11 +25,16 @@ class Crawler {
this.seenList = new Set(); this.seenList = new Set();
this.emulateDevice = null;
// links crawled counter // links crawled counter
this.numLinks = 0; this.numLinks = 0;
this.monitor = true; this.monitor = true;
this.userAgent = "";
this.headers = {};
const params = require("yargs") const params = require("yargs")
.usage("browsertrix-crawler [options]") .usage("browsertrix-crawler [options]")
.option(this.cliOpts) .option(this.cliOpts)
@ -42,9 +47,22 @@ class Crawler {
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`; this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
} }
bootstrap() { configureUA() {
const opts = {stdio: "ignore", cwd: this.params.cwd}; // override userAgent
if (this.params.userAgent) {
if (this.emulateDevice) {
this.emulateDevice.userAgent = this.params.userAgent;
}
this.userAgent = this.params.userAgent;
return;
}
// if device set, it overrides the default Chrome UA
if (this.emulateDevice) {
this.userAgent = this.emulateDevice.userAgent;
} else {
let version = "84"; let version = "84";
try { try {
@ -52,6 +70,23 @@ class Crawler {
} catch(e) {} } catch(e) {}
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
}
// suffix to append to default userAgent
if (this.params.userAgentSuffix) {
this.userAgent += " " + this.params.userAgentSuffix;
if (this.emulateDevice) {
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
}
}
}
bootstrap() {
const opts = {stdio: "ignore", cwd: this.params.cwd};
this.userAgent = this.configureUA();
this.headers = {"User-Agent": this.userAgent}; this.headers = {"User-Agent": this.userAgent};
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}); child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
@ -159,6 +194,21 @@ class Crawler {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd", describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
type: "string", type: "string",
default: process.cwd(), default: process.cwd(),
},
"mobileDevice": {
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
type: "string",
},
"userAgent": {
describe: "Override user-agent with specified string",
type: "string",
},
"userAgentSuffix": {
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
type: "string",
} }
}; };
} }
@ -215,6 +265,13 @@ class Crawler {
throw new Error("Invalid newContext, must be one of: page, session, browser"); throw new Error("Invalid newContext, must be one of: page, session, browser");
} }
if (argv.mobileDevice) {
this.emulateDevice = puppeteer.devices[argv.mobileDevice];
if (!this.emulateDevice) {
throw new Error("Unknown device: " + argv.mobileDevice);
}
}
// Support one or multiple exclude // Support one or multiple exclude
if (argv.exclude) { if (argv.exclude) {
if (typeof(argv.exclude) === "string") { if (typeof(argv.exclude) === "string") {

View file

@ -15,6 +15,10 @@ module.exports = async ({data, page, crawler}) => {
return; return;
} }
if (crawler.emulateDevice) {
await page.emulate(crawler.emulateDevice);
}
const mediaResults = []; const mediaResults = [];
await page.exposeFunction("__crawler_queueUrls", async (url) => { await page.exposeFunction("__crawler_queueUrls", async (url) => {

View file

@ -2,7 +2,7 @@ version: '3.5'
services: services:
crawler: crawler:
image: webrecorder/browsertrix-crawler:0.1.1 image: webrecorder/browsertrix-crawler:0.1.2
build: build:
context: ./ context: ./

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "0.1.1", "version": "0.1.2",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software", "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",