From fe406b5f74caae401b920def88709fb13f28d41a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 14 Nov 2020 19:32:31 +0000 Subject: [PATCH] browser config settings: - add support for --userAgent to override user agent - add support for --mobileDevice to use puppeteer device emulation presets - add support for --userAgentSuffix to append to default user agent (including device userAgent) bump to 0.1.2 --- crawler.js | 69 ++++++++++++++++++++++++++++++++++++++++++---- defaultDriver.js | 4 +++ docker-compose.yml | 2 +- package.json | 2 +- 4 files changed, 69 insertions(+), 8 deletions(-) diff --git a/crawler.js b/crawler.js index 8a9d64a1..3db3a7b5 100644 --- a/crawler.js +++ b/crawler.js @@ -25,11 +25,16 @@ class Crawler { this.seenList = new Set(); + this.emulateDevice = null; + // links crawled counter this.numLinks = 0; this.monitor = true; + this.userAgent = ""; + this.headers = {}; + const params = require("yargs") .usage("browsertrix-crawler [options]") .option(this.cliOpts) @@ -42,16 +47,46 @@ class Crawler { this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`; } + configureUA() { + // override userAgent + if (this.params.userAgent) { + + if (this.emulateDevice) { + this.emulateDevice.userAgent = this.params.userAgent; + } + + this.userAgent = this.params.userAgent; + return; + } + + // if device set, it overrides the default Chrome UA + if (this.emulateDevice) { + this.userAgent = this.emulateDevice.userAgent; + } else { + let version = "84"; + + try { + version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim(); + } catch(e) {} + + this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; + } + + // suffix to append to default userAgent + if (this.params.userAgentSuffix) { + this.userAgent += " " + this.params.userAgentSuffix; + + if (this.emulateDevice) { + this.emulateDevice.userAgent += " " + this.params.userAgentSuffix; + } + } + } + bootstrap() { const opts = {stdio: "ignore", cwd: this.params.cwd}; - let version = "84"; + this.userAgent = this.configureUA(); - try { - version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim(); - } catch(e) {} - - this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; this.headers = {"User-Agent": this.userAgent}; child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}); @@ -159,6 +194,21 @@ class Crawler { describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd", type: "string", default: process.cwd(), + }, + + "mobileDevice": { + describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts", + type: "string", + }, + + "userAgent": { + describe: "Override user-agent with specified string", + type: "string", + }, + + "userAgentSuffix": { + describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)", + type: "string", } }; } @@ -215,6 +265,13 @@ class Crawler { throw new Error("Invalid newContext, must be one of: page, session, browser"); } + if (argv.mobileDevice) { + this.emulateDevice = puppeteer.devices[argv.mobileDevice]; + if (!this.emulateDevice) { + throw new Error("Unknown device: " + argv.mobileDevice); + } + } + // Support one or multiple exclude if (argv.exclude) { if (typeof(argv.exclude) === "string") { diff --git a/defaultDriver.js b/defaultDriver.js index c3284935..5fb40461 100644 --- a/defaultDriver.js +++ b/defaultDriver.js @@ -15,6 +15,10 @@ module.exports = async ({data, page, crawler}) => { return; } + if (crawler.emulateDevice) { + await page.emulate(crawler.emulateDevice); + } + const mediaResults = []; await page.exposeFunction("__crawler_queueUrls", async (url) => { diff --git a/docker-compose.yml b/docker-compose.yml index e72e5aee..7363b0c8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.5' services: crawler: - image: webrecorder/browsertrix-crawler:0.1.1 + image: webrecorder/browsertrix-crawler:0.1.2 build: context: ./ diff --git a/package.json b/package.json index 73fe327f..fb684ff0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.1.1", + "version": "0.1.2", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software",