mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
browser config settings:
- add support for --userAgent to override user agent - add support for --mobileDevice to use puppeteer device emulation presets - add support for --userAgentSuffix to append to default user agent (including device userAgent) bump to 0.1.2
This commit is contained in:
parent
bfa1fc1618
commit
fe406b5f74
4 changed files with 69 additions and 8 deletions
69
crawler.js
69
crawler.js
|
@ -25,11 +25,16 @@ class Crawler {
|
|||
|
||||
this.seenList = new Set();
|
||||
|
||||
this.emulateDevice = null;
|
||||
|
||||
// links crawled counter
|
||||
this.numLinks = 0;
|
||||
|
||||
this.monitor = true;
|
||||
|
||||
this.userAgent = "";
|
||||
this.headers = {};
|
||||
|
||||
const params = require("yargs")
|
||||
.usage("browsertrix-crawler [options]")
|
||||
.option(this.cliOpts)
|
||||
|
@ -42,16 +47,46 @@ class Crawler {
|
|||
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
// override userAgent
|
||||
if (this.params.userAgent) {
|
||||
|
||||
if (this.emulateDevice) {
|
||||
this.emulateDevice.userAgent = this.params.userAgent;
|
||||
}
|
||||
|
||||
this.userAgent = this.params.userAgent;
|
||||
return;
|
||||
}
|
||||
|
||||
// if device set, it overrides the default Chrome UA
|
||||
if (this.emulateDevice) {
|
||||
this.userAgent = this.emulateDevice.userAgent;
|
||||
} else {
|
||||
let version = "84";
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
||||
} catch(e) {}
|
||||
|
||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
}
|
||||
|
||||
// suffix to append to default userAgent
|
||||
if (this.params.userAgentSuffix) {
|
||||
this.userAgent += " " + this.params.userAgentSuffix;
|
||||
|
||||
if (this.emulateDevice) {
|
||||
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bootstrap() {
|
||||
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||
|
||||
let version = "84";
|
||||
this.userAgent = this.configureUA();
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
||||
} catch(e) {}
|
||||
|
||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
this.headers = {"User-Agent": this.userAgent};
|
||||
|
||||
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
||||
|
@ -159,6 +194,21 @@ class Crawler {
|
|||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
|
||||
type: "string",
|
||||
default: process.cwd(),
|
||||
},
|
||||
|
||||
"mobileDevice": {
|
||||
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"userAgent": {
|
||||
describe: "Override user-agent with specified string",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"userAgentSuffix": {
|
||||
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||
type: "string",
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -215,6 +265,13 @@ class Crawler {
|
|||
throw new Error("Invalid newContext, must be one of: page, session, browser");
|
||||
}
|
||||
|
||||
if (argv.mobileDevice) {
|
||||
this.emulateDevice = puppeteer.devices[argv.mobileDevice];
|
||||
if (!this.emulateDevice) {
|
||||
throw new Error("Unknown device: " + argv.mobileDevice);
|
||||
}
|
||||
}
|
||||
|
||||
// Support one or multiple exclude
|
||||
if (argv.exclude) {
|
||||
if (typeof(argv.exclude) === "string") {
|
||||
|
|
|
@ -15,6 +15,10 @@ module.exports = async ({data, page, crawler}) => {
|
|||
return;
|
||||
}
|
||||
|
||||
if (crawler.emulateDevice) {
|
||||
await page.emulate(crawler.emulateDevice);
|
||||
}
|
||||
|
||||
const mediaResults = [];
|
||||
|
||||
await page.exposeFunction("__crawler_queueUrls", async (url) => {
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
|||
|
||||
services:
|
||||
crawler:
|
||||
image: webrecorder/browsertrix-crawler:0.1.1
|
||||
image: webrecorder/browsertrix-crawler:0.1.2
|
||||
build:
|
||||
context: ./
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.1.1",
|
||||
"version": "0.1.2",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue