mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
browser config settings:
- add support for --userAgent to override user agent - add support for --mobileDevice to use puppeteer device emulation presets - add support for --userAgentSuffix to append to default user agent (including device userAgent) bump to 0.1.2
This commit is contained in:
parent
bfa1fc1618
commit
fe406b5f74
4 changed files with 69 additions and 8 deletions
69
crawler.js
69
crawler.js
|
@ -25,11 +25,16 @@ class Crawler {
|
||||||
|
|
||||||
this.seenList = new Set();
|
this.seenList = new Set();
|
||||||
|
|
||||||
|
this.emulateDevice = null;
|
||||||
|
|
||||||
// links crawled counter
|
// links crawled counter
|
||||||
this.numLinks = 0;
|
this.numLinks = 0;
|
||||||
|
|
||||||
this.monitor = true;
|
this.monitor = true;
|
||||||
|
|
||||||
|
this.userAgent = "";
|
||||||
|
this.headers = {};
|
||||||
|
|
||||||
const params = require("yargs")
|
const params = require("yargs")
|
||||||
.usage("browsertrix-crawler [options]")
|
.usage("browsertrix-crawler [options]")
|
||||||
.option(this.cliOpts)
|
.option(this.cliOpts)
|
||||||
|
@ -42,16 +47,46 @@ class Crawler {
|
||||||
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
configureUA() {
|
||||||
|
// override userAgent
|
||||||
|
if (this.params.userAgent) {
|
||||||
|
|
||||||
|
if (this.emulateDevice) {
|
||||||
|
this.emulateDevice.userAgent = this.params.userAgent;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.userAgent = this.params.userAgent;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if device set, it overrides the default Chrome UA
|
||||||
|
if (this.emulateDevice) {
|
||||||
|
this.userAgent = this.emulateDevice.userAgent;
|
||||||
|
} else {
|
||||||
|
let version = "84";
|
||||||
|
|
||||||
|
try {
|
||||||
|
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
||||||
|
} catch(e) {}
|
||||||
|
|
||||||
|
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// suffix to append to default userAgent
|
||||||
|
if (this.params.userAgentSuffix) {
|
||||||
|
this.userAgent += " " + this.params.userAgentSuffix;
|
||||||
|
|
||||||
|
if (this.emulateDevice) {
|
||||||
|
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bootstrap() {
|
bootstrap() {
|
||||||
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||||
|
|
||||||
let version = "84";
|
this.userAgent = this.configureUA();
|
||||||
|
|
||||||
try {
|
|
||||||
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
|
||||||
} catch(e) {}
|
|
||||||
|
|
||||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
|
||||||
this.headers = {"User-Agent": this.userAgent};
|
this.headers = {"User-Agent": this.userAgent};
|
||||||
|
|
||||||
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
||||||
|
@ -159,6 +194,21 @@ class Crawler {
|
||||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
|
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: process.cwd(),
|
default: process.cwd(),
|
||||||
|
},
|
||||||
|
|
||||||
|
"mobileDevice": {
|
||||||
|
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
|
"userAgent": {
|
||||||
|
describe: "Override user-agent with specified string",
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
|
"userAgentSuffix": {
|
||||||
|
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||||
|
type: "string",
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -215,6 +265,13 @@ class Crawler {
|
||||||
throw new Error("Invalid newContext, must be one of: page, session, browser");
|
throw new Error("Invalid newContext, must be one of: page, session, browser");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv.mobileDevice) {
|
||||||
|
this.emulateDevice = puppeteer.devices[argv.mobileDevice];
|
||||||
|
if (!this.emulateDevice) {
|
||||||
|
throw new Error("Unknown device: " + argv.mobileDevice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Support one or multiple exclude
|
// Support one or multiple exclude
|
||||||
if (argv.exclude) {
|
if (argv.exclude) {
|
||||||
if (typeof(argv.exclude) === "string") {
|
if (typeof(argv.exclude) === "string") {
|
||||||
|
|
|
@ -15,6 +15,10 @@ module.exports = async ({data, page, crawler}) => {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (crawler.emulateDevice) {
|
||||||
|
await page.emulate(crawler.emulateDevice);
|
||||||
|
}
|
||||||
|
|
||||||
const mediaResults = [];
|
const mediaResults = [];
|
||||||
|
|
||||||
await page.exposeFunction("__crawler_queueUrls", async (url) => {
|
await page.exposeFunction("__crawler_queueUrls", async (url) => {
|
||||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
crawler:
|
crawler:
|
||||||
image: webrecorder/browsertrix-crawler:0.1.1
|
image: webrecorder/browsertrix-crawler:0.1.2
|
||||||
build:
|
build:
|
||||||
context: ./
|
context: ./
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.1.1",
|
"version": "0.1.2",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue