mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Profile Creation Fix + Cloudflare Wait Support + UserAgent Fix (#128)
* cloudlfare wait improvements (#110 fix) - set navigator.webdriver to false to help with cloudflare wait - add checkCF() that will detect cloudflare ddos page and wait 5 seconds until original page is loaded * chrome args refactor: - move to utils/browser - add LazyFrameLoading disable to fix occasional issues with page.goto() never finishing - add userAgent option * profile creation improvements: - fix loadProfile() missing await - fix url to support running remotely - load shared chromeArgs() - add --proxy to support profile creation through pywb proxy * fix setting custom userAgent (#90) - fix typo that resulted in error - ensure userAgent is applied separate from emulatedDevice - add getDefaultUA() browser util
This commit is contained in:
parent
dedf1cc0ad
commit
5e5efda437
4 changed files with 101 additions and 43 deletions
49
crawler.js
49
crawler.js
|
@ -30,7 +30,7 @@ const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/scre
|
||||||
const { parseArgs } = require("./util/argParser");
|
const { parseArgs } = require("./util/argParser");
|
||||||
const { initRedis } = require("./util/redis");
|
const { initRedis } = require("./util/redis");
|
||||||
|
|
||||||
const { getBrowserExe, loadProfile, evaluateWithCLI } = require("./util/browser");
|
const { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } = require("./util/browser");
|
||||||
|
|
||||||
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
|
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
|
||||||
|
|
||||||
|
@ -113,22 +113,11 @@ class Crawler {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.browserExe = getBrowserExe();
|
|
||||||
|
|
||||||
// if device set, it overrides the default Chrome UA
|
// if device set, it overrides the default Chrome UA
|
||||||
if (this.emulateDevice) {
|
if (this.emulateDevice) {
|
||||||
this.userAgent = this.emulateDevice.userAgent;
|
this.userAgent = this.emulateDevice.userAgent;
|
||||||
} else {
|
} else {
|
||||||
let version = process.env.BROWSER_VERSION;
|
this.userAgent = getDefaultUA();
|
||||||
|
|
||||||
try {
|
|
||||||
version = child_process.execFileSync(this.browserExe, ["--version"], {encoding: "utf8"});
|
|
||||||
version = version.match(/[\d.]+/)[0];
|
|
||||||
} catch(e) {
|
|
||||||
console.error(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// suffix to append to default userAgent
|
// suffix to append to default userAgent
|
||||||
|
@ -202,6 +191,8 @@ class Crawler {
|
||||||
opts = {stdio: "ignore", cwd: this.params.cwd};
|
opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.browserExe = getBrowserExe();
|
||||||
|
|
||||||
this.configureUA();
|
this.configureUA();
|
||||||
|
|
||||||
this.headers = {"User-Agent": this.userAgent};
|
this.headers = {"User-Agent": this.userAgent};
|
||||||
|
@ -237,21 +228,6 @@ class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
get chromeArgs() {
|
|
||||||
// Chrome Flags, including proxy server
|
|
||||||
return [
|
|
||||||
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
|
||||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
|
||||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
|
||||||
"--no-sandbox",
|
|
||||||
"--disable-background-media-suspend",
|
|
||||||
"--autoplay-policy=no-user-gesture-required",
|
|
||||||
"--disable-features=IsolateOrigins,site-per-process",
|
|
||||||
"--disable-popup-blocking",
|
|
||||||
"--disable-backgrounding-occluded-windows",
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
get puppeteerArgs() {
|
get puppeteerArgs() {
|
||||||
// Puppeter Options
|
// Puppeter Options
|
||||||
return {
|
return {
|
||||||
|
@ -261,7 +237,7 @@ class Crawler {
|
||||||
handleSIGTERM: false,
|
handleSIGTERM: false,
|
||||||
handleSIGHUP: false,
|
handleSIGHUP: false,
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args: this.chromeArgs,
|
args: chromeArgs(true, this.userAgent),
|
||||||
userDataDir: this.profileDir,
|
userDataDir: this.profileDir,
|
||||||
defaultViewport: null,
|
defaultViewport: null,
|
||||||
};
|
};
|
||||||
|
@ -310,6 +286,8 @@ class Crawler {
|
||||||
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||||
|
|
||||||
if (this.params.behaviorOpts && !page.__bx_inited) {
|
if (this.params.behaviorOpts && !page.__bx_inited) {
|
||||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
||||||
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
||||||
|
@ -581,6 +559,8 @@ class Crawler {
|
||||||
|
|
||||||
const seed = this.params.scopedSeeds[seedId];
|
const seed = this.params.scopedSeeds[seedId];
|
||||||
|
|
||||||
|
await this.checkCF(page);
|
||||||
|
|
||||||
// skip extraction if at max depth
|
// skip extraction if at max depth
|
||||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||||
return;
|
return;
|
||||||
|
@ -649,6 +629,17 @@ class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async checkCF(page) {
|
||||||
|
try {
|
||||||
|
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
|
||||||
|
this.statusLog("Cloudflare Check Detected, waiting for reload...");
|
||||||
|
await this.sleep(5500);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async queueUrl(seedId, url, depth, extraHops = 0) {
|
async queueUrl(seedId, url, depth, extraHops = 0) {
|
||||||
if (this.limitHit) {
|
if (this.limitHit) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -6,7 +6,7 @@ const child_process = require("child_process");
|
||||||
const puppeteer = require("puppeteer-core");
|
const puppeteer = require("puppeteer-core");
|
||||||
const yargs = require("yargs");
|
const yargs = require("yargs");
|
||||||
|
|
||||||
const { getBrowserExe, loadProfile, saveProfile } = require("./util/browser");
|
const { getBrowserExe, loadProfile, saveProfile, chromeArgs, sleep } = require("./util/browser");
|
||||||
|
|
||||||
const fs = require("fs");
|
const fs = require("fs");
|
||||||
const path = require("path");
|
const path = require("path");
|
||||||
|
@ -62,6 +62,11 @@ function cliOpts() {
|
||||||
type: "string",
|
type: "string",
|
||||||
describe: "Browser window dimensions, specified as: width,height",
|
describe: "Browser window dimensions, specified as: width,height",
|
||||||
default: "1600,900"
|
default: "1600,900"
|
||||||
|
},
|
||||||
|
|
||||||
|
"proxy": {
|
||||||
|
type: "boolean",
|
||||||
|
default: false
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -89,6 +94,23 @@ async function main() {
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let useProxy = false;
|
||||||
|
|
||||||
|
if (params.proxy) {
|
||||||
|
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
||||||
|
|
||||||
|
console.log("Running with pywb proxy");
|
||||||
|
|
||||||
|
await sleep(3000);
|
||||||
|
|
||||||
|
useProxy = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const browserArgs = chromeArgs(useProxy, null, [
|
||||||
|
"--remote-debugging-port=9221",
|
||||||
|
`--window-size=${params.windowSize}`,
|
||||||
|
]);
|
||||||
|
|
||||||
//await new Promise(resolve => setTimeout(resolve, 2000));
|
//await new Promise(resolve => setTimeout(resolve, 2000));
|
||||||
const profileDir = await loadProfile(params.profile);
|
const profileDir = await loadProfile(params.profile);
|
||||||
|
|
||||||
|
@ -96,15 +118,7 @@ async function main() {
|
||||||
headless: !!params.headless,
|
headless: !!params.headless,
|
||||||
executablePath: getBrowserExe(),
|
executablePath: getBrowserExe(),
|
||||||
ignoreHTTPSErrors: true,
|
ignoreHTTPSErrors: true,
|
||||||
args: [
|
args: browserArgs,
|
||||||
"--no-xshm",
|
|
||||||
"--no-sandbox",
|
|
||||||
"--disable-background-media-suspend",
|
|
||||||
"--autoplay-policy=no-user-gesture-required",
|
|
||||||
"--disable-features=IsolateOrigins,site-per-process",
|
|
||||||
"--remote-debugging-port=9221",
|
|
||||||
`--window-size=${params.windowSize}`
|
|
||||||
],
|
|
||||||
userDataDir: profileDir,
|
userDataDir: profileDir,
|
||||||
defaultViewport: null,
|
defaultViewport: null,
|
||||||
};
|
};
|
||||||
|
@ -126,6 +140,7 @@ async function main() {
|
||||||
await page.setCacheEnabled(false);
|
await page.setCacheEnabled(false);
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||||
// for testing, inject browsertrix-behaviors
|
// for testing, inject browsertrix-behaviors
|
||||||
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
|
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
|
||||||
}
|
}
|
||||||
|
@ -221,7 +236,7 @@ function promptInput(msg, hidden = false) {
|
||||||
|
|
||||||
async function handleInteractive(params, browser, page) {
|
async function handleInteractive(params, browser, page) {
|
||||||
const target = page.target();
|
const target = page.target();
|
||||||
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}&panel=resources`;
|
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${target._targetId}&panel=resources`;
|
||||||
|
|
||||||
console.log("Creating Profile Interactively...");
|
console.log("Creating Profile Interactively...");
|
||||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||||
|
@ -231,7 +246,7 @@ async function handleInteractive(params, browser, page) {
|
||||||
const pathname = parsedUrl.pathname;
|
const pathname = parsedUrl.pathname;
|
||||||
if (pathname === "/") {
|
if (pathname === "/") {
|
||||||
res.writeHead(200, {"Content-Type": "text/html"});
|
res.writeHead(200, {"Content-Type": "text/html"});
|
||||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replace("$HOST", parsedUrl.hostname)));
|
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
|
||||||
|
|
||||||
} else if (pathname === "/createProfile" && req.method === "POST") {
|
} else if (pathname === "/createProfile" && req.method === "POST") {
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.5.0-beta.6",
|
"version": "0.5.0-beta.7",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
|
|
|
@ -41,7 +41,7 @@ module.exports.saveProfile = function(profileFilename) {
|
||||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports.getBrowserExe = function() {
|
function getBrowserExe() {
|
||||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
if (file && fs.existsSync(file)) {
|
if (file && fs.existsSync(file)) {
|
||||||
|
@ -50,6 +50,49 @@ module.exports.getBrowserExe = function() {
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
module.exports.getBrowserExe = getBrowserExe;
|
||||||
|
|
||||||
|
|
||||||
|
function getDefaultUA() {
|
||||||
|
let version = process.env.BROWSER_VERSION;
|
||||||
|
|
||||||
|
try {
|
||||||
|
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
|
||||||
|
version = version.match(/[\d.]+/)[0];
|
||||||
|
} catch(e) {
|
||||||
|
console.error(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
module.exports.getDefaultUA = getDefaultUA;
|
||||||
|
|
||||||
|
|
||||||
|
module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
||||||
|
// Chrome Flags, including proxy server
|
||||||
|
const args = [
|
||||||
|
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
||||||
|
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-background-media-suspend",
|
||||||
|
"--autoplay-policy=no-user-gesture-required",
|
||||||
|
"--disable-features=Translate,LazyFrameLoading,IsolateOrigins,site-per-process",
|
||||||
|
"--disable-popup-blocking",
|
||||||
|
"--disable-backgrounding-occluded-windows",
|
||||||
|
`--user-agent=${userAgent || getDefaultUA()}`,
|
||||||
|
...extraArgs,
|
||||||
|
];
|
||||||
|
|
||||||
|
if (proxy) {
|
||||||
|
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return args;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -79,3 +122,12 @@ module.exports.evaluateWithCLI = async (frame, funcString) => {
|
||||||
return remoteObject.value;
|
return remoteObject.value;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
module.exports.sleep = async (time) => {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, time));
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue