Profile Creation Fix + Cloudflare Wait Support + UserAgent Fix (#128)

* cloudlfare wait improvements (#110 fix)
- set navigator.webdriver to false to help with cloudflare wait
- add checkCF() that will detect cloudflare ddos page and wait 5 seconds until original page is loaded

* chrome args refactor:
- move to utils/browser
- add LazyFrameLoading disable to fix occasional issues with page.goto() never finishing
- add userAgent option

* profile creation improvements:
- fix loadProfile() missing await
- fix url to support running remotely
- load shared chromeArgs()
- add --proxy to support profile creation through pywb proxy

* fix setting custom userAgent (#90)
- fix typo that resulted in error
- ensure userAgent is applied separate from emulatedDevice
- add getDefaultUA() browser util
This commit is contained in:
Ilya Kreymer 2022-03-18 10:32:59 -07:00 committed by GitHub
parent dedf1cc0ad
commit 5e5efda437
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 101 additions and 43 deletions

View file

@ -30,7 +30,7 @@ const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/scre
const { parseArgs } = require("./util/argParser");
const { initRedis } = require("./util/redis");
const { getBrowserExe, loadProfile, evaluateWithCLI } = require("./util/browser");
const { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } = require("./util/browser");
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
@ -113,22 +113,11 @@ class Crawler {
return;
}
this.browserExe = getBrowserExe();
// if device set, it overrides the default Chrome UA
if (this.emulateDevice) {
this.userAgent = this.emulateDevice.userAgent;
} else {
let version = process.env.BROWSER_VERSION;
try {
version = child_process.execFileSync(this.browserExe, ["--version"], {encoding: "utf8"});
version = version.match(/[\d.]+/)[0];
} catch(e) {
console.error(e);
}
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
this.userAgent = getDefaultUA();
}
// suffix to append to default userAgent
@ -202,6 +191,8 @@ class Crawler {
opts = {stdio: "ignore", cwd: this.params.cwd};
}
this.browserExe = getBrowserExe();
this.configureUA();
this.headers = {"User-Agent": this.userAgent};
@ -237,21 +228,6 @@ class Crawler {
}
}
get chromeArgs() {
// Chrome Flags, including proxy server
return [
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
"--no-sandbox",
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-popup-blocking",
"--disable-backgrounding-occluded-windows",
];
}
get puppeteerArgs() {
// Puppeter Options
return {
@ -261,7 +237,7 @@ class Crawler {
handleSIGTERM: false,
handleSIGHUP: false,
ignoreHTTPSErrors: true,
args: this.chromeArgs,
args: chromeArgs(true, this.userAgent),
userDataDir: this.profileDir,
defaultViewport: null,
};
@ -310,6 +286,8 @@ class Crawler {
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
}
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
if (this.params.behaviorOpts && !page.__bx_inited) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
@ -581,6 +559,8 @@ class Crawler {
const seed = this.params.scopedSeeds[seedId];
await this.checkCF(page);
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
return;
@ -649,6 +629,17 @@ class Crawler {
}
}
async checkCF(page) {
try {
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
this.statusLog("Cloudflare Check Detected, waiting for reload...");
await this.sleep(5500);
}
} catch (e) {
console.warn(e);
}
}
async queueUrl(seedId, url, depth, extraHops = 0) {
if (this.limitHit) {
return false;

View file

@ -6,7 +6,7 @@ const child_process = require("child_process");
const puppeteer = require("puppeteer-core");
const yargs = require("yargs");
const { getBrowserExe, loadProfile, saveProfile } = require("./util/browser");
const { getBrowserExe, loadProfile, saveProfile, chromeArgs, sleep } = require("./util/browser");
const fs = require("fs");
const path = require("path");
@ -62,6 +62,11 @@ function cliOpts() {
type: "string",
describe: "Browser window dimensions, specified as: width,height",
default: "1600,900"
},
"proxy": {
type: "boolean",
default: false
}
};
}
@ -89,6 +94,23 @@ async function main() {
]);
}
let useProxy = false;
if (params.proxy) {
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
console.log("Running with pywb proxy");
await sleep(3000);
useProxy = true;
}
const browserArgs = chromeArgs(useProxy, null, [
"--remote-debugging-port=9221",
`--window-size=${params.windowSize}`,
]);
//await new Promise(resolve => setTimeout(resolve, 2000));
const profileDir = await loadProfile(params.profile);
@ -96,15 +118,7 @@ async function main() {
headless: !!params.headless,
executablePath: getBrowserExe(),
ignoreHTTPSErrors: true,
args: [
"--no-xshm",
"--no-sandbox",
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process",
"--remote-debugging-port=9221",
`--window-size=${params.windowSize}`
],
args: browserArgs,
userDataDir: profileDir,
defaultViewport: null,
};
@ -126,6 +140,7 @@ async function main() {
await page.setCacheEnabled(false);
if (params.interactive) {
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
// for testing, inject browsertrix-behaviors
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
}
@ -221,7 +236,7 @@ function promptInput(msg, hidden = false) {
async function handleInteractive(params, browser, page) {
const target = page.target();
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}&panel=resources`;
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${target._targetId}&panel=resources`;
console.log("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
@ -231,7 +246,7 @@ async function handleInteractive(params, browser, page) {
const pathname = parsedUrl.pathname;
if (pathname === "/") {
res.writeHead(200, {"Content-Type": "text/html"});
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replace("$HOST", parsedUrl.hostname)));
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
} else if (pathname === "/createProfile" && req.method === "POST") {

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.5.0-beta.6",
"version": "0.5.0-beta.7",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",

View file

@ -41,7 +41,7 @@ module.exports.saveProfile = function(profileFilename) {
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
};
module.exports.getBrowserExe = function() {
function getBrowserExe() {
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
for (const file of files) {
if (file && fs.existsSync(file)) {
@ -50,6 +50,49 @@ module.exports.getBrowserExe = function() {
}
return null;
}
module.exports.getBrowserExe = getBrowserExe;
function getDefaultUA() {
let version = process.env.BROWSER_VERSION;
try {
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
version = version.match(/[\d.]+/)[0];
} catch(e) {
console.error(e);
}
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
}
module.exports.getDefaultUA = getDefaultUA;
module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
// Chrome Flags, including proxy server
const args = [
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
"--no-sandbox",
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=Translate,LazyFrameLoading,IsolateOrigins,site-per-process",
"--disable-popup-blocking",
"--disable-backgrounding-occluded-windows",
`--user-agent=${userAgent || getDefaultUA()}`,
...extraArgs,
];
if (proxy) {
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
}
return args;
};
@ -79,3 +122,12 @@ module.exports.evaluateWithCLI = async (frame, funcString) => {
return remoteObject.value;
};
module.exports.sleep = async (time) => {
return new Promise(resolve => setTimeout(resolve, time));
};