diff --git a/crawler.js b/crawler.js index e864efe2..d98857ff 100644 --- a/crawler.js +++ b/crawler.js @@ -30,7 +30,7 @@ const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/scre const { parseArgs } = require("./util/argParser"); const { initRedis } = require("./util/redis"); -const { getBrowserExe, loadProfile, evaluateWithCLI } = require("./util/browser"); +const { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } = require("./util/browser"); const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants"); @@ -113,22 +113,11 @@ class Crawler { return; } - this.browserExe = getBrowserExe(); - // if device set, it overrides the default Chrome UA if (this.emulateDevice) { this.userAgent = this.emulateDevice.userAgent; } else { - let version = process.env.BROWSER_VERSION; - - try { - version = child_process.execFileSync(this.browserExe, ["--version"], {encoding: "utf8"}); - version = version.match(/[\d.]+/)[0]; - } catch(e) { - console.error(e); - } - - this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; + this.userAgent = getDefaultUA(); } // suffix to append to default userAgent @@ -202,6 +191,8 @@ class Crawler { opts = {stdio: "ignore", cwd: this.params.cwd}; } + this.browserExe = getBrowserExe(); + this.configureUA(); this.headers = {"User-Agent": this.userAgent}; @@ -237,21 +228,6 @@ class Crawler { } } - get chromeArgs() { - // Chrome Flags, including proxy server - return [ - ...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean), - "--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically) - `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`, - "--no-sandbox", - "--disable-background-media-suspend", - "--autoplay-policy=no-user-gesture-required", - "--disable-features=IsolateOrigins,site-per-process", - "--disable-popup-blocking", - "--disable-backgrounding-occluded-windows", - ]; - } - get puppeteerArgs() { // Puppeter Options return { @@ -261,7 +237,7 @@ class Crawler { handleSIGTERM: false, handleSIGHUP: false, ignoreHTTPSErrors: true, - args: this.chromeArgs, + args: chromeArgs(true, this.userAgent), userDataDir: this.profileDir, defaultViewport: null, }; @@ -310,6 +286,8 @@ class Crawler { await page._client.send("Network.setBypassServiceWorker", {bypass: true}); } + await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});"); + if (this.params.behaviorOpts && !page.__bx_inited) { await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata)); await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`); @@ -581,6 +559,8 @@ class Crawler { const seed = this.params.scopedSeeds[seedId]; + await this.checkCF(page); + // skip extraction if at max depth if (seed.isAtMaxDepth(depth) || !selectorOptsList) { return; @@ -649,6 +629,17 @@ class Crawler { } } + async checkCF(page) { + try { + while (await page.$("div.cf-browser-verification.cf-im-under-attack")) { + this.statusLog("Cloudflare Check Detected, waiting for reload..."); + await this.sleep(5500); + } + } catch (e) { + console.warn(e); + } + } + async queueUrl(seedId, url, depth, extraHops = 0) { if (this.limitHit) { return false; diff --git a/create-login-profile.js b/create-login-profile.js index f6144556..6e1f3915 100755 --- a/create-login-profile.js +++ b/create-login-profile.js @@ -6,7 +6,7 @@ const child_process = require("child_process"); const puppeteer = require("puppeteer-core"); const yargs = require("yargs"); -const { getBrowserExe, loadProfile, saveProfile } = require("./util/browser"); +const { getBrowserExe, loadProfile, saveProfile, chromeArgs, sleep } = require("./util/browser"); const fs = require("fs"); const path = require("path"); @@ -62,6 +62,11 @@ function cliOpts() { type: "string", describe: "Browser window dimensions, specified as: width,height", default: "1600,900" + }, + + "proxy": { + type: "boolean", + default: false } }; } @@ -89,6 +94,23 @@ async function main() { ]); } + let useProxy = false; + + if (params.proxy) { + child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"}); + + console.log("Running with pywb proxy"); + + await sleep(3000); + + useProxy = true; + } + + const browserArgs = chromeArgs(useProxy, null, [ + "--remote-debugging-port=9221", + `--window-size=${params.windowSize}`, + ]); + //await new Promise(resolve => setTimeout(resolve, 2000)); const profileDir = await loadProfile(params.profile); @@ -96,15 +118,7 @@ async function main() { headless: !!params.headless, executablePath: getBrowserExe(), ignoreHTTPSErrors: true, - args: [ - "--no-xshm", - "--no-sandbox", - "--disable-background-media-suspend", - "--autoplay-policy=no-user-gesture-required", - "--disable-features=IsolateOrigins,site-per-process", - "--remote-debugging-port=9221", - `--window-size=${params.windowSize}` - ], + args: browserArgs, userDataDir: profileDir, defaultViewport: null, }; @@ -126,6 +140,7 @@ async function main() { await page.setCacheEnabled(false); if (params.interactive) { + await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});"); // for testing, inject browsertrix-behaviors await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();"); } @@ -221,7 +236,7 @@ function promptInput(msg, hidden = false) { async function handleInteractive(params, browser, page) { const target = page.target(); - const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}&panel=resources`; + const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${target._targetId}&panel=resources`; console.log("Creating Profile Interactively..."); child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]); @@ -231,7 +246,7 @@ async function handleInteractive(params, browser, page) { const pathname = parsedUrl.pathname; if (pathname === "/") { res.writeHead(200, {"Content-Type": "text/html"}); - res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replace("$HOST", parsedUrl.hostname))); + res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname))); } else if (pathname === "/createProfile" && req.method === "POST") { diff --git a/package.json b/package.json index b2e7c828..4a83b417 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.5.0-beta.6", + "version": "0.5.0-beta.7", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", diff --git a/util/browser.js b/util/browser.js index f77ab4d2..68e41a93 100644 --- a/util/browser.js +++ b/util/browser.js @@ -41,7 +41,7 @@ module.exports.saveProfile = function(profileFilename) { child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir}); }; -module.exports.getBrowserExe = function() { +function getBrowserExe() { const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"]; for (const file of files) { if (file && fs.existsSync(file)) { @@ -50,6 +50,49 @@ module.exports.getBrowserExe = function() { } return null; +} + + +module.exports.getBrowserExe = getBrowserExe; + + +function getDefaultUA() { + let version = process.env.BROWSER_VERSION; + + try { + version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"}); + version = version.match(/[\d.]+/)[0]; + } catch(e) { + console.error(e); + } + + return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`; +} + + +module.exports.getDefaultUA = getDefaultUA; + + +module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => { + // Chrome Flags, including proxy server + const args = [ + ...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean), + "--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically) + "--no-sandbox", + "--disable-background-media-suspend", + "--autoplay-policy=no-user-gesture-required", + "--disable-features=Translate,LazyFrameLoading,IsolateOrigins,site-per-process", + "--disable-popup-blocking", + "--disable-backgrounding-occluded-windows", + `--user-agent=${userAgent || getDefaultUA()}`, + ...extraArgs, + ]; + + if (proxy) { + args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`); + } + + return args; }; @@ -79,3 +122,12 @@ module.exports.evaluateWithCLI = async (frame, funcString) => { return remoteObject.value; }; + +module.exports.sleep = async (time) => { + return new Promise(resolve => setTimeout(resolve, time)); +}; + + + + +