mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Profile Creation Fix + Cloudflare Wait Support + UserAgent Fix (#128)
* cloudlfare wait improvements (#110 fix) - set navigator.webdriver to false to help with cloudflare wait - add checkCF() that will detect cloudflare ddos page and wait 5 seconds until original page is loaded * chrome args refactor: - move to utils/browser - add LazyFrameLoading disable to fix occasional issues with page.goto() never finishing - add userAgent option * profile creation improvements: - fix loadProfile() missing await - fix url to support running remotely - load shared chromeArgs() - add --proxy to support profile creation through pywb proxy * fix setting custom userAgent (#90) - fix typo that resulted in error - ensure userAgent is applied separate from emulatedDevice - add getDefaultUA() browser util
This commit is contained in:
parent
dedf1cc0ad
commit
5e5efda437
4 changed files with 101 additions and 43 deletions
49
crawler.js
49
crawler.js
|
@ -30,7 +30,7 @@ const { ScreenCaster, WSTransport, RedisPubSubTransport } = require("./util/scre
|
|||
const { parseArgs } = require("./util/argParser");
|
||||
const { initRedis } = require("./util/redis");
|
||||
|
||||
const { getBrowserExe, loadProfile, evaluateWithCLI } = require("./util/browser");
|
||||
const { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } = require("./util/browser");
|
||||
|
||||
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
|
||||
|
||||
|
@ -113,22 +113,11 @@ class Crawler {
|
|||
return;
|
||||
}
|
||||
|
||||
this.browserExe = getBrowserExe();
|
||||
|
||||
// if device set, it overrides the default Chrome UA
|
||||
if (this.emulateDevice) {
|
||||
this.userAgent = this.emulateDevice.userAgent;
|
||||
} else {
|
||||
let version = process.env.BROWSER_VERSION;
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync(this.browserExe, ["--version"], {encoding: "utf8"});
|
||||
version = version.match(/[\d.]+/)[0];
|
||||
} catch(e) {
|
||||
console.error(e);
|
||||
}
|
||||
|
||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
this.userAgent = getDefaultUA();
|
||||
}
|
||||
|
||||
// suffix to append to default userAgent
|
||||
|
@ -202,6 +191,8 @@ class Crawler {
|
|||
opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||
}
|
||||
|
||||
this.browserExe = getBrowserExe();
|
||||
|
||||
this.configureUA();
|
||||
|
||||
this.headers = {"User-Agent": this.userAgent};
|
||||
|
@ -237,21 +228,6 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
get chromeArgs() {
|
||||
// Chrome Flags, including proxy server
|
||||
return [
|
||||
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
||||
"--no-sandbox",
|
||||
"--disable-background-media-suspend",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=IsolateOrigins,site-per-process",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
];
|
||||
}
|
||||
|
||||
get puppeteerArgs() {
|
||||
// Puppeter Options
|
||||
return {
|
||||
|
@ -261,7 +237,7 @@ class Crawler {
|
|||
handleSIGTERM: false,
|
||||
handleSIGHUP: false,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: this.chromeArgs,
|
||||
args: chromeArgs(true, this.userAgent),
|
||||
userDataDir: this.profileDir,
|
||||
defaultViewport: null,
|
||||
};
|
||||
|
@ -310,6 +286,8 @@ class Crawler {
|
|||
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
|
||||
if (this.params.behaviorOpts && !page.__bx_inited) {
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
||||
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
||||
|
@ -581,6 +559,8 @@ class Crawler {
|
|||
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
await this.checkCF(page);
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
return;
|
||||
|
@ -649,6 +629,17 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async checkCF(page) {
|
||||
try {
|
||||
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
|
||||
this.statusLog("Cloudflare Check Detected, waiting for reload...");
|
||||
await this.sleep(5500);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
}
|
||||
|
||||
async queueUrl(seedId, url, depth, extraHops = 0) {
|
||||
if (this.limitHit) {
|
||||
return false;
|
||||
|
|
|
@ -6,7 +6,7 @@ const child_process = require("child_process");
|
|||
const puppeteer = require("puppeteer-core");
|
||||
const yargs = require("yargs");
|
||||
|
||||
const { getBrowserExe, loadProfile, saveProfile } = require("./util/browser");
|
||||
const { getBrowserExe, loadProfile, saveProfile, chromeArgs, sleep } = require("./util/browser");
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
@ -62,6 +62,11 @@ function cliOpts() {
|
|||
type: "string",
|
||||
describe: "Browser window dimensions, specified as: width,height",
|
||||
default: "1600,900"
|
||||
},
|
||||
|
||||
"proxy": {
|
||||
type: "boolean",
|
||||
default: false
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -89,6 +94,23 @@ async function main() {
|
|||
]);
|
||||
}
|
||||
|
||||
let useProxy = false;
|
||||
|
||||
if (params.proxy) {
|
||||
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
||||
|
||||
console.log("Running with pywb proxy");
|
||||
|
||||
await sleep(3000);
|
||||
|
||||
useProxy = true;
|
||||
}
|
||||
|
||||
const browserArgs = chromeArgs(useProxy, null, [
|
||||
"--remote-debugging-port=9221",
|
||||
`--window-size=${params.windowSize}`,
|
||||
]);
|
||||
|
||||
//await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
const profileDir = await loadProfile(params.profile);
|
||||
|
||||
|
@ -96,15 +118,7 @@ async function main() {
|
|||
headless: !!params.headless,
|
||||
executablePath: getBrowserExe(),
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [
|
||||
"--no-xshm",
|
||||
"--no-sandbox",
|
||||
"--disable-background-media-suspend",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=IsolateOrigins,site-per-process",
|
||||
"--remote-debugging-port=9221",
|
||||
`--window-size=${params.windowSize}`
|
||||
],
|
||||
args: browserArgs,
|
||||
userDataDir: profileDir,
|
||||
defaultViewport: null,
|
||||
};
|
||||
|
@ -126,6 +140,7 @@ async function main() {
|
|||
await page.setCacheEnabled(false);
|
||||
|
||||
if (params.interactive) {
|
||||
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
// for testing, inject browsertrix-behaviors
|
||||
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
|
||||
}
|
||||
|
@ -221,7 +236,7 @@ function promptInput(msg, hidden = false) {
|
|||
|
||||
async function handleInteractive(params, browser, page) {
|
||||
const target = page.target();
|
||||
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}&panel=resources`;
|
||||
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${target._targetId}&panel=resources`;
|
||||
|
||||
console.log("Creating Profile Interactively...");
|
||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||
|
@ -231,7 +246,7 @@ async function handleInteractive(params, browser, page) {
|
|||
const pathname = parsedUrl.pathname;
|
||||
if (pathname === "/") {
|
||||
res.writeHead(200, {"Content-Type": "text/html"});
|
||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replace("$HOST", parsedUrl.hostname)));
|
||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replaceAll("$HOST", parsedUrl.hostname)));
|
||||
|
||||
} else if (pathname === "/createProfile" && req.method === "POST") {
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.5.0-beta.6",
|
||||
"version": "0.5.0-beta.7",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
|
@ -41,7 +41,7 @@ module.exports.saveProfile = function(profileFilename) {
|
|||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
||||
};
|
||||
|
||||
module.exports.getBrowserExe = function() {
|
||||
function getBrowserExe() {
|
||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||
for (const file of files) {
|
||||
if (file && fs.existsSync(file)) {
|
||||
|
@ -50,6 +50,49 @@ module.exports.getBrowserExe = function() {
|
|||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
module.exports.getBrowserExe = getBrowserExe;
|
||||
|
||||
|
||||
function getDefaultUA() {
|
||||
let version = process.env.BROWSER_VERSION;
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
|
||||
version = version.match(/[\d.]+/)[0];
|
||||
} catch(e) {
|
||||
console.error(e);
|
||||
}
|
||||
|
||||
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
}
|
||||
|
||||
|
||||
module.exports.getDefaultUA = getDefaultUA;
|
||||
|
||||
|
||||
module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
||||
// Chrome Flags, including proxy server
|
||||
const args = [
|
||||
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
"--no-sandbox",
|
||||
"--disable-background-media-suspend",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=Translate,LazyFrameLoading,IsolateOrigins,site-per-process",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
`--user-agent=${userAgent || getDefaultUA()}`,
|
||||
...extraArgs,
|
||||
];
|
||||
|
||||
if (proxy) {
|
||||
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
||||
}
|
||||
|
||||
return args;
|
||||
};
|
||||
|
||||
|
||||
|
@ -79,3 +122,12 @@ module.exports.evaluateWithCLI = async (frame, funcString) => {
|
|||
return remoteObject.value;
|
||||
};
|
||||
|
||||
|
||||
module.exports.sleep = async (time) => {
|
||||
return new Promise(resolve => setTimeout(resolve, time));
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue