mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
add checkCF() that will detect cloudflare ddos page and wait 5 seconds until original page is loaded
This commit is contained in:
parent
0fcc89fdb4
commit
d1e53e6e26
2 changed files with 16 additions and 1 deletions
15
crawler.js
15
crawler.js
|
@ -310,6 +310,8 @@ class Crawler {
|
||||||
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||||
|
|
||||||
if (this.params.behaviorOpts && !page.__bx_inited) {
|
if (this.params.behaviorOpts && !page.__bx_inited) {
|
||||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
||||||
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
||||||
|
@ -581,6 +583,8 @@ class Crawler {
|
||||||
|
|
||||||
const seed = this.params.scopedSeeds[seedId];
|
const seed = this.params.scopedSeeds[seedId];
|
||||||
|
|
||||||
|
await this.checkCF(page);
|
||||||
|
|
||||||
// skip extraction if at max depth
|
// skip extraction if at max depth
|
||||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||||
return;
|
return;
|
||||||
|
@ -649,6 +653,17 @@ class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async checkCF(page) {
|
||||||
|
try {
|
||||||
|
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
|
||||||
|
this.statusLog("Cloudflare Check Detected, waiting for reload...");
|
||||||
|
await this.sleep(5500);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async queueUrl(seedId, url, depth, extraHops = 0) {
|
async queueUrl(seedId, url, depth, extraHops = 0) {
|
||||||
if (this.limitHit) {
|
if (this.limitHit) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -126,7 +126,7 @@ async function main() {
|
||||||
await page.setCacheEnabled(false);
|
await page.setCacheEnabled(false);
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {value: false});');
|
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||||
// for testing, inject browsertrix-behaviors
|
// for testing, inject browsertrix-behaviors
|
||||||
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
|
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue