mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
add checkCF() that will detect cloudflare ddos page and wait 5 seconds until original page is loaded
This commit is contained in:
parent
0fcc89fdb4
commit
d1e53e6e26
2 changed files with 16 additions and 1 deletions
15
crawler.js
15
crawler.js
|
@ -310,6 +310,8 @@ class Crawler {
|
|||
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
||||
}
|
||||
|
||||
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
|
||||
if (this.params.behaviorOpts && !page.__bx_inited) {
|
||||
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
|
||||
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
|
||||
|
@ -581,6 +583,8 @@ class Crawler {
|
|||
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
await this.checkCF(page);
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
return;
|
||||
|
@ -649,6 +653,17 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async checkCF(page) {
|
||||
try {
|
||||
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
|
||||
this.statusLog("Cloudflare Check Detected, waiting for reload...");
|
||||
await this.sleep(5500);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
}
|
||||
|
||||
async queueUrl(seedId, url, depth, extraHops = 0) {
|
||||
if (this.limitHit) {
|
||||
return false;
|
||||
|
|
|
@ -126,7 +126,7 @@ async function main() {
|
|||
await page.setCacheEnabled(false);
|
||||
|
||||
if (params.interactive) {
|
||||
await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {value: false});');
|
||||
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
// for testing, inject browsertrix-behaviors
|
||||
await page.evaluateOnNewDocument(behaviors + ";\nself.__bx_behaviors.init();");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue