mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
netIdle cleanup + better default for pages where networkIdle timesout (#916)
- set default networkIdle to 2 - add netIdleMaxRequests as an option, default to 1 (in case of long running requests) - further fix for #913 - avoid accidental logging --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
8c8fd6be08
commit
87edef3362
4 changed files with 26 additions and 18 deletions
|
|
@ -221,11 +221,13 @@ Options:
|
||||||
interrupted, don't run post-crawl pr
|
interrupted, don't run post-crawl pr
|
||||||
ocesses on interrupt
|
ocesses on interrupt
|
||||||
[boolean] [default: false]
|
[boolean] [default: false]
|
||||||
--netIdleWait if set, wait for network idle after
|
--netIdleWait number of seconds to wait for networ
|
||||||
page load and after behaviors are do
|
k idle after page load and after beh
|
||||||
ne (in seconds). if -1 (default), de
|
aviors are done (default: 2)
|
||||||
termine based on scope
|
[number] [default: 2]
|
||||||
[number] [default: -1]
|
--netIdleMaxRequests max active requests allowed for netw
|
||||||
|
ork to be considered idle
|
||||||
|
[default: 1]
|
||||||
--lang if set, sets the language used by th
|
--lang if set, sets the language used by th
|
||||||
e browser, should be ISO 639 languag
|
e browser, should be ISO 639 languag
|
||||||
e[-country] code [string]
|
e[-country] code [string]
|
||||||
|
|
@ -318,6 +320,10 @@ Options:
|
||||||
--sshProxyKnownHostsFile path to SSH known hosts file for SOC
|
--sshProxyKnownHostsFile path to SSH known hosts file for SOC
|
||||||
KS5 over SSH proxy connection
|
KS5 over SSH proxy connection
|
||||||
[string]
|
[string]
|
||||||
|
--extraChromeArgs Extra arguments to pass directly to
|
||||||
|
the Chrome instance (space-separated
|
||||||
|
or multiple --extraChromeArgs)
|
||||||
|
[array] [default: []]
|
||||||
--config Path to YAML config file
|
--config Path to YAML config file
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2329,6 +2329,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
try {
|
try {
|
||||||
await this.browser.waitForNetworkIdle(page, {
|
await this.browser.waitForNetworkIdle(page, {
|
||||||
timeout: this.params.netIdleWait * 1000,
|
timeout: this.params.netIdleWait * 1000,
|
||||||
|
concurrency: this.params.netIdleMaxRequests,
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
||||||
|
|
|
||||||
|
|
@ -516,9 +516,15 @@ class ArgParser {
|
||||||
|
|
||||||
netIdleWait: {
|
netIdleWait: {
|
||||||
describe:
|
describe:
|
||||||
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
"number of seconds to wait for network idle after page load and after behaviors are done (default: 2)",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: -1,
|
default: 2,
|
||||||
|
},
|
||||||
|
|
||||||
|
netIdleMaxRequests: {
|
||||||
|
describe:
|
||||||
|
"max active requests allowed for network to be considered idle",
|
||||||
|
default: 1,
|
||||||
},
|
},
|
||||||
|
|
||||||
lang: {
|
lang: {
|
||||||
|
|
@ -837,15 +843,6 @@ class ArgParser {
|
||||||
|
|
||||||
argv.selectLinks = selectLinks;
|
argv.selectLinks = selectLinks;
|
||||||
|
|
||||||
if (argv.netIdleWait === -1) {
|
|
||||||
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
|
|
||||||
argv.netIdleWait = 15;
|
|
||||||
} else {
|
|
||||||
argv.netIdleWait = 2;
|
|
||||||
}
|
|
||||||
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isQA && !argv.qaSource) {
|
if (isQA && !argv.qaSource) {
|
||||||
logger.fatal("--qaSource required for QA mode");
|
logger.fatal("--qaSource required for QA mode");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,11 @@ import puppeteer, {
|
||||||
LaunchOptions,
|
LaunchOptions,
|
||||||
Viewport,
|
Viewport,
|
||||||
CookieData,
|
CookieData,
|
||||||
|
WaitForNetworkIdleOptions,
|
||||||
|
CDPSession,
|
||||||
|
Target,
|
||||||
|
Browser as PptrBrowser,
|
||||||
} from "puppeteer-core";
|
} from "puppeteer-core";
|
||||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
|
||||||
import { Recorder } from "./recorder.js";
|
import { Recorder } from "./recorder.js";
|
||||||
import { timedRun } from "./timing.js";
|
import { timedRun } from "./timing.js";
|
||||||
import assert from "node:assert";
|
import assert from "node:assert";
|
||||||
|
|
@ -244,6 +247,7 @@ export class Browser {
|
||||||
try {
|
try {
|
||||||
child_process.execSync("rm ./Singleton*", {
|
child_process.execSync("rm ./Singleton*", {
|
||||||
cwd: this.profileDir,
|
cwd: this.profileDir,
|
||||||
|
stdio: "ignore",
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// ignore
|
// ignore
|
||||||
|
|
@ -700,7 +704,7 @@ export class Browser {
|
||||||
page.on("request", callback);
|
page.on("request", callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
async waitForNetworkIdle(page: Page, params: { timeout?: number }) {
|
async waitForNetworkIdle(page: Page, params: WaitForNetworkIdleOptions) {
|
||||||
return await page.waitForNetworkIdle(params);
|
return await page.waitForNetworkIdle(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue