netIdle cleanup + better default for pages where networkIdle timesout (#916)

- set default networkIdle to 2
- add netIdleMaxRequests as an option, default to 1 (in case of long
running requests)
- further fix for #913 
- avoid accidental logging

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-11-18 16:34:02 -08:00 committed by GitHub
parent 8c8fd6be08
commit 87edef3362
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 26 additions and 18 deletions

View file

@ -221,11 +221,13 @@ Options:
interrupted, don't run post-crawl pr
ocesses on interrupt
[boolean] [default: false]
--netIdleWait if set, wait for network idle after
page load and after behaviors are do
ne (in seconds). if -1 (default), de
termine based on scope
[number] [default: -1]
--netIdleWait number of seconds to wait for networ
k idle after page load and after beh
aviors are done (default: 2)
[number] [default: 2]
--netIdleMaxRequests max active requests allowed for netw
ork to be considered idle
[default: 1]
--lang if set, sets the language used by th
e browser, should be ISO 639 languag
e[-country] code [string]
@ -318,6 +320,10 @@ Options:
--sshProxyKnownHostsFile path to SSH known hosts file for SOC
KS5 over SSH proxy connection
[string]
--extraChromeArgs Extra arguments to pass directly to
the Chrome instance (space-separated
or multiple --extraChromeArgs)
[array] [default: []]
--config Path to YAML config file
```

View file

@ -2329,6 +2329,7 @@ self.__bx_behaviors.selectMainBehavior();
try {
await this.browser.waitForNetworkIdle(page, {
timeout: this.params.netIdleWait * 1000,
concurrency: this.params.netIdleMaxRequests,
});
} catch (e) {
logger.debug("waitForNetworkIdle timed out, ignoring", details);

View file

@ -516,9 +516,15 @@ class ArgParser {
netIdleWait: {
describe:
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
"number of seconds to wait for network idle after page load and after behaviors are done (default: 2)",
type: "number",
default: -1,
default: 2,
},
netIdleMaxRequests: {
describe:
"max active requests allowed for network to be considered idle",
default: 1,
},
lang: {
@ -837,15 +843,6 @@ class ArgParser {
argv.selectLinks = selectLinks;
if (argv.netIdleWait === -1) {
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
argv.netIdleWait = 15;
} else {
argv.netIdleWait = 2;
}
//logger.debug(`Set netIdleWait to ${argv.netIdleWait} seconds`);
}
if (isQA && !argv.qaSource) {
logger.fatal("--qaSource required for QA mode");
}

View file

@ -23,8 +23,11 @@ import puppeteer, {
LaunchOptions,
Viewport,
CookieData,
WaitForNetworkIdleOptions,
CDPSession,
Target,
Browser as PptrBrowser,
} from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js";
import { timedRun } from "./timing.js";
import assert from "node:assert";
@ -244,6 +247,7 @@ export class Browser {
try {
child_process.execSync("rm ./Singleton*", {
cwd: this.profileDir,
stdio: "ignore",
});
} catch (e) {
// ignore
@ -700,7 +704,7 @@ export class Browser {
page.on("request", callback);
}
async waitForNetworkIdle(page: Page, params: { timeout?: number }) {
async waitForNetworkIdle(page: Page, params: WaitForNetworkIdleOptions) {
return await page.waitForNetworkIdle(params);
}