Default Wait-Time Improvements (#162)

- netIdleWait better defaults: if not set, set to 15 seconds for page/page-spa scope, otherwise to 2 seconds
- default behaviors: include autoscroll in default behavior as well
- restart: if crawl already done, don't attempt to crawl further. if 'waitOnDone' set, wait for signal before exiting.
- bump to puppeteer-core 17.1.2
- bump to 0.7.0-beta.4
This commit is contained in:
Ilya Kreymer 2022-09-08 23:39:26 -07:00 committed by GitHub
parent 5c931275ed
commit 314ee3f730
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 38 additions and 16 deletions

View file

@ -492,6 +492,20 @@ class Crawler {
initState = await this.crawlState.getStatus();
}
// if already done, don't crawl anymore
if (initState === "done") {
this.done = true;
if (this.params.waitOnDone) {
this.statusLog("Already done, waiting for signal to exit...");
// wait forever until signal
await new Promise(() => {});
}
return;
}
if (this.params.generateWACZ) {
this.storage = initStorage();
}

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.7.0-beta.3",
"version": "0.7.0-beta.4",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
@ -17,7 +17,7 @@
"minio": "7.0.26",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "^16.1.1",
"puppeteer-core": "^17.1.2",
"request": "^2.88.2",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",

View file

@ -201,7 +201,7 @@ class ArgParser {
"behaviors": {
describe: "Which background behaviors to enable on each page",
default: "autoplay,autofetch,siteSpecific",
default: "autoplay,autofetch,autoscroll,siteSpecific",
type: "string",
},
@ -289,9 +289,9 @@ class ArgParser {
},
"netIdleWait": {
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds)",
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
type: "number",
default: 10
default: -1
}
};
}
@ -416,6 +416,15 @@ class ArgParser {
}
}
if (argv.netIdleWait === -1) {
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
argv.netIdleWait = 15;
} else {
argv.netIdleWait = 2;
}
console.log(`Set netIdleWait to ${argv.netIdleWait} seconds`);
}
if (argv.include) {
if (argv.scopeType && argv.scopeType !== "custom") {
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");

View file

@ -1661,10 +1661,10 @@ detect-newline@^3.0.0:
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
devtools-protocol@0.0.1019158:
version "0.0.1019158"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
devtools-protocol@0.0.1036444:
version "0.0.1036444"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1036444.tgz#a570d3cdde61527c82f9b03919847b8ac7b1c2b9"
integrity sha512-0y4f/T8H9lsESV9kKP1HDUXgHxCdniFeJh6Erq+FbdOEvp/Ydp9t8kcAAM5gOd17pMrTDlFWntoHtzzeTUWKNw==
diff-sequences@^26.6.2:
version "26.6.2"
@ -4151,7 +4151,7 @@ pirates@^4.0.1:
dependencies:
node-modules-regexp "^1.0.0"
pkg-dir@4.2.0, pkg-dir@^4.2.0:
pkg-dir@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3"
integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==
@ -4246,17 +4246,16 @@ punycode@^2.1.0, punycode@^2.1.1:
dependencies:
debug "^4.1.1"
puppeteer-core@^16.1.1:
version "16.1.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a"
integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew==
puppeteer-core@^17.1.2:
version "17.1.2"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-17.1.2.tgz#fdf109fa2d805fdb007b5abfc83728c545ac9ac0"
integrity sha512-mUndfkp581aFC9Tj0NQzoQ4kBEiYszvubkAovAfA72cO2VgiAnk4RTeQhgPekdL50+7bvU1JZp+10y2xOBOy0g==
dependencies:
cross-fetch "3.1.5"
debug "4.3.4"
devtools-protocol "0.0.1019158"
devtools-protocol "0.0.1036444"
extract-zip "2.0.1"
https-proxy-agent "5.0.1"
pkg-dir "4.2.0"
progress "2.0.3"
proxy-from-env "1.1.0"
rimraf "3.0.2"