mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Default Wait-Time Improvements (#162)
- netIdleWait better defaults: if not set, set to 15 seconds for page/page-spa scope, otherwise to 2 seconds - default behaviors: include autoscroll in default behavior as well - restart: if crawl already done, don't attempt to crawl further. if 'waitOnDone' set, wait for signal before exiting. - bump to puppeteer-core 17.1.2 - bump to 0.7.0-beta.4
This commit is contained in:
parent
5c931275ed
commit
314ee3f730
4 changed files with 38 additions and 16 deletions
14
crawler.js
14
crawler.js
|
@ -492,6 +492,20 @@ class Crawler {
|
|||
initState = await this.crawlState.getStatus();
|
||||
}
|
||||
|
||||
// if already done, don't crawl anymore
|
||||
if (initState === "done") {
|
||||
this.done = true;
|
||||
|
||||
if (this.params.waitOnDone) {
|
||||
this.statusLog("Already done, waiting for signal to exit...");
|
||||
|
||||
// wait forever until signal
|
||||
await new Promise(() => {});
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.params.generateWACZ) {
|
||||
this.storage = initStorage();
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.7.0-beta.3",
|
||||
"version": "0.7.0-beta.4",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
@ -17,7 +17,7 @@
|
|||
"minio": "7.0.26",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||
"puppeteer-core": "^16.1.1",
|
||||
"puppeteer-core": "^17.1.2",
|
||||
"request": "^2.88.2",
|
||||
"sitemapper": "^3.1.2",
|
||||
"uuid": "8.3.2",
|
||||
|
|
|
@ -201,7 +201,7 @@ class ArgParser {
|
|||
|
||||
"behaviors": {
|
||||
describe: "Which background behaviors to enable on each page",
|
||||
default: "autoplay,autofetch,siteSpecific",
|
||||
default: "autoplay,autofetch,autoscroll,siteSpecific",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
|
@ -289,9 +289,9 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"netIdleWait": {
|
||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds)",
|
||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||
type: "number",
|
||||
default: 10
|
||||
default: -1
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -416,6 +416,15 @@ class ArgParser {
|
|||
}
|
||||
}
|
||||
|
||||
if (argv.netIdleWait === -1) {
|
||||
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
|
||||
argv.netIdleWait = 15;
|
||||
} else {
|
||||
argv.netIdleWait = 2;
|
||||
}
|
||||
console.log(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
||||
}
|
||||
|
||||
if (argv.include) {
|
||||
if (argv.scopeType && argv.scopeType !== "custom") {
|
||||
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||
|
|
21
yarn.lock
21
yarn.lock
|
@ -1661,10 +1661,10 @@ detect-newline@^3.0.0:
|
|||
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
||||
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
||||
|
||||
devtools-protocol@0.0.1019158:
|
||||
version "0.0.1019158"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
|
||||
integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
|
||||
devtools-protocol@0.0.1036444:
|
||||
version "0.0.1036444"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1036444.tgz#a570d3cdde61527c82f9b03919847b8ac7b1c2b9"
|
||||
integrity sha512-0y4f/T8H9lsESV9kKP1HDUXgHxCdniFeJh6Erq+FbdOEvp/Ydp9t8kcAAM5gOd17pMrTDlFWntoHtzzeTUWKNw==
|
||||
|
||||
diff-sequences@^26.6.2:
|
||||
version "26.6.2"
|
||||
|
@ -4151,7 +4151,7 @@ pirates@^4.0.1:
|
|||
dependencies:
|
||||
node-modules-regexp "^1.0.0"
|
||||
|
||||
pkg-dir@4.2.0, pkg-dir@^4.2.0:
|
||||
pkg-dir@^4.2.0:
|
||||
version "4.2.0"
|
||||
resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3"
|
||||
integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==
|
||||
|
@ -4246,17 +4246,16 @@ punycode@^2.1.0, punycode@^2.1.1:
|
|||
dependencies:
|
||||
debug "^4.1.1"
|
||||
|
||||
puppeteer-core@^16.1.1:
|
||||
version "16.1.1"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a"
|
||||
integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew==
|
||||
puppeteer-core@^17.1.2:
|
||||
version "17.1.2"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-17.1.2.tgz#fdf109fa2d805fdb007b5abfc83728c545ac9ac0"
|
||||
integrity sha512-mUndfkp581aFC9Tj0NQzoQ4kBEiYszvubkAovAfA72cO2VgiAnk4RTeQhgPekdL50+7bvU1JZp+10y2xOBOy0g==
|
||||
dependencies:
|
||||
cross-fetch "3.1.5"
|
||||
debug "4.3.4"
|
||||
devtools-protocol "0.0.1019158"
|
||||
devtools-protocol "0.0.1036444"
|
||||
extract-zip "2.0.1"
|
||||
https-proxy-agent "5.0.1"
|
||||
pkg-dir "4.2.0"
|
||||
progress "2.0.3"
|
||||
proxy-from-env "1.1.0"
|
||||
rimraf "3.0.2"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue