mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Default Wait-Time Improvements (#162)
- netIdleWait better defaults: if not set, set to 15 seconds for page/page-spa scope, otherwise to 2 seconds - default behaviors: include autoscroll in default behavior as well - restart: if crawl already done, don't attempt to crawl further. if 'waitOnDone' set, wait for signal before exiting. - bump to puppeteer-core 17.1.2 - bump to 0.7.0-beta.4
This commit is contained in:
parent
5c931275ed
commit
314ee3f730
4 changed files with 38 additions and 16 deletions
14
crawler.js
14
crawler.js
|
@ -492,6 +492,20 @@ class Crawler {
|
||||||
initState = await this.crawlState.getStatus();
|
initState = await this.crawlState.getStatus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if already done, don't crawl anymore
|
||||||
|
if (initState === "done") {
|
||||||
|
this.done = true;
|
||||||
|
|
||||||
|
if (this.params.waitOnDone) {
|
||||||
|
this.statusLog("Already done, waiting for signal to exit...");
|
||||||
|
|
||||||
|
// wait forever until signal
|
||||||
|
await new Promise(() => {});
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (this.params.generateWACZ) {
|
if (this.params.generateWACZ) {
|
||||||
this.storage = initStorage();
|
this.storage = initStorage();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.7.0-beta.3",
|
"version": "0.7.0-beta.4",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||||
|
@ -17,7 +17,7 @@
|
||||||
"minio": "7.0.26",
|
"minio": "7.0.26",
|
||||||
"node-fetch": "^2.6.1",
|
"node-fetch": "^2.6.1",
|
||||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||||
"puppeteer-core": "^16.1.1",
|
"puppeteer-core": "^17.1.2",
|
||||||
"request": "^2.88.2",
|
"request": "^2.88.2",
|
||||||
"sitemapper": "^3.1.2",
|
"sitemapper": "^3.1.2",
|
||||||
"uuid": "8.3.2",
|
"uuid": "8.3.2",
|
||||||
|
|
|
@ -201,7 +201,7 @@ class ArgParser {
|
||||||
|
|
||||||
"behaviors": {
|
"behaviors": {
|
||||||
describe: "Which background behaviors to enable on each page",
|
describe: "Which background behaviors to enable on each page",
|
||||||
default: "autoplay,autofetch,siteSpecific",
|
default: "autoplay,autofetch,autoscroll,siteSpecific",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -289,9 +289,9 @@ class ArgParser {
|
||||||
},
|
},
|
||||||
|
|
||||||
"netIdleWait": {
|
"netIdleWait": {
|
||||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds)",
|
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 10
|
default: -1
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -416,6 +416,15 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv.netIdleWait === -1) {
|
||||||
|
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
|
||||||
|
argv.netIdleWait = 15;
|
||||||
|
} else {
|
||||||
|
argv.netIdleWait = 2;
|
||||||
|
}
|
||||||
|
console.log(`Set netIdleWait to ${argv.netIdleWait} seconds`);
|
||||||
|
}
|
||||||
|
|
||||||
if (argv.include) {
|
if (argv.include) {
|
||||||
if (argv.scopeType && argv.scopeType !== "custom") {
|
if (argv.scopeType && argv.scopeType !== "custom") {
|
||||||
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
|
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
|
||||||
|
|
21
yarn.lock
21
yarn.lock
|
@ -1661,10 +1661,10 @@ detect-newline@^3.0.0:
|
||||||
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
||||||
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
||||||
|
|
||||||
devtools-protocol@0.0.1019158:
|
devtools-protocol@0.0.1036444:
|
||||||
version "0.0.1019158"
|
version "0.0.1036444"
|
||||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
|
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1036444.tgz#a570d3cdde61527c82f9b03919847b8ac7b1c2b9"
|
||||||
integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
|
integrity sha512-0y4f/T8H9lsESV9kKP1HDUXgHxCdniFeJh6Erq+FbdOEvp/Ydp9t8kcAAM5gOd17pMrTDlFWntoHtzzeTUWKNw==
|
||||||
|
|
||||||
diff-sequences@^26.6.2:
|
diff-sequences@^26.6.2:
|
||||||
version "26.6.2"
|
version "26.6.2"
|
||||||
|
@ -4151,7 +4151,7 @@ pirates@^4.0.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
node-modules-regexp "^1.0.0"
|
node-modules-regexp "^1.0.0"
|
||||||
|
|
||||||
pkg-dir@4.2.0, pkg-dir@^4.2.0:
|
pkg-dir@^4.2.0:
|
||||||
version "4.2.0"
|
version "4.2.0"
|
||||||
resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3"
|
resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3"
|
||||||
integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==
|
integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ==
|
||||||
|
@ -4246,17 +4246,16 @@ punycode@^2.1.0, punycode@^2.1.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
debug "^4.1.1"
|
debug "^4.1.1"
|
||||||
|
|
||||||
puppeteer-core@^16.1.1:
|
puppeteer-core@^17.1.2:
|
||||||
version "16.1.1"
|
version "17.1.2"
|
||||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a"
|
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-17.1.2.tgz#fdf109fa2d805fdb007b5abfc83728c545ac9ac0"
|
||||||
integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew==
|
integrity sha512-mUndfkp581aFC9Tj0NQzoQ4kBEiYszvubkAovAfA72cO2VgiAnk4RTeQhgPekdL50+7bvU1JZp+10y2xOBOy0g==
|
||||||
dependencies:
|
dependencies:
|
||||||
cross-fetch "3.1.5"
|
cross-fetch "3.1.5"
|
||||||
debug "4.3.4"
|
debug "4.3.4"
|
||||||
devtools-protocol "0.0.1019158"
|
devtools-protocol "0.0.1036444"
|
||||||
extract-zip "2.0.1"
|
extract-zip "2.0.1"
|
||||||
https-proxy-agent "5.0.1"
|
https-proxy-agent "5.0.1"
|
||||||
pkg-dir "4.2.0"
|
|
||||||
progress "2.0.3"
|
progress "2.0.3"
|
||||||
proxy-from-env "1.1.0"
|
proxy-from-env "1.1.0"
|
||||||
rimraf "3.0.2"
|
rimraf "3.0.2"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue