mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Wait Default + Logging Improvements (#153)
improved logging of pywb + redis: - if 'logging' includes 'pywb', log pywb and redis output, to pywb.log and redis.log - otherwise, just ignore (don't print to stdout as that's too confusing) - print if wb-manager fails, likely due to existing collection waitUntil: default to just 'load' to avoid potential infinite loop, separate --netIdle can configure idle wait dependency: update to latest puppeteer-core (16.1.0)
This commit is contained in:
parent
a527cc9b36
commit
c5d208024a
4 changed files with 52 additions and 37 deletions
37
crawler.js
37
crawler.js
|
@ -78,7 +78,7 @@ class Crawler {
|
||||||
this.debugLog("Seeds", this.params.scopedSeeds);
|
this.debugLog("Seeds", this.params.scopedSeeds);
|
||||||
|
|
||||||
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
||||||
this.capturePrefix = this.captureBasePrefix + "/id_/";
|
this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
|
||||||
|
|
||||||
this.gotoOpts = {
|
this.gotoOpts = {
|
||||||
waitUntil: this.params.waitUntil,
|
waitUntil: this.params.waitUntil,
|
||||||
|
@ -201,13 +201,31 @@ class Crawler {
|
||||||
return new ScreenCaster(transport, this.params.workers);
|
return new ScreenCaster(transport, this.params.workers);
|
||||||
}
|
}
|
||||||
|
|
||||||
bootstrap() {
|
async bootstrap() {
|
||||||
let opts = {};
|
const logs = path.join(this.collDir, "logs");
|
||||||
if (this.params.logging.includes("pywb")) {
|
|
||||||
opts = {stdio: "inherit", cwd: this.params.cwd};
|
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
|
||||||
|
|
||||||
|
if (initRes.status) {
|
||||||
|
console.log("wb-manager init failed, collection likely already exists");
|
||||||
}
|
}
|
||||||
else{
|
|
||||||
|
await fsp.mkdir(logs, {recursive: true});
|
||||||
|
|
||||||
|
let opts = {};
|
||||||
|
let redisStdio;
|
||||||
|
|
||||||
|
if (this.params.logging.includes("pywb")) {
|
||||||
|
const pywbStderr = fs.openSync(path.join(logs, "pywb.log"), "a");
|
||||||
|
const stdio = [process.stdin, pywbStderr, pywbStderr];
|
||||||
|
|
||||||
|
const redisStderr = fs.openSync(path.join(logs, "redis.log"), "a");
|
||||||
|
redisStdio = [process.stdin, redisStderr, redisStderr];
|
||||||
|
|
||||||
|
opts = {stdio, cwd: this.params.cwd};
|
||||||
|
} else {
|
||||||
opts = {stdio: "ignore", cwd: this.params.cwd};
|
opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||||
|
redisStdio = "ignore";
|
||||||
}
|
}
|
||||||
|
|
||||||
this.browserExe = getBrowserExe();
|
this.browserExe = getBrowserExe();
|
||||||
|
@ -218,7 +236,7 @@ class Crawler {
|
||||||
|
|
||||||
const subprocesses = [];
|
const subprocesses = [];
|
||||||
|
|
||||||
subprocesses.push(child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}));
|
subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
|
||||||
|
|
||||||
if (this.params.overwrite) {
|
if (this.params.overwrite) {
|
||||||
console.log(`Clearing ${this.collDir} before starting`);
|
console.log(`Clearing ${this.collDir} before starting`);
|
||||||
|
@ -229,8 +247,6 @@ class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
|
||||||
|
|
||||||
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
||||||
|
|
||||||
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
|
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
|
||||||
|
@ -272,9 +288,8 @@ class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
async run() {
|
async run() {
|
||||||
await fsp.mkdir(this.params.cwd, {recursive: true});
|
await this.bootstrap();
|
||||||
|
|
||||||
this.bootstrap();
|
|
||||||
let status;
|
let status;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
"minio": "7.0.26",
|
"minio": "7.0.26",
|
||||||
"node-fetch": "^2.6.1",
|
"node-fetch": "^2.6.1",
|
||||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||||
"puppeteer-core": "^13.3.2",
|
"puppeteer-core": "16.1.0",
|
||||||
"request": "^2.88.2",
|
"request": "^2.88.2",
|
||||||
"sitemapper": "^3.1.2",
|
"sitemapper": "^3.1.2",
|
||||||
"uuid": "8.3.2",
|
"uuid": "8.3.2",
|
||||||
|
|
|
@ -53,7 +53,7 @@ class ArgParser {
|
||||||
|
|
||||||
"waitUntil": {
|
"waitUntil": {
|
||||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
|
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
|
||||||
default: "load,networkidle2",
|
default: "load",
|
||||||
},
|
},
|
||||||
|
|
||||||
"depth": {
|
"depth": {
|
||||||
|
|
48
yarn.lock
48
yarn.lock
|
@ -1558,10 +1558,10 @@ debug@4, debug@^4.0.1, debug@^4.1.0, debug@^4.1.1, debug@^4.3.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
ms "2.1.2"
|
ms "2.1.2"
|
||||||
|
|
||||||
debug@4.3.3:
|
debug@4.3.4:
|
||||||
version "4.3.3"
|
version "4.3.4"
|
||||||
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.3.tgz#04266e0b70a98d4462e6e288e38259213332b664"
|
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.4.tgz#1319f6579357f2338d3337d2cdd4914bb5dcc865"
|
||||||
integrity sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q==
|
integrity sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==
|
||||||
dependencies:
|
dependencies:
|
||||||
ms "2.1.2"
|
ms "2.1.2"
|
||||||
|
|
||||||
|
@ -1661,10 +1661,10 @@ detect-newline@^3.0.0:
|
||||||
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
||||||
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
||||||
|
|
||||||
devtools-protocol@0.0.960912:
|
devtools-protocol@0.0.1019158:
|
||||||
version "0.0.960912"
|
version "0.0.1019158"
|
||||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.960912.tgz#411c1fa355eddb72f06c4a8743f2808766db6245"
|
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
|
||||||
integrity sha512-I3hWmV9rWHbdnUdmMKHF2NuYutIM2kXz2mdXW8ha7TbRlGTVs+PF+PsB5QWvpCek4Fy9B+msiispCfwlhG5Sqg==
|
integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
|
||||||
|
|
||||||
diff-sequences@^26.6.2:
|
diff-sequences@^26.6.2:
|
||||||
version "26.6.2"
|
version "26.6.2"
|
||||||
|
@ -2531,10 +2531,10 @@ http2-wrapper@^1.0.0-beta.5.2:
|
||||||
quick-lru "^5.1.1"
|
quick-lru "^5.1.1"
|
||||||
resolve-alpn "^1.0.0"
|
resolve-alpn "^1.0.0"
|
||||||
|
|
||||||
https-proxy-agent@5.0.0:
|
https-proxy-agent@5.0.1:
|
||||||
version "5.0.0"
|
version "5.0.1"
|
||||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.0.tgz#e2a90542abb68a762e0a0850f6c9edadfd8506b2"
|
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
|
||||||
integrity sha512-EkYm5BcKUGiduxzSt3Eppko+PiNWNEpa4ySk9vTC6wDsQJW9rHSa+UhGNJoRYp7bz6Ht1eaRIa6QaJqO5rCFbA==
|
integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
|
||||||
dependencies:
|
dependencies:
|
||||||
agent-base "6"
|
agent-base "6"
|
||||||
debug "4"
|
debug "4"
|
||||||
|
@ -4246,23 +4246,23 @@ punycode@^2.1.0, punycode@^2.1.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
debug "^4.1.1"
|
debug "^4.1.1"
|
||||||
|
|
||||||
puppeteer-core@^13.3.2:
|
puppeteer-core@^16.1.0:
|
||||||
version "13.3.2"
|
version "16.1.0"
|
||||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-13.3.2.tgz#03b47c776fea881df69e7a55559848434f8110f3"
|
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4"
|
||||||
integrity sha512-9T8deXmLWf55/RvDpl32vP68stTufqvtj6fc9hH09ZwCLh5IwnN9Z0MWHfDMTLiW6MUpW2Flx5CQWt1SCUT47g==
|
integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA==
|
||||||
dependencies:
|
dependencies:
|
||||||
cross-fetch "3.1.5"
|
cross-fetch "3.1.5"
|
||||||
debug "4.3.3"
|
debug "4.3.4"
|
||||||
devtools-protocol "0.0.960912"
|
devtools-protocol "0.0.1019158"
|
||||||
extract-zip "2.0.1"
|
extract-zip "2.0.1"
|
||||||
https-proxy-agent "5.0.0"
|
https-proxy-agent "5.0.1"
|
||||||
pkg-dir "4.2.0"
|
pkg-dir "4.2.0"
|
||||||
progress "2.0.3"
|
progress "2.0.3"
|
||||||
proxy-from-env "1.1.0"
|
proxy-from-env "1.1.0"
|
||||||
rimraf "3.0.2"
|
rimraf "3.0.2"
|
||||||
tar-fs "2.1.1"
|
tar-fs "2.1.1"
|
||||||
unbzip2-stream "1.4.3"
|
unbzip2-stream "1.4.3"
|
||||||
ws "8.5.0"
|
ws "8.8.1"
|
||||||
|
|
||||||
pvtsutils@^1.1.2, pvtsutils@^1.1.6:
|
pvtsutils@^1.1.2, pvtsutils@^1.1.6:
|
||||||
version "1.1.6"
|
version "1.1.6"
|
||||||
|
@ -5453,10 +5453,10 @@ write-file-atomic@^3.0.0:
|
||||||
signal-exit "^3.0.2"
|
signal-exit "^3.0.2"
|
||||||
typedarray-to-buffer "^3.1.5"
|
typedarray-to-buffer "^3.1.5"
|
||||||
|
|
||||||
ws@8.5.0:
|
ws@8.8.1:
|
||||||
version "8.5.0"
|
version "8.8.1"
|
||||||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.5.0.tgz#bfb4be96600757fe5382de12c670dab984a1ed4f"
|
resolved "https://registry.yarnpkg.com/ws/-/ws-8.8.1.tgz#5dbad0feb7ade8ecc99b830c1d77c913d4955ff0"
|
||||||
integrity sha512-BWX0SWVgLPzYwF8lTzEy1egjhS4S4OEAHfsO8o65WOVsrnSRGaSiUaa9e0ggGlkMTtBlmOpEXiie9RUcBO86qg==
|
integrity sha512-bGy2JzvzkPowEJV++hF07hAD6niYSr0JzBNo/J29WsB57A2r7Wlc1UFcTR9IzrPvuNVO4B8LGqF8qcpsVOhJCA==
|
||||||
|
|
||||||
ws@^7.4.4:
|
ws@^7.4.4:
|
||||||
version "7.4.5"
|
version "7.4.5"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue