From c5d208024a2223fd9b4afa7114a6c3dbfd6fab4c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 11 Aug 2022 18:44:39 -0700 Subject: [PATCH] Wait Default + Logging Improvements (#153) improved logging of pywb + redis: - if 'logging' includes 'pywb', log pywb and redis output, to pywb.log and redis.log - otherwise, just ignore (don't print to stdout as that's too confusing) - print if wb-manager fails, likely due to existing collection waitUntil: default to just 'load' to avoid potential infinite loop, separate --netIdle can configure idle wait dependency: update to latest puppeteer-core (16.1.0) --- crawler.js | 37 +++++++++++++++++++++++++----------- package.json | 2 +- util/argParser.js | 2 +- yarn.lock | 48 +++++++++++++++++++++++------------------------ 4 files changed, 52 insertions(+), 37 deletions(-) diff --git a/crawler.js b/crawler.js index b883c809..c4bc6f2e 100644 --- a/crawler.js +++ b/crawler.js @@ -78,7 +78,7 @@ class Crawler { this.debugLog("Seeds", this.params.scopedSeeds); this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`; - this.capturePrefix = this.captureBasePrefix + "/id_/"; + this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/"; this.gotoOpts = { waitUntil: this.params.waitUntil, @@ -201,13 +201,31 @@ class Crawler { return new ScreenCaster(transport, this.params.workers); } - bootstrap() { - let opts = {}; - if (this.params.logging.includes("pywb")) { - opts = {stdio: "inherit", cwd: this.params.cwd}; + async bootstrap() { + const logs = path.join(this.collDir, "logs"); + + const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd}); + + if (initRes.status) { + console.log("wb-manager init failed, collection likely already exists"); } - else{ + + await fsp.mkdir(logs, {recursive: true}); + + let opts = {}; + let redisStdio; + + if (this.params.logging.includes("pywb")) { + const pywbStderr = fs.openSync(path.join(logs, "pywb.log"), "a"); + const stdio = [process.stdin, pywbStderr, pywbStderr]; + + const redisStderr = fs.openSync(path.join(logs, "redis.log"), "a"); + redisStdio = [process.stdin, redisStderr, redisStderr]; + + opts = {stdio, cwd: this.params.cwd}; + } else { opts = {stdio: "ignore", cwd: this.params.cwd}; + redisStdio = "ignore"; } this.browserExe = getBrowserExe(); @@ -218,7 +236,7 @@ class Crawler { const subprocesses = []; - subprocesses.push(child_process.spawn("redis-server", {...opts, cwd: "/tmp/"})); + subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio})); if (this.params.overwrite) { console.log(`Clearing ${this.collDir} before starting`); @@ -229,8 +247,6 @@ class Crawler { } } - child_process.spawnSync("wb-manager", ["init", this.params.collection], opts); - opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize}; subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts)); @@ -272,9 +288,8 @@ class Crawler { } async run() { - await fsp.mkdir(this.params.cwd, {recursive: true}); + await this.bootstrap(); - this.bootstrap(); let status; try { diff --git a/package.json b/package.json index 0b7a6c4d..37eafd58 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,7 @@ "minio": "7.0.26", "node-fetch": "^2.6.1", "puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue", - "puppeteer-core": "^13.3.2", + "puppeteer-core": "16.1.0", "request": "^2.88.2", "sitemapper": "^3.1.2", "uuid": "8.3.2", diff --git a/util/argParser.js b/util/argParser.js index f06663e6..5af8fa6c 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -53,7 +53,7 @@ class ArgParser { "waitUntil": { describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','", - default: "load,networkidle2", + default: "load", }, "depth": { diff --git a/yarn.lock b/yarn.lock index 51f53a68..25bde17c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1558,10 +1558,10 @@ debug@4, debug@^4.0.1, debug@^4.1.0, debug@^4.1.1, debug@^4.3.1: dependencies: ms "2.1.2" -debug@4.3.3: - version "4.3.3" - resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.3.tgz#04266e0b70a98d4462e6e288e38259213332b664" - integrity sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q== +debug@4.3.4: + version "4.3.4" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.4.tgz#1319f6579357f2338d3337d2cdd4914bb5dcc865" + integrity sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ== dependencies: ms "2.1.2" @@ -1661,10 +1661,10 @@ detect-newline@^3.0.0: resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651" integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA== -devtools-protocol@0.0.960912: - version "0.0.960912" - resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.960912.tgz#411c1fa355eddb72f06c4a8743f2808766db6245" - integrity sha512-I3hWmV9rWHbdnUdmMKHF2NuYutIM2kXz2mdXW8ha7TbRlGTVs+PF+PsB5QWvpCek4Fy9B+msiispCfwlhG5Sqg== +devtools-protocol@0.0.1019158: + version "0.0.1019158" + resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f" + integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ== diff-sequences@^26.6.2: version "26.6.2" @@ -2531,10 +2531,10 @@ http2-wrapper@^1.0.0-beta.5.2: quick-lru "^5.1.1" resolve-alpn "^1.0.0" -https-proxy-agent@5.0.0: - version "5.0.0" - resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.0.tgz#e2a90542abb68a762e0a0850f6c9edadfd8506b2" - integrity sha512-EkYm5BcKUGiduxzSt3Eppko+PiNWNEpa4ySk9vTC6wDsQJW9rHSa+UhGNJoRYp7bz6Ht1eaRIa6QaJqO5rCFbA== +https-proxy-agent@5.0.1: + version "5.0.1" + resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6" + integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA== dependencies: agent-base "6" debug "4" @@ -4246,23 +4246,23 @@ punycode@^2.1.0, punycode@^2.1.1: dependencies: debug "^4.1.1" -puppeteer-core@^13.3.2: - version "13.3.2" - resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-13.3.2.tgz#03b47c776fea881df69e7a55559848434f8110f3" - integrity sha512-9T8deXmLWf55/RvDpl32vP68stTufqvtj6fc9hH09ZwCLh5IwnN9Z0MWHfDMTLiW6MUpW2Flx5CQWt1SCUT47g== +puppeteer-core@^16.1.0: + version "16.1.0" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4" + integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA== dependencies: cross-fetch "3.1.5" - debug "4.3.3" - devtools-protocol "0.0.960912" + debug "4.3.4" + devtools-protocol "0.0.1019158" extract-zip "2.0.1" - https-proxy-agent "5.0.0" + https-proxy-agent "5.0.1" pkg-dir "4.2.0" progress "2.0.3" proxy-from-env "1.1.0" rimraf "3.0.2" tar-fs "2.1.1" unbzip2-stream "1.4.3" - ws "8.5.0" + ws "8.8.1" pvtsutils@^1.1.2, pvtsutils@^1.1.6: version "1.1.6" @@ -5453,10 +5453,10 @@ write-file-atomic@^3.0.0: signal-exit "^3.0.2" typedarray-to-buffer "^3.1.5" -ws@8.5.0: - version "8.5.0" - resolved "https://registry.yarnpkg.com/ws/-/ws-8.5.0.tgz#bfb4be96600757fe5382de12c670dab984a1ed4f" - integrity sha512-BWX0SWVgLPzYwF8lTzEy1egjhS4S4OEAHfsO8o65WOVsrnSRGaSiUaa9e0ggGlkMTtBlmOpEXiie9RUcBO86qg== +ws@8.8.1: + version "8.8.1" + resolved "https://registry.yarnpkg.com/ws/-/ws-8.8.1.tgz#5dbad0feb7ade8ecc99b830c1d77c913d4955ff0" + integrity sha512-bGy2JzvzkPowEJV++hF07hAD6niYSr0JzBNo/J29WsB57A2r7Wlc1UFcTR9IzrPvuNVO4B8LGqF8qcpsVOhJCA== ws@^7.4.4: version "7.4.5"