Wait Default + Logging Improvements (#153)

improved logging of pywb + redis:
- if 'logging' includes 'pywb', log pywb and redis output, to pywb.log and redis.log
- otherwise, just ignore (don't print to stdout as that's too confusing)
- print if wb-manager fails, likely due to existing collection

waitUntil: default to just 'load' to avoid potential infinite loop, separate --netIdle can configure idle wait
dependency: update to latest puppeteer-core (16.1.0)
This commit is contained in:
Ilya Kreymer 2022-08-11 18:44:39 -07:00 committed by GitHub
parent a527cc9b36
commit c5d208024a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 52 additions and 37 deletions

View file

@ -78,7 +78,7 @@ class Crawler {
this.debugLog("Seeds", this.params.scopedSeeds); this.debugLog("Seeds", this.params.scopedSeeds);
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`; this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
this.capturePrefix = this.captureBasePrefix + "/id_/"; this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
this.gotoOpts = { this.gotoOpts = {
waitUntil: this.params.waitUntil, waitUntil: this.params.waitUntil,
@ -201,13 +201,31 @@ class Crawler {
return new ScreenCaster(transport, this.params.workers); return new ScreenCaster(transport, this.params.workers);
} }
bootstrap() { async bootstrap() {
let opts = {}; const logs = path.join(this.collDir, "logs");
if (this.params.logging.includes("pywb")) {
opts = {stdio: "inherit", cwd: this.params.cwd}; const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
if (initRes.status) {
console.log("wb-manager init failed, collection likely already exists");
} }
else{
await fsp.mkdir(logs, {recursive: true});
let opts = {};
let redisStdio;
if (this.params.logging.includes("pywb")) {
const pywbStderr = fs.openSync(path.join(logs, "pywb.log"), "a");
const stdio = [process.stdin, pywbStderr, pywbStderr];
const redisStderr = fs.openSync(path.join(logs, "redis.log"), "a");
redisStdio = [process.stdin, redisStderr, redisStderr];
opts = {stdio, cwd: this.params.cwd};
} else {
opts = {stdio: "ignore", cwd: this.params.cwd}; opts = {stdio: "ignore", cwd: this.params.cwd};
redisStdio = "ignore";
} }
this.browserExe = getBrowserExe(); this.browserExe = getBrowserExe();
@ -218,7 +236,7 @@ class Crawler {
const subprocesses = []; const subprocesses = [];
subprocesses.push(child_process.spawn("redis-server", {...opts, cwd: "/tmp/"})); subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
if (this.params.overwrite) { if (this.params.overwrite) {
console.log(`Clearing ${this.collDir} before starting`); console.log(`Clearing ${this.collDir} before starting`);
@ -229,8 +247,6 @@ class Crawler {
} }
} }
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize}; opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts)); subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
@ -272,9 +288,8 @@ class Crawler {
} }
async run() { async run() {
await fsp.mkdir(this.params.cwd, {recursive: true}); await this.bootstrap();
this.bootstrap();
let status; let status;
try { try {

View file

@ -17,7 +17,7 @@
"minio": "7.0.26", "minio": "7.0.26",
"node-fetch": "^2.6.1", "node-fetch": "^2.6.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue", "puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "^13.3.2", "puppeteer-core": "16.1.0",
"request": "^2.88.2", "request": "^2.88.2",
"sitemapper": "^3.1.2", "sitemapper": "^3.1.2",
"uuid": "8.3.2", "uuid": "8.3.2",

View file

@ -53,7 +53,7 @@ class ArgParser {
"waitUntil": { "waitUntil": {
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','", describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
default: "load,networkidle2", default: "load",
}, },
"depth": { "depth": {

View file

@ -1558,10 +1558,10 @@ debug@4, debug@^4.0.1, debug@^4.1.0, debug@^4.1.1, debug@^4.3.1:
dependencies: dependencies:
ms "2.1.2" ms "2.1.2"
debug@4.3.3: debug@4.3.4:
version "4.3.3" version "4.3.4"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.3.tgz#04266e0b70a98d4462e6e288e38259213332b664" resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.4.tgz#1319f6579357f2338d3337d2cdd4914bb5dcc865"
integrity sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q== integrity sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==
dependencies: dependencies:
ms "2.1.2" ms "2.1.2"
@ -1661,10 +1661,10 @@ detect-newline@^3.0.0:
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651" resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA== integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
devtools-protocol@0.0.960912: devtools-protocol@0.0.1019158:
version "0.0.960912" version "0.0.1019158"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.960912.tgz#411c1fa355eddb72f06c4a8743f2808766db6245" resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
integrity sha512-I3hWmV9rWHbdnUdmMKHF2NuYutIM2kXz2mdXW8ha7TbRlGTVs+PF+PsB5QWvpCek4Fy9B+msiispCfwlhG5Sqg== integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
diff-sequences@^26.6.2: diff-sequences@^26.6.2:
version "26.6.2" version "26.6.2"
@ -2531,10 +2531,10 @@ http2-wrapper@^1.0.0-beta.5.2:
quick-lru "^5.1.1" quick-lru "^5.1.1"
resolve-alpn "^1.0.0" resolve-alpn "^1.0.0"
https-proxy-agent@5.0.0: https-proxy-agent@5.0.1:
version "5.0.0" version "5.0.1"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.0.tgz#e2a90542abb68a762e0a0850f6c9edadfd8506b2" resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
integrity sha512-EkYm5BcKUGiduxzSt3Eppko+PiNWNEpa4ySk9vTC6wDsQJW9rHSa+UhGNJoRYp7bz6Ht1eaRIa6QaJqO5rCFbA== integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
dependencies: dependencies:
agent-base "6" agent-base "6"
debug "4" debug "4"
@ -4246,23 +4246,23 @@ punycode@^2.1.0, punycode@^2.1.1:
dependencies: dependencies:
debug "^4.1.1" debug "^4.1.1"
puppeteer-core@^13.3.2: puppeteer-core@^16.1.0:
version "13.3.2" version "16.1.0"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-13.3.2.tgz#03b47c776fea881df69e7a55559848434f8110f3" resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4"
integrity sha512-9T8deXmLWf55/RvDpl32vP68stTufqvtj6fc9hH09ZwCLh5IwnN9Z0MWHfDMTLiW6MUpW2Flx5CQWt1SCUT47g== integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA==
dependencies: dependencies:
cross-fetch "3.1.5" cross-fetch "3.1.5"
debug "4.3.3" debug "4.3.4"
devtools-protocol "0.0.960912" devtools-protocol "0.0.1019158"
extract-zip "2.0.1" extract-zip "2.0.1"
https-proxy-agent "5.0.0" https-proxy-agent "5.0.1"
pkg-dir "4.2.0" pkg-dir "4.2.0"
progress "2.0.3" progress "2.0.3"
proxy-from-env "1.1.0" proxy-from-env "1.1.0"
rimraf "3.0.2" rimraf "3.0.2"
tar-fs "2.1.1" tar-fs "2.1.1"
unbzip2-stream "1.4.3" unbzip2-stream "1.4.3"
ws "8.5.0" ws "8.8.1"
pvtsutils@^1.1.2, pvtsutils@^1.1.6: pvtsutils@^1.1.2, pvtsutils@^1.1.6:
version "1.1.6" version "1.1.6"
@ -5453,10 +5453,10 @@ write-file-atomic@^3.0.0:
signal-exit "^3.0.2" signal-exit "^3.0.2"
typedarray-to-buffer "^3.1.5" typedarray-to-buffer "^3.1.5"
ws@8.5.0: ws@8.8.1:
version "8.5.0" version "8.8.1"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.5.0.tgz#bfb4be96600757fe5382de12c670dab984a1ed4f" resolved "https://registry.yarnpkg.com/ws/-/ws-8.8.1.tgz#5dbad0feb7ade8ecc99b830c1d77c913d4955ff0"
integrity sha512-BWX0SWVgLPzYwF8lTzEy1egjhS4S4OEAHfsO8o65WOVsrnSRGaSiUaa9e0ggGlkMTtBlmOpEXiie9RUcBO86qg== integrity sha512-bGy2JzvzkPowEJV++hF07hAD6niYSr0JzBNo/J29WsB57A2r7Wlc1UFcTR9IzrPvuNVO4B8LGqF8qcpsVOhJCA==
ws@^7.4.4: ws@^7.4.4:
version "7.4.5" version "7.4.5"