mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Wait Default + Logging Improvements (#153)
improved logging of pywb + redis: - if 'logging' includes 'pywb', log pywb and redis output, to pywb.log and redis.log - otherwise, just ignore (don't print to stdout as that's too confusing) - print if wb-manager fails, likely due to existing collection waitUntil: default to just 'load' to avoid potential infinite loop, separate --netIdle can configure idle wait dependency: update to latest puppeteer-core (16.1.0)
This commit is contained in:
parent
a527cc9b36
commit
c5d208024a
4 changed files with 52 additions and 37 deletions
37
crawler.js
37
crawler.js
|
@ -78,7 +78,7 @@ class Crawler {
|
|||
this.debugLog("Seeds", this.params.scopedSeeds);
|
||||
|
||||
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
||||
this.capturePrefix = this.captureBasePrefix + "/id_/";
|
||||
this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
|
||||
|
||||
this.gotoOpts = {
|
||||
waitUntil: this.params.waitUntil,
|
||||
|
@ -201,13 +201,31 @@ class Crawler {
|
|||
return new ScreenCaster(transport, this.params.workers);
|
||||
}
|
||||
|
||||
bootstrap() {
|
||||
let opts = {};
|
||||
if (this.params.logging.includes("pywb")) {
|
||||
opts = {stdio: "inherit", cwd: this.params.cwd};
|
||||
async bootstrap() {
|
||||
const logs = path.join(this.collDir, "logs");
|
||||
|
||||
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
|
||||
|
||||
if (initRes.status) {
|
||||
console.log("wb-manager init failed, collection likely already exists");
|
||||
}
|
||||
else{
|
||||
|
||||
await fsp.mkdir(logs, {recursive: true});
|
||||
|
||||
let opts = {};
|
||||
let redisStdio;
|
||||
|
||||
if (this.params.logging.includes("pywb")) {
|
||||
const pywbStderr = fs.openSync(path.join(logs, "pywb.log"), "a");
|
||||
const stdio = [process.stdin, pywbStderr, pywbStderr];
|
||||
|
||||
const redisStderr = fs.openSync(path.join(logs, "redis.log"), "a");
|
||||
redisStdio = [process.stdin, redisStderr, redisStderr];
|
||||
|
||||
opts = {stdio, cwd: this.params.cwd};
|
||||
} else {
|
||||
opts = {stdio: "ignore", cwd: this.params.cwd};
|
||||
redisStdio = "ignore";
|
||||
}
|
||||
|
||||
this.browserExe = getBrowserExe();
|
||||
|
@ -218,7 +236,7 @@ class Crawler {
|
|||
|
||||
const subprocesses = [];
|
||||
|
||||
subprocesses.push(child_process.spawn("redis-server", {...opts, cwd: "/tmp/"}));
|
||||
subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
|
||||
|
||||
if (this.params.overwrite) {
|
||||
console.log(`Clearing ${this.collDir} before starting`);
|
||||
|
@ -229,8 +247,6 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
||||
|
||||
opts.env = {...process.env, COLL: this.params.collection, ROLLOVER_SIZE: this.params.rolloverSize};
|
||||
|
||||
subprocesses.push(child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts));
|
||||
|
@ -272,9 +288,8 @@ class Crawler {
|
|||
}
|
||||
|
||||
async run() {
|
||||
await fsp.mkdir(this.params.cwd, {recursive: true});
|
||||
await this.bootstrap();
|
||||
|
||||
this.bootstrap();
|
||||
let status;
|
||||
|
||||
try {
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
"minio": "7.0.26",
|
||||
"node-fetch": "^2.6.1",
|
||||
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
|
||||
"puppeteer-core": "^13.3.2",
|
||||
"puppeteer-core": "16.1.0",
|
||||
"request": "^2.88.2",
|
||||
"sitemapper": "^3.1.2",
|
||||
"uuid": "8.3.2",
|
||||
|
|
|
@ -53,7 +53,7 @@ class ArgParser {
|
|||
|
||||
"waitUntil": {
|
||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
|
||||
default: "load,networkidle2",
|
||||
default: "load",
|
||||
},
|
||||
|
||||
"depth": {
|
||||
|
|
48
yarn.lock
48
yarn.lock
|
@ -1558,10 +1558,10 @@ debug@4, debug@^4.0.1, debug@^4.1.0, debug@^4.1.1, debug@^4.3.1:
|
|||
dependencies:
|
||||
ms "2.1.2"
|
||||
|
||||
debug@4.3.3:
|
||||
version "4.3.3"
|
||||
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.3.tgz#04266e0b70a98d4462e6e288e38259213332b664"
|
||||
integrity sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q==
|
||||
debug@4.3.4:
|
||||
version "4.3.4"
|
||||
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.4.tgz#1319f6579357f2338d3337d2cdd4914bb5dcc865"
|
||||
integrity sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==
|
||||
dependencies:
|
||||
ms "2.1.2"
|
||||
|
||||
|
@ -1661,10 +1661,10 @@ detect-newline@^3.0.0:
|
|||
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
||||
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
||||
|
||||
devtools-protocol@0.0.960912:
|
||||
version "0.0.960912"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.960912.tgz#411c1fa355eddb72f06c4a8743f2808766db6245"
|
||||
integrity sha512-I3hWmV9rWHbdnUdmMKHF2NuYutIM2kXz2mdXW8ha7TbRlGTVs+PF+PsB5QWvpCek4Fy9B+msiispCfwlhG5Sqg==
|
||||
devtools-protocol@0.0.1019158:
|
||||
version "0.0.1019158"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
|
||||
integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
|
||||
|
||||
diff-sequences@^26.6.2:
|
||||
version "26.6.2"
|
||||
|
@ -2531,10 +2531,10 @@ http2-wrapper@^1.0.0-beta.5.2:
|
|||
quick-lru "^5.1.1"
|
||||
resolve-alpn "^1.0.0"
|
||||
|
||||
https-proxy-agent@5.0.0:
|
||||
version "5.0.0"
|
||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.0.tgz#e2a90542abb68a762e0a0850f6c9edadfd8506b2"
|
||||
integrity sha512-EkYm5BcKUGiduxzSt3Eppko+PiNWNEpa4ySk9vTC6wDsQJW9rHSa+UhGNJoRYp7bz6Ht1eaRIa6QaJqO5rCFbA==
|
||||
https-proxy-agent@5.0.1:
|
||||
version "5.0.1"
|
||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
|
||||
integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
|
||||
dependencies:
|
||||
agent-base "6"
|
||||
debug "4"
|
||||
|
@ -4246,23 +4246,23 @@ punycode@^2.1.0, punycode@^2.1.1:
|
|||
dependencies:
|
||||
debug "^4.1.1"
|
||||
|
||||
puppeteer-core@^13.3.2:
|
||||
version "13.3.2"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-13.3.2.tgz#03b47c776fea881df69e7a55559848434f8110f3"
|
||||
integrity sha512-9T8deXmLWf55/RvDpl32vP68stTufqvtj6fc9hH09ZwCLh5IwnN9Z0MWHfDMTLiW6MUpW2Flx5CQWt1SCUT47g==
|
||||
puppeteer-core@^16.1.0:
|
||||
version "16.1.0"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4"
|
||||
integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA==
|
||||
dependencies:
|
||||
cross-fetch "3.1.5"
|
||||
debug "4.3.3"
|
||||
devtools-protocol "0.0.960912"
|
||||
debug "4.3.4"
|
||||
devtools-protocol "0.0.1019158"
|
||||
extract-zip "2.0.1"
|
||||
https-proxy-agent "5.0.0"
|
||||
https-proxy-agent "5.0.1"
|
||||
pkg-dir "4.2.0"
|
||||
progress "2.0.3"
|
||||
proxy-from-env "1.1.0"
|
||||
rimraf "3.0.2"
|
||||
tar-fs "2.1.1"
|
||||
unbzip2-stream "1.4.3"
|
||||
ws "8.5.0"
|
||||
ws "8.8.1"
|
||||
|
||||
pvtsutils@^1.1.2, pvtsutils@^1.1.6:
|
||||
version "1.1.6"
|
||||
|
@ -5453,10 +5453,10 @@ write-file-atomic@^3.0.0:
|
|||
signal-exit "^3.0.2"
|
||||
typedarray-to-buffer "^3.1.5"
|
||||
|
||||
ws@8.5.0:
|
||||
version "8.5.0"
|
||||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.5.0.tgz#bfb4be96600757fe5382de12c670dab984a1ed4f"
|
||||
integrity sha512-BWX0SWVgLPzYwF8lTzEy1egjhS4S4OEAHfsO8o65WOVsrnSRGaSiUaa9e0ggGlkMTtBlmOpEXiie9RUcBO86qg==
|
||||
ws@8.8.1:
|
||||
version "8.8.1"
|
||||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.8.1.tgz#5dbad0feb7ade8ecc99b830c1d77c913d4955ff0"
|
||||
integrity sha512-bGy2JzvzkPowEJV++hF07hAD6niYSr0JzBNo/J29WsB57A2r7Wlc1UFcTR9IzrPvuNVO4B8LGqF8qcpsVOhJCA==
|
||||
|
||||
ws@^7.4.4:
|
||||
version "7.4.5"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue