Logger cleanup (#254)

* logging: convert logger to a singleton to simplify use

* add logger to create-login-profile.js
This commit is contained in:
Ilya Kreymer 2023-03-17 14:24:44 -07:00 committed by GitHub
parent 82808d8133
commit 07e503a8e6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 149 additions and 171 deletions

View file

@ -19,7 +19,7 @@ import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screenca
import { Screenshots } from "./util/screenshots.js"; import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js"; import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js"; import { initRedis } from "./util/redis.js";
import { Logger, errJSON, setExternalLogStream, setDebugLogging } from "./util/logger.js"; import { logger, errJSON } from "./util/logger.js";
import { runWorkers } from "./util/worker.js"; import { runWorkers } from "./util/worker.js";
import { sleep, timedRun } from "./util/timing.js"; import { sleep, timedRun } from "./util/timing.js";
@ -58,10 +58,9 @@ export class Crawler {
this.logFilename = path.join(this.logDir, `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`); this.logFilename = path.join(this.logDir, `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`);
const debugLogging = this.params.logging.includes("debug"); const debugLogging = this.params.logging.includes("debug");
setDebugLogging(debugLogging); logger.setDebugLogging(debugLogging);
this.logger = new Logger(); logger.debug("Writing log to: " + this.logFilename, {}, "init");
this.logger.debug("Writing log to: " + this.logFilename, {}, "init");
this.headers = {}; this.headers = {};
this.crawlState = null; this.crawlState = null;
@ -136,7 +135,7 @@ export class Crawler {
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0"; const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
if (!redisUrl.startsWith("redis://")) { if (!redisUrl.startsWith("redis://")) {
this.logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported"); logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
} }
let redis; let redis;
@ -146,18 +145,18 @@ export class Crawler {
redis = await initRedis(redisUrl); redis = await initRedis(redisUrl);
break; break;
} catch (e) { } catch (e) {
//this.logger.fatal("Unable to connect to state store Redis: " + redisUrl); //logger.fatal("Unable to connect to state store Redis: " + redisUrl);
this.logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state"); logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
await sleep(3); await sleep(3);
} }
} }
this.logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state"); logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname()); this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
if (this.params.saveState === "always" && this.params.saveStateInterval) { if (this.params.saveState === "always" && this.params.saveStateInterval) {
this.logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state"); logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
} }
return this.crawlState; return this.crawlState;
@ -168,10 +167,10 @@ export class Crawler {
if (this.params.screencastPort) { if (this.params.screencastPort) {
transport = new WSTransport(this.params.screencastPort); transport = new WSTransport(this.params.screencastPort);
this.logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast"); logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
} else if (this.params.redisStoreUrl && this.params.screencastRedis) { } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId); transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
this.logger.debug("Screencast enabled via redis pubsub", {}, "screencast"); logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
} }
if (!transport) { if (!transport) {
@ -185,28 +184,28 @@ export class Crawler {
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd}); const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
if (initRes.status) { if (initRes.status) {
this.logger.info("wb-manager init failed, collection likely already exists"); logger.info("wb-manager init failed, collection likely already exists");
} }
fs.mkdirSync(this.logDir, {recursive: true}); fs.mkdirSync(this.logDir, {recursive: true});
this.logFH = fs.createWriteStream(this.logFilename); this.logFH = fs.createWriteStream(this.logFilename);
setExternalLogStream(this.logFH); logger.setExternalLogStream(this.logFH);
this.infoString = await this.getInfoString(); this.infoString = await this.getInfoString();
this.logger.info(this.infoString); logger.info(this.infoString);
this.logger.info("Seeds", this.params.scopedSeeds); logger.info("Seeds", this.params.scopedSeeds);
if (this.params.profile) { if (this.params.profile) {
this.logger.info("With Browser Profile", {url: this.params.profile}); logger.info("With Browser Profile", {url: this.params.profile});
} }
if (this.params.overwrite) { if (this.params.overwrite) {
this.logger.debug(`Clearing ${this.collDir} before starting`); logger.debug(`Clearing ${this.collDir} before starting`);
try { try {
fs.rmSync(this.collDir, { recursive: true, force: true }); fs.rmSync(this.collDir, { recursive: true, force: true });
} catch(e) { } catch(e) {
this.logger.error(`Unable to clear ${this.collDir}`, e); logger.error(`Unable to clear ${this.collDir}`, e);
} }
} }
@ -277,7 +276,7 @@ export class Crawler {
await this.crawl(); await this.crawl();
status = (!this.interrupted ? "done" : "interrupted"); status = (!this.interrupted ? "done" : "interrupted");
} catch(e) { } catch(e) {
this.logger.error("Crawl failed", e); logger.error("Crawl failed", e);
exitCode = 9; exitCode = 9;
status = "failing"; status = "failing";
if (await this.crawlState.incFailCount()) { if (await this.crawlState.incFailCount()) {
@ -285,7 +284,7 @@ export class Crawler {
} }
} finally { } finally {
this.logger.info(`Crawl status: ${status}`); logger.info(`Crawl status: ${status}`);
if (this.crawlState) { if (this.crawlState) {
await this.crawlState.setStatus(status); await this.crawlState.setStatus(status);
@ -314,14 +313,14 @@ export class Crawler {
case "info": case "info":
behaviorLine = JSON.stringify(data); behaviorLine = JSON.stringify(data);
if (behaviorLine != this._behaviorLastLine) { if (behaviorLine != this._behaviorLastLine) {
this.logger.info(message, details, "behaviorScript"); logger.info(message, details, "behaviorScript");
this._behaviorLastLine = behaviorLine; this._behaviorLastLine = behaviorLine;
} }
break; break;
case "debug": case "debug":
default: default:
this.logger.debug(message, details, "behaviorScript"); logger.debug(message, details, "behaviorScript");
} }
} }
@ -337,17 +336,17 @@ export class Crawler {
if (this.params.logging.includes("jserrors")) { if (this.params.logging.includes("jserrors")) {
page.on("console", (msg) => { page.on("console", (msg) => {
if (msg.type() === "error") { if (msg.type() === "error") {
this.logger.warn(msg.text(), {"location": msg.location(), "page": page.url(), workerid}, "jsError"); logger.warn(msg.text(), {"location": msg.location(), "page": page.url(), workerid}, "jsError");
} }
}); });
page.on("pageerror", (e) => { page.on("pageerror", (e) => {
this.logger.warn("Page Error", {...errJSON(e), "page": page.url(), workerid}, "jsError"); logger.warn("Page Error", {...errJSON(e), "page": page.url(), workerid}, "jsError");
}); });
} }
if (this.screencaster) { if (this.screencaster) {
this.logger.debug("Start Screencast", {workerid}, "screencast"); logger.debug("Start Screencast", {workerid}, "screencast");
await this.screencaster.screencastPage(page, cdp, workerid); await this.screencaster.screencastPage(page, cdp, workerid);
} }
@ -359,7 +358,7 @@ export class Crawler {
async teardownPage({workerid}) { async teardownPage({workerid}) {
if (this.screencaster) { if (this.screencaster) {
this.logger.debug("End Screencast", {workerid}, "screencast"); logger.debug("End Screencast", {workerid}, "screencast");
await this.screencaster.stopById(workerid); await this.screencaster.stopById(workerid);
} }
} }
@ -375,7 +374,7 @@ export class Crawler {
data.workerid = workerid; data.workerid = workerid;
if (!this.isInScope(data, logDetails)) { if (!this.isInScope(data, logDetails)) {
this.logger.info("Page no longer in scope", data); logger.info("Page no longer in scope", data);
return true; return true;
} }
@ -386,7 +385,7 @@ export class Crawler {
if (this.params.screenshot) { if (this.params.screenshot) {
if (!data.isHTMLPage) { if (!data.isHTMLPage) {
this.logger.debug("Skipping screenshots for non-HTML page", logDetails); logger.debug("Skipping screenshots for non-HTML page", logDetails);
} }
const archiveDir = path.join(this.collDir, "archive"); const archiveDir = path.join(this.collDir, "archive");
const screenshots = new Screenshots({page, url, directory: archiveDir}); const screenshots = new Screenshots({page, url, directory: archiveDir});
@ -411,7 +410,7 @@ export class Crawler {
if (this.params.behaviorOpts) { if (this.params.behaviorOpts) {
if (!data.isHTMLPage) { if (!data.isHTMLPage) {
this.logger.debug("Skipping behaviors for non-HTML page", logDetails, "behavior"); logger.debug("Skipping behaviors for non-HTML page", logDetails, "behavior");
} else { } else {
const behaviorTimeout = this.params.behaviorTimeout / 1000; const behaviorTimeout = this.params.behaviorTimeout / 1000;
@ -424,12 +423,12 @@ export class Crawler {
); );
if (res && res.length) { if (res && res.length) {
this.logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior"); logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
} }
} }
} }
this.logger.info("Page finished", logDetails, "pageStatus"); logger.info("Page finished", logDetails, "pageStatus");
await this.crawlState.markFinished(url); await this.crawlState.markFinished(url);
@ -450,14 +449,14 @@ export class Crawler {
const context = page.context(); const context = page.context();
this.logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior"); logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
return await Promise.allSettled( return await Promise.allSettled(
frames.map(frame => this.browser.evaluateWithCLI(context, frame, "self.__bx_behaviors.run();", logDetails, "behavior")) frames.map(frame => this.browser.evaluateWithCLI(context, frame, "self.__bx_behaviors.run();", logDetails, "behavior"))
); );
} catch (e) { } catch (e) {
this.logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior"); logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior");
return null; return null;
} }
} }
@ -478,7 +477,7 @@ export class Crawler {
} }
if (!res) { if (!res) {
this.logger.debug("Skipping processing frame", {frameUrl, ...logDetails}, "behavior"); logger.debug("Skipping processing frame", {frameUrl, ...logDetails}, "behavior");
} }
return res; return res;
@ -516,7 +515,7 @@ export class Crawler {
const size = await getDirSize(dir); const size = await getDirSize(dir);
if (size >= this.params.sizeLimit) { if (size >= this.params.sizeLimit) {
this.logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`); logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
interrupt = true; interrupt = true;
this.clearOnExit = true; this.clearOnExit = true;
} }
@ -525,7 +524,7 @@ export class Crawler {
if (this.params.timeLimit) { if (this.params.timeLimit) {
const elapsed = (Date.now() - this.startTime) / 1000; const elapsed = (Date.now() - this.startTime) / 1000;
if (elapsed > this.params.timeLimit) { if (elapsed > this.params.timeLimit) {
this.logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`); logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
interrupt = true; interrupt = true;
} }
} }
@ -537,7 +536,7 @@ export class Crawler {
gracefulFinish() { gracefulFinish() {
this.interrupted = true; this.interrupted = true;
this.logger.info("Crawler interrupted, gracefully finishing current pages"); logger.info("Crawler interrupted, gracefully finishing current pages");
if (!this.params.waitOnDone) { if (!this.params.waitOnDone) {
this.finalExit = true; this.finalExit = true;
} }
@ -547,9 +546,9 @@ export class Crawler {
if (!markDone) { if (!markDone) {
this.params.waitOnDone = false; this.params.waitOnDone = false;
this.clearOnExit = true; this.clearOnExit = true;
this.logger.info("SIGNAL: Preparing for exit of this crawler instance only"); logger.info("SIGNAL: Preparing for exit of this crawler instance only");
} else { } else {
this.logger.info("SIGNAL: Preparing for final exit of all crawlers"); logger.info("SIGNAL: Preparing for final exit of all crawlers");
this.finalExit = true; this.finalExit = true;
} }
} }
@ -570,7 +569,7 @@ export class Crawler {
const driverUrl = new URL(this.params.driver, import.meta.url); const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl)).default; this.driver = (await import(driverUrl)).default;
} catch(e) { } catch(e) {
this.logger.warn(`Error importing driver ${this.params.driver}`, e); logger.warn(`Error importing driver ${this.params.driver}`, e);
return; return;
} }
@ -579,7 +578,7 @@ export class Crawler {
let initState = await this.crawlState.getStatus(); let initState = await this.crawlState.getStatus();
while (initState === "debug") { while (initState === "debug") {
this.logger.info("Paused for debugging, will continue after manual resume"); logger.info("Paused for debugging, will continue after manual resume");
await sleep(60); await sleep(60);
@ -591,7 +590,7 @@ export class Crawler {
this.done = true; this.done = true;
if (this.params.waitOnDone) { if (this.params.waitOnDone) {
this.logger.info("Already done, waiting for signal to exit..."); logger.info("Already done, waiting for signal to exit...");
// wait forever until signal // wait forever until signal
await new Promise(() => {}); await new Promise(() => {});
@ -617,10 +616,10 @@ export class Crawler {
await this.initPages(); await this.initPages();
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger); this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, logger);
if (this.params.blockRules && this.params.blockRules.length) { if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger); this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, logger);
} }
this.screencaster = this.initScreenCaster(); this.screencaster = this.initScreenCaster();
@ -676,7 +675,7 @@ export class Crawler {
} }
if (this.params.generateCDX) { if (this.params.generateCDX) {
this.logger.info("Generating CDX"); logger.info("Generating CDX");
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
} }
@ -686,18 +685,18 @@ export class Crawler {
await this.generateWACZ(); await this.generateWACZ();
if (this.clearOnExit) { if (this.clearOnExit) {
this.logger.info(`Clearing ${this.collDir} before exit`); logger.info(`Clearing ${this.collDir} before exit`);
try { try {
fs.rmSync(this.collDir, { recursive: true, force: true }); fs.rmSync(this.collDir, { recursive: true, force: true });
} catch(e) { } catch(e) {
this.logger.warn(`Unable to clear ${this.collDir} before exit`, e); logger.warn(`Unable to clear ${this.collDir} before exit`, e);
} }
} }
} }
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) { if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
this.done = true; this.done = true;
this.logger.info("All done, waiting for signal..."); logger.info("All done, waiting for signal...");
await this.crawlState.setStatus("done"); await this.crawlState.setStatus("done");
// wait forever until signal // wait forever until signal
@ -707,7 +706,7 @@ export class Crawler {
async closeLog() { async closeLog() {
// close file-based log // close file-based log
setExternalLogStream(null); logger.setExternalLogStream(null);
try { try {
await new Promise(resolve => this.logFH.close(() => resolve())); await new Promise(resolve => this.logFH.close(() => resolve()));
} catch (e) { } catch (e) {
@ -716,7 +715,7 @@ export class Crawler {
} }
async generateWACZ() { async generateWACZ() {
this.logger.info("Generating WACZ"); logger.info("Generating WACZ");
const archiveDir = path.join(this.collDir, "archive"); const archiveDir = path.join(this.collDir, "archive");
@ -726,13 +725,13 @@ export class Crawler {
// is finished (>0 pages and all pages written) // is finished (>0 pages and all pages written)
const isFinished = await this.crawlState.isFinished(); const isFinished = await this.crawlState.isFinished();
this.logger.info(`Num WARC Files: ${warcFileList.length}`); logger.info(`Num WARC Files: ${warcFileList.length}`);
if (!warcFileList.length) { if (!warcFileList.length) {
// if finished, just return // if finished, just return
if (isFinished) { if (isFinished) {
return; return;
} }
this.logger.fatal("No WARC Files, assuming crawl failed"); logger.fatal("No WARC Files, assuming crawl failed");
} }
// Build the argument list to pass to the wacz create command // Build the argument list to pass to the wacz create command
@ -764,11 +763,11 @@ export class Crawler {
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs)); const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
if (waczResult !== 0) { if (waczResult !== 0) {
this.logger.error("Error creating WACZ", {"status code": waczResult}); logger.error("Error creating WACZ", {"status code": waczResult});
this.logger.fatal("Unable to write WACZ successfully"); logger.fatal("Unable to write WACZ successfully");
} }
this.logger.debug(`WACZ successfully generated and saved to: ${waczPath}`); logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
// Verify WACZ // Verify WACZ
/* /*
@ -780,7 +779,7 @@ export class Crawler {
if (waczVerifyResult !== 0) { if (waczVerifyResult !== 0) {
console.log("validate", waczVerifyResult); console.log("validate", waczVerifyResult);
this.logger.fatal("Unable to verify WACZ created successfully"); logger.fatal("Unable to verify WACZ created successfully");
} }
*/ */
if (this.storage) { if (this.storage) {
@ -793,11 +792,11 @@ export class Crawler {
awaitProcess(proc) { awaitProcess(proc) {
proc.stdout.on("data", (data) => { proc.stdout.on("data", (data) => {
this.logger.debug(data.toString()); logger.debug(data.toString());
}); });
proc.stderr.on("data", (data) => { proc.stderr.on("data", (data) => {
this.logger.error(data.toString()); logger.error(data.toString());
}); });
return new Promise((resolve) => { return new Promise((resolve) => {
proc.on("close", (code) => resolve(code)); proc.on("close", (code) => resolve(code));
@ -822,13 +821,13 @@ export class Crawler {
"pendingPages": pendingList.map(x => JSON.stringify(x)) "pendingPages": pendingList.map(x => JSON.stringify(x))
}; };
this.logger.info("Crawl statistics", stats, "crawlStatus"); logger.info("Crawl statistics", stats, "crawlStatus");
if (toFile && this.params.statsFilename) { if (toFile && this.params.statsFilename) {
try { try {
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2)); await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
} catch (err) { } catch (err) {
this.logger.warn("Stats output failed", err); logger.warn("Stats output failed", err);
} }
} }
} }
@ -854,7 +853,7 @@ export class Crawler {
logDetails logDetails
); );
if (captureResult) { if (captureResult) {
this.logger.info("Direct fetch successful", {url, ...logDetails}, "fetch"); logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
return; return;
} }
} catch (e) { } catch (e) {
@ -880,7 +879,7 @@ export class Crawler {
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"}; const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
this.logger.info("Awaiting page load", logDetails); logger.info("Awaiting page load", logDetails);
try { try {
const resp = await page.goto(url, gotoOpts); const resp = await page.goto(url, gotoOpts);
@ -896,7 +895,7 @@ export class Crawler {
let msg = e.message || ""; let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error"; const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error";
this.logger.error(mainMessage, {msg, ...logDetails}); logger.error(mainMessage, {msg, ...logDetails});
//if (this.healthChecker) { //if (this.healthChecker) {
// this.healthChecker.incError(); // this.healthChecker.incError();
//} //}
@ -912,7 +911,7 @@ export class Crawler {
} }
if (!isHTMLPage) { if (!isHTMLPage) {
this.logger.debug("Skipping link extraction for non-HTML page", logDetails); logger.debug("Skipping link extraction for non-HTML page", logDetails);
return; return;
} }
@ -927,7 +926,7 @@ export class Crawler {
return; return;
} }
this.logger.debug("Extracting links"); logger.debug("Extracting links");
for (const opts of selectorOptsList) { for (const opts of selectorOptsList) {
const links = await this.extractLinks(page, data.filteredFrames, opts, logDetails); const links = await this.extractLinks(page, data.filteredFrames, opts, logDetails);
@ -953,7 +952,7 @@ export class Crawler {
try { try {
await page.waitForLoadState("networkidle", {timeout: this.params.netIdleWait * 1000}); await page.waitForLoadState("networkidle", {timeout: this.params.netIdleWait * 1000});
} catch (e) { } catch (e) {
this.logger.debug("waitForNetworkIdle timed out, ignoring", details); logger.debug("waitForNetworkIdle timed out, ignoring", details);
// ignore, continue // ignore, continue
} }
} }
@ -989,7 +988,7 @@ export class Crawler {
let i = 0; let i = 0;
for (const linkResult of linkResults) { for (const linkResult of linkResults) {
if (!linkResult) { if (!linkResult) {
this.logger.warn("Link Extraction timed out in frame", {frameUrl: frames[i].url, ...logDetails}); logger.warn("Link Extraction timed out in frame", {frameUrl: frames[i].url, ...logDetails});
continue; continue;
} }
if (!linkResult.value) continue; if (!linkResult.value) continue;
@ -1001,7 +1000,7 @@ export class Crawler {
} }
} catch (e) { } catch (e) {
this.logger.warn("Link Extraction failed", e); logger.warn("Link Extraction failed", e);
} }
return results; return results;
} }
@ -1027,27 +1026,27 @@ export class Crawler {
} }
} }
} catch (e) { } catch (e) {
this.logger.error("Queuing Error", e); logger.error("Queuing Error", e);
} }
} }
async checkCF(page, logDetails) { async checkCF(page, logDetails) {
try { try {
this.logger.debug("Check CF Blocking", logDetails); logger.debug("Check CF Blocking", logDetails);
const cloudflare = page.locator("div.cf-browser-verification.cf-im-under-attack"); const cloudflare = page.locator("div.cf-browser-verification.cf-im-under-attack");
while (await cloudflare.waitFor({timeout: PAGE_OP_TIMEOUT_SECS})) { while (await cloudflare.waitFor({timeout: PAGE_OP_TIMEOUT_SECS})) {
this.logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails); logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
await sleep(5.5); await sleep(5.5);
} }
} catch (e) { } catch (e) {
//this.logger.warn("Check CF failed, ignoring"); //logger.warn("Check CF failed, ignoring");
} }
} }
async queueUrl(seedId, url, depth, extraHops = 0) { async queueUrl(seedId, url, depth, extraHops = 0) {
this.logger.debug(`Queuing url ${url}`); logger.debug(`Queuing url ${url}`);
if (this.limitHit) { if (this.limitHit) {
return false; return false;
} }
@ -1082,17 +1081,17 @@ export class Crawler {
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}; const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
if (this.params.text) { if (this.params.text) {
header["hasText"] = true; header["hasText"] = true;
this.logger.debug("Text Extraction: Enabled"); logger.debug("Text Extraction: Enabled");
} else { } else {
header["hasText"] = false; header["hasText"] = false;
this.logger.debug("Text Extraction: Disabled"); logger.debug("Text Extraction: Disabled");
} }
const header_formatted = JSON.stringify(header).concat("\n"); const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted); await this.pagesFH.writeFile(header_formatted);
} }
} catch(err) { } catch(err) {
this.logger.error("pages/pages.jsonl creation failed", err); logger.error("pages/pages.jsonl creation failed", err);
} }
} }
@ -1112,7 +1111,7 @@ export class Crawler {
try { try {
await this.pagesFH.writeFile(processedRow); await this.pagesFH.writeFile(processedRow);
} catch (err) { } catch (err) {
this.logger.warn("pages/pages.jsonl append failed", err); logger.warn("pages/pages.jsonl append failed", err);
} }
} }
@ -1128,7 +1127,7 @@ export class Crawler {
agent: this.resolveAgent agent: this.resolveAgent
}); });
if (resp.status !== 200) { if (resp.status !== 200) {
this.logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`); logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
return true; return true;
} }
@ -1164,7 +1163,7 @@ export class Crawler {
} }
async awaitPendingClear() { async awaitPendingClear() {
this.logger.info("Waiting to ensure pending data is written to WARCs..."); logger.info("Waiting to ensure pending data is written to WARCs...");
const redis = await initRedis("redis://localhost/0"); const redis = await initRedis("redis://localhost/0");
@ -1174,7 +1173,7 @@ export class Crawler {
if (count <= 0) { if (count <= 0) {
break; break;
} }
this.logger.debug("Waiting for pending requests to finish", {numRequests: count}); logger.debug("Waiting for pending requests to finish", {numRequests: count});
} catch (e) { } catch (e) {
break; break;
} }
@ -1194,17 +1193,17 @@ export class Crawler {
const { sites } = await sitemapper.fetch(); const { sites } = await sitemapper.fetch();
await this.queueInScopeUrls(seedId, sites, 0); await this.queueInScopeUrls(seedId, sites, 0);
} catch(e) { } catch(e) {
this.logger.warn("Error fetching sites from sitemap", e); logger.warn("Error fetching sites from sitemap", e);
} }
} }
async combineWARC() { async combineWARC() {
this.logger.info("Generating Combined WARCs"); logger.info("Generating Combined WARCs");
// Get the list of created Warcs // Get the list of created Warcs
const warcLists = await fsp.readdir(path.join(this.collDir, "archive")); const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
this.logger.debug(`Combining ${warcLists.length} WARCs...`); logger.debug(`Combining ${warcLists.length} WARCs...`);
const fileSizeObjects = []; // Used to sort the created warc by fileSize const fileSizeObjects = []; // Used to sort the created warc by fileSize
@ -1271,7 +1270,7 @@ export class Crawler {
fh.write(warcBuffer); fh.write(warcBuffer);
} }
this.logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`); logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
const reader = fs.createReadStream(fileSizeObjects[j].fileName); const reader = fs.createReadStream(fileSizeObjects[j].fileName);
@ -1288,7 +1287,7 @@ export class Crawler {
await fh.end(); await fh.end();
} }
this.logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`); logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
} }
async serializeConfig(done = false) { async serializeConfig(done = false) {
@ -1338,10 +1337,10 @@ export class Crawler {
} }
const res = yaml.dump(this.origConfig, {lineWidth: -1}); const res = yaml.dump(this.origConfig, {lineWidth: -1});
try { try {
this.logger.info(`Saving crawl state to: ${filename}`); logger.info(`Saving crawl state to: ${filename}`);
await fsp.writeFile(filename, res); await fsp.writeFile(filename, res);
} catch (e) { } catch (e) {
this.logger.error(`Failed to write save state file: ${filename}`, e); logger.error(`Failed to write save state file: ${filename}`, e);
return; return;
} }
@ -1349,11 +1348,11 @@ export class Crawler {
if (this.saveStateFiles.length > this.params.saveStateHistory) { if (this.saveStateFiles.length > this.params.saveStateHistory) {
const oldFilename = this.saveStateFiles.shift(); const oldFilename = this.saveStateFiles.shift();
this.logger.info(`Removing old save-state: ${oldFilename}`); logger.info(`Removing old save-state: ${oldFilename}`);
try { try {
await fsp.unlink(oldFilename); await fsp.unlink(oldFilename);
} catch (e) { } catch (e) {
this.logger.error(`Failed to delete old save state file: ${oldFilename}`); logger.error(`Failed to delete old save state file: ${oldFilename}`);
} }
} }

View file

@ -9,6 +9,8 @@ import child_process from "child_process";
import yargs from "yargs"; import yargs from "yargs";
import { logger } from "./util/logger.js";
import { sleep } from "./util/timing.js"; import { sleep } from "./util/timing.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js"; import { initStorage } from "./util/storage.js";
@ -106,8 +108,11 @@ async function main() {
.option(cliOpts()) .option(cliOpts())
.argv; .argv;
logger.setDebugLogging(true);
if (!params.headless) { if (!params.headless) {
console.log("Launching XVFB"); logger.debug("Launching XVFB");
child_process.spawn("Xvfb", [ child_process.spawn("Xvfb", [
process.env.DISPLAY, process.env.DISPLAY,
"-listen", "-listen",
@ -144,7 +149,7 @@ async function main() {
if (params.proxy) { if (params.proxy) {
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"}); child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
console.log("Running with pywb proxy"); logger.debug("Running with pywb proxy");
await sleep(3000); await sleep(3000);
@ -171,7 +176,7 @@ async function main() {
}); });
if (params.interactive) { if (params.interactive) {
console.log("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode"); logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
} }
if (params.user || params.password) { if (params.user || params.password) {
@ -199,7 +204,7 @@ async function main() {
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();"); await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
} }
console.log(`Loading page: ${params.url}`); logger.info(`Loading page: ${params.url}`);
await page.goto(params.url, {waitUntil}); await page.goto(params.url, {waitUntil});
@ -216,7 +221,7 @@ async function main() {
async function automatedProfile(params, browser, page, cdp, waitUntil) { async function automatedProfile(params, browser, page, cdp, waitUntil) {
let u, p; let u, p;
console.log("Looking for username and password entry fields on page..."); logger.debug("Looking for username and password entry fields on page...");
try { try {
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]"); u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
@ -226,7 +231,7 @@ async function automatedProfile(params, browser, page, cdp, waitUntil) {
if (params.debugScreenshot) { if (params.debugScreenshot) {
await page.screenshot({path: params.debugScreenshot}); await page.screenshot({path: params.debugScreenshot});
} }
console.log("Login form could not be found"); logger.debug("Login form could not be found");
await page.close(); await page.close();
process.exit(1); process.exit(1);
return; return;
@ -255,7 +260,7 @@ async function createProfile(params, browser, page, cdp, targetFilename = "") {
await browser.close(); await browser.close();
console.log("creating profile"); logger.info("Creating profile");
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz"; const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
@ -270,11 +275,11 @@ async function createProfile(params, browser, page, cdp, targetFilename = "") {
const storage = initStorage(); const storage = initStorage();
if (storage) { if (storage) {
console.log("Uploading to remote storage..."); logger.info("Uploading to remote storage...");
resource = await storage.uploadFile(profileFilename, targetFilename); resource = await storage.uploadFile(profileFilename, targetFilename);
} }
console.log("done"); logger.info("Profile creation done");
return resource; return resource;
} }
@ -311,7 +316,7 @@ function promptInput(msg, hidden = false) {
class InteractiveBrowser { class InteractiveBrowser {
constructor(params, browser, page, cdp, targetId) { constructor(params, browser, page, cdp, targetId) {
console.log("Creating Profile Interactively..."); logger.info("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]); child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
this.params = params; this.params = params;
@ -342,7 +347,7 @@ class InteractiveBrowser {
if (this.shutdownWait) { if (this.shutdownWait) {
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
console.log(`Shutting down in ${this.shutdownWait}ms if no ping received`); logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`);
} else { } else {
this.shutdownTimer = 0; this.shutdownTimer = 0;
} }
@ -350,12 +355,12 @@ class InteractiveBrowser {
const httpServer = http.createServer((req, res) => this.handleRequest(req, res)); const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
const port = 9223; const port = 9223;
httpServer.listen(port); httpServer.listen(port);
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`); logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
if (!params.headless) { if (!params.headless) {
console.log("Screencasting with VNC on port 6080"); logger.info("Screencasting with VNC on port 6080");
} else { } else {
console.log("Screencasting with CDP on port 9222"); logger.info("Screencasting with CDP on port 9222");
} }
} }
@ -365,7 +370,7 @@ class InteractiveBrowser {
} }
async saveAllCookies() { async saveAllCookies() {
console.log("Saving all cookies"); logger.info("Saving all cookies");
for (const origin of this.originSet.values()) { for (const origin of this.originSet.values()) {
await this.saveCookiesFor(origin + "/"); await this.saveCookiesFor(origin + "/");
@ -392,13 +397,13 @@ class InteractiveBrowser {
} }
await this.browser.context.addCookies(cookies); await this.browser.context.addCookies(cookies);
} catch (e) { } catch (e) {
console.log("Save Cookie Error: " + e); logger.error("Save Cookie Error: ", e);
} }
} }
addOrigin() { addOrigin() {
const url = this.page.url(); const url = this.page.url();
console.log("Adding origin for", url); logger.debug("Adding origin", {url});
if (url.startsWith("http:") || url.startsWith("https:")) { if (url.startsWith("http:") || url.startsWith("https:")) {
this.originSet.add(new URL(url).origin); this.originSet.add(new URL(url).origin);
} }
@ -431,7 +436,7 @@ class InteractiveBrowser {
if (this.shutdownWait) { if (this.shutdownWait) {
clearInterval(this.shutdownTimer); clearInterval(this.shutdownTimer);
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
console.log(`Ping received, delaying shutdown for ${this.shutdownWait}ms`); logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
} }
origins = Array.from(this.originSet.values()); origins = Array.from(this.originSet.values());
@ -468,7 +473,7 @@ class InteractiveBrowser {
} catch (e) { } catch (e) {
res.writeHead(400, {"Content-Type": "application/json"}); res.writeHead(400, {"Content-Type": "application/json"});
res.end(JSON.stringify({"error": e.toString()})); res.end(JSON.stringify({"error": e.toString()}));
console.log(e); logger.warn("HTTP Error", e);
} }
return; return;
@ -491,7 +496,7 @@ class InteractiveBrowser {
} catch (e) { } catch (e) {
res.writeHead(500, {"Content-Type": "application/json"}); res.writeHead(500, {"Content-Type": "application/json"});
res.end(JSON.stringify({"error": e.toString()})); res.end(JSON.stringify({"error": e.toString()}));
console.log(e); logger.warn("HTTP Error", e);
} }
setTimeout(() => process.exit(0), 200); setTimeout(() => process.exit(0), 200);
@ -512,7 +517,7 @@ class InteractiveBrowser {
} catch (e) { } catch (e) {
res.writeHead(500, {"Content-Type": "text/html"}); res.writeHead(500, {"Content-Type": "text/html"});
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info"); res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info");
console.log(e); logger.warn("HTTP Error", e);
} }
setTimeout(() => process.exit(0), 200); setTimeout(() => process.exit(0), 200);

View file

@ -1,9 +1,7 @@
#!/usr/bin/env -S node --experimental-global-webcrypto #!/usr/bin/env -S node --experimental-global-webcrypto
import { logger } from "./util/logger.js";
import { Crawler } from "./crawler.js"; import { Crawler } from "./crawler.js";
import { Logger } from "./util/logger.js";
const logger = new Logger();
var crawler = null; var crawler = null;

View file

@ -11,9 +11,7 @@ import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js"; import { screenshotTypes } from "./screenshots.js";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
const logger = new Logger();
// ============================================================================ // ============================================================================

View file

@ -1,8 +1,6 @@
import fs from "fs"; import fs from "fs";
import { Logger, errJSON } from "./logger.js"; import { logger, errJSON } from "./logger.js";
const logger = new Logger();
const RULE_TYPES = ["block", "allowOnly"]; const RULE_TYPES = ["block", "allowOnly"];

View file

@ -6,13 +6,11 @@ import { Readable } from "node:stream";
import os from "os"; import os from "os";
import path from "path"; import path from "path";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
import { initStorage } from "./storage.js"; import { initStorage } from "./storage.js";
import { chromium } from "playwright-core"; import { chromium } from "playwright-core";
const logger = new Logger();
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));

View file

@ -2,6 +2,7 @@
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle"]; export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle"];
export const BEHAVIOR_LOG_FUNC = "__bx_log"; export const BEHAVIOR_LOG_FUNC = "__bx_log";
export const MAX_DEPTH = 1000000;
export const DEFAULT_SELECTORS = [{ export const DEFAULT_SELECTORS = [{
selector: "a[href]", selector: "a[href]",

View file

@ -1,8 +1,6 @@
import http from "http"; import http from "http";
import url from "url"; import url from "url";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
const logger = new Logger();
// =========================================================================== // ===========================================================================

View file

@ -1,15 +1,3 @@
// ===========================================================================
let logStream = null;
let debugLogging = false;
export function setExternalLogStream(logFH) {
logStream = logFH;
}
export function setDebugLogging(debugLog) {
debugLogging = debugLog;
}
// =========================================================================== // ===========================================================================
// to fix serialization of regexes for logging purposes // to fix serialization of regexes for logging purposes
RegExp.prototype.toJSON = RegExp.prototype.toString; RegExp.prototype.toJSON = RegExp.prototype.toString;
@ -22,8 +10,21 @@ export function errJSON(e) {
// =========================================================================== // ===========================================================================
export class Logger class Logger
{ {
constructor() {
this.logStream = null;
this.debugLogging = null;
}
setExternalLogStream(logFH) {
this.logStream = logFH;
}
setDebugLogging(debugLog) {
this.debugLogging = debugLog;
}
logAsJSON(message, data, context, logLevel="info") { logAsJSON(message, data, context, logLevel="info") {
if (data instanceof Error) { if (data instanceof Error) {
data = errJSON(data); data = errJSON(data);
@ -39,8 +40,8 @@ export class Logger
}; };
const string = JSON.stringify(dataToLog); const string = JSON.stringify(dataToLog);
console.log(string); console.log(string);
if (logStream) { if (this.logStream) {
logStream.write(string + "\n"); this.logStream.write(string + "\n");
} }
} }
@ -57,7 +58,7 @@ export class Logger
} }
debug(message, data={}, context="general") { debug(message, data={}, context="general") {
if (debugLogging) { if (this.debugLogging) {
this.logAsJSON(message, data, context, "debug"); this.logAsJSON(message, data, context, "debug");
} }
} }
@ -67,3 +68,5 @@ export class Logger
process.exit(exitCode); process.exit(exitCode);
} }
} }
export const logger = new Logger();

View file

@ -1,7 +1,5 @@
import Redis from "ioredis"; import Redis from "ioredis";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
const logger = new Logger();
const error = console.error; const error = console.error;

View file

@ -2,15 +2,12 @@ import ws from "ws";
import http from "http"; import http from "http";
import url from "url"; import url from "url";
import fs from "fs"; import fs from "fs";
//import { v4 as uuidv4 } from "uuid";
import { initRedis } from "./redis.js"; import { initRedis } from "./redis.js";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"}); const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
const logger = new Logger();
// =========================================================================== // ===========================================================================
class WSTransport class WSTransport

View file

@ -2,9 +2,7 @@ import fs from "fs";
import path from "path"; import path from "path";
import * as warcio from "warcio"; import * as warcio from "warcio";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
const logger = new Logger();
// ============================================================================ // ============================================================================

View file

@ -1,8 +1,5 @@
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js";
const logger = new Logger();
export const MAX_DEPTH = 1000000;
export class ScopedSeed export class ScopedSeed

View file

@ -1,8 +1,7 @@
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
import { MAX_DEPTH } from "./seeds.js"; import { MAX_DEPTH } from "./constants.js";
const logger = new Logger();
// ============================================================================ // ============================================================================
export class RedisCrawlState export class RedisCrawlState

View file

@ -7,12 +7,10 @@ import { createHash } from "crypto";
import Minio from "minio"; import Minio from "minio";
import { initRedis } from "./redis.js"; import { initRedis } from "./redis.js";
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
import getFolderSize from "get-folder-size"; import getFolderSize from "get-folder-size";
const logger = new Logger();
// =========================================================================== // ===========================================================================
export class S3StorageSync export class S3StorageSync

View file

@ -1,6 +1,4 @@
import { Logger } from "./logger.js"; import { logger } from "./logger.js";
const logger = new Logger();
export function sleep(seconds) { export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000)); return new Promise(resolve => setTimeout(resolve, seconds * 1000));

View file

@ -1,15 +1,10 @@
//import PQueue from "p-queue"; import { logger, errJSON } from "./logger.js";
import { Logger, errJSON } from "./logger.js";
import { sleep, timedRun } from "./timing.js"; import { sleep, timedRun } from "./timing.js";
const logger = new Logger();
const MAX_REUSE = 5; const MAX_REUSE = 5;
const NEW_WINDOW_TIMEOUT = 10; const NEW_WINDOW_TIMEOUT = 10;
// =========================================================================== // ===========================================================================
export function runWorkers(crawler, numWorkers, timeout) { export function runWorkers(crawler, numWorkers, timeout) {
logger.info(`Creating ${numWorkers} workers`, {}, "worker"); logger.info(`Creating ${numWorkers} workers`, {}, "worker");