Implement improved json-l logging

- Add Logger class with methods for info, error, warn, debug, fatal
- Add context, timestamp, and details fields to log entries
- Log messages as JSON Lines
- Replace puppeteer-cluster stats with custom stats implementation
- Log behaviors by default
- Amend argParser to reflect logging changes
- Capture and log stdout/stderr from awaited child_processes
- Modify tests to use webrecorder.net to avoid timeouts
This commit is contained in:
Tessa Walsh 2022-12-15 12:38:41 -05:00
parent 2b03e23174
commit 0192d05f4c
16 changed files with 276 additions and 194 deletions

View file

@ -23,6 +23,7 @@ import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screenca
import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js";
import { initRedis } from "./util/redis.js";
import { Logger } from "./util/logger.js";
import { getBrowserExe, loadProfile, chromeArgs, getDefaultUA, evaluateWithCLI } from "./util/browser.js";
@ -46,6 +47,13 @@ const behaviors = fs.readFileSync(new URL("./node_modules/browsertrix-behaviors/
// ============================================================================
export class Crawler {
constructor() {
const res = parseArgs();
this.params = res.parsed;
this.origConfig = res.origConfig;
const debugLogging = this.params.logging.includes("debug");
this.logger = new Logger(debugLogging);
this.headers = {};
this.crawlState = null;
@ -63,23 +71,17 @@ export class Crawler {
this.userAgent = "";
const res = parseArgs();
this.params = res.parsed;
this.origConfig = res.origConfig;
this.saveStateFiles = [];
this.lastSaveTime = 0;
this.saveStateInterval = this.params.saveStateInterval * 1000;
this.debugLogging = this.params.logging.includes("debug");
if (this.params.profile) {
this.statusLog("With Browser Profile: " + this.params.profile);
this.logger.info(`With Browser Profile: ${this.params.profile}"`);
}
this.emulateDevice = this.params.emulateDevice;
this.debugLog("Seeds", this.params.scopedSeeds);
this.logger.info("Seeds", this.params.scopedSeeds);
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
this.capturePrefix = process.env.NO_PROXY ? "" : this.captureBasePrefix + "/id_/";
@ -110,18 +112,6 @@ export class Crawler {
this.done = false;
this.behaviorLastLine = null;
this.logConsole = false;
}
statusLog(...args) {
console.log(...args);
}
debugLog(...args) {
if (this.debugLogging) {
console.log(...args);
}
}
configureUA() {
@ -158,7 +148,7 @@ export class Crawler {
if (redisUrl) {
if (!redisUrl.startsWith("redis://")) {
throw new Error("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
this.logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
}
let redis;
@ -168,24 +158,24 @@ export class Crawler {
redis = await initRedis(redisUrl);
break;
} catch (e) {
//throw new Error("Unable to connect to state store Redis: " + redisUrl);
console.warn(`Waiting for redis at ${redisUrl}`);
//this.logger.fatal("Unable to connect to state store Redis: " + redisUrl);
this.logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
await this.sleep(3);
}
}
this.statusLog(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`);
this.logger.info(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
} else {
this.statusLog("Storing state in memory");
this.logger.info("Storing state in memory", {}, "state");
this.crawlState = new MemoryCrawlState();
}
if (this.params.saveState === "always" && this.params.saveStateInterval) {
this.statusLog(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`);
this.logger.info(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
}
return this.crawlState;
@ -196,10 +186,10 @@ export class Crawler {
if (this.params.screencastPort) {
transport = new WSTransport(this.params.screencastPort);
this.debugLog(`Screencast server started on: ${this.params.screencastPort}`);
this.logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
this.debugLog("Screencast enabled via redis pubsub");
this.logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
}
if (!transport) {
@ -215,7 +205,7 @@ export class Crawler {
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
if (initRes.status) {
console.log("wb-manager init failed, collection likely already exists");
this.logger.info("wb-manager init failed, collection likely already exists");
}
await fsp.mkdir(logs, {recursive: true});
@ -236,8 +226,6 @@ export class Crawler {
redisStdio = "ignore";
}
this.logConsole = this.params.logging.includes("jserrors");
this.browserExe = getBrowserExe();
this.configureUA();
@ -249,11 +237,11 @@ export class Crawler {
subprocesses.push(child_process.spawn("redis-server", {cwd: "/tmp/", stdio: redisStdio}));
if (this.params.overwrite) {
console.log(`Clearing ${this.collDir} before starting`);
this.logger.info(`Clearing ${this.collDir} before starting`);
try {
fs.rmSync(this.collDir, { recursive: true, force: true });
} catch(e) {
console.warn(e);
this.logger.error(`Unable to clear ${this.collDir}`, e);
}
}
@ -318,8 +306,7 @@ export class Crawler {
await this.crawl();
status = (!this.interrupted ? "done" : "interrupted");
} catch(e) {
console.error("Crawl failed");
console.error(e);
this.logger.error("Crawl failed", e);
exitCode = 9;
status = "failing";
if (await this.crawlState.incFailCount()) {
@ -327,7 +314,7 @@ export class Crawler {
}
} finally {
console.log(status);
this.logger.info(`Crawl status: ${status}`);
if (this.crawlState) {
await this.crawlState.setStatus(status);
@ -344,16 +331,14 @@ export class Crawler {
case "info":
behaviorLine = JSON.stringify(data);
if (behaviorLine != this._behaviorLastLine) {
console.log(behaviorLine);
this.logger.info("Behavior line", data, "behavior");
this._behaviorLastLine = behaviorLine;
}
break;
case "debug":
default:
if (this.params.behaviorsLogDebug) {
console.log("behavior debug: " + JSON.stringify(data));
}
this.logger.debug("Behavior debug", data, "behavior");
}
}
@ -367,7 +352,7 @@ export class Crawler {
const {page, data} = opts;
if (!this.isInScope(data)) {
console.log(`No longer in scope: ${JSON.stringify(data)}`);
this.logger.info("Page no longer in scope", data);
return;
}
@ -425,7 +410,7 @@ export class Crawler {
if (this.params.behaviorOpts) {
if (!page.isHTMLPage) {
console.log("Skipping behaviors for non-HTML page");
this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
} else {
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
@ -434,6 +419,8 @@ export class Crawler {
}
}
this.logger.info("Page graph data for successfully crawled page", data, "pageGraph");
await this.writeStats();
await this.checkLimits();
@ -441,7 +428,7 @@ export class Crawler {
await this.serializeConfig();
} catch (e) {
console.warn(e);
this.logger.error(`Error crawling page ${data.url}`, e.message);
await this.markPageFailed(page);
}
}
@ -470,14 +457,14 @@ export class Crawler {
switch (pathname) {
case "/healthz":
if (this.errorCount < threshold) {
console.log(`health check ok, num errors ${this.errorCount} < ${threshold}`);
this.logger.debug(`health check ok, num errors ${this.errorCount} < ${threshold}`);
res.writeHead(200);
res.end();
}
return;
}
console.log(`health check failed: ${this.errorCount} >= ${threshold}`);
this.logger.error(`health check failed: ${this.errorCount} >= ${threshold}`);
res.writeHead(503);
res.end();
}
@ -491,7 +478,7 @@ export class Crawler {
const size = await getDirSize(dir);
if (size >= this.params.sizeLimit) {
console.log(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
this.logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
interrupt = true;
this.clearOnExit = true;
}
@ -500,7 +487,7 @@ export class Crawler {
if (this.params.timeLimit) {
const elapsed = (Date.now() - this.startTime) / 1000;
if (elapsed > this.params.timeLimit) {
console.log(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
this.logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
interrupt = true;
}
}
@ -522,9 +509,9 @@ export class Crawler {
if (!markDone) {
this.params.waitOnDone = false;
this.clearOnExit = true;
console.log("SIGNAL: Preparing for exit of this crawler instance only");
this.logger.info("SIGNAL: Preparing for exit of this crawler instance only");
} else {
console.log("SIGNAL: Preparing for final exit of all crawlers");
this.logger.info("SIGNAL: Preparing for final exit of all crawlers");
this.finalExit = true;
}
}
@ -539,7 +526,7 @@ export class Crawler {
if (this.params.healthCheckPort) {
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
this.statusLog(`Healthcheck server started on ${this.params.healthCheckPort}`);
this.logger.info(`Healthcheck server started on ${this.params.healthCheckPort}`);
this.healthServer.listen(this.params.healthCheckPort);
}
@ -547,7 +534,7 @@ export class Crawler {
const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl)).default;
} catch(e) {
console.warn(e);
this.logger.warn(`Error importing driver ${this.params.driver}`, e);
return;
}
@ -556,7 +543,7 @@ export class Crawler {
let initState = await this.crawlState.getStatus();
while (initState === "debug") {
console.log("Paused for debugging, will continue after manual resume");
this.logger.info("Paused for debugging, will continue after manual resume");
await this.sleep(60);
@ -568,7 +555,7 @@ export class Crawler {
this.done = true;
if (this.params.waitOnDone) {
this.statusLog("Already done, waiting for signal to exit...");
this.logger.info("Already done, waiting for signal to exit...");
// wait forever until signal
await new Promise(() => {});
@ -589,7 +576,7 @@ export class Crawler {
timeout: this.params.timeout * 2,
puppeteerOptions: this.puppeteerArgs,
puppeteer,
monitor: this.params.logging.includes("stats")
monitor: false
});
@ -606,11 +593,11 @@ export class Crawler {
await this.initPages();
if (this.params.blockAds) {
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, (text) => this.debugLog(text));
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
}
if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger);
}
this.screencaster = this.initScreenCaster();
@ -637,8 +624,6 @@ export class Crawler {
await this.serializeConfig(true);
this.writeStats();
if (this.pagesFH) {
await this.pagesFH.sync();
await this.pagesFH.close();
@ -652,27 +637,28 @@ export class Crawler {
}
if (this.params.generateCDX) {
this.statusLog("Generating CDX");
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd}));
this.logger.info("Generating CDX");
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
}
await this.writeStats(true);
if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) {
await this.generateWACZ();
if (this.clearOnExit) {
console.log(`Clearing ${this.collDir} before exit`);
this.logger.info(`Clearing ${this.collDir} before exit`);
try {
fs.rmSync(this.collDir, { recursive: true, force: true });
} catch(e) {
console.warn(e);
this.logger.warn(`Unable to clear ${this.collDir} before exit`, e);
}
}
}
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
this.done = true;
this.statusLog("All done, waiting for signal...");
this.logger.info("All done, waiting for signal...");
await this.crawlState.setStatus("done");
// wait forever until signal
@ -681,7 +667,7 @@ export class Crawler {
}
async generateWACZ() {
this.statusLog("Generating WACZ");
this.logger.info("Generating WACZ");
const archiveDir = path.join(this.collDir, "archive");
@ -691,13 +677,13 @@ export class Crawler {
// is finished (>0 pages and all pages written)
const isFinished = await this.crawlState.isFinished();
console.log(`Num WARC Files: ${warcFileList.length}`);
this.logger.info(`Num WARC Files: ${warcFileList.length}`);
if (!warcFileList.length) {
// if finished, just return
if (isFinished) {
return;
}
throw new Error("No WARC Files, assuming crawl failed");
this.logger.fatal("No WARC Files, assuming crawl failed");
}
// Build the argument list to pass to the wacz create command
@ -720,14 +706,14 @@ export class Crawler {
warcFileList.forEach((val, index) => createArgs.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
// create WACZ
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs, {stdio: "inherit"}));
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
if (waczResult !== 0) {
console.log("create result", waczResult);
throw new Error("Unable to write WACZ successfully");
this.logger.error("Error creating WACZ", {"status code": waczResult});
this.logger.fatal("Unable to write WACZ successfully");
}
this.debugLog(`WACZ successfully generated and saved to: ${waczPath}`);
this.logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
// Verify WACZ
/*
@ -735,11 +721,11 @@ export class Crawler {
validateArgs.push("-f");
validateArgs.push(waczPath);
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs, {stdio: "inherit"}));
const waczVerifyResult = await this.awaitProcess(child_process.spawn("wacz", validateArgs));
if (waczVerifyResult !== 0) {
console.log("validate", waczVerifyResult);
throw new Error("Unable to verify WACZ created successfully");
this.logger.fatal("Unable to verify WACZ created successfully");
}
*/
if (this.storage) {
@ -751,23 +737,43 @@ export class Crawler {
}
awaitProcess(proc) {
proc.stdout.on("data", (data) => {
this.logger.info(data.toString());
});
proc.stderr.on("data", (data) => {
this.logger.error(data.toString());
});
return new Promise((resolve) => {
proc.on("close", (code) => resolve(code));
});
}
async writeStats() {
if (this.params.statsFilename) {
const total = this.cluster.allTargetCount;
const workersRunning = this.cluster.workersBusy.length;
const numCrawled = total - (await this.crawlState.size()) - workersRunning;
const limit = {max: this.params.limit || 0, hit: this.limitHit};
const stats = {numCrawled, workersRunning, total, limit};
async writeStats(toFile=false) {
if (!this.params.logging.includes("stats")) {
return;
}
const realSize = await this.crawlState.realSize();
const pending = await this.crawlState.numRealPending();
const done = await this.crawlState.numDone();
const total = realSize + pending + done;
const limit = {max: this.params.limit || 0, hit: this.limitHit};
const stats = {
"crawled": done,
"total": total,
"pending": pending,
"limit": limit,
"pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x))
};
this.logger.info("Crawl statistics", stats, "crawlState");
if (toFile && this.params.statsFilename) {
try {
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
} catch (err) {
console.warn("Stats output failed", err);
this.logger.warn("Stats output failed", err);
}
}
}
@ -807,26 +813,24 @@ export class Crawler {
// more serious page error, mark page session as invalid
page.on("error", () => this.markPageFailed(page));
if (this.logConsole) {
page.on("console", (msg) => {
if (msg.type() === "error") {
console.log(msg.text(), msg.location());
if (this.params.logging.includes("jserrors") && (msg.type() === "error")) {
this.logger.warn(msg.text(), {"location": msg.location()}, "jsError");
}
});
}
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
try {
await page.goto(url, gotoOpts);
if (this.errorCount > 0) {
this.statusLog(`Page loaded, resetting error count ${this.errorCount} to 0`);
this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`);
this.errorCount = 0;
}
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
this.statusLog(`Load Error: ${url}: ${msg}`);
this.logger.error(`Load Error: ${url}: ${msg}`);
this.errorCount++;
}
}
@ -875,7 +879,7 @@ export class Crawler {
try {
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
} catch (e) {
console.log("note: waitForNetworkIdle timed out, ignoring");
this.logger.info("note: waitForNetworkIdle timed out, ignoring");
// ignore, continue
}
}
@ -906,7 +910,7 @@ export class Crawler {
}
} catch (e) {
console.warn("Link Extraction failed", e);
this.logger.warn("Link Extraction failed", e);
}
return results;
}
@ -932,19 +936,18 @@ export class Crawler {
}
}
} catch (e) {
console.error("Queuing Error: ", e);
this.logger.error("Queuing Error", e);
}
}
async checkCF(page) {
try {
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
this.statusLog("Cloudflare Check Detected, waiting for reload...");
this.logger.info("Cloudflare Check Detected, waiting for reload...");
await this.sleep(5.5);
}
} catch (e) {
//console.warn("Check CF failed, ignoring");
//console.warn(e);
this.logger.warn("Check CF failed, ignoring", e.message);
}
}
@ -987,17 +990,17 @@ export class Crawler {
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
if (this.params.text) {
header["hasText"] = true;
this.statusLog("Text Extraction: Enabled");
this.logger.info("Text Extraction: Enabled");
} else {
header["hasText"] = false;
this.statusLog("Text Extraction: Disabled");
this.logger.info("Text Extraction: Disabled");
}
const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted);
}
} catch(err) {
console.error("pages/pages.jsonl creation failed", err);
this.logger.error("pages/pages.jsonl creation failed", err);
}
}
@ -1017,7 +1020,7 @@ export class Crawler {
try {
await this.pagesFH.writeFile(processedRow);
} catch (err) {
console.warn("pages/pages.jsonl append failed", err);
this.logger.warn("pages/pages.jsonl append failed", err);
}
}
@ -1033,7 +1036,7 @@ export class Crawler {
agent: this.resolveAgent
});
if (resp.status !== 200) {
this.debugLog(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
this.logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
return true;
}
@ -1067,7 +1070,7 @@ export class Crawler {
}
async awaitPendingClear() {
this.statusLog("Waiting to ensure pending data is written to WARCs...");
this.logger.info("Waiting to ensure pending data is written to WARCs...");
const redis = await initRedis("redis://localhost/0");
@ -1080,7 +1083,7 @@ export class Crawler {
if (count <= 0) {
break;
}
this.debugLog(`Still waiting for ${count} pending requests to finish...`);
this.logger.debug(`Still waiting for ${count} pending requests to finish...`);
} catch (e) {
break;
}
@ -1104,17 +1107,17 @@ export class Crawler {
const { sites } = await sitemapper.fetch();
await this.queueInScopeUrls(seedId, sites, 0);
} catch(e) {
console.warn(e);
this.logger.warn("Error fetching sites from sitemap", e);
}
}
async combineWARC() {
this.statusLog("Generating Combined WARCs");
this.logger.info("Generating Combined WARCs");
// Get the list of created Warcs
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
this.debugLog(`Combining ${warcLists.length} WARCs...`);
this.logger.debug(`Combining ${warcLists.length} WARCs...`);
const fileSizeObjects = []; // Used to sort the created warc by fileSize
@ -1181,7 +1184,7 @@ export class Crawler {
fh.write(warcBuffer);
}
this.debugLog(`Appending WARC ${fileSizeObjects[j].fileName}`);
this.logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
@ -1198,7 +1201,7 @@ export class Crawler {
await fh.end();
}
this.debugLog(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
this.logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
}
async serializeConfig(done = false) {
@ -1248,10 +1251,10 @@ export class Crawler {
}
const res = yaml.dump(this.origConfig, {lineWidth: -1});
try {
this.statusLog("Saving crawl state to: " + filename);
this.logger.info(`Saving crawl state to: ${filename}`);
await fsp.writeFile(filename, res);
} catch (e) {
console.error(`Failed to write save state file: ${filename}`, e);
this.logger.error(`Failed to write save state file: ${filename}`, e);
return;
}
@ -1259,11 +1262,11 @@ export class Crawler {
if (this.saveStateFiles.length > this.params.saveStateHistory) {
const oldFilename = this.saveStateFiles.shift();
this.statusLog(`Removing old save-state: ${oldFilename}`);
this.logger.info(`Removing old save-state: ${oldFilename}`);
try {
await fsp.unlink(oldFilename);
} catch (e) {
console.error(`Failed to delete old save state file: ${oldFilename}`);
this.logger.error(`Failed to delete old save state file: ${oldFilename}`);
}
}

17
main.js
View file

@ -1,6 +1,9 @@
#!/usr/bin/env -S node --experimental-global-webcrypto
import { Crawler } from "./crawler.js";
import { Logger } from "./util/logger.js";
const logger = new Logger();
var crawler = null;
@ -9,29 +12,29 @@ let forceTerm = false;
async function handleTerminate(signame) {
console.log(`${signame} received...`);
logger.info(`${signame} received...`);
if (!crawler || !crawler.crawlState) {
console.log("error: no crawler running, exiting");
logger.error("error: no crawler running, exiting");
process.exit(1);
}
if (crawler.done) {
console.log("success: crawler done, exiting");
logger.info("success: crawler done, exiting");
process.exit(0);
}
try {
if (!crawler.crawlState.drainMax) {
console.log("SIGNAL: gracefully finishing current pages...");
logger.info("SIGNAL: gracefully finishing current pages...");
crawler.gracefulFinish();
} else if (forceTerm || (Date.now() - lastSigInt) > 200) {
console.log("SIGNAL: stopping crawl now...");
logger.info("SIGNAL: stopping crawl now...");
await crawler.serializeAndExit();
}
lastSigInt = Date.now();
} catch (e) {
console.log(e);
logger.error("Error stopping crawl after receiving termination signal", e);
}
}
@ -40,7 +43,7 @@ process.on("SIGINT", () => handleTerminate("SIGINT"));
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
process.on("SIGABRT", async () => {
console.log("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
logger.info("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
forceTerm = true;
});

View file

@ -45,7 +45,7 @@ test("check yaml config file with seed list is used", async () => {
test("check yaml config file will be overwritten by command line", async () => {
try{
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000");
}
catch (error) {
console.log(error);
@ -61,7 +61,7 @@ test("check yaml config file will be overwritten by command line", async () => {
}
}
expect(pages.has("https://www.example.com/")).toBe(true);
expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
expect(pages.size).toBe(1);
});

View file

@ -8,7 +8,7 @@ const exec = util.promisify(execCallback);
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
}
catch (error) {
console.log(error);
@ -17,13 +17,13 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
const expectedPages = [
"https://example.com/",
"https://www.iana.org/domains/example",
"http://www.iana.org/",
"http://www.iana.org/domains",
"http://www.iana.org/protocols",
"http://www.iana.org/numbers",
"http://www.iana.org/about",
"https://webrecorder.net/",
"https://webrecorder.net/blog",
"https://webrecorder.net/tools",
"https://webrecorder.net/community",
"https://webrecorder.net/about",
"https://webrecorder.net/contact",
"https://webrecorder.net/faq",
];
for (const page of crawled_pages.trim().split("\n")) {

View file

@ -1,8 +1,8 @@
name: crawl-test-1
collection: configtest
seeds:
- https://www.example.org
- https://www.iana.org/
- https://webrecorder.net/
- https://specs.webrecorder.net/
generateWACZ: true

View file

@ -1,2 +1,2 @@
https://www.example.org
https://www.example.com
https://webrecorder.net/about/
https://specs.webrecorder.net/wacz/1.1.1/

View file

@ -4,10 +4,10 @@ import fs from "fs";
const exec = util.promisify(execCallback);
test("check that URLs one-depth out from the seed-list are crawled", async () => {
test("check that URLs in seed-list are crawled", async () => {
try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000");
}
catch (error) {
console.log(error);

View file

@ -13,6 +13,9 @@ import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js";
import { Logger } from "./logger.js";
const logger = new Logger();
// ============================================================================
@ -171,7 +174,7 @@ class ArgParser {
},
"logging": {
describe: "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug, jserrors",
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, pywb, debug",
type: "string",
default: "stats",
},
@ -351,7 +354,7 @@ class ArgParser {
// Check that the collection name is valid.
if (argv.collection.search(/^[\w][\w-]*$/) === -1){
throw new Error(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
}
argv.timeout *= 1000;
@ -365,7 +368,7 @@ class ArgParser {
for (const opt of argv.waitUntil) {
if (!WAIT_UNTIL_OPTS.includes(opt)) {
throw new Error("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
logger.fatal("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
}
}
@ -394,30 +397,25 @@ class ArgParser {
if (argv.behaviorTimeout) {
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
}
if (argv.logging.includes("behaviors")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
} else if (argv.logging.includes("behaviors-debug")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorsLogDebug = true;
}
argv.behaviorOpts = JSON.stringify(behaviorOpts);
if (argv.newContext) {
console.log("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
logger.info("Note: The newContext argument is deprecated in 0.8.0. Values passed to this option will be ignored");
}
if (argv.workers > 1) {
console.log("Window context being used to support >1 workers");
logger.info("Window context being used to support >1 workers");
argv.newContext = ReuseWindowConcurrency;
} else {
console.log("Page context being used with 1 worker");
logger.info("Page context being used with 1 worker");
argv.newContext = Cluster.CONCURRENCY_PAGE;
}
if (argv.mobileDevice) {
argv.emulateDevice = puppeteer.devices[argv.mobileDevice];
if (!argv.emulateDevice) {
throw new Error("Unknown device: " + argv.mobileDevice);
logger.fatal("Unknown device: " + argv.mobileDevice);
}
}
@ -442,13 +440,13 @@ class ArgParser {
} else {
argv.netIdleWait = 2;
}
console.log(`Set netIdleWait to ${argv.netIdleWait} seconds`);
logger.info(`Set netIdleWait to ${argv.netIdleWait} seconds`);
}
// prefer argv.include only if string or a non-empty array
if (argv.include && (typeof(argv.include) === "string" || argv.include.length)) {
if (argv.scopeType && argv.scopeType !== "custom") {
console.warn("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
logger.info("You've specified a --scopeType and a --scopeIncludeRx / --include regex. The custom scope regex will take precedence, overriding the scopeType");
argv.scopeType = "custom";
}
}

View file

@ -1,5 +1,9 @@
import fs from "fs";
import { Logger } from "./logger.js";
const logger = new Logger();
const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
@ -28,7 +32,7 @@ class BlockRule
}
if (!RULE_TYPES.includes(this.type)) {
throw new Error("Rule \"type\" must be: " + RULE_TYPES.join(", "));
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
}
}
@ -47,11 +51,11 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
// ===========================================================================
export class BlockRules
{
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
constructor(blockRules, blockPutUrl, blockErrMsg, logger) {
this.rules = [];
this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg;
this.debugLog = debugLog;
this.logger = logger;
this.blockedUrlSet = new Set();
@ -60,9 +64,9 @@ export class BlockRules
}
if (this.rules.length) {
this.debugLog("URL Block Rules:\n");
this.logger.debug("URL Block Rules:\n");
for (const rule of this.rules) {
this.debugLog(rule.toString());
this.logger.debug(rule.toString());
}
}
}
@ -84,7 +88,7 @@ export class BlockRules
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
this.logger.warn("Error handling request", e);
}
});
}
@ -104,7 +108,7 @@ export class BlockRules
}
} catch (e) {
this.debugLog(`Block: (${blockState}) Failed On: ${url} Reason: ${e}`);
this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, e);
}
}
@ -151,10 +155,10 @@ export class BlockRules
if (block) {
if (blockState === BlockState.BLOCK_PAGE_NAV) {
console.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
this.logger.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
return BlockState.ALLOW;
}
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
this.logger.debug(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
await this.recordBlockMsg(url);
return blockState;
}
@ -187,7 +191,7 @@ export class BlockRules
}
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
this.logger.debug(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
return {block, done: true};
}
@ -204,7 +208,7 @@ export class BlockRules
return !!text.match(frameTextMatch);
} catch (e) {
this.debugLog(e);
this.logger.debug("Error determining text match", e);
}
}
@ -230,8 +234,8 @@ export class BlockRules
// ===========================================================================
export class AdBlockRules extends BlockRules
{
constructor(blockPutUrl, blockErrMsg, debugLog, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg, debugLog);
constructor(blockPutUrl, blockErrMsg, logger, adhostsFilePath = "../ad-hosts.json") {
super([], blockPutUrl, blockErrMsg, logger);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url)));
}
@ -248,7 +252,7 @@ export class AdBlockRules extends BlockRules
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
this.logger.warn("Error handling request", e);
}
});
}
@ -257,7 +261,7 @@ export class AdBlockRules extends BlockRules
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.debugLog(`URL blocked for being an ad: ${url}`);
this.logger.debug(`URL blocked for being an ad: ${url}`);
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}

View file

@ -4,6 +4,9 @@ import path from "path";
import os from "os";
import request from "request";
import { initStorage } from "./storage.js";
import { Logger } from "./logger.js";
const logger = new Logger();
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
@ -13,7 +16,7 @@ export async function loadProfile(profileFilename) {
if (profileFilename &&
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
console.log(`Downloading ${profileFilename} to ${targetFilename}`);
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
const p = new Promise((resolve, reject) => {
request.get(profileFilename).
@ -29,7 +32,7 @@ export async function loadProfile(profileFilename) {
const storage = initStorage("");
if (!storage) {
throw new Error("Profile specified relative to s3 storage, but no S3 storage defined");
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
}
await storage.downloadFile(profileFilename.slice(1), targetFilename);
@ -41,7 +44,7 @@ export async function loadProfile(profileFilename) {
try {
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
} catch (e) {
console.error(`Profile filename ${profileFilename} not a valid tar.gz`);
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
}
}
@ -72,7 +75,7 @@ export function getDefaultUA() {
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
version = version.match(/[\d.]+/)[0];
} catch(e) {
console.error(e);
logger.error("Error getting default UserAgent", e);
}
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
@ -158,7 +161,7 @@ export async function evaluateWithCLI(frame, funcString) {
});
if (exceptionDetails) {
throw new Error(
logger.fatal(
"Behavior Evaluation Failed" + exceptionDetails.text
);
}

45
util/logger.js Normal file
View file

@ -0,0 +1,45 @@
// ===========================================================================
export class Logger
{
constructor(debugLogging=false) {
this.debugLogging = debugLogging;
}
logAsJSON(message, data, context, logLevel="info") {
if (typeof data !== "object") {
data = {"message": data.toString()};
}
let dataToLog = {
"logLevel": logLevel,
"timestamp": new Date().toISOString(),
"context": context,
"message": message,
"details": data ? data : {}
};
console.log(JSON.stringify(dataToLog));
}
info(message, data={}, context="general") {
this.logAsJSON(message, data, context);
}
error(message, data={}, context="general") {
this.logAsJSON(message, data, context, "error");
}
warn(message, data={}, context="general") {
this.logAsJSON(message, data, context, "warn");
}
debug(message, data={}, context="general") {
if (this.debugLogging) {
this.logAsJSON(message, data, context, "debug");
}
}
fatal(message, data={}, context="general", exitCode=1) {
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
process.exit(exitCode);
}
}

View file

@ -2,6 +2,9 @@ import fs from "fs";
import path from "path";
import * as warcio from "warcio";
import { Logger } from "./logger.js";
const logger = new Logger();
// ============================================================================
@ -43,9 +46,9 @@ export class Screenshots {
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, options.type);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
fs.appendFileSync(this.warcName, warcRecordBuffer);
console.log(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
} catch (e) {
console.log(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e);
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
}
}

View file

@ -1,7 +1,15 @@
import { Logger } from "./logger.js";
const logger = new Logger();
export class ScopedSeed
{
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} = {}) {
const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
logger.fatal(`Invalid Seed "${url}" - not a valid URL`);
}
this.url = parsedUrl.href;
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
@ -36,11 +44,12 @@ export class ScopedSeed
try {
parsedUrl = new URL(url.trim());
} catch (e) {
throw new Error(`Invalid Seed "${url}" - not a valid URL`);
logger.error(`Invalid Seed "${url}" - not a valid URL`);
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
throw new Error(`Invalid Seed "${url}" - URL must start with http:// or https://`);
logger.error(`Invalid Seed "${url}" - URL must start with http:// or https://`);
parsedUrl = null;
}
return parsedUrl;
@ -91,7 +100,7 @@ export class ScopedSeed
break;
default:
throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
}
return [include, allowHash];
@ -106,9 +115,8 @@ export class ScopedSeed
return false;
}
try {
url = this.parseUrl(url);
} catch(e) {
if (!url) {
return false;
}

View file

@ -1,6 +1,11 @@
import mod from "puppeteer-cluster/dist/Job.js";
import { Logger } from "./logger.js";
const Job = mod.default;
const logger = new Logger();
// ============================================================================
export class BaseState
@ -102,7 +107,7 @@ export class MemoryCrawlState extends BaseState
},
reject(e) {
console.warn(`Page Load Failed: ${url}, Reason: ${e}`);
logger.warn(`Page Load Failed: ${url}`, e.message);
state.pending.delete(url);
@ -316,7 +321,7 @@ return 0;
try {
data = JSON.parse(json);
} catch(e) {
console.error("Invalid queued json: ", json);
logger.error("Invalid queued json", json);
return null;
}
@ -338,7 +343,7 @@ return 0;
},
async reject(e) {
console.warn(`Page Load Failed: ${url}, Reason: ${e}`);
logger.warn(`Page Load Failed: ${url}`, e.message);
await state._fail(url);
}
};
@ -445,7 +450,7 @@ return 0;
for (const url of pendingUrls) {
if (await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url)) {
console.log("Requeued: " + url);
logger.info(`Requeued: ${url}`);
}
}
}

View file

@ -7,9 +7,12 @@ import { createHash } from "crypto";
import Minio from "minio";
import { initRedis } from "./redis.js";
import { Logger } from "./logger.js";
import getFolderSize from "get-folder-size";
const logger = new Logger();
// ===========================================================================
export class S3StorageSync
@ -57,10 +60,13 @@ export class S3StorageSync
}
async uploadFile(srcFilename, targetFilename) {
console.log(`Bucket: ${this.bucketName}`);
console.log(`Crawl Id: ${this.crawlId}`);
console.log(`Prefix: ${this.objectPrefix}`);
console.log(`Target Filename: ${targetFilename}`);
const fileUploadInfo = {
"bucket": this.bucketName,
"crawlId": this.crawlId,
"prefix": this.objectPrefix,
"targetFilename": this.targetFilename
};
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
@ -76,7 +82,7 @@ export class S3StorageSync
async uploadCollWACZ(srcFilename, targetFilename, completed = true) {
const resource = await this.uploadFile(srcFilename, targetFilename);
console.log(resource);
logger.info("WACZ S3 file upload resource", resource, "s3Upload");
if (this.webhookUrl) {
const body = {
@ -92,14 +98,14 @@ export class S3StorageSync
completed
};
console.log("Pinging Webhook: " + this.webhookUrl);
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
} else if (this.webhookUrl.startsWith("redis://")) {
const parts = this.webhookUrl.split("/");
if (parts.length !== 5) {
throw new Error("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
}
const redis = await initRedis(parts.slice(0, 4).join("/"));
await redis.rpush(parts[4], JSON.stringify(body));

View file

@ -1,5 +1,9 @@
import sbi from "puppeteer-cluster/dist/concurrency/SingleBrowserImplementation.js";
import { Logger } from "./logger.js";
const logger = new Logger();
const SingleBrowserImplementation = sbi.default;
@ -40,7 +44,7 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
}
this.repairing = true;
console.debug("Starting repair");
logger.warn("Starting browser repair");
if (this.screencaster) {
this.screencaster.endAllTargets();
@ -50,13 +54,13 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
// will probably fail, but just in case the repair was not necessary
await this.browser.close();
} catch (e) {
console.debug("Unable to close browser.");
logger.warn("Unable to close browser");
}
try {
await this.init();
} catch (err) {
console.debug("Unable to restart chrome.");
logger.warn("Unable to restart chrome");
}
this.repairRequested = false;
this.repairing = false;
@ -71,7 +75,7 @@ export class ReuseWindowConcurrency extends SingleBrowserImplementation {
const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true});
targetId = res.targetId;
} catch (e) {
console.warn(e);
logger.warn("Error getting new page in window context", e);
await this.repair();
}