mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Logger cleanup (#254)
* logging: convert logger to a singleton to simplify use * add logger to create-login-profile.js
This commit is contained in:
parent
82808d8133
commit
07e503a8e6
17 changed files with 149 additions and 171 deletions
177
crawler.js
177
crawler.js
|
@ -19,7 +19,7 @@ import { ScreenCaster, WSTransport, RedisPubSubTransport } from "./util/screenca
|
||||||
import { Screenshots } from "./util/screenshots.js";
|
import { Screenshots } from "./util/screenshots.js";
|
||||||
import { parseArgs } from "./util/argParser.js";
|
import { parseArgs } from "./util/argParser.js";
|
||||||
import { initRedis } from "./util/redis.js";
|
import { initRedis } from "./util/redis.js";
|
||||||
import { Logger, errJSON, setExternalLogStream, setDebugLogging } from "./util/logger.js";
|
import { logger, errJSON } from "./util/logger.js";
|
||||||
import { runWorkers } from "./util/worker.js";
|
import { runWorkers } from "./util/worker.js";
|
||||||
import { sleep, timedRun } from "./util/timing.js";
|
import { sleep, timedRun } from "./util/timing.js";
|
||||||
|
|
||||||
|
@ -58,10 +58,9 @@ export class Crawler {
|
||||||
this.logFilename = path.join(this.logDir, `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`);
|
this.logFilename = path.join(this.logDir, `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`);
|
||||||
|
|
||||||
const debugLogging = this.params.logging.includes("debug");
|
const debugLogging = this.params.logging.includes("debug");
|
||||||
setDebugLogging(debugLogging);
|
logger.setDebugLogging(debugLogging);
|
||||||
|
|
||||||
this.logger = new Logger();
|
logger.debug("Writing log to: " + this.logFilename, {}, "init");
|
||||||
this.logger.debug("Writing log to: " + this.logFilename, {}, "init");
|
|
||||||
|
|
||||||
this.headers = {};
|
this.headers = {};
|
||||||
this.crawlState = null;
|
this.crawlState = null;
|
||||||
|
@ -136,7 +135,7 @@ export class Crawler {
|
||||||
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
|
||||||
|
|
||||||
if (!redisUrl.startsWith("redis://")) {
|
if (!redisUrl.startsWith("redis://")) {
|
||||||
this.logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
|
logger.fatal("stateStoreUrl must start with redis:// -- Only redis-based store currently supported");
|
||||||
}
|
}
|
||||||
|
|
||||||
let redis;
|
let redis;
|
||||||
|
@ -146,18 +145,18 @@ export class Crawler {
|
||||||
redis = await initRedis(redisUrl);
|
redis = await initRedis(redisUrl);
|
||||||
break;
|
break;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
//this.logger.fatal("Unable to connect to state store Redis: " + redisUrl);
|
//logger.fatal("Unable to connect to state store Redis: " + redisUrl);
|
||||||
this.logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
|
logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
|
||||||
await sleep(3);
|
await sleep(3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
|
logger.debug(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
|
||||||
|
|
||||||
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
|
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
|
||||||
|
|
||||||
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
if (this.params.saveState === "always" && this.params.saveStateInterval) {
|
||||||
this.logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
|
logger.debug(`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, {}, "state");
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.crawlState;
|
return this.crawlState;
|
||||||
|
@ -168,10 +167,10 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.params.screencastPort) {
|
if (this.params.screencastPort) {
|
||||||
transport = new WSTransport(this.params.screencastPort);
|
transport = new WSTransport(this.params.screencastPort);
|
||||||
this.logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
|
logger.debug(`Screencast server started on: ${this.params.screencastPort}`, {}, "screencast");
|
||||||
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
} else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
||||||
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
transport = new RedisPubSubTransport(this.params.redisStoreUrl, this.crawlId);
|
||||||
this.logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
|
logger.debug("Screencast enabled via redis pubsub", {}, "screencast");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!transport) {
|
if (!transport) {
|
||||||
|
@ -185,28 +184,28 @@ export class Crawler {
|
||||||
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
|
const initRes = child_process.spawnSync("wb-manager", ["init", this.params.collection], {cwd: this.params.cwd});
|
||||||
|
|
||||||
if (initRes.status) {
|
if (initRes.status) {
|
||||||
this.logger.info("wb-manager init failed, collection likely already exists");
|
logger.info("wb-manager init failed, collection likely already exists");
|
||||||
}
|
}
|
||||||
|
|
||||||
fs.mkdirSync(this.logDir, {recursive: true});
|
fs.mkdirSync(this.logDir, {recursive: true});
|
||||||
this.logFH = fs.createWriteStream(this.logFilename);
|
this.logFH = fs.createWriteStream(this.logFilename);
|
||||||
setExternalLogStream(this.logFH);
|
logger.setExternalLogStream(this.logFH);
|
||||||
|
|
||||||
this.infoString = await this.getInfoString();
|
this.infoString = await this.getInfoString();
|
||||||
this.logger.info(this.infoString);
|
logger.info(this.infoString);
|
||||||
|
|
||||||
this.logger.info("Seeds", this.params.scopedSeeds);
|
logger.info("Seeds", this.params.scopedSeeds);
|
||||||
|
|
||||||
if (this.params.profile) {
|
if (this.params.profile) {
|
||||||
this.logger.info("With Browser Profile", {url: this.params.profile});
|
logger.info("With Browser Profile", {url: this.params.profile});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.overwrite) {
|
if (this.params.overwrite) {
|
||||||
this.logger.debug(`Clearing ${this.collDir} before starting`);
|
logger.debug(`Clearing ${this.collDir} before starting`);
|
||||||
try {
|
try {
|
||||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
this.logger.error(`Unable to clear ${this.collDir}`, e);
|
logger.error(`Unable to clear ${this.collDir}`, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -277,7 +276,7 @@ export class Crawler {
|
||||||
await this.crawl();
|
await this.crawl();
|
||||||
status = (!this.interrupted ? "done" : "interrupted");
|
status = (!this.interrupted ? "done" : "interrupted");
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
this.logger.error("Crawl failed", e);
|
logger.error("Crawl failed", e);
|
||||||
exitCode = 9;
|
exitCode = 9;
|
||||||
status = "failing";
|
status = "failing";
|
||||||
if (await this.crawlState.incFailCount()) {
|
if (await this.crawlState.incFailCount()) {
|
||||||
|
@ -285,7 +284,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
this.logger.info(`Crawl status: ${status}`);
|
logger.info(`Crawl status: ${status}`);
|
||||||
|
|
||||||
if (this.crawlState) {
|
if (this.crawlState) {
|
||||||
await this.crawlState.setStatus(status);
|
await this.crawlState.setStatus(status);
|
||||||
|
@ -314,14 +313,14 @@ export class Crawler {
|
||||||
case "info":
|
case "info":
|
||||||
behaviorLine = JSON.stringify(data);
|
behaviorLine = JSON.stringify(data);
|
||||||
if (behaviorLine != this._behaviorLastLine) {
|
if (behaviorLine != this._behaviorLastLine) {
|
||||||
this.logger.info(message, details, "behaviorScript");
|
logger.info(message, details, "behaviorScript");
|
||||||
this._behaviorLastLine = behaviorLine;
|
this._behaviorLastLine = behaviorLine;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "debug":
|
case "debug":
|
||||||
default:
|
default:
|
||||||
this.logger.debug(message, details, "behaviorScript");
|
logger.debug(message, details, "behaviorScript");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -337,17 +336,17 @@ export class Crawler {
|
||||||
if (this.params.logging.includes("jserrors")) {
|
if (this.params.logging.includes("jserrors")) {
|
||||||
page.on("console", (msg) => {
|
page.on("console", (msg) => {
|
||||||
if (msg.type() === "error") {
|
if (msg.type() === "error") {
|
||||||
this.logger.warn(msg.text(), {"location": msg.location(), "page": page.url(), workerid}, "jsError");
|
logger.warn(msg.text(), {"location": msg.location(), "page": page.url(), workerid}, "jsError");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
page.on("pageerror", (e) => {
|
page.on("pageerror", (e) => {
|
||||||
this.logger.warn("Page Error", {...errJSON(e), "page": page.url(), workerid}, "jsError");
|
logger.warn("Page Error", {...errJSON(e), "page": page.url(), workerid}, "jsError");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.screencaster) {
|
if (this.screencaster) {
|
||||||
this.logger.debug("Start Screencast", {workerid}, "screencast");
|
logger.debug("Start Screencast", {workerid}, "screencast");
|
||||||
await this.screencaster.screencastPage(page, cdp, workerid);
|
await this.screencaster.screencastPage(page, cdp, workerid);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -359,7 +358,7 @@ export class Crawler {
|
||||||
|
|
||||||
async teardownPage({workerid}) {
|
async teardownPage({workerid}) {
|
||||||
if (this.screencaster) {
|
if (this.screencaster) {
|
||||||
this.logger.debug("End Screencast", {workerid}, "screencast");
|
logger.debug("End Screencast", {workerid}, "screencast");
|
||||||
await this.screencaster.stopById(workerid);
|
await this.screencaster.stopById(workerid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -375,7 +374,7 @@ export class Crawler {
|
||||||
data.workerid = workerid;
|
data.workerid = workerid;
|
||||||
|
|
||||||
if (!this.isInScope(data, logDetails)) {
|
if (!this.isInScope(data, logDetails)) {
|
||||||
this.logger.info("Page no longer in scope", data);
|
logger.info("Page no longer in scope", data);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -386,7 +385,7 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.params.screenshot) {
|
if (this.params.screenshot) {
|
||||||
if (!data.isHTMLPage) {
|
if (!data.isHTMLPage) {
|
||||||
this.logger.debug("Skipping screenshots for non-HTML page", logDetails);
|
logger.debug("Skipping screenshots for non-HTML page", logDetails);
|
||||||
}
|
}
|
||||||
const archiveDir = path.join(this.collDir, "archive");
|
const archiveDir = path.join(this.collDir, "archive");
|
||||||
const screenshots = new Screenshots({page, url, directory: archiveDir});
|
const screenshots = new Screenshots({page, url, directory: archiveDir});
|
||||||
|
@ -411,7 +410,7 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.params.behaviorOpts) {
|
if (this.params.behaviorOpts) {
|
||||||
if (!data.isHTMLPage) {
|
if (!data.isHTMLPage) {
|
||||||
this.logger.debug("Skipping behaviors for non-HTML page", logDetails, "behavior");
|
logger.debug("Skipping behaviors for non-HTML page", logDetails, "behavior");
|
||||||
} else {
|
} else {
|
||||||
const behaviorTimeout = this.params.behaviorTimeout / 1000;
|
const behaviorTimeout = this.params.behaviorTimeout / 1000;
|
||||||
|
|
||||||
|
@ -424,12 +423,12 @@ export class Crawler {
|
||||||
);
|
);
|
||||||
|
|
||||||
if (res && res.length) {
|
if (res && res.length) {
|
||||||
this.logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
|
logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.info("Page finished", logDetails, "pageStatus");
|
logger.info("Page finished", logDetails, "pageStatus");
|
||||||
|
|
||||||
await this.crawlState.markFinished(url);
|
await this.crawlState.markFinished(url);
|
||||||
|
|
||||||
|
@ -450,14 +449,14 @@ export class Crawler {
|
||||||
|
|
||||||
const context = page.context();
|
const context = page.context();
|
||||||
|
|
||||||
this.logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
|
logger.info("Running behaviors", {frames: frames.length, frameUrls: frames.map(frame => frame.url()), ...logDetails}, "behavior");
|
||||||
|
|
||||||
return await Promise.allSettled(
|
return await Promise.allSettled(
|
||||||
frames.map(frame => this.browser.evaluateWithCLI(context, frame, "self.__bx_behaviors.run();", logDetails, "behavior"))
|
frames.map(frame => this.browser.evaluateWithCLI(context, frame, "self.__bx_behaviors.run();", logDetails, "behavior"))
|
||||||
);
|
);
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior");
|
logger.warn("Behavior run failed", {...errJSON(e), ...logDetails}, "behavior");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -478,7 +477,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
this.logger.debug("Skipping processing frame", {frameUrl, ...logDetails}, "behavior");
|
logger.debug("Skipping processing frame", {frameUrl, ...logDetails}, "behavior");
|
||||||
}
|
}
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
|
@ -516,7 +515,7 @@ export class Crawler {
|
||||||
const size = await getDirSize(dir);
|
const size = await getDirSize(dir);
|
||||||
|
|
||||||
if (size >= this.params.sizeLimit) {
|
if (size >= this.params.sizeLimit) {
|
||||||
this.logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
|
logger.info(`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`);
|
||||||
interrupt = true;
|
interrupt = true;
|
||||||
this.clearOnExit = true;
|
this.clearOnExit = true;
|
||||||
}
|
}
|
||||||
|
@ -525,7 +524,7 @@ export class Crawler {
|
||||||
if (this.params.timeLimit) {
|
if (this.params.timeLimit) {
|
||||||
const elapsed = (Date.now() - this.startTime) / 1000;
|
const elapsed = (Date.now() - this.startTime) / 1000;
|
||||||
if (elapsed > this.params.timeLimit) {
|
if (elapsed > this.params.timeLimit) {
|
||||||
this.logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
logger.info(`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`);
|
||||||
interrupt = true;
|
interrupt = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -537,7 +536,7 @@ export class Crawler {
|
||||||
|
|
||||||
gracefulFinish() {
|
gracefulFinish() {
|
||||||
this.interrupted = true;
|
this.interrupted = true;
|
||||||
this.logger.info("Crawler interrupted, gracefully finishing current pages");
|
logger.info("Crawler interrupted, gracefully finishing current pages");
|
||||||
if (!this.params.waitOnDone) {
|
if (!this.params.waitOnDone) {
|
||||||
this.finalExit = true;
|
this.finalExit = true;
|
||||||
}
|
}
|
||||||
|
@ -547,9 +546,9 @@ export class Crawler {
|
||||||
if (!markDone) {
|
if (!markDone) {
|
||||||
this.params.waitOnDone = false;
|
this.params.waitOnDone = false;
|
||||||
this.clearOnExit = true;
|
this.clearOnExit = true;
|
||||||
this.logger.info("SIGNAL: Preparing for exit of this crawler instance only");
|
logger.info("SIGNAL: Preparing for exit of this crawler instance only");
|
||||||
} else {
|
} else {
|
||||||
this.logger.info("SIGNAL: Preparing for final exit of all crawlers");
|
logger.info("SIGNAL: Preparing for final exit of all crawlers");
|
||||||
this.finalExit = true;
|
this.finalExit = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -570,7 +569,7 @@ export class Crawler {
|
||||||
const driverUrl = new URL(this.params.driver, import.meta.url);
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
||||||
this.driver = (await import(driverUrl)).default;
|
this.driver = (await import(driverUrl)).default;
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
this.logger.warn(`Error importing driver ${this.params.driver}`, e);
|
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -579,7 +578,7 @@ export class Crawler {
|
||||||
let initState = await this.crawlState.getStatus();
|
let initState = await this.crawlState.getStatus();
|
||||||
|
|
||||||
while (initState === "debug") {
|
while (initState === "debug") {
|
||||||
this.logger.info("Paused for debugging, will continue after manual resume");
|
logger.info("Paused for debugging, will continue after manual resume");
|
||||||
|
|
||||||
await sleep(60);
|
await sleep(60);
|
||||||
|
|
||||||
|
@ -591,7 +590,7 @@ export class Crawler {
|
||||||
this.done = true;
|
this.done = true;
|
||||||
|
|
||||||
if (this.params.waitOnDone) {
|
if (this.params.waitOnDone) {
|
||||||
this.logger.info("Already done, waiting for signal to exit...");
|
logger.info("Already done, waiting for signal to exit...");
|
||||||
|
|
||||||
// wait forever until signal
|
// wait forever until signal
|
||||||
await new Promise(() => {});
|
await new Promise(() => {});
|
||||||
|
@ -617,10 +616,10 @@ export class Crawler {
|
||||||
|
|
||||||
await this.initPages();
|
await this.initPages();
|
||||||
|
|
||||||
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
|
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, logger);
|
||||||
|
|
||||||
if (this.params.blockRules && this.params.blockRules.length) {
|
if (this.params.blockRules && this.params.blockRules.length) {
|
||||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger);
|
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, logger);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.screencaster = this.initScreenCaster();
|
this.screencaster = this.initScreenCaster();
|
||||||
|
@ -676,7 +675,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.generateCDX) {
|
if (this.params.generateCDX) {
|
||||||
this.logger.info("Generating CDX");
|
logger.info("Generating CDX");
|
||||||
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -686,18 +685,18 @@ export class Crawler {
|
||||||
await this.generateWACZ();
|
await this.generateWACZ();
|
||||||
|
|
||||||
if (this.clearOnExit) {
|
if (this.clearOnExit) {
|
||||||
this.logger.info(`Clearing ${this.collDir} before exit`);
|
logger.info(`Clearing ${this.collDir} before exit`);
|
||||||
try {
|
try {
|
||||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
this.logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
|
if (this.params.waitOnDone && (!this.interrupted || this.finalExit)) {
|
||||||
this.done = true;
|
this.done = true;
|
||||||
this.logger.info("All done, waiting for signal...");
|
logger.info("All done, waiting for signal...");
|
||||||
await this.crawlState.setStatus("done");
|
await this.crawlState.setStatus("done");
|
||||||
|
|
||||||
// wait forever until signal
|
// wait forever until signal
|
||||||
|
@ -707,7 +706,7 @@ export class Crawler {
|
||||||
|
|
||||||
async closeLog() {
|
async closeLog() {
|
||||||
// close file-based log
|
// close file-based log
|
||||||
setExternalLogStream(null);
|
logger.setExternalLogStream(null);
|
||||||
try {
|
try {
|
||||||
await new Promise(resolve => this.logFH.close(() => resolve()));
|
await new Promise(resolve => this.logFH.close(() => resolve()));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -716,7 +715,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
async generateWACZ() {
|
async generateWACZ() {
|
||||||
this.logger.info("Generating WACZ");
|
logger.info("Generating WACZ");
|
||||||
|
|
||||||
const archiveDir = path.join(this.collDir, "archive");
|
const archiveDir = path.join(this.collDir, "archive");
|
||||||
|
|
||||||
|
@ -726,13 +725,13 @@ export class Crawler {
|
||||||
// is finished (>0 pages and all pages written)
|
// is finished (>0 pages and all pages written)
|
||||||
const isFinished = await this.crawlState.isFinished();
|
const isFinished = await this.crawlState.isFinished();
|
||||||
|
|
||||||
this.logger.info(`Num WARC Files: ${warcFileList.length}`);
|
logger.info(`Num WARC Files: ${warcFileList.length}`);
|
||||||
if (!warcFileList.length) {
|
if (!warcFileList.length) {
|
||||||
// if finished, just return
|
// if finished, just return
|
||||||
if (isFinished) {
|
if (isFinished) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
this.logger.fatal("No WARC Files, assuming crawl failed");
|
logger.fatal("No WARC Files, assuming crawl failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build the argument list to pass to the wacz create command
|
// Build the argument list to pass to the wacz create command
|
||||||
|
@ -764,11 +763,11 @@ export class Crawler {
|
||||||
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
|
const waczResult = await this.awaitProcess(child_process.spawn("wacz" , createArgs));
|
||||||
|
|
||||||
if (waczResult !== 0) {
|
if (waczResult !== 0) {
|
||||||
this.logger.error("Error creating WACZ", {"status code": waczResult});
|
logger.error("Error creating WACZ", {"status code": waczResult});
|
||||||
this.logger.fatal("Unable to write WACZ successfully");
|
logger.fatal("Unable to write WACZ successfully");
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
|
logger.debug(`WACZ successfully generated and saved to: ${waczPath}`);
|
||||||
|
|
||||||
// Verify WACZ
|
// Verify WACZ
|
||||||
/*
|
/*
|
||||||
|
@ -780,7 +779,7 @@ export class Crawler {
|
||||||
|
|
||||||
if (waczVerifyResult !== 0) {
|
if (waczVerifyResult !== 0) {
|
||||||
console.log("validate", waczVerifyResult);
|
console.log("validate", waczVerifyResult);
|
||||||
this.logger.fatal("Unable to verify WACZ created successfully");
|
logger.fatal("Unable to verify WACZ created successfully");
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
if (this.storage) {
|
if (this.storage) {
|
||||||
|
@ -793,11 +792,11 @@ export class Crawler {
|
||||||
|
|
||||||
awaitProcess(proc) {
|
awaitProcess(proc) {
|
||||||
proc.stdout.on("data", (data) => {
|
proc.stdout.on("data", (data) => {
|
||||||
this.logger.debug(data.toString());
|
logger.debug(data.toString());
|
||||||
});
|
});
|
||||||
|
|
||||||
proc.stderr.on("data", (data) => {
|
proc.stderr.on("data", (data) => {
|
||||||
this.logger.error(data.toString());
|
logger.error(data.toString());
|
||||||
});
|
});
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
proc.on("close", (code) => resolve(code));
|
proc.on("close", (code) => resolve(code));
|
||||||
|
@ -822,13 +821,13 @@ export class Crawler {
|
||||||
"pendingPages": pendingList.map(x => JSON.stringify(x))
|
"pendingPages": pendingList.map(x => JSON.stringify(x))
|
||||||
};
|
};
|
||||||
|
|
||||||
this.logger.info("Crawl statistics", stats, "crawlStatus");
|
logger.info("Crawl statistics", stats, "crawlStatus");
|
||||||
|
|
||||||
if (toFile && this.params.statsFilename) {
|
if (toFile && this.params.statsFilename) {
|
||||||
try {
|
try {
|
||||||
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
await fsp.writeFile(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn("Stats output failed", err);
|
logger.warn("Stats output failed", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -854,7 +853,7 @@ export class Crawler {
|
||||||
logDetails
|
logDetails
|
||||||
);
|
);
|
||||||
if (captureResult) {
|
if (captureResult) {
|
||||||
this.logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
|
logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -880,7 +879,7 @@ export class Crawler {
|
||||||
|
|
||||||
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
|
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
|
||||||
|
|
||||||
this.logger.info("Awaiting page load", logDetails);
|
logger.info("Awaiting page load", logDetails);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const resp = await page.goto(url, gotoOpts);
|
const resp = await page.goto(url, gotoOpts);
|
||||||
|
@ -896,7 +895,7 @@ export class Crawler {
|
||||||
let msg = e.message || "";
|
let msg = e.message || "";
|
||||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||||
const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error";
|
const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error";
|
||||||
this.logger.error(mainMessage, {msg, ...logDetails});
|
logger.error(mainMessage, {msg, ...logDetails});
|
||||||
//if (this.healthChecker) {
|
//if (this.healthChecker) {
|
||||||
// this.healthChecker.incError();
|
// this.healthChecker.incError();
|
||||||
//}
|
//}
|
||||||
|
@ -912,7 +911,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isHTMLPage) {
|
if (!isHTMLPage) {
|
||||||
this.logger.debug("Skipping link extraction for non-HTML page", logDetails);
|
logger.debug("Skipping link extraction for non-HTML page", logDetails);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -927,7 +926,7 @@ export class Crawler {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.debug("Extracting links");
|
logger.debug("Extracting links");
|
||||||
|
|
||||||
for (const opts of selectorOptsList) {
|
for (const opts of selectorOptsList) {
|
||||||
const links = await this.extractLinks(page, data.filteredFrames, opts, logDetails);
|
const links = await this.extractLinks(page, data.filteredFrames, opts, logDetails);
|
||||||
|
@ -953,7 +952,7 @@ export class Crawler {
|
||||||
try {
|
try {
|
||||||
await page.waitForLoadState("networkidle", {timeout: this.params.netIdleWait * 1000});
|
await page.waitForLoadState("networkidle", {timeout: this.params.netIdleWait * 1000});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
logger.debug("waitForNetworkIdle timed out, ignoring", details);
|
||||||
// ignore, continue
|
// ignore, continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -989,7 +988,7 @@ export class Crawler {
|
||||||
let i = 0;
|
let i = 0;
|
||||||
for (const linkResult of linkResults) {
|
for (const linkResult of linkResults) {
|
||||||
if (!linkResult) {
|
if (!linkResult) {
|
||||||
this.logger.warn("Link Extraction timed out in frame", {frameUrl: frames[i].url, ...logDetails});
|
logger.warn("Link Extraction timed out in frame", {frameUrl: frames[i].url, ...logDetails});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!linkResult.value) continue;
|
if (!linkResult.value) continue;
|
||||||
|
@ -1001,7 +1000,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.logger.warn("Link Extraction failed", e);
|
logger.warn("Link Extraction failed", e);
|
||||||
}
|
}
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
@ -1027,27 +1026,27 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.logger.error("Queuing Error", e);
|
logger.error("Queuing Error", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async checkCF(page, logDetails) {
|
async checkCF(page, logDetails) {
|
||||||
try {
|
try {
|
||||||
this.logger.debug("Check CF Blocking", logDetails);
|
logger.debug("Check CF Blocking", logDetails);
|
||||||
|
|
||||||
const cloudflare = page.locator("div.cf-browser-verification.cf-im-under-attack");
|
const cloudflare = page.locator("div.cf-browser-verification.cf-im-under-attack");
|
||||||
|
|
||||||
while (await cloudflare.waitFor({timeout: PAGE_OP_TIMEOUT_SECS})) {
|
while (await cloudflare.waitFor({timeout: PAGE_OP_TIMEOUT_SECS})) {
|
||||||
this.logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
|
logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
|
||||||
await sleep(5.5);
|
await sleep(5.5);
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
//this.logger.warn("Check CF failed, ignoring");
|
//logger.warn("Check CF failed, ignoring");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async queueUrl(seedId, url, depth, extraHops = 0) {
|
async queueUrl(seedId, url, depth, extraHops = 0) {
|
||||||
this.logger.debug(`Queuing url ${url}`);
|
logger.debug(`Queuing url ${url}`);
|
||||||
if (this.limitHit) {
|
if (this.limitHit) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1082,17 +1081,17 @@ export class Crawler {
|
||||||
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
||||||
if (this.params.text) {
|
if (this.params.text) {
|
||||||
header["hasText"] = true;
|
header["hasText"] = true;
|
||||||
this.logger.debug("Text Extraction: Enabled");
|
logger.debug("Text Extraction: Enabled");
|
||||||
} else {
|
} else {
|
||||||
header["hasText"] = false;
|
header["hasText"] = false;
|
||||||
this.logger.debug("Text Extraction: Disabled");
|
logger.debug("Text Extraction: Disabled");
|
||||||
}
|
}
|
||||||
const header_formatted = JSON.stringify(header).concat("\n");
|
const header_formatted = JSON.stringify(header).concat("\n");
|
||||||
await this.pagesFH.writeFile(header_formatted);
|
await this.pagesFH.writeFile(header_formatted);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch(err) {
|
} catch(err) {
|
||||||
this.logger.error("pages/pages.jsonl creation failed", err);
|
logger.error("pages/pages.jsonl creation failed", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1112,7 +1111,7 @@ export class Crawler {
|
||||||
try {
|
try {
|
||||||
await this.pagesFH.writeFile(processedRow);
|
await this.pagesFH.writeFile(processedRow);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn("pages/pages.jsonl append failed", err);
|
logger.warn("pages/pages.jsonl append failed", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1128,7 +1127,7 @@ export class Crawler {
|
||||||
agent: this.resolveAgent
|
agent: this.resolveAgent
|
||||||
});
|
});
|
||||||
if (resp.status !== 200) {
|
if (resp.status !== 200) {
|
||||||
this.logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
logger.debug(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1164,7 +1163,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
async awaitPendingClear() {
|
async awaitPendingClear() {
|
||||||
this.logger.info("Waiting to ensure pending data is written to WARCs...");
|
logger.info("Waiting to ensure pending data is written to WARCs...");
|
||||||
|
|
||||||
const redis = await initRedis("redis://localhost/0");
|
const redis = await initRedis("redis://localhost/0");
|
||||||
|
|
||||||
|
@ -1174,7 +1173,7 @@ export class Crawler {
|
||||||
if (count <= 0) {
|
if (count <= 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
this.logger.debug("Waiting for pending requests to finish", {numRequests: count});
|
logger.debug("Waiting for pending requests to finish", {numRequests: count});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1194,17 +1193,17 @@ export class Crawler {
|
||||||
const { sites } = await sitemapper.fetch();
|
const { sites } = await sitemapper.fetch();
|
||||||
await this.queueInScopeUrls(seedId, sites, 0);
|
await this.queueInScopeUrls(seedId, sites, 0);
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
this.logger.warn("Error fetching sites from sitemap", e);
|
logger.warn("Error fetching sites from sitemap", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async combineWARC() {
|
async combineWARC() {
|
||||||
this.logger.info("Generating Combined WARCs");
|
logger.info("Generating Combined WARCs");
|
||||||
|
|
||||||
// Get the list of created Warcs
|
// Get the list of created Warcs
|
||||||
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||||
|
|
||||||
this.logger.debug(`Combining ${warcLists.length} WARCs...`);
|
logger.debug(`Combining ${warcLists.length} WARCs...`);
|
||||||
|
|
||||||
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
||||||
|
|
||||||
|
@ -1271,7 +1270,7 @@ export class Crawler {
|
||||||
fh.write(warcBuffer);
|
fh.write(warcBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
logger.debug(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
||||||
|
|
||||||
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
||||||
|
|
||||||
|
@ -1288,7 +1287,7 @@ export class Crawler {
|
||||||
await fh.end();
|
await fh.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
logger.debug(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
async serializeConfig(done = false) {
|
async serializeConfig(done = false) {
|
||||||
|
@ -1338,10 +1337,10 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
const res = yaml.dump(this.origConfig, {lineWidth: -1});
|
const res = yaml.dump(this.origConfig, {lineWidth: -1});
|
||||||
try {
|
try {
|
||||||
this.logger.info(`Saving crawl state to: ${filename}`);
|
logger.info(`Saving crawl state to: ${filename}`);
|
||||||
await fsp.writeFile(filename, res);
|
await fsp.writeFile(filename, res);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.logger.error(`Failed to write save state file: ${filename}`, e);
|
logger.error(`Failed to write save state file: ${filename}`, e);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1349,11 +1348,11 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
if (this.saveStateFiles.length > this.params.saveStateHistory) {
|
||||||
const oldFilename = this.saveStateFiles.shift();
|
const oldFilename = this.saveStateFiles.shift();
|
||||||
this.logger.info(`Removing old save-state: ${oldFilename}`);
|
logger.info(`Removing old save-state: ${oldFilename}`);
|
||||||
try {
|
try {
|
||||||
await fsp.unlink(oldFilename);
|
await fsp.unlink(oldFilename);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
this.logger.error(`Failed to delete old save state file: ${oldFilename}`);
|
logger.error(`Failed to delete old save state file: ${oldFilename}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,8 @@ import child_process from "child_process";
|
||||||
|
|
||||||
import yargs from "yargs";
|
import yargs from "yargs";
|
||||||
|
|
||||||
|
import { logger } from "./util/logger.js";
|
||||||
|
|
||||||
import { sleep } from "./util/timing.js";
|
import { sleep } from "./util/timing.js";
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
import { initStorage } from "./util/storage.js";
|
import { initStorage } from "./util/storage.js";
|
||||||
|
@ -106,8 +108,11 @@ async function main() {
|
||||||
.option(cliOpts())
|
.option(cliOpts())
|
||||||
.argv;
|
.argv;
|
||||||
|
|
||||||
|
logger.setDebugLogging(true);
|
||||||
|
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
console.log("Launching XVFB");
|
logger.debug("Launching XVFB");
|
||||||
child_process.spawn("Xvfb", [
|
child_process.spawn("Xvfb", [
|
||||||
process.env.DISPLAY,
|
process.env.DISPLAY,
|
||||||
"-listen",
|
"-listen",
|
||||||
|
@ -144,7 +149,7 @@ async function main() {
|
||||||
if (params.proxy) {
|
if (params.proxy) {
|
||||||
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
child_process.spawn("wayback", ["--live", "--proxy", "live"], {stdio: "inherit", cwd: "/tmp"});
|
||||||
|
|
||||||
console.log("Running with pywb proxy");
|
logger.debug("Running with pywb proxy");
|
||||||
|
|
||||||
await sleep(3000);
|
await sleep(3000);
|
||||||
|
|
||||||
|
@ -171,7 +176,7 @@ async function main() {
|
||||||
});
|
});
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
console.log("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
|
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.user || params.password) {
|
if (params.user || params.password) {
|
||||||
|
@ -199,7 +204,7 @@ async function main() {
|
||||||
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
|
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`Loading page: ${params.url}`);
|
logger.info(`Loading page: ${params.url}`);
|
||||||
|
|
||||||
await page.goto(params.url, {waitUntil});
|
await page.goto(params.url, {waitUntil});
|
||||||
|
|
||||||
|
@ -216,7 +221,7 @@ async function main() {
|
||||||
async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
||||||
let u, p;
|
let u, p;
|
||||||
|
|
||||||
console.log("Looking for username and password entry fields on page...");
|
logger.debug("Looking for username and password entry fields on page...");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
||||||
|
@ -226,7 +231,7 @@ async function automatedProfile(params, browser, page, cdp, waitUntil) {
|
||||||
if (params.debugScreenshot) {
|
if (params.debugScreenshot) {
|
||||||
await page.screenshot({path: params.debugScreenshot});
|
await page.screenshot({path: params.debugScreenshot});
|
||||||
}
|
}
|
||||||
console.log("Login form could not be found");
|
logger.debug("Login form could not be found");
|
||||||
await page.close();
|
await page.close();
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
return;
|
return;
|
||||||
|
@ -255,7 +260,7 @@ async function createProfile(params, browser, page, cdp, targetFilename = "") {
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
|
||||||
console.log("creating profile");
|
logger.info("Creating profile");
|
||||||
|
|
||||||
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
|
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
|
||||||
|
|
||||||
|
@ -270,11 +275,11 @@ async function createProfile(params, browser, page, cdp, targetFilename = "") {
|
||||||
|
|
||||||
const storage = initStorage();
|
const storage = initStorage();
|
||||||
if (storage) {
|
if (storage) {
|
||||||
console.log("Uploading to remote storage...");
|
logger.info("Uploading to remote storage...");
|
||||||
resource = await storage.uploadFile(profileFilename, targetFilename);
|
resource = await storage.uploadFile(profileFilename, targetFilename);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log("done");
|
logger.info("Profile creation done");
|
||||||
return resource;
|
return resource;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -311,7 +316,7 @@ function promptInput(msg, hidden = false) {
|
||||||
|
|
||||||
class InteractiveBrowser {
|
class InteractiveBrowser {
|
||||||
constructor(params, browser, page, cdp, targetId) {
|
constructor(params, browser, page, cdp, targetId) {
|
||||||
console.log("Creating Profile Interactively...");
|
logger.info("Creating Profile Interactively...");
|
||||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||||
|
|
||||||
this.params = params;
|
this.params = params;
|
||||||
|
@ -342,7 +347,7 @@ class InteractiveBrowser {
|
||||||
|
|
||||||
if (this.shutdownWait) {
|
if (this.shutdownWait) {
|
||||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||||
console.log(`Shutting down in ${this.shutdownWait}ms if no ping received`);
|
logger.debug(`Shutting down in ${this.shutdownWait}ms if no ping received`);
|
||||||
} else {
|
} else {
|
||||||
this.shutdownTimer = 0;
|
this.shutdownTimer = 0;
|
||||||
}
|
}
|
||||||
|
@ -350,12 +355,12 @@ class InteractiveBrowser {
|
||||||
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
|
const httpServer = http.createServer((req, res) => this.handleRequest(req, res));
|
||||||
const port = 9223;
|
const port = 9223;
|
||||||
httpServer.listen(port);
|
httpServer.listen(port);
|
||||||
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
logger.info(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
console.log("Screencasting with VNC on port 6080");
|
logger.info("Screencasting with VNC on port 6080");
|
||||||
} else {
|
} else {
|
||||||
console.log("Screencasting with CDP on port 9222");
|
logger.info("Screencasting with CDP on port 9222");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -365,7 +370,7 @@ class InteractiveBrowser {
|
||||||
}
|
}
|
||||||
|
|
||||||
async saveAllCookies() {
|
async saveAllCookies() {
|
||||||
console.log("Saving all cookies");
|
logger.info("Saving all cookies");
|
||||||
|
|
||||||
for (const origin of this.originSet.values()) {
|
for (const origin of this.originSet.values()) {
|
||||||
await this.saveCookiesFor(origin + "/");
|
await this.saveCookiesFor(origin + "/");
|
||||||
|
@ -392,13 +397,13 @@ class InteractiveBrowser {
|
||||||
}
|
}
|
||||||
await this.browser.context.addCookies(cookies);
|
await this.browser.context.addCookies(cookies);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log("Save Cookie Error: " + e);
|
logger.error("Save Cookie Error: ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addOrigin() {
|
addOrigin() {
|
||||||
const url = this.page.url();
|
const url = this.page.url();
|
||||||
console.log("Adding origin for", url);
|
logger.debug("Adding origin", {url});
|
||||||
if (url.startsWith("http:") || url.startsWith("https:")) {
|
if (url.startsWith("http:") || url.startsWith("https:")) {
|
||||||
this.originSet.add(new URL(url).origin);
|
this.originSet.add(new URL(url).origin);
|
||||||
}
|
}
|
||||||
|
@ -431,7 +436,7 @@ class InteractiveBrowser {
|
||||||
if (this.shutdownWait) {
|
if (this.shutdownWait) {
|
||||||
clearInterval(this.shutdownTimer);
|
clearInterval(this.shutdownTimer);
|
||||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||||
console.log(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
|
logger.debug(`Ping received, delaying shutdown for ${this.shutdownWait}ms`);
|
||||||
}
|
}
|
||||||
|
|
||||||
origins = Array.from(this.originSet.values());
|
origins = Array.from(this.originSet.values());
|
||||||
|
@ -468,7 +473,7 @@ class InteractiveBrowser {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
res.writeHead(400, {"Content-Type": "application/json"});
|
res.writeHead(400, {"Content-Type": "application/json"});
|
||||||
res.end(JSON.stringify({"error": e.toString()}));
|
res.end(JSON.stringify({"error": e.toString()}));
|
||||||
console.log(e);
|
logger.warn("HTTP Error", e);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
@ -491,7 +496,7 @@ class InteractiveBrowser {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
res.writeHead(500, {"Content-Type": "application/json"});
|
res.writeHead(500, {"Content-Type": "application/json"});
|
||||||
res.end(JSON.stringify({"error": e.toString()}));
|
res.end(JSON.stringify({"error": e.toString()}));
|
||||||
console.log(e);
|
logger.warn("HTTP Error", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
setTimeout(() => process.exit(0), 200);
|
setTimeout(() => process.exit(0), 200);
|
||||||
|
@ -512,7 +517,7 @@ class InteractiveBrowser {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
res.writeHead(500, {"Content-Type": "text/html"});
|
res.writeHead(500, {"Content-Type": "text/html"});
|
||||||
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info");
|
res.end("<html><body>Profile creation failed! See the browsertrix-crawler console for more info");
|
||||||
console.log(e);
|
logger.warn("HTTP Error", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
setTimeout(() => process.exit(0), 200);
|
setTimeout(() => process.exit(0), 200);
|
||||||
|
|
4
main.js
4
main.js
|
@ -1,9 +1,7 @@
|
||||||
#!/usr/bin/env -S node --experimental-global-webcrypto
|
#!/usr/bin/env -S node --experimental-global-webcrypto
|
||||||
|
|
||||||
|
import { logger } from "./util/logger.js";
|
||||||
import { Crawler } from "./crawler.js";
|
import { Crawler } from "./crawler.js";
|
||||||
import { Logger } from "./util/logger.js";
|
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
var crawler = null;
|
var crawler = null;
|
||||||
|
|
||||||
|
|
|
@ -11,9 +11,7 @@ import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
import { screenshotTypes } from "./screenshots.js";
|
import { screenshotTypes } from "./screenshots.js";
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
import { Logger, errJSON } from "./logger.js";
|
import { logger, errJSON } from "./logger.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
const RULE_TYPES = ["block", "allowOnly"];
|
const RULE_TYPES = ["block", "allowOnly"];
|
||||||
|
|
||||||
|
|
|
@ -6,13 +6,11 @@ import { Readable } from "node:stream";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
|
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
import { initStorage } from "./storage.js";
|
import { initStorage } from "./storage.js";
|
||||||
|
|
||||||
import { chromium } from "playwright-core";
|
import { chromium } from "playwright-core";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||||
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle"];
|
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle"];
|
||||||
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||||
|
export const MAX_DEPTH = 1000000;
|
||||||
|
|
||||||
export const DEFAULT_SELECTORS = [{
|
export const DEFAULT_SELECTORS = [{
|
||||||
selector: "a[href]",
|
selector: "a[href]",
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
import http from "http";
|
import http from "http";
|
||||||
import url from "url";
|
import url from "url";
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
|
|
|
@ -1,15 +1,3 @@
|
||||||
// ===========================================================================
|
|
||||||
let logStream = null;
|
|
||||||
let debugLogging = false;
|
|
||||||
|
|
||||||
export function setExternalLogStream(logFH) {
|
|
||||||
logStream = logFH;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function setDebugLogging(debugLog) {
|
|
||||||
debugLogging = debugLog;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
// to fix serialization of regexes for logging purposes
|
// to fix serialization of regexes for logging purposes
|
||||||
RegExp.prototype.toJSON = RegExp.prototype.toString;
|
RegExp.prototype.toJSON = RegExp.prototype.toString;
|
||||||
|
@ -22,8 +10,21 @@ export function errJSON(e) {
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class Logger
|
class Logger
|
||||||
{
|
{
|
||||||
|
constructor() {
|
||||||
|
this.logStream = null;
|
||||||
|
this.debugLogging = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
setExternalLogStream(logFH) {
|
||||||
|
this.logStream = logFH;
|
||||||
|
}
|
||||||
|
|
||||||
|
setDebugLogging(debugLog) {
|
||||||
|
this.debugLogging = debugLog;
|
||||||
|
}
|
||||||
|
|
||||||
logAsJSON(message, data, context, logLevel="info") {
|
logAsJSON(message, data, context, logLevel="info") {
|
||||||
if (data instanceof Error) {
|
if (data instanceof Error) {
|
||||||
data = errJSON(data);
|
data = errJSON(data);
|
||||||
|
@ -39,8 +40,8 @@ export class Logger
|
||||||
};
|
};
|
||||||
const string = JSON.stringify(dataToLog);
|
const string = JSON.stringify(dataToLog);
|
||||||
console.log(string);
|
console.log(string);
|
||||||
if (logStream) {
|
if (this.logStream) {
|
||||||
logStream.write(string + "\n");
|
this.logStream.write(string + "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,7 +58,7 @@ export class Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
debug(message, data={}, context="general") {
|
debug(message, data={}, context="general") {
|
||||||
if (debugLogging) {
|
if (this.debugLogging) {
|
||||||
this.logAsJSON(message, data, context, "debug");
|
this.logAsJSON(message, data, context, "debug");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -67,3 +68,5 @@ export class Logger
|
||||||
process.exit(exitCode);
|
process.exit(exitCode);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const logger = new Logger();
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
import Redis from "ioredis";
|
import Redis from "ioredis";
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
const error = console.error;
|
const error = console.error;
|
||||||
|
|
||||||
|
|
|
@ -2,15 +2,12 @@ import ws from "ws";
|
||||||
import http from "http";
|
import http from "http";
|
||||||
import url from "url";
|
import url from "url";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
//import { v4 as uuidv4 } from "uuid";
|
|
||||||
|
|
||||||
import { initRedis } from "./redis.js";
|
import { initRedis } from "./redis.js";
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class WSTransport
|
class WSTransport
|
||||||
|
|
|
@ -2,9 +2,7 @@ import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import * as warcio from "warcio";
|
import * as warcio from "warcio";
|
||||||
|
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
import { MAX_DEPTH } from "./constants.js";
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
export const MAX_DEPTH = 1000000;
|
|
||||||
|
|
||||||
|
|
||||||
export class ScopedSeed
|
export class ScopedSeed
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
import { MAX_DEPTH } from "./seeds.js";
|
import { MAX_DEPTH } from "./constants.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class RedisCrawlState
|
export class RedisCrawlState
|
||||||
|
|
|
@ -7,12 +7,10 @@ import { createHash } from "crypto";
|
||||||
import Minio from "minio";
|
import Minio from "minio";
|
||||||
|
|
||||||
import { initRedis } from "./redis.js";
|
import { initRedis } from "./redis.js";
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
import getFolderSize from "get-folder-size";
|
import getFolderSize from "get-folder-size";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class S3StorageSync
|
export class S3StorageSync
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
import { Logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
export function sleep(seconds) {
|
export function sleep(seconds) {
|
||||||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
||||||
|
|
|
@ -1,15 +1,10 @@
|
||||||
//import PQueue from "p-queue";
|
import { logger, errJSON } from "./logger.js";
|
||||||
|
|
||||||
import { Logger, errJSON } from "./logger.js";
|
|
||||||
import { sleep, timedRun } from "./timing.js";
|
import { sleep, timedRun } from "./timing.js";
|
||||||
|
|
||||||
const logger = new Logger();
|
|
||||||
|
|
||||||
const MAX_REUSE = 5;
|
const MAX_REUSE = 5;
|
||||||
|
|
||||||
const NEW_WINDOW_TIMEOUT = 10;
|
const NEW_WINDOW_TIMEOUT = 10;
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export function runWorkers(crawler, numWorkers, timeout) {
|
export function runWorkers(crawler, numWorkers, timeout) {
|
||||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue