Logging and Behavior Tweaks (#229)

- Ensure page is included in all logging details
- Update logging messages to be a single string, with variables added in the details
- Always wait for all pending wait requests to finish (unless counter <0)
- Don't set puppeteer-cluster timeout (prep for removing puppeeteer-cluster)
- Add behaviorTimeout to running behaviors in crawler, in addition to in behaviors themselves.
- Add logging for behavior start, finish and timeout
- Move writeStats() logging to beginning of each page as well as at the end, to avoid confusion about pending pages.
- For events from frames, use frameUrl along with current page
- deps: bump browsertrix-behaviors to 0.4.2
- version: bump to 0.8.1
This commit is contained in:
Ilya Kreymer 2023-02-23 18:50:22 -08:00 committed by GitHub
parent a4358f4622
commit 5da379cb5f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 116 additions and 82 deletions

View file

@ -166,7 +166,7 @@ export class Crawler {
this.logger.info(`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, {}, "state");
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.timeout * 2, os.hostname());
this.crawlState = new RedisCrawlState(redis, this.params.crawlId, this.params.behaviorTimeout + this.params.timeout, os.hostname());
} else {
this.logger.info("Storing state in memory", {}, "state");
@ -324,21 +324,35 @@ export class Crawler {
}
}
_behaviorLog({data, type}) {
_behaviorLog({data, type}, pageUrl) {
let behaviorLine;
let message;
let details;
if (typeof(data) === "string") {
message = data;
details = {};
} else {
message = type === "info" ? "Behavior log" : "Behavior debug";
details = typeof(data) === "object" ? data : {};
}
if (pageUrl) {
details.page = pageUrl;
}
switch (type) {
case "info":
behaviorLine = JSON.stringify(data);
if (behaviorLine != this._behaviorLastLine) {
this.logger.info("Behavior log", data, "behavior");
this.logger.info(message, details, "behaviorScript");
this._behaviorLastLine = behaviorLine;
}
break;
case "debug":
default:
this.logger.debug("Behavior debug", data, "behavior");
this.logger.debug(message, details, "behaviorScript");
}
}
@ -349,7 +363,12 @@ export class Crawler {
}
async crawlPage(opts) {
await this.writeStats();
const {page, data} = opts;
const {url} = data;
const logDetails = {page: url};
if (!this.isInScope(data)) {
this.logger.info("Page no longer in scope", data);
@ -358,7 +377,7 @@ export class Crawler {
try {
if (this.screencaster) {
await this.screencaster.screencastTarget(page.target(), data.url);
await this.screencaster.screencastTarget(page.target(), url);
}
if (this.emulateDevice) {
@ -372,7 +391,7 @@ export class Crawler {
await page.evaluateOnNewDocument("Object.defineProperty(navigator, \"webdriver\", {value: false});");
if (this.params.behaviorOpts && !page.__bx_inited) {
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata));
await page.exposeFunction(BEHAVIOR_LOG_FUNC, (logdata) => this._behaviorLog(logdata, url));
await page.evaluateOnNewDocument(behaviors + `;\nself.__bx_behaviors.init(${this.params.behaviorOpts});`);
page.__bx_inited = true;
}
@ -384,10 +403,10 @@ export class Crawler {
if (this.params.screenshot) {
if (!page.isHTMLPage) {
this.logger.info("Skipping screenshots for non-HTML page");
this.logger.info("Skipping screenshots for non-HTML page", logDetails);
}
const archiveDir = path.join(this.collDir, "archive");
const screenshots = new Screenshots({page, url: data.url, directory: archiveDir});
const screenshots = new Screenshots({page, url, directory: archiveDir});
if (this.params.screenshot.includes("view")) {
await screenshots.take();
}
@ -410,45 +429,58 @@ export class Crawler {
if (this.params.behaviorOpts) {
if (!page.isHTMLPage) {
this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
this.logger.info("Skipping behaviors for non-HTML page", logDetails, "behavior");
} else {
await Promise.allSettled(
page.frames().
filter(frame => this.shouldRunBehavior(frame)).
map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();"))
);
// also wait for general net idle
await this.netIdle(page);
const behaviorTimeout = this.params.behaviorTimeout / 1000;
this.logger.info("Behaviors started", {behaviorTimeout, ...logDetails}, "behavior");
const res = await Promise.race([
this.sleep(behaviorTimeout),
Promise.allSettled(
page.frames().
filter(frame => this.shouldRunBehavior(frame, logDetails)).
map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();", logDetails, "behavior"))
)
]);
if (res && res.length) {
this.logger.info("Behaviors finished", {finished: res.length, ...logDetails}, "behavior");
} else {
this.logger.error("Behaviors timed out", logDetails, "behavior");
}
}
}
this.logger.info("Page graph data for successfully crawled page", data, "pageGraph");
await this.writeStats();
this.logger.info("Page finished", logDetails, "pageStatus");
await this.checkLimits();
await this.serializeConfig();
} catch (e) {
this.logger.error(`Error crawling page ${data.url}`, e);
this.logger.error("Page Errored", {...e, ...logDetails}, "pageStatus");
await this.markPageFailed(page);
}
}
async shouldRunBehavior(frame) {
async shouldRunBehavior(frame, logDetails) {
if (!frame.parentFrame()) {
return true;
}
const url = frame.url();
const frameUrl = frame.url();
if (url === "about:blank") {
return false;
let res;
if (frameUrl === "about:blank") {
res = false;
} else {
res = !(await this.adBlockRules.shouldBlock(null, frameUrl, logDetails));
}
return !!(await this.adBlockRules.shouldBlock(null, url));
if (!res) {
this.logger.info("Skipping behavior for frame", {frameUrl, ...logDetails}, "behavior");
}
return res;
}
async createWARCInfo(filename) {
@ -591,7 +623,8 @@ export class Crawler {
concurrency: this.params.newContext,
maxConcurrency: this.params.workers,
skipDuplicateUrls: false,
timeout: this.params.timeout * 2,
// effectively disable
timeout: 1e8,
puppeteerOptions: this.puppeteerArgs,
puppeteer,
monitor: false
@ -645,6 +678,8 @@ export class Crawler {
await this.pagesFH.close();
}
await this.writeStats(true);
// extra wait for all resources to land into WARCs
await this.awaitPendingClear();
@ -657,8 +692,6 @@ export class Crawler {
await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
}
await this.writeStats(true);
if (this.params.generateWACZ && (!this.interrupted || this.finalExit || this.clearOnExit)) {
await this.generateWACZ();
@ -783,7 +816,7 @@ export class Crawler {
"pendingPages": pendingList.map(x => JSON.stringify(x))
};
this.logger.info("Crawl statistics", stats, "crawlState");
this.logger.info("Crawl statistics", stats, "crawlStatus");
if (toFile && this.params.statsFilename) {
try {
@ -797,6 +830,8 @@ export class Crawler {
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
const {url, seedId, depth, extraHops = 0} = urlData;
const logDetails = {page: url};
let isHTMLPage = true;
if (!await this.isHTML(url)) {
@ -840,13 +875,14 @@ export class Crawler {
try {
await page.goto(url, gotoOpts);
if (this.errorCount > 0) {
this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`);
this.logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, logDetails);
this.errorCount = 0;
}
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
this.logger.error(`Load Error: ${url}: ${msg}`);
const mainMessage = e.name === "TimeoutError" ? "Page Load Timeout" : "Page Load Error";
this.logger.error(mainMessage, {msg, ...logDetails});
this.errorCount++;
}
}
@ -859,11 +895,9 @@ export class Crawler {
const seed = this.params.scopedSeeds[seedId];
await this.checkCF(page);
await this.checkCF(page, logDetails);
await this.netIdle(page);
await this.sleep(5);
await this.netIdle(page, logDetails);
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
@ -884,7 +918,7 @@ export class Crawler {
}
}
async netIdle(page) {
async netIdle(page, details) {
if (!this.params.netIdleWait) {
return;
}
@ -895,7 +929,7 @@ export class Crawler {
try {
await page.waitForNetworkIdle({timeout: this.params.netIdleWait * 1000});
} catch (e) {
this.logger.info("note: waitForNetworkIdle timed out, ignoring");
this.logger.info("waitForNetworkIdle timed out, ignoring", details);
// ignore, continue
}
}
@ -956,14 +990,14 @@ export class Crawler {
}
}
async checkCF(page) {
async checkCF(page, logDetails) {
try {
while (await page.$("div.cf-browser-verification.cf-im-under-attack")) {
this.logger.info("Cloudflare Check Detected, waiting for reload...");
this.logger.info("Cloudflare Check Detected, waiting for reload...", logDetails);
await this.sleep(5.5);
}
} catch (e) {
this.logger.warn("Check CF failed, ignoring", e);
//this.logger.warn("Check CF failed, ignoring");
}
}
@ -1090,10 +1124,7 @@ export class Crawler {
const redis = await initRedis("redis://localhost/0");
// wait for pending requests upto 120 secs, unless canceling
const MAX_WAIT = 120;
for (let i = 0; i < MAX_WAIT && !this.interrupted; i++) {
while (!this.interrupted) {
try {
const count = Number(await redis.get(`pywb:${this.params.collection}:pending`) || 0);
if (count <= 0) {

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.8.0-beta.2",
"version": "0.8.1",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
@ -13,7 +13,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0-beta",
"abort-controller": "^3.0.0",
"browsertrix-behaviors": "^0.4.1",
"browsertrix-behaviors": "^0.4.2",
"get-folder-size": "^4.0.0",
"ioredis": "^4.27.1",
"js-yaml": "^4.1.0",

View file

@ -64,9 +64,9 @@ export class BlockRules
}
if (this.rules.length) {
this.logger.debug("URL Block Rules:\n");
this.logger.debug("URL Block Rules:\n", {}, "blocking");
for (const rule of this.rules) {
this.logger.debug(rule.toString());
this.logger.debug(rule.toString(), {}, "blocking");
}
}
}
@ -85,21 +85,22 @@ export class BlockRules
await page.setRequestInterception(true);
page.on("request", async (request) => {
const logDetails = {page: page.url()};
try {
await this.handleRequest(request);
await this.handleRequest(request, logDetails);
} catch (e) {
this.logger.warn("Error handling request", e);
this.logger.warn("Error handling request", {...e, ...logDetails}, "blocking");
}
});
}
async handleRequest(request) {
async handleRequest(request, logDetails) {
const url = request.url();
let blockState;
try {
blockState = await this.shouldBlock(request, url);
blockState = await this.shouldBlock(request, url, logDetails);
if (blockState === BlockState.ALLOW) {
await request.continue();
@ -108,11 +109,11 @@ export class BlockRules
}
} catch (e) {
this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, e);
this.logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...e, ...logDetails}, "blocking");
}
}
async shouldBlock(request, url) {
async shouldBlock(request, url, logDetails) {
if (!url.startsWith("http:") && !url.startsWith("https:")) {
return BlockState.ALLOW;
}
@ -151,14 +152,14 @@ export class BlockRules
}
for (const rule of this.rules) {
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq);
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails);
if (block) {
if (blockState === BlockState.BLOCK_PAGE_NAV) {
this.logger.warn(`Warning: Block rule match for page request "${url}" ignored, set --exclude to block full pages`);
this.logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
return BlockState.ALLOW;
}
this.logger.debug(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
this.logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking");
await this.recordBlockMsg(url);
return blockState;
}
@ -170,7 +171,7 @@ export class BlockRules
return BlockState.ALLOW;
}
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq) {
async ruleCheck(rule, request, reqUrl, frameUrl, isNavReq, logDetails) {
const {url, inFrameUrl, frameTextMatch} = rule;
const type = rule.type || "block";
@ -190,8 +191,8 @@ export class BlockRules
return {block: false, done: false};
}
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
this.logger.debug(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
this.logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
return {block, done: true};
}
@ -200,7 +201,7 @@ export class BlockRules
return {block, done: false};
}
async isTextMatch(request, reqUrl, frameTextMatch) {
async isTextMatch(request, reqUrl, frameTextMatch, logDetails) {
try {
const res = await fetch(reqUrl);
const text = await res.text();
@ -208,7 +209,7 @@ export class BlockRules
return !!text.match(frameTextMatch);
} catch (e) {
this.logger.debug("Error determining text match", e);
this.logger.debug("Error determining text match", {...e, ...logDetails}, "blocking");
}
}
@ -249,19 +250,21 @@ export class AdBlockRules extends BlockRules
await page.setRequestInterception(true);
page.on("request", async (request) => {
const logDetails = {page: page.url()};
try {
await this.handleRequest(request);
await this.handleRequest(request, logDetails);
} catch (e) {
this.logger.warn("Error handling request", e);
this.logger.warn("Error handling request", {...e, ...logDetails}, "blocking");
}
});
}
async shouldBlock(request, url) {
async shouldBlock(request, url, logDetails) {
const fragments = url.split("/");
const domain = fragments.length > 2 ? fragments[2] : null;
if (this.adhosts.includes(domain)) {
this.logger.debug(`URL blocked for being an ad: ${url}`);
this.logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
await this.recordBlockMsg(url);
return BlockState.BLOCK_AD;
}

View file

@ -143,11 +143,12 @@ export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
}
export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
export async function evaluateWithCLI(frame, funcString, logData, contextName) {
const context = await frame.executionContext();
const url = frame.url();
logger.info(`Running ${name}...`, {url});
let details = {frameUrl: frame.url(), ...logData};
logger.info("Run Script Started", details, contextName);
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
const contextId = context._contextId;
@ -164,13 +165,12 @@ export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
});
if (exceptionDetails) {
const details = exceptionDetails.stackTrace || {};
details.url = url;
logger.error(
`Run ${name} failed: ${exceptionDetails.text}`, details
);
if (exceptionDetails.stackTrace) {
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
}
logger.error("Run Script Failed", details, contextName);
} else {
logger.info(`Run ${name} finished`, {url});
logger.info("Run Script Finished", details, contextName);
}
return result.value;

View file

@ -326,11 +326,11 @@ return 0;
data = JSON.parse(json);
} catch(e) {
logger.error("Invalid queued json", json);
return null;
return undefined;
}
if (!data) {
return null;
return undefined;
}
const url = data.url;

View file

@ -1148,10 +1148,10 @@ browserslist@^4.21.3:
node-releases "^2.0.6"
update-browserslist-db "^1.0.9"
browsertrix-behaviors@^0.4.1:
version "0.4.1"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.1.tgz#dfa5beb66348df28ccbaad4ba66fe49ca0dde947"
integrity sha512-zG6eNWqT9z+WvdUSSWg6de8NtYnGs7MHpzezKUgMJg+ze1+xtBgc6jkLT1d64eXC0cZQgwW+WV/e0yLAi1PyPA==
browsertrix-behaviors@^0.4.2:
version "0.4.2"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.4.2.tgz#830f4e37ddebf10dd923237dfd75c734d5d211e9"
integrity sha512-5w6kPL3NB/BkmEGxWt3NT3iddAaSzMR1TtDPS7b66fM9kkhpCjCv/R/zR951jWDIeV3flJFBOy09uI5o8asPqg==
bser@2.1.1:
version "2.1.1"