improve logging / healthcheck responsiveness:

- avoid double logging of exceptions
- ensure page is included in all errors
- treat some timeouts as warnings
- remove unused responseHeaders call in browser
- fix screenshot err logging

healthcheck:
- interrupt crawl directly after healthcheck fails
This commit is contained in:
Ilya Kreymer 2023-08-31 11:47:49 -07:00
parent 1c486ea1f3
commit 416a62fb7a
7 changed files with 140 additions and 97 deletions

View file

@ -546,6 +546,10 @@ self.__bx_behaviors.selectMainBehavior();
if (this.healthChecker) {
this.healthChecker.incError();
}
this.interrupted = this.healthChecker.isFailing();
if (this.interrupted) {
logger.warn("Interrupting due to too many failures, will restart");
}
}
await this.serializeConfig();
@ -992,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug(stdout.join("\n"));
}
if (stderr.length && this.params.logging.includes("debug")) {
logger.error(stderr.join("\n"));
logger.debug(stderr.join("\n"));
}
resolve(code);
});
@ -1046,20 +1050,24 @@ self.__bx_behaviors.selectMainBehavior();
const failCrawlOnError = ((depth === 0) && this.params.failOnFailedSeed);
let isHTMLPage = await timedRun(
data.isHTMLPage = await timedRun(
this.isHTML(url),
FETCH_TIMEOUT_SECS,
"HEAD request to determine if URL is HTML page timed out",
logDetails
logDetails,
"fetch",
true
);
if (!isHTMLPage) {
if (!data.isHTMLPage) {
try {
const captureResult = await timedRun(
this.directFetchCapture(url),
FETCH_TIMEOUT_SECS,
"Direct fetch capture attempt timed out",
logDetails
logDetails,
"fetch",
true
);
if (captureResult) {
logger.info("Direct fetch successful", {url, ...logDetails}, "fetch");
@ -1070,73 +1078,11 @@ self.__bx_behaviors.selectMainBehavior();
}
}
let ignoreAbort = false;
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
// if so, don't report as an error
page.once("requestfailed", (req) => {
ignoreAbort = shouldIgnoreAbort(req);
});
if (isHTMLPage) {
page.once("domcontentloaded", () => {
data.loadState = LoadState.CONTENT_LOADED;
});
}
const gotoOpts = isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
logger.info("Awaiting page load", logDetails);
try {
const resp = await page.goto(url, gotoOpts);
// Handle 4xx or 5xx response as a page load error
const statusCode = resp.status();
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {statusCode, ...logDetails});
throw new Error(`Page ${url} returned status code ${statusCode}`);
}
}
const contentType = await this.browser.responseHeader(resp, "content-type");
isHTMLPage = this.isHTMLContentType(contentType);
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
if (e.name === "TimeoutError") {
if (data.loadState !== LoadState.CONTENT_LOADED) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
throw e;
}
} else {
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
data.skipBehaviors = true;
}
} else {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
throw e;
}
}
}
}
await this.doGotoPage(page, url, data, logDetails, failCrawlOnError);
data.loadState = LoadState.FULL_PAGE_LOADED;
data.isHTMLPage = isHTMLPage;
if (isHTMLPage) {
if (data.isHTMLPage) {
let frames = await page.frames();
frames = await Promise.allSettled(frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)));
@ -1147,7 +1093,7 @@ self.__bx_behaviors.selectMainBehavior();
data.filteredFrames = [];
}
if (!isHTMLPage) {
if (data.isHTMLPage) {
logger.debug("Skipping link extraction for non-HTML page", logDetails);
return;
}
@ -1172,6 +1118,82 @@ self.__bx_behaviors.selectMainBehavior();
}
}
async doGotoPage(page, url, data, logDetails, failCrawlOnError) {
let ignoreAbort = false;
// Detect if ERR_ABORTED is actually caused by trying to load a non-page (eg. downloadable PDF),
// if so, don't report as an error
page.once("requestfailed", (req) => {
ignoreAbort = shouldIgnoreAbort(req);
});
if (data.isHTMLPage) {
page.once("domcontentloaded", () => {
data.loadState = LoadState.CONTENT_LOADED;
});
}
const gotoOpts = data.isHTMLPage ? this.gotoOpts : {waitUntil: "domcontentloaded"};
logger.info("Awaiting page load", logDetails);
let mainPageResp = null;
try {
mainPageResp = await page.goto(url, gotoOpts);
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
if (e.name === "TimeoutError") {
if (data.loadState !== LoadState.CONTENT_LOADED) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Timeout, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Timeout, skipping page", {msg, ...logDetails});
e.message = "logged";
throw e;
}
} else {
logger.warn("Page Loading Slowly, skipping behaviors", {msg, ...logDetails});
data.skipBehaviors = true;
}
} else {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {msg, ...logDetails});
} else {
logger.error("Page Load Error, skipping page", {msg, ...logDetails});
e.message = "logged";
throw e;
}
}
}
}
if (!mainPageResp) {
return;
}
// Handle 4xx or 5xx response as a page load error
const statusCode = mainPageResp.status();
if (statusCode.toString().startsWith("4") || statusCode.toString().startsWith("5")) {
if (failCrawlOnError) {
logger.fatal("Seed Page Load Error, failing crawl", {statusCode, ...logDetails});
} else {
logger.error("Invalid status Code, skipping page", {statusCode, ...logDetails});
throw new Error("logged");
}
}
// attempt to determine content-type more accurately, ignore if failed
try {
data.isHTMLPage = this.isHTMLContentType(mainPageResp.headers()["content-type"]);
} catch (e) {
// ignore
}
}
async netIdle(page, details) {
if (!this.params.netIdleWait) {
return;
@ -1267,7 +1289,11 @@ self.__bx_behaviors.selectMainBehavior();
while (await timedRun(
page.$("div.cf-browser-verification.cf-im-under-attack"),
PAGE_OP_TIMEOUT_SECS
PAGE_OP_TIMEOUT_SECS,
"Cloudflare check timed out",
logDetails,
"general",
true
)) {
logger.debug("Cloudflare Check Detected, waiting for reload...", logDetails);
await sleep(5.5);

View file

@ -280,10 +280,6 @@ export class Browser extends BaseBrowser
return {page, cdp};
}
async responseHeader(resp, header) {
return await resp.headers()[header];
}
async evaluateWithCLI(_, frame, cdp, funcString, logData, contextName) {
const context = await frame.executionContext();
cdp = context._client;

View file

@ -10,17 +10,22 @@ export class HealthChecker
this.port = port;
this.errorCount = 0;
this.errorThreshold = errorThreshold;
this.resetCount = 0;
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
this.healthServer.listen(port);
}
isFailing() {
return this.errorCount >= this.errorThreshold;
}
async healthCheck(req, res) {
const pathname = url.parse(req.url).pathname;
switch (pathname) {
case "/healthz":
if (this.errorCount < this.errorThreshold) {
if (!this.isFailing()) {
logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck");
res.writeHead(200);
res.end();
@ -34,9 +39,13 @@ export class HealthChecker
}
resetErrors() {
if (this.errorCount > 0) {
this.resetCount++;
if (this.errorCount > 0 && this.resetCount >= 2) {
logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck");
this.errorCount = 0;
this.resetCount = 0;
} else if (!this.errorCount) {
this.resetCount = 0;
}
}

View file

@ -3,7 +3,7 @@ import path from "path";
import * as warcio from "warcio";
import sharp from "sharp";
import { logger } from "./logger.js";
import { logger, errJSON } from "./logger.js";
// ============================================================================
@ -47,7 +47,7 @@ export class Screenshots {
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error(`Taking screenshot (type: ${screenshotType}) failed for ${this.url}`, e.message);
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}
@ -56,8 +56,8 @@ export class Screenshots {
}
async takeThumbnail() {
try {
const screenshotType = "thumbnail";
try {
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options);
@ -68,7 +68,7 @@ export class Screenshots {
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error(`Taking screenshot (type: thumbnail) failed for ${this.url}`, e.message);
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}

View file

@ -134,7 +134,7 @@ end
});
redis.defineCommand("requeue", {
numberOfKeys: 3,
numberOfKeys: 4,
lua: `
local res = redis.call('get', KEYS[3]);
if not res then
@ -148,6 +148,9 @@ if not res then
redis.call('zadd', KEYS[2], 0, json);
return 1;
else
data['failed'] = '1';
json = cjson.encode(data);
redis.call('lpush', KEYS[3], json);
return 2;
end
end
@ -179,7 +182,8 @@ return 0;
}
async markFailed(url) {
await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
//await this.redis.movefailed(this.pkey, this.fkey, url, "1", "failed");
await this.requeueOrFail(url);
return await this.redis.incr(this.dkey);
}
@ -416,7 +420,12 @@ return 0;
const pendingUrls = await this.redis.hkeys(this.pkey);
for (const url of pendingUrls) {
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending);
await this.requeueOrFail(url);
}
}
async requeueOrFail(url) {
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, this.fkey, url, this.maxRetryPending);
switch (res) {
case 1:
logger.info(`Requeued: ${url}`);
@ -426,7 +435,7 @@ return 0;
logger.info(`Not requeuing anymore: ${url}`);
break;
}
}
return res === 1;
}
async queueSize() {

View file

@ -4,7 +4,7 @@ export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general") {
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
// return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000;
@ -17,7 +17,8 @@ export function timedRun(promise, seconds, message="Promise timed out", logDetai
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
.catch((err) => {
if (err == "timeout reached") {
logger.error(message, {"seconds": seconds, ...logDetails}, context);
const logFunc = isWarn ? logger.warn : logger.error;
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
} else {
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err;

View file

@ -167,14 +167,16 @@ export class PageWorker
this.crawler.crawlPage(opts),
this.maxPageTime,
"Page Worker Timeout",
{workerid},
this.logDetails,
"worker"
),
this.crashBreak
]);
} catch (e) {
if (e.message !== "logged") {
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
}
} finally {
await this.crawler.pageFinished(data);
}