Better tracking of failed requests + logging context exclude (#485)

- add --logExcludeContext for log contexts that should be excluded
(while --logContext specifies which are to be included)
- enable 'recorderNetwork' logging for debugging CDP network
- create default log context exclude list (containing: screencast,
recorderNetwork, jsErrors), customizable via --logExcludeContext

recorder: Track failed requests and include in pageinfo records with
status code 0
- cleanup cdp handler methods
- intercept requestWillBeSent to track requests that started (but may
not complete)
- fix shouldSkip() still working if no url is provided (eg. check only
headers)
- set status to 0 for async fetch failures
- remove responseServedFromCache interception, as response data
generally not available then, and responseReceived is still called
- pageinfo: include page requests that failed with status code 0, also
include 'error' status if available.
- ensure page is closed on failure
- ensure pageinfo still written even if nothing else is crawled for a
page
- track cached responses, add to debug logging (can also add to pageinfo
later if needed)

tests: add pageinfo test for crawling invalid URL, which should still
result in pageinfo record with status code 0

bump to 1.0.0-beta.7
This commit is contained in:
Ilya Kreymer 2024-03-07 08:35:53 -08:00 committed by GitHub
parent 65133c9d9d
commit 9f18a49c0a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 299 additions and 131 deletions

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.0.0-beta.6",
"version": "1.0.0-beta.7",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -189,7 +189,8 @@ export class Crawler {
const debugLogging = this.params.logging.includes("debug");
logger.setDebugLogging(debugLogging);
logger.setLogLevel(this.params.logLevel);
logger.setContext(this.params.context);
logger.setContext(this.params.logContext);
logger.setExcludeContext(this.params.logExcludeContext);
// if automatically restarts on error exit code,
// exit with 0 from fatal by default, to avoid unnecessary restart

View file

@ -15,7 +15,11 @@ import {
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js";
import { LOG_CONTEXT_TYPES, logger } from "./logger.js";
import {
DEFAULT_EXCLUDE_LOG_CONTEXTS,
LOG_CONTEXT_TYPES,
logger,
} from "./logger.js";
// ============================================================================
class ArgParser {
@ -225,6 +229,14 @@ class ArgParser {
coerce,
},
logExcludeContext: {
describe: "Comma-separated list of contexts to NOT include in logs",
type: "array",
default: DEFAULT_EXCLUDE_LOG_CONTEXTS,
choices: LOG_CONTEXT_TYPES,
coerce,
},
text: {
describe:
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",

View file

@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [
"general",
"worker",
"recorder",
"recorderNetwork",
"writer",
"state",
"redis",
@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
"recorderNetwork",
"jsError",
"screencast",
];
// ===========================================================================
class Logger {
logStream: Writable | null = null;
debugLogging = false;
logErrorsToRedis = false;
logLevels: string[] = [];
contexts: string[] = [];
contexts: LogContext[] = [];
excludeContexts: LogContext[] = [];
crawlState?: RedisCrawlState | null = null;
fatalExitCode = 17;
@ -81,10 +89,14 @@ class Logger {
this.logLevels = logLevels;
}
setContext(contexts: string[]) {
setContext(contexts: LogContext[]) {
this.contexts = contexts;
}
setExcludeContext(contexts: LogContext[]) {
this.excludeContexts = contexts;
}
setCrawlState(crawlState: RedisCrawlState) {
this.crawlState = crawlState;
}
@ -92,7 +104,7 @@ class Logger {
logAsJSON(
message: string,
dataUnknown: unknown,
context: string,
context: LogContext,
logLevel = "info",
) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -110,6 +122,12 @@ class Logger {
}
}
if (this.excludeContexts.length) {
if (this.excludeContexts.indexOf(context) >= 0) {
return;
}
}
const dataToLog = {
timestamp: new Date().toISOString(),
logLevel: logLevel,

View file

@ -42,10 +42,9 @@ const MIME_EVENT_STREAM = "text/event-stream";
const encoder = new TextEncoder();
// =================================================================
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
// eslint-disable-next-line @typescript-eslint/no-explicit-any
function logNetwork(msg: string, data: any) {
// logger.debug(msg, data, "recorderNetwork");
logger.debug(msg, data, "recorderNetwork");
}
// =================================================================
@ -53,6 +52,8 @@ export type PageInfoValue = {
status: number;
mime?: string;
type?: string;
error?: string;
fromBrowserCache?: boolean;
};
// =================================================================
@ -150,7 +151,6 @@ export class Recorder {
async onCreatePage({ cdp }: { cdp: CDPSession }) {
// Fetch
cdp.on("Fetch.requestPaused", async (params) => {
this.handleRequestPaused(params, cdp);
});
@ -159,85 +159,41 @@ export class Recorder {
patterns: [{ urlPattern: "*", requestStage: "Response" }],
});
await cdp.send("Console.enable");
// Response
cdp.on("Network.responseReceived", (params) => {
// handling to fill in security details
logNetwork("Network.responseReceived", {
requestId: params.requestId,
...this.logDetails,
});
this.handleResponseReceived(params);
});
cdp.on("Network.responseReceived", (params) =>
this.handleResponseReceived(params),
);
cdp.on("Network.responseReceivedExtraInfo", (params) => {
logNetwork("Network.responseReceivedExtraInfo", {
requestId: params.requestId,
...this.logDetails,
});
const reqresp = this.pendingReqResp(params.requestId, true);
if (reqresp) {
reqresp.fillResponseReceivedExtraInfo(params);
}
});
cdp.on("Network.responseReceivedExtraInfo", (params) =>
this.handleResponseReceivedExtraInfo(params),
);
// Cache
cdp.on("Network.requestServedFromCache", (params) =>
this.handleRequestServedFromCache(params),
);
// Request
cdp.on("Network.requestWillBeSent", (params) =>
this.handleRequestWillBeSent(params),
);
cdp.on("Network.requestWillBeSent", (params) => {
// only handling redirect here, committing last response in redirect chain
// request data stored from requestPaused
if (params.redirectResponse) {
logNetwork("Network.requestWillBeSent after redirect", {
requestId: params.requestId,
...this.logDetails,
});
this.handleRedirectResponse(params);
}
});
cdp.on("Network.requestServedFromCache", (params) => {
logNetwork("Network.requestServedFromCache", {
requestId: params.requestId,
...this.logDetails,
});
const reqresp = this.pendingReqResp(params.requestId, true);
if (reqresp) {
this.addPageRecord(reqresp);
this.removeReqResp(params.requestId);
}
});
cdp.on("Network.requestWillBeSentExtraInfo", (params) => {
logNetwork("Network.requestWillBeSentExtraInfo", {
requestId: params.requestId,
...this.logDetails,
});
this.handleRequestExtraInfo(params);
});
cdp.on("Network.requestWillBeSentExtraInfo", (params) =>
this.handleRequestExtraInfo(params),
);
// Loading
cdp.on("Network.loadingFinished", (params) => {
logNetwork("Network.loadingFinished", {
requestId: params.requestId,
...this.logDetails,
});
this.handleLoadingFinished(params);
});
cdp.on("Network.loadingFinished", (params) =>
this.handleLoadingFinished(params),
);
cdp.on("Network.loadingFailed", (params) => {
logNetwork("Network.loadingFailed", {
requestId: params.requestId,
...this.logDetails,
});
this.handleLoadingFailed(params);
});
cdp.on("Network.loadingFailed", (params) =>
this.handleLoadingFailed(params),
);
await cdp.send("Network.enable");
// Target
cdp.on("Target.attachedToTarget", async (params) => {
const { url, type, sessionId } = params.targetInfo;
if (type === "service_worker") {
@ -255,6 +211,13 @@ export class Recorder {
}
});
await cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: false,
flatten: true,
});
// Console
cdp.on("Console.messageAdded", (params) => {
const { message } = params;
const { source, level } = message;
@ -263,17 +226,19 @@ export class Recorder {
}
});
await cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: false,
flatten: true,
});
await cdp.send("Console.enable");
}
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
const { requestId, response, type } = params;
const { mimeType } = response;
const { mimeType, url } = response;
logNetwork("Network.responseReceived", {
requestId,
url,
...this.logDetails,
});
if (mimeType === MIME_EVENT_STREAM) {
return;
@ -285,15 +250,81 @@ export class Recorder {
}
reqresp.fillResponse(response, type);
}
this.addPageRecord(reqresp);
handleResponseReceivedExtraInfo(
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
) {
const { requestId } = params;
logNetwork("Network.responseReceivedExtraInfo", {
requestId,
...this.logDetails,
});
const reqresp = this.pendingReqResp(requestId, true);
if (reqresp) {
reqresp.fillResponseReceivedExtraInfo(params);
}
}
handleRequestServedFromCache(
params: Protocol.Network.RequestServedFromCacheEvent,
) {
const { requestId } = params;
const reqresp = this.pendingReqResp(requestId, true);
const url = reqresp?.url;
logNetwork("Network.requestServedFromCache", {
requestId,
url,
...this.logDetails,
});
if (reqresp) {
reqresp.fromCache = true;
}
}
handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) {
// only handling redirect here, committing last response in redirect chain
// request data stored from requestPaused
const { redirectResponse, requestId, request, type } = params;
const { headers, method, url } = request;
logNetwork("Network.requestWillBeSent", {
requestId,
redirectResponse,
...this.logDetails,
});
if (redirectResponse) {
this.handleRedirectResponse(params);
} else {
if (!this.shouldSkip(headers, url, method, type)) {
const reqresp = this.pendingReqResp(requestId);
if (reqresp) {
reqresp.fillRequest(request);
}
}
}
}
handleRequestExtraInfo(
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
) {
if (!this.shouldSkip(params.headers)) {
const reqresp = this.pendingReqResp(params.requestId, true);
const { requestId, headers } = params;
logNetwork("Network.requestWillBeSentExtraInfo", {
requestId,
...this.logDetails,
});
if (!this.shouldSkip(headers)) {
const reqresp = this.pendingReqResp(requestId, true);
if (reqresp) {
reqresp.fillRequestExtraInfo(params);
}
@ -328,11 +359,19 @@ export class Recorder {
const { errorText, type, requestId } = params;
const reqresp = this.pendingReqResp(requestId, true);
const url = reqresp?.url;
logNetwork("Network.loadingFailed", {
requestId,
url,
...this.logDetails,
});
if (!reqresp) {
return;
}
const { url } = reqresp;
if (type) {
reqresp.resourceType = type.toLowerCase();
}
@ -377,19 +416,33 @@ export class Recorder {
"recorder",
);
}
reqresp.status = 0;
reqresp.errorText = errorText;
this.addPageRecord(reqresp);
this.removeReqResp(requestId);
}
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
const reqresp = this.pendingReqResp(params.requestId, true);
const { requestId } = params;
const reqresp = this.pendingReqResp(requestId, true);
const url = reqresp?.url;
logNetwork("Network.loadingFinished", {
requestId,
url,
...this.logDetails,
});
if (!reqresp || reqresp.asyncLoading) {
return;
}
this.removeReqResp(params.requestId);
this.removeReqResp(requestId);
if (!this.isValidUrl(reqresp.url)) {
if (!this.isValidUrl(url)) {
return;
}
@ -668,7 +721,15 @@ export class Recorder {
if (this.isValidUrl(reqresp.url)) {
const { status, resourceType: type } = reqresp;
const mime = reqresp.getMimeType();
this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type };
const info: PageInfoValue = { status, mime, type };
if (reqresp.errorText) {
info.error = reqresp.errorText;
}
//TODO: revisit if we want to record this later
// if (reqresp.fromCache) {
// info.fromBrowserCache = true;
// }
this.pageInfo.urls[reqresp.getCanonURL()] = info;
}
}
@ -684,9 +745,11 @@ export class Recorder {
);
this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord));
return this.pageInfo.ts;
}
async finishPage() {
async awaitPageResources() {
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
if (reqresp.payload) {
this.removeReqResp(requestId);
@ -726,10 +789,6 @@ export class Recorder {
await sleep(5.0);
numPending = this.pendingRequests.size;
}
await this.writePageInfoRecord();
return this.pageInfo.ts;
}
async onClosePage() {
@ -768,7 +827,8 @@ export class Recorder {
method = headers[":method"];
}
if (!this.isValidUrl(url)) {
// only check if url is provided, since it is optional
if (url && !this.isValidUrl(url)) {
return true;
}
@ -949,6 +1009,10 @@ export class Recorder {
removeReqResp(requestId: string, allowReuse = false) {
const reqresp = this.pendingRequests.get(requestId);
if (reqresp) {
const { url, requestId } = reqresp;
logNetwork("Removing reqresp", { requestId, url });
}
this.pendingRequests.delete(requestId);
if (!allowReuse) {
this.skipIds.add(requestId);
@ -957,28 +1021,40 @@ export class Recorder {
}
async serializeToWARC(reqresp: RequestResponseInfo) {
if (reqresp.shouldSkipSave()) {
const { url, method, status, payload } = reqresp;
logNetwork("Skipping request/response", {
// always include in pageinfo record if going to serialize to WARC
// even if serialization does not happen
this.addPageRecord(reqresp);
const { url, method, status, payload, requestId } = reqresp;
// Specifically log skipping cached resources
if (reqresp.isCached()) {
logger.debug(
"Skipping cached resource, should be already recorded",
{ url, status },
"recorder",
);
return;
} else if (reqresp.shouldSkipSave()) {
logNetwork("Skipping writing request/response", {
requestId,
url,
method,
status,
payloadLength: payload && payload.length,
payloadLength: (payload && payload.length) || 0,
});
return;
}
if (
reqresp.url &&
reqresp.method === "GET" &&
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))
url &&
method === "GET" &&
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url))
) {
logNetwork("Skipping dupe", { url: reqresp.url });
logNetwork("Skipping dupe", { url });
return;
}
this.addPageRecord(reqresp);
const responseRecord = createResponse(reqresp, this.pageid);
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
@ -1016,8 +1092,7 @@ export class Recorder {
const res = await fetcher.load();
const mime =
(reqresp &&
reqresp.responseHeaders &&
(reqresp.responseHeaders &&
reqresp.responseHeaders["content-type"] &&
reqresp.responseHeaders["content-type"].split(";")[0]) ||
"";
@ -1184,8 +1259,6 @@ class AsyncFetcher {
);
}
recorder.addPageRecord(reqresp);
recorder.warcQ.add(() =>
recorder.writer.writeRecordPair(
responseRecord,
@ -1204,9 +1277,16 @@ class AsyncFetcher {
{ url, networkId, filename, ...formatErr(e), ...logDetails },
"recorder",
);
// indicate response is ultimately not valid
reqresp.status = 0;
reqresp.errorText = e.message;
} finally {
// exclude direct fetch request with fake id
if (networkId !== "0") {
recorder.addPageRecord(reqresp);
recorder.removeReqResp(networkId);
}
}
return fetched;
}

View file

@ -37,6 +37,8 @@ export class RequestResponseInfo {
status: number = 0;
statusText?: string;
errorText?: string;
responseHeaders?: Record<string, string>;
responseHeadersList?: { name: string; value: string }[];
responseHeadersText?: string;
@ -44,11 +46,12 @@ export class RequestResponseInfo {
payload?: Uint8Array;
// misc
fromServiceWorker: boolean = false;
fromServiceWorker = false;
fromCache = false;
frameId?: string;
fetch: boolean = false;
fetch = false;
resourceType?: string;
@ -71,13 +74,7 @@ export class RequestResponseInfo {
}
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
this.url = params.request.url;
this.method = params.request.method;
if (!this.requestHeaders) {
this.requestHeaders = params.request.headers;
}
this.postData = params.request.postData;
this.hasPostData = params.request.hasPostData || false;
this.fillRequest(params.request, params.resourceType);
this.status = params.responseStatusCode || 0;
this.statusText = params.responseStatusText || getStatusText(this.status);
@ -86,14 +83,24 @@ export class RequestResponseInfo {
this.fetch = true;
if (params.resourceType) {
this.resourceType = params.resourceType.toLowerCase();
}
this.frameId = params.frameId;
}
fillResponse(response: Protocol.Network.Response, type?: string) {
fillRequest(request: Protocol.Network.Request, resourceType?: string) {
this.url = request.url;
this.method = request.method;
if (!this.requestHeaders) {
this.requestHeaders = request.headers;
}
this.postData = request.postData;
this.hasPostData = request.hasPostData || false;
if (resourceType) {
this.resourceType = resourceType.toLowerCase();
}
}
fillResponse(response: Protocol.Network.Response, resourceType?: string) {
// if initial fetch was a 200, but now replacing with 304, don't!
if (
response.status == 304 &&
@ -111,8 +118,8 @@ export class RequestResponseInfo {
this.protocol = response.protocol;
if (type) {
this.resourceType = type.toLowerCase();
if (resourceType) {
this.resourceType = resourceType.toLowerCase();
}
if (response.requestHeaders) {
@ -292,9 +299,14 @@ export class RequestResponseInfo {
return true;
}
isCached() {
return this.fromCache && !this.payload;
}
shouldSkipSave() {
// skip OPTIONS/HEAD responses, and 304 or 206 responses
// skip cached, OPTIONS/HEAD responses, and 304 or 206 responses
if (
this.fromCache ||
!this.payload ||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
[206, 304].includes(this.status)

View file

@ -298,7 +298,7 @@ export class PageWorker {
async crawlPage(opts: WorkerState) {
const res = await this.crawler.crawlPage(opts);
if (this.recorder) {
opts.data.ts = await this.recorder.finishPage();
await this.recorder.awaitPageResources();
}
return res;
}
@ -339,7 +339,21 @@ export class PageWorker {
"worker",
);
}
await this.closePage();
} finally {
try {
if (this.recorder) {
opts.data.ts = await this.recorder.writePageInfoRecord();
}
} catch (e) {
logger.error(
"Error writing pageinfo recorder",
{ ...formatErr(e), ...this.logDetails },
"recorder",
);
}
await timedRun(
this.crawler.pageFinished(data),
FINISHED_TIMEOUT,

View file

@ -5,7 +5,7 @@ import { WARCParser } from "warcio";
test("run warc and ensure pageinfo records contain the correct resources", async () => {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --scopeType page --collection page-info-test --combineWARC",
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
);
const filename = path.join(
@ -21,6 +21,7 @@ test("run warc and ensure pageinfo records contain the correct resources", async
let foundIndex = false;
let foundAbout = false;
let foundInvalid = false;
for await (const record of parser) {
if (
@ -40,10 +41,20 @@ test("run warc and ensure pageinfo records contain the correct resources", async
const text = await record.contentText();
validateResourcesAbout(JSON.parse(text));
}
if (
!foundInvalid &&
record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
) {
foundInvalid = true;
const text = await record.contentText();
validateResourcesInvalid(JSON.parse(text));
}
}
expect(foundIndex).toBe(true);
expect(foundAbout).toBe(true);
expect(foundInvalid).toBe(true);
});
function validateResourcesIndex(json) {
@ -161,5 +172,25 @@ function validateResourcesAbout(json) {
{ status: 200, mime: "font/woff2", type: "font" },
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
{ status: 200, mime: "font/woff2", type: "font" },
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
{
status: 0,
type: "xhr",
error: "net::ERR_BLOCKED_BY_CLIENT",
},
});
}
function validateResourcesInvalid(json) {
expect(json).toHaveProperty("pageid");
expect(json).toHaveProperty("url");
expect(json).toHaveProperty("urls");
expect(json.counts).toEqual({ jsErrors: 0 });
expect(json.urls).toEqual({
"https://invalid.invalid/": {
status: 0,
type: "document",
error: "net::ERR_NAME_NOT_RESOLVED",
},
});
}