mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Better tracking of failed requests + logging context exclude (#485)
- add --logExcludeContext for log contexts that should be excluded (while --logContext specifies which are to be included) - enable 'recorderNetwork' logging for debugging CDP network - create default log context exclude list (containing: screencast, recorderNetwork, jsErrors), customizable via --logExcludeContext recorder: Track failed requests and include in pageinfo records with status code 0 - cleanup cdp handler methods - intercept requestWillBeSent to track requests that started (but may not complete) - fix shouldSkip() still working if no url is provided (eg. check only headers) - set status to 0 for async fetch failures - remove responseServedFromCache interception, as response data generally not available then, and responseReceived is still called - pageinfo: include page requests that failed with status code 0, also include 'error' status if available. - ensure page is closed on failure - ensure pageinfo still written even if nothing else is crawled for a page - track cached responses, add to debug logging (can also add to pageinfo later if needed) tests: add pageinfo test for crawling invalid URL, which should still result in pageinfo record with status code 0 bump to 1.0.0-beta.7
This commit is contained in:
parent
65133c9d9d
commit
9f18a49c0a
8 changed files with 299 additions and 131 deletions
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.0.0-beta.6",
|
"version": "1.0.0-beta.7",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -189,7 +189,8 @@ export class Crawler {
|
||||||
const debugLogging = this.params.logging.includes("debug");
|
const debugLogging = this.params.logging.includes("debug");
|
||||||
logger.setDebugLogging(debugLogging);
|
logger.setDebugLogging(debugLogging);
|
||||||
logger.setLogLevel(this.params.logLevel);
|
logger.setLogLevel(this.params.logLevel);
|
||||||
logger.setContext(this.params.context);
|
logger.setContext(this.params.logContext);
|
||||||
|
logger.setExcludeContext(this.params.logExcludeContext);
|
||||||
|
|
||||||
// if automatically restarts on error exit code,
|
// if automatically restarts on error exit code,
|
||||||
// exit with 0 from fatal by default, to avoid unnecessary restart
|
// exit with 0 from fatal by default, to avoid unnecessary restart
|
||||||
|
|
|
@ -15,7 +15,11 @@ import {
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
import { screenshotTypes } from "./screenshots.js";
|
import { screenshotTypes } from "./screenshots.js";
|
||||||
import { LOG_CONTEXT_TYPES, logger } from "./logger.js";
|
import {
|
||||||
|
DEFAULT_EXCLUDE_LOG_CONTEXTS,
|
||||||
|
LOG_CONTEXT_TYPES,
|
||||||
|
logger,
|
||||||
|
} from "./logger.js";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class ArgParser {
|
class ArgParser {
|
||||||
|
@ -225,6 +229,14 @@ class ArgParser {
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
logExcludeContext: {
|
||||||
|
describe: "Comma-separated list of contexts to NOT include in logs",
|
||||||
|
type: "array",
|
||||||
|
default: DEFAULT_EXCLUDE_LOG_CONTEXTS,
|
||||||
|
choices: LOG_CONTEXT_TYPES,
|
||||||
|
coerce,
|
||||||
|
},
|
||||||
|
|
||||||
text: {
|
text: {
|
||||||
describe:
|
describe:
|
||||||
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||||
|
|
|
@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [
|
||||||
"general",
|
"general",
|
||||||
"worker",
|
"worker",
|
||||||
"recorder",
|
"recorder",
|
||||||
|
"recorderNetwork",
|
||||||
"writer",
|
"writer",
|
||||||
"state",
|
"state",
|
||||||
"redis",
|
"redis",
|
||||||
|
@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [
|
||||||
|
|
||||||
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
||||||
|
|
||||||
|
export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
|
||||||
|
"recorderNetwork",
|
||||||
|
"jsError",
|
||||||
|
"screencast",
|
||||||
|
];
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class Logger {
|
class Logger {
|
||||||
logStream: Writable | null = null;
|
logStream: Writable | null = null;
|
||||||
debugLogging = false;
|
debugLogging = false;
|
||||||
logErrorsToRedis = false;
|
logErrorsToRedis = false;
|
||||||
logLevels: string[] = [];
|
logLevels: string[] = [];
|
||||||
contexts: string[] = [];
|
contexts: LogContext[] = [];
|
||||||
|
excludeContexts: LogContext[] = [];
|
||||||
crawlState?: RedisCrawlState | null = null;
|
crawlState?: RedisCrawlState | null = null;
|
||||||
fatalExitCode = 17;
|
fatalExitCode = 17;
|
||||||
|
|
||||||
|
@ -81,10 +89,14 @@ class Logger {
|
||||||
this.logLevels = logLevels;
|
this.logLevels = logLevels;
|
||||||
}
|
}
|
||||||
|
|
||||||
setContext(contexts: string[]) {
|
setContext(contexts: LogContext[]) {
|
||||||
this.contexts = contexts;
|
this.contexts = contexts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setExcludeContext(contexts: LogContext[]) {
|
||||||
|
this.excludeContexts = contexts;
|
||||||
|
}
|
||||||
|
|
||||||
setCrawlState(crawlState: RedisCrawlState) {
|
setCrawlState(crawlState: RedisCrawlState) {
|
||||||
this.crawlState = crawlState;
|
this.crawlState = crawlState;
|
||||||
}
|
}
|
||||||
|
@ -92,7 +104,7 @@ class Logger {
|
||||||
logAsJSON(
|
logAsJSON(
|
||||||
message: string,
|
message: string,
|
||||||
dataUnknown: unknown,
|
dataUnknown: unknown,
|
||||||
context: string,
|
context: LogContext,
|
||||||
logLevel = "info",
|
logLevel = "info",
|
||||||
) {
|
) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -110,6 +122,12 @@ class Logger {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (this.excludeContexts.length) {
|
||||||
|
if (this.excludeContexts.indexOf(context) >= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const dataToLog = {
|
const dataToLog = {
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
logLevel: logLevel,
|
logLevel: logLevel,
|
||||||
|
|
|
@ -42,10 +42,9 @@ const MIME_EVENT_STREAM = "text/event-stream";
|
||||||
const encoder = new TextEncoder();
|
const encoder = new TextEncoder();
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
// TODO: Fix this the next time the file is edited.
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
|
|
||||||
function logNetwork(msg: string, data: any) {
|
function logNetwork(msg: string, data: any) {
|
||||||
// logger.debug(msg, data, "recorderNetwork");
|
logger.debug(msg, data, "recorderNetwork");
|
||||||
}
|
}
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
|
@ -53,6 +52,8 @@ export type PageInfoValue = {
|
||||||
status: number;
|
status: number;
|
||||||
mime?: string;
|
mime?: string;
|
||||||
type?: string;
|
type?: string;
|
||||||
|
error?: string;
|
||||||
|
fromBrowserCache?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
|
@ -150,7 +151,6 @@ export class Recorder {
|
||||||
|
|
||||||
async onCreatePage({ cdp }: { cdp: CDPSession }) {
|
async onCreatePage({ cdp }: { cdp: CDPSession }) {
|
||||||
// Fetch
|
// Fetch
|
||||||
|
|
||||||
cdp.on("Fetch.requestPaused", async (params) => {
|
cdp.on("Fetch.requestPaused", async (params) => {
|
||||||
this.handleRequestPaused(params, cdp);
|
this.handleRequestPaused(params, cdp);
|
||||||
});
|
});
|
||||||
|
@ -159,85 +159,41 @@ export class Recorder {
|
||||||
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
||||||
});
|
});
|
||||||
|
|
||||||
await cdp.send("Console.enable");
|
|
||||||
|
|
||||||
// Response
|
// Response
|
||||||
cdp.on("Network.responseReceived", (params) => {
|
cdp.on("Network.responseReceived", (params) =>
|
||||||
// handling to fill in security details
|
this.handleResponseReceived(params),
|
||||||
logNetwork("Network.responseReceived", {
|
);
|
||||||
requestId: params.requestId,
|
|
||||||
...this.logDetails,
|
|
||||||
});
|
|
||||||
this.handleResponseReceived(params);
|
|
||||||
});
|
|
||||||
|
|
||||||
cdp.on("Network.responseReceivedExtraInfo", (params) => {
|
cdp.on("Network.responseReceivedExtraInfo", (params) =>
|
||||||
logNetwork("Network.responseReceivedExtraInfo", {
|
this.handleResponseReceivedExtraInfo(params),
|
||||||
requestId: params.requestId,
|
);
|
||||||
...this.logDetails,
|
|
||||||
});
|
// Cache
|
||||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
cdp.on("Network.requestServedFromCache", (params) =>
|
||||||
if (reqresp) {
|
this.handleRequestServedFromCache(params),
|
||||||
reqresp.fillResponseReceivedExtraInfo(params);
|
);
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Request
|
// Request
|
||||||
|
cdp.on("Network.requestWillBeSent", (params) =>
|
||||||
|
this.handleRequestWillBeSent(params),
|
||||||
|
);
|
||||||
|
|
||||||
cdp.on("Network.requestWillBeSent", (params) => {
|
cdp.on("Network.requestWillBeSentExtraInfo", (params) =>
|
||||||
// only handling redirect here, committing last response in redirect chain
|
this.handleRequestExtraInfo(params),
|
||||||
// request data stored from requestPaused
|
);
|
||||||
if (params.redirectResponse) {
|
|
||||||
logNetwork("Network.requestWillBeSent after redirect", {
|
|
||||||
requestId: params.requestId,
|
|
||||||
...this.logDetails,
|
|
||||||
});
|
|
||||||
this.handleRedirectResponse(params);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
cdp.on("Network.requestServedFromCache", (params) => {
|
|
||||||
logNetwork("Network.requestServedFromCache", {
|
|
||||||
requestId: params.requestId,
|
|
||||||
...this.logDetails,
|
|
||||||
});
|
|
||||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
|
||||||
if (reqresp) {
|
|
||||||
this.addPageRecord(reqresp);
|
|
||||||
|
|
||||||
this.removeReqResp(params.requestId);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
cdp.on("Network.requestWillBeSentExtraInfo", (params) => {
|
|
||||||
logNetwork("Network.requestWillBeSentExtraInfo", {
|
|
||||||
requestId: params.requestId,
|
|
||||||
...this.logDetails,
|
|
||||||
});
|
|
||||||
this.handleRequestExtraInfo(params);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Loading
|
// Loading
|
||||||
cdp.on("Network.loadingFinished", (params) => {
|
cdp.on("Network.loadingFinished", (params) =>
|
||||||
logNetwork("Network.loadingFinished", {
|
this.handleLoadingFinished(params),
|
||||||
requestId: params.requestId,
|
);
|
||||||
...this.logDetails,
|
|
||||||
});
|
|
||||||
this.handleLoadingFinished(params);
|
|
||||||
});
|
|
||||||
|
|
||||||
cdp.on("Network.loadingFailed", (params) => {
|
cdp.on("Network.loadingFailed", (params) =>
|
||||||
logNetwork("Network.loadingFailed", {
|
this.handleLoadingFailed(params),
|
||||||
requestId: params.requestId,
|
);
|
||||||
...this.logDetails,
|
|
||||||
});
|
|
||||||
this.handleLoadingFailed(params);
|
|
||||||
});
|
|
||||||
|
|
||||||
await cdp.send("Network.enable");
|
await cdp.send("Network.enable");
|
||||||
|
|
||||||
// Target
|
// Target
|
||||||
|
|
||||||
cdp.on("Target.attachedToTarget", async (params) => {
|
cdp.on("Target.attachedToTarget", async (params) => {
|
||||||
const { url, type, sessionId } = params.targetInfo;
|
const { url, type, sessionId } = params.targetInfo;
|
||||||
if (type === "service_worker") {
|
if (type === "service_worker") {
|
||||||
|
@ -255,6 +211,13 @@ export class Recorder {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
await cdp.send("Target.setAutoAttach", {
|
||||||
|
autoAttach: true,
|
||||||
|
waitForDebuggerOnStart: false,
|
||||||
|
flatten: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Console
|
||||||
cdp.on("Console.messageAdded", (params) => {
|
cdp.on("Console.messageAdded", (params) => {
|
||||||
const { message } = params;
|
const { message } = params;
|
||||||
const { source, level } = message;
|
const { source, level } = message;
|
||||||
|
@ -263,17 +226,19 @@ export class Recorder {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
await cdp.send("Target.setAutoAttach", {
|
await cdp.send("Console.enable");
|
||||||
autoAttach: true,
|
|
||||||
waitForDebuggerOnStart: false,
|
|
||||||
flatten: true,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
||||||
const { requestId, response, type } = params;
|
const { requestId, response, type } = params;
|
||||||
|
|
||||||
const { mimeType } = response;
|
const { mimeType, url } = response;
|
||||||
|
|
||||||
|
logNetwork("Network.responseReceived", {
|
||||||
|
requestId,
|
||||||
|
url,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
if (mimeType === MIME_EVENT_STREAM) {
|
if (mimeType === MIME_EVENT_STREAM) {
|
||||||
return;
|
return;
|
||||||
|
@ -285,15 +250,81 @@ export class Recorder {
|
||||||
}
|
}
|
||||||
|
|
||||||
reqresp.fillResponse(response, type);
|
reqresp.fillResponse(response, type);
|
||||||
|
}
|
||||||
|
|
||||||
this.addPageRecord(reqresp);
|
handleResponseReceivedExtraInfo(
|
||||||
|
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
|
||||||
|
) {
|
||||||
|
const { requestId } = params;
|
||||||
|
|
||||||
|
logNetwork("Network.responseReceivedExtraInfo", {
|
||||||
|
requestId,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
|
const reqresp = this.pendingReqResp(requestId, true);
|
||||||
|
if (reqresp) {
|
||||||
|
reqresp.fillResponseReceivedExtraInfo(params);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
handleRequestServedFromCache(
|
||||||
|
params: Protocol.Network.RequestServedFromCacheEvent,
|
||||||
|
) {
|
||||||
|
const { requestId } = params;
|
||||||
|
|
||||||
|
const reqresp = this.pendingReqResp(requestId, true);
|
||||||
|
|
||||||
|
const url = reqresp?.url;
|
||||||
|
|
||||||
|
logNetwork("Network.requestServedFromCache", {
|
||||||
|
requestId,
|
||||||
|
url,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (reqresp) {
|
||||||
|
reqresp.fromCache = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) {
|
||||||
|
// only handling redirect here, committing last response in redirect chain
|
||||||
|
// request data stored from requestPaused
|
||||||
|
const { redirectResponse, requestId, request, type } = params;
|
||||||
|
|
||||||
|
const { headers, method, url } = request;
|
||||||
|
|
||||||
|
logNetwork("Network.requestWillBeSent", {
|
||||||
|
requestId,
|
||||||
|
redirectResponse,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (redirectResponse) {
|
||||||
|
this.handleRedirectResponse(params);
|
||||||
|
} else {
|
||||||
|
if (!this.shouldSkip(headers, url, method, type)) {
|
||||||
|
const reqresp = this.pendingReqResp(requestId);
|
||||||
|
if (reqresp) {
|
||||||
|
reqresp.fillRequest(request);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
handleRequestExtraInfo(
|
handleRequestExtraInfo(
|
||||||
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
|
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
|
||||||
) {
|
) {
|
||||||
if (!this.shouldSkip(params.headers)) {
|
const { requestId, headers } = params;
|
||||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
|
||||||
|
logNetwork("Network.requestWillBeSentExtraInfo", {
|
||||||
|
requestId,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!this.shouldSkip(headers)) {
|
||||||
|
const reqresp = this.pendingReqResp(requestId, true);
|
||||||
if (reqresp) {
|
if (reqresp) {
|
||||||
reqresp.fillRequestExtraInfo(params);
|
reqresp.fillRequestExtraInfo(params);
|
||||||
}
|
}
|
||||||
|
@ -328,11 +359,19 @@ export class Recorder {
|
||||||
const { errorText, type, requestId } = params;
|
const { errorText, type, requestId } = params;
|
||||||
|
|
||||||
const reqresp = this.pendingReqResp(requestId, true);
|
const reqresp = this.pendingReqResp(requestId, true);
|
||||||
|
|
||||||
|
const url = reqresp?.url;
|
||||||
|
|
||||||
|
logNetwork("Network.loadingFailed", {
|
||||||
|
requestId,
|
||||||
|
url,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
if (!reqresp) {
|
if (!reqresp) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const { url } = reqresp;
|
|
||||||
if (type) {
|
if (type) {
|
||||||
reqresp.resourceType = type.toLowerCase();
|
reqresp.resourceType = type.toLowerCase();
|
||||||
}
|
}
|
||||||
|
@ -377,19 +416,33 @@ export class Recorder {
|
||||||
"recorder",
|
"recorder",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
reqresp.status = 0;
|
||||||
|
reqresp.errorText = errorText;
|
||||||
|
|
||||||
|
this.addPageRecord(reqresp);
|
||||||
this.removeReqResp(requestId);
|
this.removeReqResp(requestId);
|
||||||
}
|
}
|
||||||
|
|
||||||
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
|
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
|
||||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
const { requestId } = params;
|
||||||
|
|
||||||
|
const reqresp = this.pendingReqResp(requestId, true);
|
||||||
|
|
||||||
|
const url = reqresp?.url;
|
||||||
|
|
||||||
|
logNetwork("Network.loadingFinished", {
|
||||||
|
requestId,
|
||||||
|
url,
|
||||||
|
...this.logDetails,
|
||||||
|
});
|
||||||
|
|
||||||
if (!reqresp || reqresp.asyncLoading) {
|
if (!reqresp || reqresp.asyncLoading) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.removeReqResp(params.requestId);
|
this.removeReqResp(requestId);
|
||||||
|
|
||||||
if (!this.isValidUrl(reqresp.url)) {
|
if (!this.isValidUrl(url)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -668,7 +721,15 @@ export class Recorder {
|
||||||
if (this.isValidUrl(reqresp.url)) {
|
if (this.isValidUrl(reqresp.url)) {
|
||||||
const { status, resourceType: type } = reqresp;
|
const { status, resourceType: type } = reqresp;
|
||||||
const mime = reqresp.getMimeType();
|
const mime = reqresp.getMimeType();
|
||||||
this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type };
|
const info: PageInfoValue = { status, mime, type };
|
||||||
|
if (reqresp.errorText) {
|
||||||
|
info.error = reqresp.errorText;
|
||||||
|
}
|
||||||
|
//TODO: revisit if we want to record this later
|
||||||
|
// if (reqresp.fromCache) {
|
||||||
|
// info.fromBrowserCache = true;
|
||||||
|
// }
|
||||||
|
this.pageInfo.urls[reqresp.getCanonURL()] = info;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -684,9 +745,11 @@ export class Recorder {
|
||||||
);
|
);
|
||||||
|
|
||||||
this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord));
|
this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord));
|
||||||
|
|
||||||
|
return this.pageInfo.ts;
|
||||||
}
|
}
|
||||||
|
|
||||||
async finishPage() {
|
async awaitPageResources() {
|
||||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||||
if (reqresp.payload) {
|
if (reqresp.payload) {
|
||||||
this.removeReqResp(requestId);
|
this.removeReqResp(requestId);
|
||||||
|
@ -726,10 +789,6 @@ export class Recorder {
|
||||||
await sleep(5.0);
|
await sleep(5.0);
|
||||||
numPending = this.pendingRequests.size;
|
numPending = this.pendingRequests.size;
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.writePageInfoRecord();
|
|
||||||
|
|
||||||
return this.pageInfo.ts;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async onClosePage() {
|
async onClosePage() {
|
||||||
|
@ -768,7 +827,8 @@ export class Recorder {
|
||||||
method = headers[":method"];
|
method = headers[":method"];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.isValidUrl(url)) {
|
// only check if url is provided, since it is optional
|
||||||
|
if (url && !this.isValidUrl(url)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -949,6 +1009,10 @@ export class Recorder {
|
||||||
|
|
||||||
removeReqResp(requestId: string, allowReuse = false) {
|
removeReqResp(requestId: string, allowReuse = false) {
|
||||||
const reqresp = this.pendingRequests.get(requestId);
|
const reqresp = this.pendingRequests.get(requestId);
|
||||||
|
if (reqresp) {
|
||||||
|
const { url, requestId } = reqresp;
|
||||||
|
logNetwork("Removing reqresp", { requestId, url });
|
||||||
|
}
|
||||||
this.pendingRequests.delete(requestId);
|
this.pendingRequests.delete(requestId);
|
||||||
if (!allowReuse) {
|
if (!allowReuse) {
|
||||||
this.skipIds.add(requestId);
|
this.skipIds.add(requestId);
|
||||||
|
@ -957,28 +1021,40 @@ export class Recorder {
|
||||||
}
|
}
|
||||||
|
|
||||||
async serializeToWARC(reqresp: RequestResponseInfo) {
|
async serializeToWARC(reqresp: RequestResponseInfo) {
|
||||||
if (reqresp.shouldSkipSave()) {
|
// always include in pageinfo record if going to serialize to WARC
|
||||||
const { url, method, status, payload } = reqresp;
|
// even if serialization does not happen
|
||||||
logNetwork("Skipping request/response", {
|
this.addPageRecord(reqresp);
|
||||||
|
|
||||||
|
const { url, method, status, payload, requestId } = reqresp;
|
||||||
|
|
||||||
|
// Specifically log skipping cached resources
|
||||||
|
if (reqresp.isCached()) {
|
||||||
|
logger.debug(
|
||||||
|
"Skipping cached resource, should be already recorded",
|
||||||
|
{ url, status },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
} else if (reqresp.shouldSkipSave()) {
|
||||||
|
logNetwork("Skipping writing request/response", {
|
||||||
|
requestId,
|
||||||
url,
|
url,
|
||||||
method,
|
method,
|
||||||
status,
|
status,
|
||||||
payloadLength: payload && payload.length,
|
payloadLength: (payload && payload.length) || 0,
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
reqresp.url &&
|
url &&
|
||||||
reqresp.method === "GET" &&
|
method === "GET" &&
|
||||||
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))
|
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url))
|
||||||
) {
|
) {
|
||||||
logNetwork("Skipping dupe", { url: reqresp.url });
|
logNetwork("Skipping dupe", { url });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.addPageRecord(reqresp);
|
|
||||||
|
|
||||||
const responseRecord = createResponse(reqresp, this.pageid);
|
const responseRecord = createResponse(reqresp, this.pageid);
|
||||||
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
||||||
|
|
||||||
|
@ -1016,8 +1092,7 @@ export class Recorder {
|
||||||
const res = await fetcher.load();
|
const res = await fetcher.load();
|
||||||
|
|
||||||
const mime =
|
const mime =
|
||||||
(reqresp &&
|
(reqresp.responseHeaders &&
|
||||||
reqresp.responseHeaders &&
|
|
||||||
reqresp.responseHeaders["content-type"] &&
|
reqresp.responseHeaders["content-type"] &&
|
||||||
reqresp.responseHeaders["content-type"].split(";")[0]) ||
|
reqresp.responseHeaders["content-type"].split(";")[0]) ||
|
||||||
"";
|
"";
|
||||||
|
@ -1184,8 +1259,6 @@ class AsyncFetcher {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
recorder.addPageRecord(reqresp);
|
|
||||||
|
|
||||||
recorder.warcQ.add(() =>
|
recorder.warcQ.add(() =>
|
||||||
recorder.writer.writeRecordPair(
|
recorder.writer.writeRecordPair(
|
||||||
responseRecord,
|
responseRecord,
|
||||||
|
@ -1204,9 +1277,16 @@ class AsyncFetcher {
|
||||||
{ url, networkId, filename, ...formatErr(e), ...logDetails },
|
{ url, networkId, filename, ...formatErr(e), ...logDetails },
|
||||||
"recorder",
|
"recorder",
|
||||||
);
|
);
|
||||||
|
// indicate response is ultimately not valid
|
||||||
|
reqresp.status = 0;
|
||||||
|
reqresp.errorText = e.message;
|
||||||
} finally {
|
} finally {
|
||||||
|
// exclude direct fetch request with fake id
|
||||||
|
if (networkId !== "0") {
|
||||||
|
recorder.addPageRecord(reqresp);
|
||||||
recorder.removeReqResp(networkId);
|
recorder.removeReqResp(networkId);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return fetched;
|
return fetched;
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,8 @@ export class RequestResponseInfo {
|
||||||
status: number = 0;
|
status: number = 0;
|
||||||
statusText?: string;
|
statusText?: string;
|
||||||
|
|
||||||
|
errorText?: string;
|
||||||
|
|
||||||
responseHeaders?: Record<string, string>;
|
responseHeaders?: Record<string, string>;
|
||||||
responseHeadersList?: { name: string; value: string }[];
|
responseHeadersList?: { name: string; value: string }[];
|
||||||
responseHeadersText?: string;
|
responseHeadersText?: string;
|
||||||
|
@ -44,11 +46,12 @@ export class RequestResponseInfo {
|
||||||
payload?: Uint8Array;
|
payload?: Uint8Array;
|
||||||
|
|
||||||
// misc
|
// misc
|
||||||
fromServiceWorker: boolean = false;
|
fromServiceWorker = false;
|
||||||
|
fromCache = false;
|
||||||
|
|
||||||
frameId?: string;
|
frameId?: string;
|
||||||
|
|
||||||
fetch: boolean = false;
|
fetch = false;
|
||||||
|
|
||||||
resourceType?: string;
|
resourceType?: string;
|
||||||
|
|
||||||
|
@ -71,13 +74,7 @@ export class RequestResponseInfo {
|
||||||
}
|
}
|
||||||
|
|
||||||
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
|
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
|
||||||
this.url = params.request.url;
|
this.fillRequest(params.request, params.resourceType);
|
||||||
this.method = params.request.method;
|
|
||||||
if (!this.requestHeaders) {
|
|
||||||
this.requestHeaders = params.request.headers;
|
|
||||||
}
|
|
||||||
this.postData = params.request.postData;
|
|
||||||
this.hasPostData = params.request.hasPostData || false;
|
|
||||||
|
|
||||||
this.status = params.responseStatusCode || 0;
|
this.status = params.responseStatusCode || 0;
|
||||||
this.statusText = params.responseStatusText || getStatusText(this.status);
|
this.statusText = params.responseStatusText || getStatusText(this.status);
|
||||||
|
@ -86,14 +83,24 @@ export class RequestResponseInfo {
|
||||||
|
|
||||||
this.fetch = true;
|
this.fetch = true;
|
||||||
|
|
||||||
if (params.resourceType) {
|
|
||||||
this.resourceType = params.resourceType.toLowerCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
this.frameId = params.frameId;
|
this.frameId = params.frameId;
|
||||||
}
|
}
|
||||||
|
|
||||||
fillResponse(response: Protocol.Network.Response, type?: string) {
|
fillRequest(request: Protocol.Network.Request, resourceType?: string) {
|
||||||
|
this.url = request.url;
|
||||||
|
this.method = request.method;
|
||||||
|
if (!this.requestHeaders) {
|
||||||
|
this.requestHeaders = request.headers;
|
||||||
|
}
|
||||||
|
this.postData = request.postData;
|
||||||
|
this.hasPostData = request.hasPostData || false;
|
||||||
|
|
||||||
|
if (resourceType) {
|
||||||
|
this.resourceType = resourceType.toLowerCase();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fillResponse(response: Protocol.Network.Response, resourceType?: string) {
|
||||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||||
if (
|
if (
|
||||||
response.status == 304 &&
|
response.status == 304 &&
|
||||||
|
@ -111,8 +118,8 @@ export class RequestResponseInfo {
|
||||||
|
|
||||||
this.protocol = response.protocol;
|
this.protocol = response.protocol;
|
||||||
|
|
||||||
if (type) {
|
if (resourceType) {
|
||||||
this.resourceType = type.toLowerCase();
|
this.resourceType = resourceType.toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (response.requestHeaders) {
|
if (response.requestHeaders) {
|
||||||
|
@ -292,9 +299,14 @@ export class RequestResponseInfo {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isCached() {
|
||||||
|
return this.fromCache && !this.payload;
|
||||||
|
}
|
||||||
|
|
||||||
shouldSkipSave() {
|
shouldSkipSave() {
|
||||||
// skip OPTIONS/HEAD responses, and 304 or 206 responses
|
// skip cached, OPTIONS/HEAD responses, and 304 or 206 responses
|
||||||
if (
|
if (
|
||||||
|
this.fromCache ||
|
||||||
!this.payload ||
|
!this.payload ||
|
||||||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
|
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
|
||||||
[206, 304].includes(this.status)
|
[206, 304].includes(this.status)
|
||||||
|
|
|
@ -298,7 +298,7 @@ export class PageWorker {
|
||||||
async crawlPage(opts: WorkerState) {
|
async crawlPage(opts: WorkerState) {
|
||||||
const res = await this.crawler.crawlPage(opts);
|
const res = await this.crawler.crawlPage(opts);
|
||||||
if (this.recorder) {
|
if (this.recorder) {
|
||||||
opts.data.ts = await this.recorder.finishPage();
|
await this.recorder.awaitPageResources();
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@ -339,7 +339,21 @@ export class PageWorker {
|
||||||
"worker",
|
"worker",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
await this.closePage();
|
||||||
} finally {
|
} finally {
|
||||||
|
try {
|
||||||
|
if (this.recorder) {
|
||||||
|
opts.data.ts = await this.recorder.writePageInfoRecord();
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
logger.error(
|
||||||
|
"Error writing pageinfo recorder",
|
||||||
|
{ ...formatErr(e), ...this.logDetails },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
await timedRun(
|
await timedRun(
|
||||||
this.crawler.pageFinished(data),
|
this.crawler.pageFinished(data),
|
||||||
FINISHED_TIMEOUT,
|
FINISHED_TIMEOUT,
|
||||||
|
|
|
@ -5,7 +5,7 @@ import { WARCParser } from "warcio";
|
||||||
|
|
||||||
test("run warc and ensure pageinfo records contain the correct resources", async () => {
|
test("run warc and ensure pageinfo records contain the correct resources", async () => {
|
||||||
child_process.execSync(
|
child_process.execSync(
|
||||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --scopeType page --collection page-info-test --combineWARC",
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
|
||||||
);
|
);
|
||||||
|
|
||||||
const filename = path.join(
|
const filename = path.join(
|
||||||
|
@ -21,6 +21,7 @@ test("run warc and ensure pageinfo records contain the correct resources", async
|
||||||
|
|
||||||
let foundIndex = false;
|
let foundIndex = false;
|
||||||
let foundAbout = false;
|
let foundAbout = false;
|
||||||
|
let foundInvalid = false;
|
||||||
|
|
||||||
for await (const record of parser) {
|
for await (const record of parser) {
|
||||||
if (
|
if (
|
||||||
|
@ -40,10 +41,20 @@ test("run warc and ensure pageinfo records contain the correct resources", async
|
||||||
const text = await record.contentText();
|
const text = await record.contentText();
|
||||||
validateResourcesAbout(JSON.parse(text));
|
validateResourcesAbout(JSON.parse(text));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
!foundInvalid &&
|
||||||
|
record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
|
||||||
|
) {
|
||||||
|
foundInvalid = true;
|
||||||
|
const text = await record.contentText();
|
||||||
|
validateResourcesInvalid(JSON.parse(text));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(foundIndex).toBe(true);
|
expect(foundIndex).toBe(true);
|
||||||
expect(foundAbout).toBe(true);
|
expect(foundAbout).toBe(true);
|
||||||
|
expect(foundInvalid).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
function validateResourcesIndex(json) {
|
function validateResourcesIndex(json) {
|
||||||
|
@ -161,5 +172,25 @@ function validateResourcesAbout(json) {
|
||||||
{ status: 200, mime: "font/woff2", type: "font" },
|
{ status: 200, mime: "font/woff2", type: "font" },
|
||||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||||
{ status: 200, mime: "font/woff2", type: "font" },
|
{ status: 200, mime: "font/woff2", type: "font" },
|
||||||
|
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
|
||||||
|
{
|
||||||
|
status: 0,
|
||||||
|
type: "xhr",
|
||||||
|
error: "net::ERR_BLOCKED_BY_CLIENT",
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function validateResourcesInvalid(json) {
|
||||||
|
expect(json).toHaveProperty("pageid");
|
||||||
|
expect(json).toHaveProperty("url");
|
||||||
|
expect(json).toHaveProperty("urls");
|
||||||
|
expect(json.counts).toEqual({ jsErrors: 0 });
|
||||||
|
expect(json.urls).toEqual({
|
||||||
|
"https://invalid.invalid/": {
|
||||||
|
status: 0,
|
||||||
|
type: "document",
|
||||||
|
error: "net::ERR_NAME_NOT_RESOLVED",
|
||||||
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue