mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Better tracking of failed requests + logging context exclude (#485)
- add --logExcludeContext for log contexts that should be excluded (while --logContext specifies which are to be included) - enable 'recorderNetwork' logging for debugging CDP network - create default log context exclude list (containing: screencast, recorderNetwork, jsErrors), customizable via --logExcludeContext recorder: Track failed requests and include in pageinfo records with status code 0 - cleanup cdp handler methods - intercept requestWillBeSent to track requests that started (but may not complete) - fix shouldSkip() still working if no url is provided (eg. check only headers) - set status to 0 for async fetch failures - remove responseServedFromCache interception, as response data generally not available then, and responseReceived is still called - pageinfo: include page requests that failed with status code 0, also include 'error' status if available. - ensure page is closed on failure - ensure pageinfo still written even if nothing else is crawled for a page - track cached responses, add to debug logging (can also add to pageinfo later if needed) tests: add pageinfo test for crawling invalid URL, which should still result in pageinfo record with status code 0 bump to 1.0.0-beta.7
This commit is contained in:
parent
65133c9d9d
commit
9f18a49c0a
8 changed files with 299 additions and 131 deletions
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.0.0-beta.6",
|
||||
"version": "1.0.0-beta.7",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -189,7 +189,8 @@ export class Crawler {
|
|||
const debugLogging = this.params.logging.includes("debug");
|
||||
logger.setDebugLogging(debugLogging);
|
||||
logger.setLogLevel(this.params.logLevel);
|
||||
logger.setContext(this.params.context);
|
||||
logger.setContext(this.params.logContext);
|
||||
logger.setExcludeContext(this.params.logExcludeContext);
|
||||
|
||||
// if automatically restarts on error exit code,
|
||||
// exit with 0 from fatal by default, to avoid unnecessary restart
|
||||
|
|
|
@ -15,7 +15,11 @@ import {
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
import { screenshotTypes } from "./screenshots.js";
|
||||
import { LOG_CONTEXT_TYPES, logger } from "./logger.js";
|
||||
import {
|
||||
DEFAULT_EXCLUDE_LOG_CONTEXTS,
|
||||
LOG_CONTEXT_TYPES,
|
||||
logger,
|
||||
} from "./logger.js";
|
||||
|
||||
// ============================================================================
|
||||
class ArgParser {
|
||||
|
@ -225,6 +229,14 @@ class ArgParser {
|
|||
coerce,
|
||||
},
|
||||
|
||||
logExcludeContext: {
|
||||
describe: "Comma-separated list of contexts to NOT include in logs",
|
||||
type: "array",
|
||||
default: DEFAULT_EXCLUDE_LOG_CONTEXTS,
|
||||
choices: LOG_CONTEXT_TYPES,
|
||||
coerce,
|
||||
},
|
||||
|
||||
text: {
|
||||
describe:
|
||||
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
|
|
|
@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [
|
|||
"general",
|
||||
"worker",
|
||||
"recorder",
|
||||
"recorderNetwork",
|
||||
"writer",
|
||||
"state",
|
||||
"redis",
|
||||
|
@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [
|
|||
|
||||
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
||||
|
||||
export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
|
||||
"recorderNetwork",
|
||||
"jsError",
|
||||
"screencast",
|
||||
];
|
||||
|
||||
// ===========================================================================
|
||||
class Logger {
|
||||
logStream: Writable | null = null;
|
||||
debugLogging = false;
|
||||
logErrorsToRedis = false;
|
||||
logLevels: string[] = [];
|
||||
contexts: string[] = [];
|
||||
contexts: LogContext[] = [];
|
||||
excludeContexts: LogContext[] = [];
|
||||
crawlState?: RedisCrawlState | null = null;
|
||||
fatalExitCode = 17;
|
||||
|
||||
|
@ -81,10 +89,14 @@ class Logger {
|
|||
this.logLevels = logLevels;
|
||||
}
|
||||
|
||||
setContext(contexts: string[]) {
|
||||
setContext(contexts: LogContext[]) {
|
||||
this.contexts = contexts;
|
||||
}
|
||||
|
||||
setExcludeContext(contexts: LogContext[]) {
|
||||
this.excludeContexts = contexts;
|
||||
}
|
||||
|
||||
setCrawlState(crawlState: RedisCrawlState) {
|
||||
this.crawlState = crawlState;
|
||||
}
|
||||
|
@ -92,7 +104,7 @@ class Logger {
|
|||
logAsJSON(
|
||||
message: string,
|
||||
dataUnknown: unknown,
|
||||
context: string,
|
||||
context: LogContext,
|
||||
logLevel = "info",
|
||||
) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -110,6 +122,12 @@ class Logger {
|
|||
}
|
||||
}
|
||||
|
||||
if (this.excludeContexts.length) {
|
||||
if (this.excludeContexts.indexOf(context) >= 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const dataToLog = {
|
||||
timestamp: new Date().toISOString(),
|
||||
logLevel: logLevel,
|
||||
|
|
|
@ -42,10 +42,9 @@ const MIME_EVENT_STREAM = "text/event-stream";
|
|||
const encoder = new TextEncoder();
|
||||
|
||||
// =================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
function logNetwork(msg: string, data: any) {
|
||||
// logger.debug(msg, data, "recorderNetwork");
|
||||
logger.debug(msg, data, "recorderNetwork");
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
|
@ -53,6 +52,8 @@ export type PageInfoValue = {
|
|||
status: number;
|
||||
mime?: string;
|
||||
type?: string;
|
||||
error?: string;
|
||||
fromBrowserCache?: boolean;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
|
@ -150,7 +151,6 @@ export class Recorder {
|
|||
|
||||
async onCreatePage({ cdp }: { cdp: CDPSession }) {
|
||||
// Fetch
|
||||
|
||||
cdp.on("Fetch.requestPaused", async (params) => {
|
||||
this.handleRequestPaused(params, cdp);
|
||||
});
|
||||
|
@ -159,85 +159,41 @@ export class Recorder {
|
|||
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
||||
});
|
||||
|
||||
await cdp.send("Console.enable");
|
||||
|
||||
// Response
|
||||
cdp.on("Network.responseReceived", (params) => {
|
||||
// handling to fill in security details
|
||||
logNetwork("Network.responseReceived", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleResponseReceived(params);
|
||||
});
|
||||
cdp.on("Network.responseReceived", (params) =>
|
||||
this.handleResponseReceived(params),
|
||||
);
|
||||
|
||||
cdp.on("Network.responseReceivedExtraInfo", (params) => {
|
||||
logNetwork("Network.responseReceivedExtraInfo", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
if (reqresp) {
|
||||
reqresp.fillResponseReceivedExtraInfo(params);
|
||||
}
|
||||
});
|
||||
cdp.on("Network.responseReceivedExtraInfo", (params) =>
|
||||
this.handleResponseReceivedExtraInfo(params),
|
||||
);
|
||||
|
||||
// Cache
|
||||
cdp.on("Network.requestServedFromCache", (params) =>
|
||||
this.handleRequestServedFromCache(params),
|
||||
);
|
||||
|
||||
// Request
|
||||
cdp.on("Network.requestWillBeSent", (params) =>
|
||||
this.handleRequestWillBeSent(params),
|
||||
);
|
||||
|
||||
cdp.on("Network.requestWillBeSent", (params) => {
|
||||
// only handling redirect here, committing last response in redirect chain
|
||||
// request data stored from requestPaused
|
||||
if (params.redirectResponse) {
|
||||
logNetwork("Network.requestWillBeSent after redirect", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleRedirectResponse(params);
|
||||
}
|
||||
});
|
||||
|
||||
cdp.on("Network.requestServedFromCache", (params) => {
|
||||
logNetwork("Network.requestServedFromCache", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
if (reqresp) {
|
||||
this.addPageRecord(reqresp);
|
||||
|
||||
this.removeReqResp(params.requestId);
|
||||
}
|
||||
});
|
||||
|
||||
cdp.on("Network.requestWillBeSentExtraInfo", (params) => {
|
||||
logNetwork("Network.requestWillBeSentExtraInfo", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleRequestExtraInfo(params);
|
||||
});
|
||||
cdp.on("Network.requestWillBeSentExtraInfo", (params) =>
|
||||
this.handleRequestExtraInfo(params),
|
||||
);
|
||||
|
||||
// Loading
|
||||
cdp.on("Network.loadingFinished", (params) => {
|
||||
logNetwork("Network.loadingFinished", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleLoadingFinished(params);
|
||||
});
|
||||
cdp.on("Network.loadingFinished", (params) =>
|
||||
this.handleLoadingFinished(params),
|
||||
);
|
||||
|
||||
cdp.on("Network.loadingFailed", (params) => {
|
||||
logNetwork("Network.loadingFailed", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleLoadingFailed(params);
|
||||
});
|
||||
cdp.on("Network.loadingFailed", (params) =>
|
||||
this.handleLoadingFailed(params),
|
||||
);
|
||||
|
||||
await cdp.send("Network.enable");
|
||||
|
||||
// Target
|
||||
|
||||
cdp.on("Target.attachedToTarget", async (params) => {
|
||||
const { url, type, sessionId } = params.targetInfo;
|
||||
if (type === "service_worker") {
|
||||
|
@ -255,6 +211,13 @@ export class Recorder {
|
|||
}
|
||||
});
|
||||
|
||||
await cdp.send("Target.setAutoAttach", {
|
||||
autoAttach: true,
|
||||
waitForDebuggerOnStart: false,
|
||||
flatten: true,
|
||||
});
|
||||
|
||||
// Console
|
||||
cdp.on("Console.messageAdded", (params) => {
|
||||
const { message } = params;
|
||||
const { source, level } = message;
|
||||
|
@ -263,17 +226,19 @@ export class Recorder {
|
|||
}
|
||||
});
|
||||
|
||||
await cdp.send("Target.setAutoAttach", {
|
||||
autoAttach: true,
|
||||
waitForDebuggerOnStart: false,
|
||||
flatten: true,
|
||||
});
|
||||
await cdp.send("Console.enable");
|
||||
}
|
||||
|
||||
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
||||
const { requestId, response, type } = params;
|
||||
|
||||
const { mimeType } = response;
|
||||
const { mimeType, url } = response;
|
||||
|
||||
logNetwork("Network.responseReceived", {
|
||||
requestId,
|
||||
url,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
if (mimeType === MIME_EVENT_STREAM) {
|
||||
return;
|
||||
|
@ -285,15 +250,81 @@ export class Recorder {
|
|||
}
|
||||
|
||||
reqresp.fillResponse(response, type);
|
||||
}
|
||||
|
||||
this.addPageRecord(reqresp);
|
||||
handleResponseReceivedExtraInfo(
|
||||
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
|
||||
) {
|
||||
const { requestId } = params;
|
||||
|
||||
logNetwork("Network.responseReceivedExtraInfo", {
|
||||
requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
const reqresp = this.pendingReqResp(requestId, true);
|
||||
if (reqresp) {
|
||||
reqresp.fillResponseReceivedExtraInfo(params);
|
||||
}
|
||||
}
|
||||
|
||||
handleRequestServedFromCache(
|
||||
params: Protocol.Network.RequestServedFromCacheEvent,
|
||||
) {
|
||||
const { requestId } = params;
|
||||
|
||||
const reqresp = this.pendingReqResp(requestId, true);
|
||||
|
||||
const url = reqresp?.url;
|
||||
|
||||
logNetwork("Network.requestServedFromCache", {
|
||||
requestId,
|
||||
url,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
if (reqresp) {
|
||||
reqresp.fromCache = true;
|
||||
}
|
||||
}
|
||||
|
||||
handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) {
|
||||
// only handling redirect here, committing last response in redirect chain
|
||||
// request data stored from requestPaused
|
||||
const { redirectResponse, requestId, request, type } = params;
|
||||
|
||||
const { headers, method, url } = request;
|
||||
|
||||
logNetwork("Network.requestWillBeSent", {
|
||||
requestId,
|
||||
redirectResponse,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
if (redirectResponse) {
|
||||
this.handleRedirectResponse(params);
|
||||
} else {
|
||||
if (!this.shouldSkip(headers, url, method, type)) {
|
||||
const reqresp = this.pendingReqResp(requestId);
|
||||
if (reqresp) {
|
||||
reqresp.fillRequest(request);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
handleRequestExtraInfo(
|
||||
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
|
||||
) {
|
||||
if (!this.shouldSkip(params.headers)) {
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
const { requestId, headers } = params;
|
||||
|
||||
logNetwork("Network.requestWillBeSentExtraInfo", {
|
||||
requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
if (!this.shouldSkip(headers)) {
|
||||
const reqresp = this.pendingReqResp(requestId, true);
|
||||
if (reqresp) {
|
||||
reqresp.fillRequestExtraInfo(params);
|
||||
}
|
||||
|
@ -328,11 +359,19 @@ export class Recorder {
|
|||
const { errorText, type, requestId } = params;
|
||||
|
||||
const reqresp = this.pendingReqResp(requestId, true);
|
||||
|
||||
const url = reqresp?.url;
|
||||
|
||||
logNetwork("Network.loadingFailed", {
|
||||
requestId,
|
||||
url,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
if (!reqresp) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { url } = reqresp;
|
||||
if (type) {
|
||||
reqresp.resourceType = type.toLowerCase();
|
||||
}
|
||||
|
@ -377,19 +416,33 @@ export class Recorder {
|
|||
"recorder",
|
||||
);
|
||||
}
|
||||
reqresp.status = 0;
|
||||
reqresp.errorText = errorText;
|
||||
|
||||
this.addPageRecord(reqresp);
|
||||
this.removeReqResp(requestId);
|
||||
}
|
||||
|
||||
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
const { requestId } = params;
|
||||
|
||||
const reqresp = this.pendingReqResp(requestId, true);
|
||||
|
||||
const url = reqresp?.url;
|
||||
|
||||
logNetwork("Network.loadingFinished", {
|
||||
requestId,
|
||||
url,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
if (!reqresp || reqresp.asyncLoading) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.removeReqResp(params.requestId);
|
||||
this.removeReqResp(requestId);
|
||||
|
||||
if (!this.isValidUrl(reqresp.url)) {
|
||||
if (!this.isValidUrl(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -668,7 +721,15 @@ export class Recorder {
|
|||
if (this.isValidUrl(reqresp.url)) {
|
||||
const { status, resourceType: type } = reqresp;
|
||||
const mime = reqresp.getMimeType();
|
||||
this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type };
|
||||
const info: PageInfoValue = { status, mime, type };
|
||||
if (reqresp.errorText) {
|
||||
info.error = reqresp.errorText;
|
||||
}
|
||||
//TODO: revisit if we want to record this later
|
||||
// if (reqresp.fromCache) {
|
||||
// info.fromBrowserCache = true;
|
||||
// }
|
||||
this.pageInfo.urls[reqresp.getCanonURL()] = info;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -684,9 +745,11 @@ export class Recorder {
|
|||
);
|
||||
|
||||
this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord));
|
||||
|
||||
return this.pageInfo.ts;
|
||||
}
|
||||
|
||||
async finishPage() {
|
||||
async awaitPageResources() {
|
||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||
if (reqresp.payload) {
|
||||
this.removeReqResp(requestId);
|
||||
|
@ -726,10 +789,6 @@ export class Recorder {
|
|||
await sleep(5.0);
|
||||
numPending = this.pendingRequests.size;
|
||||
}
|
||||
|
||||
await this.writePageInfoRecord();
|
||||
|
||||
return this.pageInfo.ts;
|
||||
}
|
||||
|
||||
async onClosePage() {
|
||||
|
@ -768,7 +827,8 @@ export class Recorder {
|
|||
method = headers[":method"];
|
||||
}
|
||||
|
||||
if (!this.isValidUrl(url)) {
|
||||
// only check if url is provided, since it is optional
|
||||
if (url && !this.isValidUrl(url)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -949,6 +1009,10 @@ export class Recorder {
|
|||
|
||||
removeReqResp(requestId: string, allowReuse = false) {
|
||||
const reqresp = this.pendingRequests.get(requestId);
|
||||
if (reqresp) {
|
||||
const { url, requestId } = reqresp;
|
||||
logNetwork("Removing reqresp", { requestId, url });
|
||||
}
|
||||
this.pendingRequests.delete(requestId);
|
||||
if (!allowReuse) {
|
||||
this.skipIds.add(requestId);
|
||||
|
@ -957,28 +1021,40 @@ export class Recorder {
|
|||
}
|
||||
|
||||
async serializeToWARC(reqresp: RequestResponseInfo) {
|
||||
if (reqresp.shouldSkipSave()) {
|
||||
const { url, method, status, payload } = reqresp;
|
||||
logNetwork("Skipping request/response", {
|
||||
// always include in pageinfo record if going to serialize to WARC
|
||||
// even if serialization does not happen
|
||||
this.addPageRecord(reqresp);
|
||||
|
||||
const { url, method, status, payload, requestId } = reqresp;
|
||||
|
||||
// Specifically log skipping cached resources
|
||||
if (reqresp.isCached()) {
|
||||
logger.debug(
|
||||
"Skipping cached resource, should be already recorded",
|
||||
{ url, status },
|
||||
"recorder",
|
||||
);
|
||||
return;
|
||||
} else if (reqresp.shouldSkipSave()) {
|
||||
logNetwork("Skipping writing request/response", {
|
||||
requestId,
|
||||
url,
|
||||
method,
|
||||
status,
|
||||
payloadLength: payload && payload.length,
|
||||
payloadLength: (payload && payload.length) || 0,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
reqresp.url &&
|
||||
reqresp.method === "GET" &&
|
||||
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))
|
||||
url &&
|
||||
method === "GET" &&
|
||||
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url))
|
||||
) {
|
||||
logNetwork("Skipping dupe", { url: reqresp.url });
|
||||
logNetwork("Skipping dupe", { url });
|
||||
return;
|
||||
}
|
||||
|
||||
this.addPageRecord(reqresp);
|
||||
|
||||
const responseRecord = createResponse(reqresp, this.pageid);
|
||||
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
||||
|
||||
|
@ -1016,8 +1092,7 @@ export class Recorder {
|
|||
const res = await fetcher.load();
|
||||
|
||||
const mime =
|
||||
(reqresp &&
|
||||
reqresp.responseHeaders &&
|
||||
(reqresp.responseHeaders &&
|
||||
reqresp.responseHeaders["content-type"] &&
|
||||
reqresp.responseHeaders["content-type"].split(";")[0]) ||
|
||||
"";
|
||||
|
@ -1184,8 +1259,6 @@ class AsyncFetcher {
|
|||
);
|
||||
}
|
||||
|
||||
recorder.addPageRecord(reqresp);
|
||||
|
||||
recorder.warcQ.add(() =>
|
||||
recorder.writer.writeRecordPair(
|
||||
responseRecord,
|
||||
|
@ -1204,9 +1277,16 @@ class AsyncFetcher {
|
|||
{ url, networkId, filename, ...formatErr(e), ...logDetails },
|
||||
"recorder",
|
||||
);
|
||||
// indicate response is ultimately not valid
|
||||
reqresp.status = 0;
|
||||
reqresp.errorText = e.message;
|
||||
} finally {
|
||||
// exclude direct fetch request with fake id
|
||||
if (networkId !== "0") {
|
||||
recorder.addPageRecord(reqresp);
|
||||
recorder.removeReqResp(networkId);
|
||||
}
|
||||
}
|
||||
|
||||
return fetched;
|
||||
}
|
||||
|
|
|
@ -37,6 +37,8 @@ export class RequestResponseInfo {
|
|||
status: number = 0;
|
||||
statusText?: string;
|
||||
|
||||
errorText?: string;
|
||||
|
||||
responseHeaders?: Record<string, string>;
|
||||
responseHeadersList?: { name: string; value: string }[];
|
||||
responseHeadersText?: string;
|
||||
|
@ -44,11 +46,12 @@ export class RequestResponseInfo {
|
|||
payload?: Uint8Array;
|
||||
|
||||
// misc
|
||||
fromServiceWorker: boolean = false;
|
||||
fromServiceWorker = false;
|
||||
fromCache = false;
|
||||
|
||||
frameId?: string;
|
||||
|
||||
fetch: boolean = false;
|
||||
fetch = false;
|
||||
|
||||
resourceType?: string;
|
||||
|
||||
|
@ -71,13 +74,7 @@ export class RequestResponseInfo {
|
|||
}
|
||||
|
||||
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
|
||||
this.url = params.request.url;
|
||||
this.method = params.request.method;
|
||||
if (!this.requestHeaders) {
|
||||
this.requestHeaders = params.request.headers;
|
||||
}
|
||||
this.postData = params.request.postData;
|
||||
this.hasPostData = params.request.hasPostData || false;
|
||||
this.fillRequest(params.request, params.resourceType);
|
||||
|
||||
this.status = params.responseStatusCode || 0;
|
||||
this.statusText = params.responseStatusText || getStatusText(this.status);
|
||||
|
@ -86,14 +83,24 @@ export class RequestResponseInfo {
|
|||
|
||||
this.fetch = true;
|
||||
|
||||
if (params.resourceType) {
|
||||
this.resourceType = params.resourceType.toLowerCase();
|
||||
}
|
||||
|
||||
this.frameId = params.frameId;
|
||||
}
|
||||
|
||||
fillResponse(response: Protocol.Network.Response, type?: string) {
|
||||
fillRequest(request: Protocol.Network.Request, resourceType?: string) {
|
||||
this.url = request.url;
|
||||
this.method = request.method;
|
||||
if (!this.requestHeaders) {
|
||||
this.requestHeaders = request.headers;
|
||||
}
|
||||
this.postData = request.postData;
|
||||
this.hasPostData = request.hasPostData || false;
|
||||
|
||||
if (resourceType) {
|
||||
this.resourceType = resourceType.toLowerCase();
|
||||
}
|
||||
}
|
||||
|
||||
fillResponse(response: Protocol.Network.Response, resourceType?: string) {
|
||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||
if (
|
||||
response.status == 304 &&
|
||||
|
@ -111,8 +118,8 @@ export class RequestResponseInfo {
|
|||
|
||||
this.protocol = response.protocol;
|
||||
|
||||
if (type) {
|
||||
this.resourceType = type.toLowerCase();
|
||||
if (resourceType) {
|
||||
this.resourceType = resourceType.toLowerCase();
|
||||
}
|
||||
|
||||
if (response.requestHeaders) {
|
||||
|
@ -292,9 +299,14 @@ export class RequestResponseInfo {
|
|||
return true;
|
||||
}
|
||||
|
||||
isCached() {
|
||||
return this.fromCache && !this.payload;
|
||||
}
|
||||
|
||||
shouldSkipSave() {
|
||||
// skip OPTIONS/HEAD responses, and 304 or 206 responses
|
||||
// skip cached, OPTIONS/HEAD responses, and 304 or 206 responses
|
||||
if (
|
||||
this.fromCache ||
|
||||
!this.payload ||
|
||||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
|
||||
[206, 304].includes(this.status)
|
||||
|
|
|
@ -298,7 +298,7 @@ export class PageWorker {
|
|||
async crawlPage(opts: WorkerState) {
|
||||
const res = await this.crawler.crawlPage(opts);
|
||||
if (this.recorder) {
|
||||
opts.data.ts = await this.recorder.finishPage();
|
||||
await this.recorder.awaitPageResources();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
@ -339,7 +339,21 @@ export class PageWorker {
|
|||
"worker",
|
||||
);
|
||||
}
|
||||
|
||||
await this.closePage();
|
||||
} finally {
|
||||
try {
|
||||
if (this.recorder) {
|
||||
opts.data.ts = await this.recorder.writePageInfoRecord();
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
"Error writing pageinfo recorder",
|
||||
{ ...formatErr(e), ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
||||
await timedRun(
|
||||
this.crawler.pageFinished(data),
|
||||
FINISHED_TIMEOUT,
|
||||
|
|
|
@ -5,7 +5,7 @@ import { WARCParser } from "warcio";
|
|||
|
||||
test("run warc and ensure pageinfo records contain the correct resources", async () => {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --scopeType page --collection page-info-test --combineWARC",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
|
||||
);
|
||||
|
||||
const filename = path.join(
|
||||
|
@ -21,6 +21,7 @@ test("run warc and ensure pageinfo records contain the correct resources", async
|
|||
|
||||
let foundIndex = false;
|
||||
let foundAbout = false;
|
||||
let foundInvalid = false;
|
||||
|
||||
for await (const record of parser) {
|
||||
if (
|
||||
|
@ -40,10 +41,20 @@ test("run warc and ensure pageinfo records contain the correct resources", async
|
|||
const text = await record.contentText();
|
||||
validateResourcesAbout(JSON.parse(text));
|
||||
}
|
||||
|
||||
if (
|
||||
!foundInvalid &&
|
||||
record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
|
||||
) {
|
||||
foundInvalid = true;
|
||||
const text = await record.contentText();
|
||||
validateResourcesInvalid(JSON.parse(text));
|
||||
}
|
||||
}
|
||||
|
||||
expect(foundIndex).toBe(true);
|
||||
expect(foundAbout).toBe(true);
|
||||
expect(foundInvalid).toBe(true);
|
||||
});
|
||||
|
||||
function validateResourcesIndex(json) {
|
||||
|
@ -161,5 +172,25 @@ function validateResourcesAbout(json) {
|
|||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
|
||||
{
|
||||
status: 0,
|
||||
type: "xhr",
|
||||
error: "net::ERR_BLOCKED_BY_CLIENT",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
function validateResourcesInvalid(json) {
|
||||
expect(json).toHaveProperty("pageid");
|
||||
expect(json).toHaveProperty("url");
|
||||
expect(json).toHaveProperty("urls");
|
||||
expect(json.counts).toEqual({ jsErrors: 0 });
|
||||
expect(json.urls).toEqual({
|
||||
"https://invalid.invalid/": {
|
||||
status: 0,
|
||||
type: "document",
|
||||
error: "net::ERR_NAME_NOT_RESOLVED",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue