Better tracking of failed requests + logging context exclude (#485)

- add --logExcludeContext for log contexts that should be excluded
(while --logContext specifies which are to be included)
- enable 'recorderNetwork' logging for debugging CDP network
- create default log context exclude list (containing: screencast,
recorderNetwork, jsErrors), customizable via --logExcludeContext

recorder: Track failed requests and include in pageinfo records with
status code 0
- cleanup cdp handler methods
- intercept requestWillBeSent to track requests that started (but may
not complete)
- fix shouldSkip() still working if no url is provided (eg. check only
headers)
- set status to 0 for async fetch failures
- remove responseServedFromCache interception, as response data
generally not available then, and responseReceived is still called
- pageinfo: include page requests that failed with status code 0, also
include 'error' status if available.
- ensure page is closed on failure
- ensure pageinfo still written even if nothing else is crawled for a
page
- track cached responses, add to debug logging (can also add to pageinfo
later if needed)

tests: add pageinfo test for crawling invalid URL, which should still
result in pageinfo record with status code 0

bump to 1.0.0-beta.7
This commit is contained in:
Ilya Kreymer 2024-03-07 08:35:53 -08:00 committed by GitHub
parent 65133c9d9d
commit 9f18a49c0a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 299 additions and 131 deletions

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "1.0.0-beta.6", "version": "1.0.0-beta.7",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module", "type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -189,7 +189,8 @@ export class Crawler {
const debugLogging = this.params.logging.includes("debug"); const debugLogging = this.params.logging.includes("debug");
logger.setDebugLogging(debugLogging); logger.setDebugLogging(debugLogging);
logger.setLogLevel(this.params.logLevel); logger.setLogLevel(this.params.logLevel);
logger.setContext(this.params.context); logger.setContext(this.params.logContext);
logger.setExcludeContext(this.params.logExcludeContext);
// if automatically restarts on error exit code, // if automatically restarts on error exit code,
// exit with 0 from fatal by default, to avoid unnecessary restart // exit with 0 from fatal by default, to avoid unnecessary restart

View file

@ -15,7 +15,11 @@ import {
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js"; import { screenshotTypes } from "./screenshots.js";
import { LOG_CONTEXT_TYPES, logger } from "./logger.js"; import {
DEFAULT_EXCLUDE_LOG_CONTEXTS,
LOG_CONTEXT_TYPES,
logger,
} from "./logger.js";
// ============================================================================ // ============================================================================
class ArgParser { class ArgParser {
@ -225,6 +229,14 @@ class ArgParser {
coerce, coerce,
}, },
logExcludeContext: {
describe: "Comma-separated list of contexts to NOT include in logs",
type: "array",
default: DEFAULT_EXCLUDE_LOG_CONTEXTS,
choices: LOG_CONTEXT_TYPES,
coerce,
},
text: { text: {
describe: describe:
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)", "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",

View file

@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [
"general", "general",
"worker", "worker",
"recorder", "recorder",
"recorderNetwork",
"writer", "writer",
"state", "state",
"redis", "redis",
@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
"recorderNetwork",
"jsError",
"screencast",
];
// =========================================================================== // ===========================================================================
class Logger { class Logger {
logStream: Writable | null = null; logStream: Writable | null = null;
debugLogging = false; debugLogging = false;
logErrorsToRedis = false; logErrorsToRedis = false;
logLevels: string[] = []; logLevels: string[] = [];
contexts: string[] = []; contexts: LogContext[] = [];
excludeContexts: LogContext[] = [];
crawlState?: RedisCrawlState | null = null; crawlState?: RedisCrawlState | null = null;
fatalExitCode = 17; fatalExitCode = 17;
@ -81,10 +89,14 @@ class Logger {
this.logLevels = logLevels; this.logLevels = logLevels;
} }
setContext(contexts: string[]) { setContext(contexts: LogContext[]) {
this.contexts = contexts; this.contexts = contexts;
} }
setExcludeContext(contexts: LogContext[]) {
this.excludeContexts = contexts;
}
setCrawlState(crawlState: RedisCrawlState) { setCrawlState(crawlState: RedisCrawlState) {
this.crawlState = crawlState; this.crawlState = crawlState;
} }
@ -92,7 +104,7 @@ class Logger {
logAsJSON( logAsJSON(
message: string, message: string,
dataUnknown: unknown, dataUnknown: unknown,
context: string, context: LogContext,
logLevel = "info", logLevel = "info",
) { ) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -110,6 +122,12 @@ class Logger {
} }
} }
if (this.excludeContexts.length) {
if (this.excludeContexts.indexOf(context) >= 0) {
return;
}
}
const dataToLog = { const dataToLog = {
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
logLevel: logLevel, logLevel: logLevel,

View file

@ -42,10 +42,9 @@ const MIME_EVENT_STREAM = "text/event-stream";
const encoder = new TextEncoder(); const encoder = new TextEncoder();
// ================================================================= // =================================================================
// TODO: Fix this the next time the file is edited. // eslint-disable-next-line @typescript-eslint/no-explicit-any
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
function logNetwork(msg: string, data: any) { function logNetwork(msg: string, data: any) {
// logger.debug(msg, data, "recorderNetwork"); logger.debug(msg, data, "recorderNetwork");
} }
// ================================================================= // =================================================================
@ -53,6 +52,8 @@ export type PageInfoValue = {
status: number; status: number;
mime?: string; mime?: string;
type?: string; type?: string;
error?: string;
fromBrowserCache?: boolean;
}; };
// ================================================================= // =================================================================
@ -150,7 +151,6 @@ export class Recorder {
async onCreatePage({ cdp }: { cdp: CDPSession }) { async onCreatePage({ cdp }: { cdp: CDPSession }) {
// Fetch // Fetch
cdp.on("Fetch.requestPaused", async (params) => { cdp.on("Fetch.requestPaused", async (params) => {
this.handleRequestPaused(params, cdp); this.handleRequestPaused(params, cdp);
}); });
@ -159,85 +159,41 @@ export class Recorder {
patterns: [{ urlPattern: "*", requestStage: "Response" }], patterns: [{ urlPattern: "*", requestStage: "Response" }],
}); });
await cdp.send("Console.enable");
// Response // Response
cdp.on("Network.responseReceived", (params) => { cdp.on("Network.responseReceived", (params) =>
// handling to fill in security details this.handleResponseReceived(params),
logNetwork("Network.responseReceived", { );
requestId: params.requestId,
...this.logDetails,
});
this.handleResponseReceived(params);
});
cdp.on("Network.responseReceivedExtraInfo", (params) => { cdp.on("Network.responseReceivedExtraInfo", (params) =>
logNetwork("Network.responseReceivedExtraInfo", { this.handleResponseReceivedExtraInfo(params),
requestId: params.requestId, );
...this.logDetails,
}); // Cache
const reqresp = this.pendingReqResp(params.requestId, true); cdp.on("Network.requestServedFromCache", (params) =>
if (reqresp) { this.handleRequestServedFromCache(params),
reqresp.fillResponseReceivedExtraInfo(params); );
}
});
// Request // Request
cdp.on("Network.requestWillBeSent", (params) =>
this.handleRequestWillBeSent(params),
);
cdp.on("Network.requestWillBeSent", (params) => { cdp.on("Network.requestWillBeSentExtraInfo", (params) =>
// only handling redirect here, committing last response in redirect chain this.handleRequestExtraInfo(params),
// request data stored from requestPaused );
if (params.redirectResponse) {
logNetwork("Network.requestWillBeSent after redirect", {
requestId: params.requestId,
...this.logDetails,
});
this.handleRedirectResponse(params);
}
});
cdp.on("Network.requestServedFromCache", (params) => {
logNetwork("Network.requestServedFromCache", {
requestId: params.requestId,
...this.logDetails,
});
const reqresp = this.pendingReqResp(params.requestId, true);
if (reqresp) {
this.addPageRecord(reqresp);
this.removeReqResp(params.requestId);
}
});
cdp.on("Network.requestWillBeSentExtraInfo", (params) => {
logNetwork("Network.requestWillBeSentExtraInfo", {
requestId: params.requestId,
...this.logDetails,
});
this.handleRequestExtraInfo(params);
});
// Loading // Loading
cdp.on("Network.loadingFinished", (params) => { cdp.on("Network.loadingFinished", (params) =>
logNetwork("Network.loadingFinished", { this.handleLoadingFinished(params),
requestId: params.requestId, );
...this.logDetails,
});
this.handleLoadingFinished(params);
});
cdp.on("Network.loadingFailed", (params) => { cdp.on("Network.loadingFailed", (params) =>
logNetwork("Network.loadingFailed", { this.handleLoadingFailed(params),
requestId: params.requestId, );
...this.logDetails,
});
this.handleLoadingFailed(params);
});
await cdp.send("Network.enable"); await cdp.send("Network.enable");
// Target // Target
cdp.on("Target.attachedToTarget", async (params) => { cdp.on("Target.attachedToTarget", async (params) => {
const { url, type, sessionId } = params.targetInfo; const { url, type, sessionId } = params.targetInfo;
if (type === "service_worker") { if (type === "service_worker") {
@ -255,6 +211,13 @@ export class Recorder {
} }
}); });
await cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: false,
flatten: true,
});
// Console
cdp.on("Console.messageAdded", (params) => { cdp.on("Console.messageAdded", (params) => {
const { message } = params; const { message } = params;
const { source, level } = message; const { source, level } = message;
@ -263,17 +226,19 @@ export class Recorder {
} }
}); });
await cdp.send("Target.setAutoAttach", { await cdp.send("Console.enable");
autoAttach: true,
waitForDebuggerOnStart: false,
flatten: true,
});
} }
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) { handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
const { requestId, response, type } = params; const { requestId, response, type } = params;
const { mimeType } = response; const { mimeType, url } = response;
logNetwork("Network.responseReceived", {
requestId,
url,
...this.logDetails,
});
if (mimeType === MIME_EVENT_STREAM) { if (mimeType === MIME_EVENT_STREAM) {
return; return;
@ -285,15 +250,81 @@ export class Recorder {
} }
reqresp.fillResponse(response, type); reqresp.fillResponse(response, type);
}
this.addPageRecord(reqresp); handleResponseReceivedExtraInfo(
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
) {
const { requestId } = params;
logNetwork("Network.responseReceivedExtraInfo", {
requestId,
...this.logDetails,
});
const reqresp = this.pendingReqResp(requestId, true);
if (reqresp) {
reqresp.fillResponseReceivedExtraInfo(params);
}
}
handleRequestServedFromCache(
params: Protocol.Network.RequestServedFromCacheEvent,
) {
const { requestId } = params;
const reqresp = this.pendingReqResp(requestId, true);
const url = reqresp?.url;
logNetwork("Network.requestServedFromCache", {
requestId,
url,
...this.logDetails,
});
if (reqresp) {
reqresp.fromCache = true;
}
}
handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) {
// only handling redirect here, committing last response in redirect chain
// request data stored from requestPaused
const { redirectResponse, requestId, request, type } = params;
const { headers, method, url } = request;
logNetwork("Network.requestWillBeSent", {
requestId,
redirectResponse,
...this.logDetails,
});
if (redirectResponse) {
this.handleRedirectResponse(params);
} else {
if (!this.shouldSkip(headers, url, method, type)) {
const reqresp = this.pendingReqResp(requestId);
if (reqresp) {
reqresp.fillRequest(request);
}
}
}
} }
handleRequestExtraInfo( handleRequestExtraInfo(
params: Protocol.Network.RequestWillBeSentExtraInfoEvent, params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
) { ) {
if (!this.shouldSkip(params.headers)) { const { requestId, headers } = params;
const reqresp = this.pendingReqResp(params.requestId, true);
logNetwork("Network.requestWillBeSentExtraInfo", {
requestId,
...this.logDetails,
});
if (!this.shouldSkip(headers)) {
const reqresp = this.pendingReqResp(requestId, true);
if (reqresp) { if (reqresp) {
reqresp.fillRequestExtraInfo(params); reqresp.fillRequestExtraInfo(params);
} }
@ -328,11 +359,19 @@ export class Recorder {
const { errorText, type, requestId } = params; const { errorText, type, requestId } = params;
const reqresp = this.pendingReqResp(requestId, true); const reqresp = this.pendingReqResp(requestId, true);
const url = reqresp?.url;
logNetwork("Network.loadingFailed", {
requestId,
url,
...this.logDetails,
});
if (!reqresp) { if (!reqresp) {
return; return;
} }
const { url } = reqresp;
if (type) { if (type) {
reqresp.resourceType = type.toLowerCase(); reqresp.resourceType = type.toLowerCase();
} }
@ -377,19 +416,33 @@ export class Recorder {
"recorder", "recorder",
); );
} }
reqresp.status = 0;
reqresp.errorText = errorText;
this.addPageRecord(reqresp);
this.removeReqResp(requestId); this.removeReqResp(requestId);
} }
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) { handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
const reqresp = this.pendingReqResp(params.requestId, true); const { requestId } = params;
const reqresp = this.pendingReqResp(requestId, true);
const url = reqresp?.url;
logNetwork("Network.loadingFinished", {
requestId,
url,
...this.logDetails,
});
if (!reqresp || reqresp.asyncLoading) { if (!reqresp || reqresp.asyncLoading) {
return; return;
} }
this.removeReqResp(params.requestId); this.removeReqResp(requestId);
if (!this.isValidUrl(reqresp.url)) { if (!this.isValidUrl(url)) {
return; return;
} }
@ -668,7 +721,15 @@ export class Recorder {
if (this.isValidUrl(reqresp.url)) { if (this.isValidUrl(reqresp.url)) {
const { status, resourceType: type } = reqresp; const { status, resourceType: type } = reqresp;
const mime = reqresp.getMimeType(); const mime = reqresp.getMimeType();
this.pageInfo.urls[reqresp.getCanonURL()] = { status, mime, type }; const info: PageInfoValue = { status, mime, type };
if (reqresp.errorText) {
info.error = reqresp.errorText;
}
//TODO: revisit if we want to record this later
// if (reqresp.fromCache) {
// info.fromBrowserCache = true;
// }
this.pageInfo.urls[reqresp.getCanonURL()] = info;
} }
} }
@ -684,9 +745,11 @@ export class Recorder {
); );
this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord)); this.warcQ.add(() => this.writer.writeSingleRecord(resourceRecord));
return this.pageInfo.ts;
} }
async finishPage() { async awaitPageResources() {
for (const [requestId, reqresp] of this.pendingRequests.entries()) { for (const [requestId, reqresp] of this.pendingRequests.entries()) {
if (reqresp.payload) { if (reqresp.payload) {
this.removeReqResp(requestId); this.removeReqResp(requestId);
@ -726,10 +789,6 @@ export class Recorder {
await sleep(5.0); await sleep(5.0);
numPending = this.pendingRequests.size; numPending = this.pendingRequests.size;
} }
await this.writePageInfoRecord();
return this.pageInfo.ts;
} }
async onClosePage() { async onClosePage() {
@ -768,7 +827,8 @@ export class Recorder {
method = headers[":method"]; method = headers[":method"];
} }
if (!this.isValidUrl(url)) { // only check if url is provided, since it is optional
if (url && !this.isValidUrl(url)) {
return true; return true;
} }
@ -949,6 +1009,10 @@ export class Recorder {
removeReqResp(requestId: string, allowReuse = false) { removeReqResp(requestId: string, allowReuse = false) {
const reqresp = this.pendingRequests.get(requestId); const reqresp = this.pendingRequests.get(requestId);
if (reqresp) {
const { url, requestId } = reqresp;
logNetwork("Removing reqresp", { requestId, url });
}
this.pendingRequests.delete(requestId); this.pendingRequests.delete(requestId);
if (!allowReuse) { if (!allowReuse) {
this.skipIds.add(requestId); this.skipIds.add(requestId);
@ -957,28 +1021,40 @@ export class Recorder {
} }
async serializeToWARC(reqresp: RequestResponseInfo) { async serializeToWARC(reqresp: RequestResponseInfo) {
if (reqresp.shouldSkipSave()) { // always include in pageinfo record if going to serialize to WARC
const { url, method, status, payload } = reqresp; // even if serialization does not happen
logNetwork("Skipping request/response", { this.addPageRecord(reqresp);
const { url, method, status, payload, requestId } = reqresp;
// Specifically log skipping cached resources
if (reqresp.isCached()) {
logger.debug(
"Skipping cached resource, should be already recorded",
{ url, status },
"recorder",
);
return;
} else if (reqresp.shouldSkipSave()) {
logNetwork("Skipping writing request/response", {
requestId,
url, url,
method, method,
status, status,
payloadLength: payload && payload.length, payloadLength: (payload && payload.length) || 0,
}); });
return; return;
} }
if ( if (
reqresp.url && url &&
reqresp.method === "GET" && method === "GET" &&
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url)) !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url))
) { ) {
logNetwork("Skipping dupe", { url: reqresp.url }); logNetwork("Skipping dupe", { url });
return; return;
} }
this.addPageRecord(reqresp);
const responseRecord = createResponse(reqresp, this.pageid); const responseRecord = createResponse(reqresp, this.pageid);
const requestRecord = createRequest(reqresp, responseRecord, this.pageid); const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
@ -1016,8 +1092,7 @@ export class Recorder {
const res = await fetcher.load(); const res = await fetcher.load();
const mime = const mime =
(reqresp && (reqresp.responseHeaders &&
reqresp.responseHeaders &&
reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"] &&
reqresp.responseHeaders["content-type"].split(";")[0]) || reqresp.responseHeaders["content-type"].split(";")[0]) ||
""; "";
@ -1184,8 +1259,6 @@ class AsyncFetcher {
); );
} }
recorder.addPageRecord(reqresp);
recorder.warcQ.add(() => recorder.warcQ.add(() =>
recorder.writer.writeRecordPair( recorder.writer.writeRecordPair(
responseRecord, responseRecord,
@ -1204,8 +1277,15 @@ class AsyncFetcher {
{ url, networkId, filename, ...formatErr(e), ...logDetails }, { url, networkId, filename, ...formatErr(e), ...logDetails },
"recorder", "recorder",
); );
// indicate response is ultimately not valid
reqresp.status = 0;
reqresp.errorText = e.message;
} finally { } finally {
recorder.removeReqResp(networkId); // exclude direct fetch request with fake id
if (networkId !== "0") {
recorder.addPageRecord(reqresp);
recorder.removeReqResp(networkId);
}
} }
return fetched; return fetched;

View file

@ -37,6 +37,8 @@ export class RequestResponseInfo {
status: number = 0; status: number = 0;
statusText?: string; statusText?: string;
errorText?: string;
responseHeaders?: Record<string, string>; responseHeaders?: Record<string, string>;
responseHeadersList?: { name: string; value: string }[]; responseHeadersList?: { name: string; value: string }[];
responseHeadersText?: string; responseHeadersText?: string;
@ -44,11 +46,12 @@ export class RequestResponseInfo {
payload?: Uint8Array; payload?: Uint8Array;
// misc // misc
fromServiceWorker: boolean = false; fromServiceWorker = false;
fromCache = false;
frameId?: string; frameId?: string;
fetch: boolean = false; fetch = false;
resourceType?: string; resourceType?: string;
@ -71,13 +74,7 @@ export class RequestResponseInfo {
} }
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) { fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
this.url = params.request.url; this.fillRequest(params.request, params.resourceType);
this.method = params.request.method;
if (!this.requestHeaders) {
this.requestHeaders = params.request.headers;
}
this.postData = params.request.postData;
this.hasPostData = params.request.hasPostData || false;
this.status = params.responseStatusCode || 0; this.status = params.responseStatusCode || 0;
this.statusText = params.responseStatusText || getStatusText(this.status); this.statusText = params.responseStatusText || getStatusText(this.status);
@ -86,14 +83,24 @@ export class RequestResponseInfo {
this.fetch = true; this.fetch = true;
if (params.resourceType) {
this.resourceType = params.resourceType.toLowerCase();
}
this.frameId = params.frameId; this.frameId = params.frameId;
} }
fillResponse(response: Protocol.Network.Response, type?: string) { fillRequest(request: Protocol.Network.Request, resourceType?: string) {
this.url = request.url;
this.method = request.method;
if (!this.requestHeaders) {
this.requestHeaders = request.headers;
}
this.postData = request.postData;
this.hasPostData = request.hasPostData || false;
if (resourceType) {
this.resourceType = resourceType.toLowerCase();
}
}
fillResponse(response: Protocol.Network.Response, resourceType?: string) {
// if initial fetch was a 200, but now replacing with 304, don't! // if initial fetch was a 200, but now replacing with 304, don't!
if ( if (
response.status == 304 && response.status == 304 &&
@ -111,8 +118,8 @@ export class RequestResponseInfo {
this.protocol = response.protocol; this.protocol = response.protocol;
if (type) { if (resourceType) {
this.resourceType = type.toLowerCase(); this.resourceType = resourceType.toLowerCase();
} }
if (response.requestHeaders) { if (response.requestHeaders) {
@ -292,9 +299,14 @@ export class RequestResponseInfo {
return true; return true;
} }
isCached() {
return this.fromCache && !this.payload;
}
shouldSkipSave() { shouldSkipSave() {
// skip OPTIONS/HEAD responses, and 304 or 206 responses // skip cached, OPTIONS/HEAD responses, and 304 or 206 responses
if ( if (
this.fromCache ||
!this.payload || !this.payload ||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) || (this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
[206, 304].includes(this.status) [206, 304].includes(this.status)

View file

@ -298,7 +298,7 @@ export class PageWorker {
async crawlPage(opts: WorkerState) { async crawlPage(opts: WorkerState) {
const res = await this.crawler.crawlPage(opts); const res = await this.crawler.crawlPage(opts);
if (this.recorder) { if (this.recorder) {
opts.data.ts = await this.recorder.finishPage(); await this.recorder.awaitPageResources();
} }
return res; return res;
} }
@ -339,7 +339,21 @@ export class PageWorker {
"worker", "worker",
); );
} }
await this.closePage();
} finally { } finally {
try {
if (this.recorder) {
opts.data.ts = await this.recorder.writePageInfoRecord();
}
} catch (e) {
logger.error(
"Error writing pageinfo recorder",
{ ...formatErr(e), ...this.logDetails },
"recorder",
);
}
await timedRun( await timedRun(
this.crawler.pageFinished(data), this.crawler.pageFinished(data),
FINISHED_TIMEOUT, FINISHED_TIMEOUT,

View file

@ -5,7 +5,7 @@ import { WARCParser } from "warcio";
test("run warc and ensure pageinfo records contain the correct resources", async () => { test("run warc and ensure pageinfo records contain the correct resources", async () => {
child_process.execSync( child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --scopeType page --collection page-info-test --combineWARC", "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --url https://webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
); );
const filename = path.join( const filename = path.join(
@ -21,6 +21,7 @@ test("run warc and ensure pageinfo records contain the correct resources", async
let foundIndex = false; let foundIndex = false;
let foundAbout = false; let foundAbout = false;
let foundInvalid = false;
for await (const record of parser) { for await (const record of parser) {
if ( if (
@ -40,10 +41,20 @@ test("run warc and ensure pageinfo records contain the correct resources", async
const text = await record.contentText(); const text = await record.contentText();
validateResourcesAbout(JSON.parse(text)); validateResourcesAbout(JSON.parse(text));
} }
if (
!foundInvalid &&
record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
) {
foundInvalid = true;
const text = await record.contentText();
validateResourcesInvalid(JSON.parse(text));
}
} }
expect(foundIndex).toBe(true); expect(foundIndex).toBe(true);
expect(foundAbout).toBe(true); expect(foundAbout).toBe(true);
expect(foundInvalid).toBe(true);
}); });
function validateResourcesIndex(json) { function validateResourcesIndex(json) {
@ -161,5 +172,25 @@ function validateResourcesAbout(json) {
{ status: 200, mime: "font/woff2", type: "font" }, { status: 200, mime: "font/woff2", type: "font" },
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
{ status: 200, mime: "font/woff2", type: "font" }, { status: 200, mime: "font/woff2", type: "font" },
"https://stats.browsertrix.com/api/event?__wb_method=POST&n=pageview&u=https%3A%2F%2Fwebrecorder.net%2Fabout&d=webrecorder.net":
{
status: 0,
type: "xhr",
error: "net::ERR_BLOCKED_BY_CLIENT",
},
});
}
function validateResourcesInvalid(json) {
expect(json).toHaveProperty("pageid");
expect(json).toHaveProperty("url");
expect(json).toHaveProperty("urls");
expect(json.counts).toEqual({ jsErrors: 0 });
expect(json.urls).toEqual({
"https://invalid.invalid/": {
status: 0,
type: "document",
error: "net::ERR_NAME_NOT_RESOLVED",
},
}); });
} }