mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

- extractLinks() now handled via browsertix-behaviors - fixes #770 via browsertrix-behaviors, checks for toJSON overrides - organize exposed functions to enum list
1963 lines
49 KiB
TypeScript
1963 lines
49 KiB
TypeScript
import PQueue from "p-queue";
|
|
|
|
import { logger, formatErr } from "./logger.js";
|
|
import { sleep, timedRun } from "./timing.js";
|
|
import {
|
|
RequestResponseInfo,
|
|
isHTMLMime,
|
|
isRedirectStatus,
|
|
} from "./reqresp.js";
|
|
|
|
import { fetch, Response } from "undici";
|
|
|
|
import {
|
|
getCustomRewriter,
|
|
removeRangeAsQuery,
|
|
rewriteDASH,
|
|
rewriteHLS,
|
|
} from "@webrecorder/wabac";
|
|
|
|
import { WARCRecord } from "warcio";
|
|
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
|
import { WARCWriter } from "./warcwriter.js";
|
|
import { RedisCrawlState, WorkerId } from "./state.js";
|
|
import { CDPSession, Protocol } from "puppeteer-core";
|
|
import { Crawler } from "../crawler.js";
|
|
import { getProxyDispatcher } from "./proxy.js";
|
|
import { ScopedSeed } from "./seeds.js";
|
|
|
|
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
|
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
|
|
|
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
|
|
|
|
const TAKE_STREAM_BUFF_SIZE = 1024 * 64;
|
|
|
|
const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe";
|
|
|
|
const WRITE_DUPE_KEY = "s:writedupe";
|
|
|
|
const MIME_EVENT_STREAM = "text/event-stream";
|
|
|
|
const RW_MIME_TYPES = [
|
|
"application/x-mpegURL",
|
|
"application/vnd.apple.mpegurl",
|
|
"application/dash+xml",
|
|
"text/html",
|
|
"application/json",
|
|
"text/javascript",
|
|
"application/javascript",
|
|
"application/x-javascript",
|
|
];
|
|
|
|
const encoder = new TextEncoder();
|
|
|
|
// =================================================================
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
function logNetwork(msg: string, data: any) {
|
|
logger.debug(msg, data, "recorderNetwork");
|
|
}
|
|
|
|
// =================================================================
|
|
export type PageInfoValue = {
|
|
status: number;
|
|
mime?: string;
|
|
type?: string;
|
|
error?: string;
|
|
fromBrowserCache?: boolean;
|
|
};
|
|
|
|
// =================================================================
|
|
export type PageInfoRecord = {
|
|
pageid: string;
|
|
urls: Record<string, PageInfoValue>;
|
|
url: string;
|
|
ts?: Date;
|
|
tsStatus: number;
|
|
counts: {
|
|
jsErrors: number;
|
|
};
|
|
};
|
|
|
|
// =================================================================
|
|
export type AsyncFetchOptions = {
|
|
reqresp: RequestResponseInfo;
|
|
expectedSize?: number;
|
|
// eslint-disable-next-line no-use-before-define
|
|
recorder: Recorder;
|
|
networkId: string;
|
|
filter?: (resp: Response) => boolean;
|
|
ignoreDupe?: boolean;
|
|
maxFetchSize?: number;
|
|
manualRedirect?: boolean;
|
|
};
|
|
|
|
// =================================================================
|
|
export type DirectFetchRequest = {
|
|
url: string;
|
|
headers: Record<string, string>;
|
|
cdp: CDPSession;
|
|
};
|
|
|
|
// =================================================================
|
|
export type DirectFetchResponse = {
|
|
fetched: boolean;
|
|
mime: string;
|
|
ts: Date;
|
|
};
|
|
|
|
// =================================================================
|
|
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
|
|
cdp: CDPSession;
|
|
};
|
|
|
|
// =================================================================
|
|
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
|
|
requestId: string;
|
|
};
|
|
|
|
// =================================================================
|
|
export class Recorder {
|
|
workerid: WorkerId;
|
|
|
|
crawler: Crawler;
|
|
|
|
crawlState: RedisCrawlState;
|
|
|
|
fetcherQ: PQueue;
|
|
|
|
pendingRequests!: Map<string, RequestResponseInfo>;
|
|
skipIds!: Set<string>;
|
|
pageInfo!: PageInfoRecord;
|
|
mainFrameId: string | null = null;
|
|
skipRangeUrls!: Map<string, number>;
|
|
|
|
swTargetId?: string | null;
|
|
swFrameIds = new Set<string>();
|
|
swUrls = new Set<string>();
|
|
|
|
// TODO: Fix this the next time the file is edited.
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
logDetails: Record<string, any> = {};
|
|
|
|
pageFinished = false;
|
|
|
|
gzip = true;
|
|
|
|
writer: WARCWriter;
|
|
|
|
pageUrl!: string;
|
|
pageid!: string;
|
|
|
|
pageSeed?: ScopedSeed;
|
|
|
|
frameIdToExecId: Map<string, number> | null;
|
|
|
|
constructor({
|
|
workerid,
|
|
writer,
|
|
crawler,
|
|
}: {
|
|
workerid: WorkerId;
|
|
writer: WARCWriter;
|
|
crawler: Crawler;
|
|
}) {
|
|
this.workerid = workerid;
|
|
this.crawler = crawler;
|
|
this.crawlState = crawler.crawlState;
|
|
|
|
this.writer = writer;
|
|
|
|
this.fetcherQ = new PQueue({ concurrency: 1 });
|
|
|
|
this.frameIdToExecId = null;
|
|
}
|
|
|
|
async onCreatePage({
|
|
cdp,
|
|
frameIdToExecId,
|
|
}: {
|
|
cdp: CDPSession;
|
|
frameIdToExecId: Map<string, number>;
|
|
}) {
|
|
this.frameIdToExecId = frameIdToExecId;
|
|
this.pageFinished = false;
|
|
|
|
// Fetch
|
|
cdp.on("Fetch.requestPaused", (params) => {
|
|
void this.handleRequestPaused(params, cdp);
|
|
});
|
|
|
|
await cdp.send("Fetch.enable", {
|
|
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
|
});
|
|
|
|
// Response
|
|
cdp.on("Network.responseReceived", (params) =>
|
|
this.handleResponseReceived(params),
|
|
);
|
|
|
|
cdp.on("Network.responseReceivedExtraInfo", (params) =>
|
|
this.handleResponseReceivedExtraInfo(params),
|
|
);
|
|
|
|
// Cache
|
|
cdp.on("Network.requestServedFromCache", (params) =>
|
|
this.handleRequestServedFromCache(params),
|
|
);
|
|
|
|
// Request
|
|
cdp.on("Network.requestWillBeSent", (params) =>
|
|
this.handleRequestWillBeSent(params),
|
|
);
|
|
|
|
cdp.on("Network.requestWillBeSentExtraInfo", (params) =>
|
|
this.handleRequestExtraInfo(params),
|
|
);
|
|
|
|
// Loading
|
|
cdp.on("Network.loadingFinished", (params) =>
|
|
this.handleLoadingFinished(params),
|
|
);
|
|
|
|
cdp.on("Network.loadingFailed", (params) =>
|
|
this.handleLoadingFailed(params),
|
|
);
|
|
|
|
await cdp.send("Network.enable");
|
|
|
|
// Target
|
|
cdp.on("Target.attachedToTarget", async (params) => {
|
|
const { url, type, targetId } = params.targetInfo;
|
|
if (type === "service_worker") {
|
|
this.swTargetId = targetId;
|
|
this.swUrls.add(url);
|
|
}
|
|
});
|
|
|
|
cdp.on("Target.detachedFromTarget", async (params) => {
|
|
const { targetId } = params;
|
|
if (this.swTargetId && targetId === this.swTargetId) {
|
|
this.swUrls.clear();
|
|
this.swFrameIds.clear();
|
|
this.swTargetId = null;
|
|
}
|
|
});
|
|
|
|
await cdp.send("Target.setAutoAttach", {
|
|
autoAttach: true,
|
|
waitForDebuggerOnStart: false,
|
|
flatten: true,
|
|
});
|
|
|
|
// Console
|
|
cdp.on("Console.messageAdded", (params) => {
|
|
const { message } = params;
|
|
const { source, level } = message;
|
|
if (source === "console-api" && level === "error") {
|
|
this.pageInfo.counts.jsErrors++;
|
|
}
|
|
});
|
|
|
|
await cdp.send("Console.enable");
|
|
}
|
|
|
|
hasFrame(frameId: string) {
|
|
return this.swFrameIds.has(frameId) || this.frameIdToExecId?.has(frameId);
|
|
}
|
|
|
|
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
|
const { requestId, response, type } = params;
|
|
|
|
const { mimeType, url, headers } = response;
|
|
|
|
logNetwork("Network.responseReceived", {
|
|
requestId,
|
|
url,
|
|
...this.logDetails,
|
|
});
|
|
|
|
if (mimeType === MIME_EVENT_STREAM) {
|
|
return;
|
|
}
|
|
|
|
if (this.shouldSkip(headers, url, undefined, type)) {
|
|
return;
|
|
}
|
|
|
|
const reqresp = this.pendingReqResp(requestId);
|
|
if (!reqresp) {
|
|
return;
|
|
}
|
|
|
|
reqresp.fillResponse(response, type);
|
|
}
|
|
|
|
handleResponseReceivedExtraInfo(
|
|
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
|
|
) {
|
|
const { requestId } = params;
|
|
|
|
logNetwork("Network.responseReceivedExtraInfo", {
|
|
requestId,
|
|
...this.logDetails,
|
|
});
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
if (reqresp) {
|
|
reqresp.fillResponseReceivedExtraInfo(params);
|
|
}
|
|
}
|
|
|
|
handleRequestServedFromCache(
|
|
params: Protocol.Network.RequestServedFromCacheEvent,
|
|
) {
|
|
const { requestId } = params;
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
|
|
const url = reqresp?.url;
|
|
|
|
logNetwork("Network.requestServedFromCache", {
|
|
requestId,
|
|
url,
|
|
...this.logDetails,
|
|
});
|
|
|
|
if (reqresp) {
|
|
reqresp.fromCache = true;
|
|
}
|
|
}
|
|
|
|
handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) {
|
|
const { redirectResponse, requestId, request, type } = params;
|
|
|
|
const { headers, method, url } = request;
|
|
|
|
logNetwork("Network.requestWillBeSent", {
|
|
requestId,
|
|
url,
|
|
redirectResponse,
|
|
...this.logDetails,
|
|
});
|
|
|
|
// handling redirect here, committing last response in redirect chain
|
|
// request data stored from requestPaused
|
|
if (redirectResponse) {
|
|
this.handleRedirectResponse(params);
|
|
} else {
|
|
if (!this.shouldSkip(headers, url, method, type)) {
|
|
const reqresp = this.pendingReqResp(requestId);
|
|
if (reqresp) {
|
|
reqresp.fillRequest(request, type || "");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
handleRequestExtraInfo(
|
|
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
|
|
) {
|
|
const { requestId, headers } = params;
|
|
|
|
logNetwork("Network.requestWillBeSentExtraInfo", {
|
|
requestId,
|
|
...this.logDetails,
|
|
});
|
|
|
|
if (!this.shouldSkip(headers)) {
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
if (reqresp) {
|
|
reqresp.fillRequestExtraInfo(params);
|
|
}
|
|
}
|
|
}
|
|
|
|
handleRedirectResponse(params: Protocol.Network.RequestWillBeSentEvent) {
|
|
const { requestId, redirectResponse, type } = params;
|
|
|
|
// remove and serialize, but allow reusing requestId
|
|
// as redirect chain may reuse same requestId for subsequent request
|
|
const reqresp = this.removeReqResp(requestId, true);
|
|
if (!reqresp || !redirectResponse) {
|
|
return;
|
|
}
|
|
|
|
reqresp.fillResponse(redirectResponse, type);
|
|
|
|
if (reqresp.isSelfRedirect()) {
|
|
logger.warn(
|
|
"Skipping self redirect",
|
|
{ url: reqresp.url, status: reqresp.status, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
return;
|
|
}
|
|
|
|
try {
|
|
new URL(reqresp.url);
|
|
} catch (e) {
|
|
logger.warn(
|
|
"Skipping invalid URL from redirect",
|
|
{ url: reqresp.url, status: reqresp.status, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
return;
|
|
}
|
|
|
|
this.serializeToWARC(reqresp).catch((e) =>
|
|
logger.warn("Error Serializing to WARC", e, "recorder"),
|
|
);
|
|
}
|
|
|
|
handleLoadingFailed(params: Protocol.Network.LoadingFailedEvent) {
|
|
const { errorText, type, requestId } = params;
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
|
|
const url = reqresp?.url;
|
|
|
|
logNetwork("Network.loadingFailed", {
|
|
requestId,
|
|
url,
|
|
errorText,
|
|
type,
|
|
...this.logDetails,
|
|
});
|
|
|
|
if (!reqresp) {
|
|
return;
|
|
}
|
|
|
|
if (type) {
|
|
reqresp.resourceType = type.toLowerCase();
|
|
}
|
|
|
|
switch (errorText) {
|
|
case "net::ERR_BLOCKED_BY_CLIENT":
|
|
logNetwork("Request blocked", { url, errorText, ...this.logDetails });
|
|
break;
|
|
|
|
case "net::ERR_ABORTED":
|
|
// check if this is a false positive -- a valid download that's already been fetched
|
|
// the abort is just for page, but download will succeed
|
|
if (
|
|
(type === "Document" || type === "Media") &&
|
|
reqresp.isValidBinary()
|
|
) {
|
|
this.removeReqResp(requestId);
|
|
return this.serializeToWARC(reqresp);
|
|
} else if (url && reqresp.requestHeaders && type === "Media") {
|
|
this.removeReqResp(requestId);
|
|
logger.warn(
|
|
"Attempt direct fetch of failed request",
|
|
{ url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
reqresp.deleteRange();
|
|
reqresp.requestId = "0";
|
|
|
|
const fetcher = new AsyncFetcher({
|
|
reqresp,
|
|
expectedSize: reqresp.expectedSize ? reqresp.expectedSize : -1,
|
|
recorder: this,
|
|
networkId: "0",
|
|
});
|
|
void this.fetcherQ.add(() => fetcher.load());
|
|
return;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
logger.warn(
|
|
"Request failed",
|
|
{ url, errorText, type, status: reqresp.status, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
}
|
|
reqresp.status = 0;
|
|
reqresp.errorText = errorText;
|
|
this.addPageRecord(reqresp);
|
|
|
|
this.removeReqResp(requestId);
|
|
}
|
|
|
|
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
|
|
const { requestId } = params;
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
|
|
const url = reqresp?.url;
|
|
|
|
logNetwork("Network.loadingFinished", {
|
|
requestId,
|
|
url,
|
|
...this.logDetails,
|
|
});
|
|
|
|
if (!reqresp || reqresp.asyncLoading) {
|
|
return;
|
|
}
|
|
|
|
this.removeReqResp(requestId);
|
|
|
|
if (!this.isValidUrl(url)) {
|
|
return;
|
|
}
|
|
|
|
this.serializeToWARC(reqresp).catch((e) =>
|
|
logger.warn("Error Serializing to WARC", e, "recorder"),
|
|
);
|
|
}
|
|
|
|
async handleRequestPaused(
|
|
params: Protocol.Fetch.RequestPausedEvent,
|
|
cdp: CDPSession,
|
|
isBrowserContext = false,
|
|
) {
|
|
const {
|
|
requestId,
|
|
request,
|
|
responseStatusCode,
|
|
responseErrorReason,
|
|
resourceType,
|
|
networkId,
|
|
} = params;
|
|
const { method, headers, url } = request;
|
|
|
|
logNetwork("Fetch.requestPaused", {
|
|
requestId,
|
|
networkId,
|
|
url,
|
|
...this.logDetails,
|
|
});
|
|
|
|
let continued = false;
|
|
|
|
try {
|
|
if (
|
|
responseStatusCode &&
|
|
!responseErrorReason &&
|
|
!this.shouldSkip(headers, url, method, resourceType)
|
|
) {
|
|
continued = await this.handleFetchResponse(
|
|
params,
|
|
cdp,
|
|
isBrowserContext,
|
|
);
|
|
}
|
|
} catch (e) {
|
|
logger.error(
|
|
"Error handling response, probably skipping URL",
|
|
{ url, ...formatErr(e), ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
}
|
|
|
|
if (!continued) {
|
|
try {
|
|
await cdp.send("Fetch.continueResponse", { requestId });
|
|
} catch (e) {
|
|
logger.debug(
|
|
"continueResponse failed",
|
|
{ requestId, networkId, url, ...formatErr(e), ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
async handleFetchResponse(
|
|
params: Protocol.Fetch.RequestPausedEvent,
|
|
cdp: CDPSession,
|
|
isBrowserContext: boolean,
|
|
) {
|
|
const { request } = params;
|
|
const { url } = request;
|
|
const {
|
|
requestId,
|
|
responseErrorReason,
|
|
responseStatusCode,
|
|
responseHeaders,
|
|
} = params;
|
|
|
|
const networkId = params.networkId || requestId;
|
|
|
|
const reqresp = this.pendingReqResp(networkId);
|
|
|
|
if (!reqresp) {
|
|
return false;
|
|
}
|
|
|
|
if (responseErrorReason) {
|
|
logger.warn(
|
|
"Skipping failed response",
|
|
{ url, reason: responseErrorReason, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
return false;
|
|
}
|
|
|
|
const contentLen = this._getContentLen(responseHeaders);
|
|
|
|
if (responseStatusCode === 206) {
|
|
const range = this._getContentRange(responseHeaders);
|
|
if (range === `bytes 0-${contentLen - 1}/${contentLen}`) {
|
|
logger.debug(
|
|
"Keep 206 Response, Full Range",
|
|
{ range, contentLen, url, networkId, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
} else if (range?.startsWith("bytes 0-")) {
|
|
logger.debug(
|
|
"Re-request 206 Response without range",
|
|
{ range, contentLen, url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
this.removeReqResp(networkId);
|
|
|
|
if (!reqresp.fetchContinued) {
|
|
const reqrespNew = new RequestResponseInfo("0");
|
|
reqrespNew.fillRequest(params.request, params.resourceType);
|
|
reqrespNew.deleteRange();
|
|
reqrespNew.frameId = params.frameId;
|
|
|
|
this.addAsyncFetch(
|
|
{
|
|
reqresp: reqrespNew,
|
|
expectedSize: parseInt(range.split("/")[1]),
|
|
recorder: this,
|
|
networkId: "0",
|
|
cdp,
|
|
},
|
|
contentLen,
|
|
);
|
|
}
|
|
|
|
return false;
|
|
} else {
|
|
// logger.debug(
|
|
// "Skip 206 Response",
|
|
// { range, contentLen, url, ...this.logDetails },
|
|
// "recorder",
|
|
// );
|
|
this.removeReqResp(networkId);
|
|
const count = this.skipRangeUrls.get(url) || 0;
|
|
if (count > 2) {
|
|
// just fail additional range requests to save bandwidth, as these are not being recorded
|
|
await cdp.send("Fetch.failRequest", {
|
|
requestId,
|
|
errorReason: "BlockedByResponse",
|
|
});
|
|
return true;
|
|
}
|
|
this.skipRangeUrls.set(url, count + 1);
|
|
return false;
|
|
}
|
|
} else {
|
|
const filteredUrl = removeRangeAsQuery(url);
|
|
if (filteredUrl) {
|
|
this.removeReqResp(networkId);
|
|
|
|
logger.debug(
|
|
"Removed range in query, async fetching full URL",
|
|
{ url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
|
|
if (!reqresp.fetchContinued) {
|
|
const reqrespNew = new RequestResponseInfo("0");
|
|
reqrespNew.fillRequest(params.request, params.resourceType);
|
|
reqrespNew.url = filteredUrl;
|
|
reqrespNew.frameId = params.frameId;
|
|
|
|
this.addAsyncFetch({
|
|
reqresp: reqrespNew,
|
|
recorder: this,
|
|
networkId: "0",
|
|
cdp,
|
|
});
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// indicate that this is intercepted in the page context
|
|
if (!isBrowserContext) {
|
|
reqresp.inPageContext = true;
|
|
}
|
|
|
|
// Already being handled by a different handler
|
|
if (reqresp.fetchContinued) {
|
|
return false;
|
|
}
|
|
|
|
reqresp.fetchContinued = true;
|
|
|
|
reqresp.fillFetchRequestPaused(params);
|
|
|
|
if (
|
|
url === this.pageUrl &&
|
|
(!this.pageInfo.ts ||
|
|
(responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
|
|
) {
|
|
const errorReason = await this.blockPageResponse(
|
|
url,
|
|
reqresp,
|
|
responseHeaders,
|
|
);
|
|
|
|
if (errorReason) {
|
|
await cdp.send("Fetch.failRequest", {
|
|
requestId,
|
|
errorReason,
|
|
});
|
|
return true;
|
|
}
|
|
|
|
logger.debug("Setting page timestamp", {
|
|
ts: reqresp.ts,
|
|
url,
|
|
status: responseStatusCode,
|
|
});
|
|
this.pageInfo.ts = reqresp.ts;
|
|
this.pageInfo.tsStatus = responseStatusCode!;
|
|
this.mainFrameId = params.frameId;
|
|
}
|
|
|
|
if (this.noResponseForStatus(responseStatusCode)) {
|
|
reqresp.payload = new Uint8Array();
|
|
return false;
|
|
}
|
|
|
|
const mimeType = this.getMimeType(responseHeaders) || "";
|
|
|
|
let streamingConsume = false;
|
|
|
|
if (
|
|
this.shouldStream(
|
|
contentLen,
|
|
responseStatusCode || 0,
|
|
reqresp.resourceType || "",
|
|
mimeType,
|
|
)
|
|
) {
|
|
const opts: ResponseStreamAsyncFetchOptions = {
|
|
reqresp,
|
|
expectedSize: contentLen,
|
|
recorder: this,
|
|
networkId,
|
|
cdp,
|
|
requestId,
|
|
};
|
|
|
|
// fetching using response stream as first attempt,
|
|
// await here and then either call fulFill, or if dupe, return false
|
|
const fetcher = new ResponseStreamAsyncFetcher(opts);
|
|
const res = await fetcher.load();
|
|
switch (res) {
|
|
case "dupe":
|
|
this.removeReqResp(networkId);
|
|
return false;
|
|
|
|
case "fetched":
|
|
streamingConsume = true;
|
|
break;
|
|
}
|
|
|
|
// if not consumed via takeStream, attempt async loading
|
|
if (!streamingConsume) {
|
|
this.addAsyncFetch(opts, contentLen);
|
|
return false;
|
|
}
|
|
} else {
|
|
try {
|
|
logNetwork("Fetching response", {
|
|
sizeExpected: this._getContentLen(responseHeaders),
|
|
url,
|
|
networkId,
|
|
...this.logDetails,
|
|
});
|
|
const { body, base64Encoded } = await cdp.send(
|
|
"Fetch.getResponseBody",
|
|
{ requestId },
|
|
);
|
|
reqresp.payload = Buffer.from(body, base64Encoded ? "base64" : "utf-8");
|
|
logNetwork("Fetch done", {
|
|
size: reqresp.payload.length,
|
|
url,
|
|
networkId,
|
|
...this.logDetails,
|
|
});
|
|
} catch (e) {
|
|
logger.warn(
|
|
"Failed to load response body",
|
|
{ url, networkId, ...formatErr(e), ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const rewritten = await this.rewriteResponse(reqresp, mimeType);
|
|
|
|
// if in browser context, and not also intercepted in page context
|
|
// serialize here, as won't be getting a loadingFinished message for it
|
|
if (
|
|
isBrowserContext &&
|
|
!reqresp.inPageContext &&
|
|
!reqresp.asyncLoading &&
|
|
reqresp.payload
|
|
) {
|
|
this.removeReqResp(networkId);
|
|
await this.serializeToWARC(reqresp);
|
|
}
|
|
|
|
// not rewritten, and not streaming, return false to continue
|
|
if (!rewritten && !streamingConsume) {
|
|
if (!reqresp.payload) {
|
|
logger.error(
|
|
"Unable to get payload skipping recording",
|
|
{ url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
this.removeReqResp(networkId);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// if has payload, encode it, otherwise return empty string
|
|
const body =
|
|
reqresp.payload && reqresp.payload.length
|
|
? Buffer.from(reqresp.payload).toString("base64")
|
|
: "";
|
|
|
|
try {
|
|
await cdp.send("Fetch.fulfillRequest", {
|
|
requestId,
|
|
responseCode: responseStatusCode || 0,
|
|
responseHeaders,
|
|
body,
|
|
});
|
|
} catch (e) {
|
|
const { resourceType } = reqresp;
|
|
const msg =
|
|
resourceType === "document"
|
|
? "document not loaded in browser, possibly other URLs missing"
|
|
: "URL not loaded in browser";
|
|
|
|
logger.debug(msg, { url, resourceType, e }, "recorder");
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number = -1) {
|
|
let fetcher: AsyncFetcher;
|
|
|
|
if (
|
|
opts.reqresp.method !== "GET" ||
|
|
contentLen > MAX_NETWORK_LOAD_SIZE ||
|
|
!opts.reqresp.inPageContext
|
|
) {
|
|
fetcher = new AsyncFetcher(opts);
|
|
} else {
|
|
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
|
|
}
|
|
void this.fetcherQ.add(() => fetcher.load());
|
|
}
|
|
|
|
addExternalFetch(url: string, cdp: CDPSession) {
|
|
logger.debug(
|
|
"Handling fetch from behavior",
|
|
{ url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
const reqresp = new RequestResponseInfo("0");
|
|
reqresp.url = url;
|
|
reqresp.method = "GET";
|
|
reqresp.frameId = this.mainFrameId || undefined;
|
|
const fetcher = new NetworkLoadStreamAsyncFetcher({
|
|
reqresp,
|
|
recorder: this,
|
|
cdp,
|
|
networkId: "0",
|
|
});
|
|
void this.fetcherQ.add(() => fetcher.load());
|
|
// return true if successful
|
|
return true;
|
|
}
|
|
|
|
async blockPageResponse(
|
|
url: string,
|
|
reqresp: RequestResponseInfo,
|
|
responseHeaders?: Protocol.Fetch.HeaderEntry[],
|
|
): Promise<Protocol.Network.ErrorReason | undefined> {
|
|
if (reqresp.isRedirectStatus()) {
|
|
try {
|
|
let loc = this.getLocation(responseHeaders);
|
|
if (loc) {
|
|
loc = new URL(loc, url).href;
|
|
|
|
if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
|
|
logger.warn(
|
|
"Skipping page that redirects to excluded URL",
|
|
{ newUrl: loc, origUrl: this.pageUrl },
|
|
"recorder",
|
|
);
|
|
|
|
return "BlockedByResponse";
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// ignore
|
|
logger.debug("Redirect check error", e, "recorder");
|
|
}
|
|
}
|
|
}
|
|
|
|
startPage({ pageid, url }: { pageid: string; url: string }) {
|
|
this.pageid = pageid;
|
|
this.pageUrl = url;
|
|
this.logDetails = { page: url, workerid: this.workerid };
|
|
if (this.pendingRequests && this.pendingRequests.size) {
|
|
logger.debug(
|
|
"Interrupting timed out requests, moving to next page",
|
|
this.logDetails,
|
|
"recorder",
|
|
);
|
|
}
|
|
this.pendingRequests = new Map();
|
|
this.skipIds = new Set();
|
|
this.skipRangeUrls = new Map<string, number>();
|
|
this.pageFinished = false;
|
|
this.pageInfo = {
|
|
pageid,
|
|
urls: {},
|
|
url,
|
|
counts: { jsErrors: 0 },
|
|
tsStatus: 999,
|
|
};
|
|
this.mainFrameId = null;
|
|
}
|
|
|
|
addPageRecord(reqresp: RequestResponseInfo) {
|
|
if (this.isValidUrl(reqresp.url)) {
|
|
const { status, resourceType: type } = reqresp;
|
|
const mime = reqresp.getMimeType();
|
|
const info: PageInfoValue = { status, mime, type };
|
|
if (reqresp.errorText) {
|
|
info.error = reqresp.errorText;
|
|
}
|
|
//TODO: revisit if we want to record this later
|
|
// if (reqresp.fromCache) {
|
|
// info.fromBrowserCache = true;
|
|
// }
|
|
this.pageInfo.urls[reqresp.getCanonURL()] = info;
|
|
}
|
|
}
|
|
|
|
writePageInfoRecord() {
|
|
const text = JSON.stringify(this.pageInfo, null, 2);
|
|
|
|
const url = this.pageUrl;
|
|
|
|
this.writer.writeNewResourceRecord(
|
|
{
|
|
buffer: new TextEncoder().encode(text),
|
|
resourceType: "pageinfo",
|
|
contentType: "application/json",
|
|
url,
|
|
},
|
|
{ type: "pageinfo", url },
|
|
"recorder",
|
|
);
|
|
|
|
return this.pageInfo.ts;
|
|
}
|
|
|
|
async awaitPageResources() {
|
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
|
if (reqresp.payload && !reqresp.asyncLoading) {
|
|
this.removeReqResp(requestId);
|
|
await this.serializeToWARC(reqresp);
|
|
// if no url, and not fetch intercept or async loading,
|
|
// drop this request, as it was not being loaded
|
|
} else if (
|
|
!reqresp.url ||
|
|
(!reqresp.intercepting && !reqresp.asyncLoading)
|
|
) {
|
|
logger.debug(
|
|
"Removing pending request that was never fetched",
|
|
{ requestId, url: reqresp.url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
this.removeReqResp(requestId);
|
|
}
|
|
}
|
|
|
|
let numPending = this.pendingRequests.size;
|
|
|
|
let pending = [];
|
|
while (
|
|
numPending &&
|
|
!this.pageFinished &&
|
|
!this.crawler.interruptReason &&
|
|
!this.crawler.postCrawling
|
|
) {
|
|
pending = [];
|
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
|
const url = reqresp.url || "";
|
|
const entry: {
|
|
requestId: string;
|
|
url: string;
|
|
expectedSize?: number;
|
|
readSize?: number;
|
|
resourceType?: string;
|
|
} = { requestId, url };
|
|
if (reqresp.expectedSize) {
|
|
entry.expectedSize = reqresp.expectedSize;
|
|
}
|
|
if (reqresp.readSize) {
|
|
entry.readSize = reqresp.readSize;
|
|
}
|
|
if (reqresp.resourceType) {
|
|
entry.resourceType = reqresp.resourceType;
|
|
}
|
|
pending.push(entry);
|
|
}
|
|
|
|
logger.debug(
|
|
"Finishing pending requests for page",
|
|
{ numPending, pending, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
await sleep(5.0);
|
|
numPending = this.pendingRequests.size;
|
|
}
|
|
|
|
if (this.pendingRequests.size) {
|
|
logger.warn(
|
|
"Dropping timed out requests",
|
|
{ numPending, pending, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
for (const requestId of this.pendingRequests.keys()) {
|
|
this.removeReqResp(requestId);
|
|
}
|
|
}
|
|
}
|
|
|
|
async onClosePage() {
|
|
// Any page-specific handling before page is closed.
|
|
this.frameIdToExecId = null;
|
|
|
|
this.pageFinished = true;
|
|
}
|
|
|
|
async onDone(timeout: number) {
|
|
await this.crawlState.setStatus("pending-wait");
|
|
|
|
const finishFetch = async () => {
|
|
logger.debug("Finishing Fetcher Queue", this.logDetails, "recorder");
|
|
await this.fetcherQ.onIdle();
|
|
};
|
|
|
|
if (timeout > 0) {
|
|
await timedRun(
|
|
finishFetch(),
|
|
timeout,
|
|
"Finishing Fetch Timed Out",
|
|
this.logDetails,
|
|
"recorder",
|
|
);
|
|
}
|
|
|
|
logger.debug("Finishing WARC writing", this.logDetails, "recorder");
|
|
|
|
await this.writer.flush();
|
|
}
|
|
|
|
shouldSkip(
|
|
headers: Protocol.Network.Headers,
|
|
url?: string,
|
|
method?: string,
|
|
resourceType?: string,
|
|
) {
|
|
if (headers && !method) {
|
|
method = headers[":method"];
|
|
}
|
|
|
|
// only check if url is provided, since it is optional
|
|
if (url && !this.isValidUrl(url)) {
|
|
return true;
|
|
}
|
|
|
|
if (method === "OPTIONS" || method === "HEAD") {
|
|
return true;
|
|
}
|
|
|
|
if (["EventSource", "WebSocket", "Ping"].includes(resourceType || "")) {
|
|
return true;
|
|
}
|
|
|
|
// beacon
|
|
if (resourceType === "Other" && method === "POST") {
|
|
return true;
|
|
}
|
|
|
|
// skip eventsource, resourceType may not be set correctly
|
|
if (
|
|
headers &&
|
|
(headers["accept"] === MIME_EVENT_STREAM ||
|
|
headers["Accept"] === MIME_EVENT_STREAM)
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
async rewriteResponse(reqresp: RequestResponseInfo, contentType: string) {
|
|
const { url, extraOpts, payload } = reqresp;
|
|
|
|
// don't rewrite if payload is missing or too big
|
|
if (!payload || !payload.length || payload.length > MAX_TEXT_REWRITE_SIZE) {
|
|
return false;
|
|
}
|
|
|
|
let newString = null;
|
|
let string = null;
|
|
|
|
switch (contentType) {
|
|
case "application/x-mpegURL":
|
|
case "application/vnd.apple.mpegurl":
|
|
string = payload.toString();
|
|
newString = rewriteHLS(string, { save: extraOpts });
|
|
break;
|
|
|
|
case "application/dash+xml":
|
|
string = payload.toString();
|
|
newString = rewriteDASH(string, { save: extraOpts });
|
|
break;
|
|
|
|
case "text/html":
|
|
case "application/json":
|
|
case "text/javascript":
|
|
case "application/javascript":
|
|
case "application/x-javascript": {
|
|
const rw = getCustomRewriter(url, isHTMLMime(contentType));
|
|
|
|
if (rw) {
|
|
string = payload.toString();
|
|
newString = rw.rewrite(string, { live: true, save: extraOpts });
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!newString) {
|
|
return false;
|
|
}
|
|
|
|
if (newString !== string) {
|
|
extraOpts.rewritten = 1;
|
|
logger.debug(
|
|
"Content Rewritten",
|
|
{ url, ...this.logDetails },
|
|
"recorder",
|
|
);
|
|
reqresp.payload = encoder.encode(newString);
|
|
reqresp.isRemoveRange = true;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
isEssentialResource(resourceType: string, contentType: string) {
|
|
if (resourceType === "script" || resourceType === "stylesheet") {
|
|
return true;
|
|
}
|
|
|
|
if (RW_MIME_TYPES.includes(contentType)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
shouldStream(
|
|
contentLength: number,
|
|
responseStatusCode: number,
|
|
resourceType: string,
|
|
mimeType: string,
|
|
) {
|
|
// if contentLength is too large even for rewriting, always stream, will not do rewriting
|
|
// even if text
|
|
if (contentLength > MAX_TEXT_REWRITE_SIZE) {
|
|
return true;
|
|
}
|
|
|
|
// if contentLength larger but is essential resource, do stream
|
|
// otherwise full fetch for rewriting
|
|
if (
|
|
contentLength > MAX_BROWSER_DEFAULT_FETCH_SIZE &&
|
|
!this.isEssentialResource(resourceType, mimeType)
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
// if contentLength is unknown, also stream if its an essential resource and not 3xx / 4xx / 5xx
|
|
// status code, as these codes may have no content-length, and are likely small
|
|
if (
|
|
contentLength < 0 &&
|
|
!this.isEssentialResource(resourceType, mimeType) &&
|
|
responseStatusCode >= 200 &&
|
|
responseStatusCode < 300
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
protected getMimeType(
|
|
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
|
) {
|
|
if (!headers) {
|
|
return null;
|
|
}
|
|
for (const header of headers) {
|
|
if (header.name.toLowerCase() === "content-type") {
|
|
return header.value.split(";")[0];
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected getLocation(
|
|
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
|
) {
|
|
if (!headers) {
|
|
return null;
|
|
}
|
|
for (const header of headers) {
|
|
if (header.name.toLowerCase() === "location") {
|
|
return header.value;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
|
|
if (!headers) {
|
|
return -1;
|
|
}
|
|
for (const header of headers) {
|
|
if (header.name.toLowerCase() === "content-length") {
|
|
return Number(header.value);
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
_getContentRange(headers?: Protocol.Fetch.HeaderEntry[]) {
|
|
if (!headers) {
|
|
return null;
|
|
}
|
|
for (const header of headers) {
|
|
if (header.name.toLowerCase() === "content-range") {
|
|
return header.value;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
noResponseForStatus(status: number | undefined | null) {
|
|
return !status || status === 204 || (status >= 300 && status < 400);
|
|
}
|
|
|
|
isValidUrl(url?: string) {
|
|
return url && (url.startsWith("https:") || url.startsWith("http:"));
|
|
}
|
|
|
|
pendingReqResp(requestId: string, reuseOnly = false) {
|
|
if (!this.pendingRequests.has(requestId)) {
|
|
if (reuseOnly || !requestId) {
|
|
return null;
|
|
}
|
|
if (this.skipIds.has(requestId)) {
|
|
logNetwork("Skipping ignored id", { requestId });
|
|
return null;
|
|
}
|
|
const reqresp = new RequestResponseInfo(requestId);
|
|
this.pendingRequests.set(requestId, reqresp);
|
|
return reqresp;
|
|
} else {
|
|
const reqresp = this.pendingRequests.get(requestId);
|
|
if (reqresp && requestId !== reqresp.requestId) {
|
|
logger.warn(
|
|
"Invalid request id",
|
|
{ requestId, actualRequestId: reqresp.requestId },
|
|
"recorder",
|
|
);
|
|
}
|
|
return reqresp;
|
|
}
|
|
}
|
|
|
|
removeReqResp(requestId: string, allowReuse = false) {
|
|
const reqresp = this.pendingRequests.get(requestId);
|
|
if (reqresp) {
|
|
const { url, requestId } = reqresp;
|
|
logNetwork("Removing reqresp", { requestId, url });
|
|
}
|
|
this.pendingRequests.delete(requestId);
|
|
if (!allowReuse) {
|
|
this.skipIds.add(requestId);
|
|
}
|
|
return reqresp;
|
|
}
|
|
|
|
async serializeToWARC(reqresp: RequestResponseInfo) {
|
|
// always include in pageinfo record if going to serialize to WARC
|
|
// even if serialization does not happen
|
|
this.addPageRecord(reqresp);
|
|
|
|
const { url, method, status, payload, requestId } = reqresp;
|
|
|
|
// Specifically log skipping cached resources
|
|
if (reqresp.isCached()) {
|
|
logger.debug(
|
|
"Skipping cached resource, should be already recorded",
|
|
{ url, status },
|
|
"recorder",
|
|
);
|
|
return;
|
|
} else if (reqresp.shouldSkipSave()) {
|
|
logger.debug(
|
|
"Skipping writing request/response",
|
|
{
|
|
requestId,
|
|
url,
|
|
method,
|
|
status,
|
|
payloadLength: (payload && payload.length) || 0,
|
|
},
|
|
"recorder",
|
|
);
|
|
return;
|
|
}
|
|
|
|
if (
|
|
url &&
|
|
method === "GET" &&
|
|
!isRedirectStatus(status) &&
|
|
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
|
) {
|
|
logNetwork("Skipping dupe", { url, status, ...this.logDetails });
|
|
return;
|
|
}
|
|
|
|
const responseRecord = createResponse(reqresp, this.pageid);
|
|
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
|
|
|
this.writer.writeRecordPair(responseRecord, requestRecord);
|
|
}
|
|
|
|
async directFetchCapture({
|
|
url,
|
|
headers,
|
|
cdp,
|
|
}: DirectFetchRequest): Promise<DirectFetchResponse> {
|
|
const reqresp = new RequestResponseInfo("0");
|
|
const ts = new Date();
|
|
|
|
const cookie = await this.getCookieString(cdp, url);
|
|
if (cookie) {
|
|
headers["Cookie"] = cookie;
|
|
}
|
|
|
|
reqresp.url = url;
|
|
reqresp.method = "GET";
|
|
reqresp.requestHeaders = headers;
|
|
reqresp.ts = ts;
|
|
|
|
let mime: string = "";
|
|
|
|
const filter = (resp: Response) => {
|
|
// only direct load 200 responses
|
|
if (resp.status !== 200) {
|
|
return false;
|
|
}
|
|
|
|
const ct = resp.headers.get("content-type");
|
|
if (ct) {
|
|
mime = ct.split(";")[0];
|
|
}
|
|
|
|
const result = !isHTMLMime(mime);
|
|
|
|
if (result) {
|
|
logger.info(
|
|
"Directly fetching page URL without browser",
|
|
{ url, ...this.logDetails },
|
|
"fetch",
|
|
);
|
|
}
|
|
|
|
return result;
|
|
};
|
|
|
|
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
|
// should not get here, as dupe pages tracked via seen list
|
|
const fetcher = new AsyncFetcher({
|
|
reqresp,
|
|
recorder: this,
|
|
networkId: "0",
|
|
filter,
|
|
ignoreDupe: true,
|
|
manualRedirect: true,
|
|
});
|
|
const res = await fetcher.load();
|
|
|
|
// if we get here, resource was not filtered out, has status code of 200
|
|
|
|
this.addPageRecord(reqresp);
|
|
|
|
const fetched = res === "fetched";
|
|
|
|
if (
|
|
url === this.pageUrl &&
|
|
fetched &&
|
|
(!this.pageInfo.ts || 200 < this.pageInfo.tsStatus)
|
|
) {
|
|
logger.debug("Setting page timestamp", { ts, url, status: 200 });
|
|
this.pageInfo.ts = ts;
|
|
this.pageInfo.tsStatus = 200;
|
|
}
|
|
|
|
return { fetched, mime, ts };
|
|
}
|
|
|
|
async getCookieString(cdp: CDPSession, url: string): Promise<string> {
|
|
try {
|
|
const cookieList: string[] = [];
|
|
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
|
|
for (const { name, value } of cookies) {
|
|
cookieList.push(`${name}=${value}`);
|
|
}
|
|
|
|
return cookieList.join(";");
|
|
} catch (e) {
|
|
logger.warn("Error getting cookies", { page: url, e }, "recorder");
|
|
return "";
|
|
}
|
|
}
|
|
}
|
|
|
|
// =================================================================
|
|
class AsyncFetcher {
|
|
reqresp: RequestResponseInfo;
|
|
|
|
networkId: string;
|
|
filter?: (resp: Response) => boolean;
|
|
ignoreDupe = false;
|
|
|
|
maxFetchSize: number;
|
|
|
|
recorder: Recorder;
|
|
|
|
manualRedirect = false;
|
|
|
|
constructor({
|
|
reqresp,
|
|
expectedSize = -1,
|
|
recorder,
|
|
networkId,
|
|
filter = undefined,
|
|
ignoreDupe = false,
|
|
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
|
manualRedirect = false,
|
|
}: AsyncFetchOptions) {
|
|
this.reqresp = reqresp;
|
|
this.reqresp.expectedSize = expectedSize;
|
|
this.reqresp.asyncLoading = true;
|
|
|
|
this.networkId = networkId;
|
|
this.filter = filter;
|
|
this.ignoreDupe = ignoreDupe;
|
|
|
|
this.recorder = recorder;
|
|
|
|
this.maxFetchSize = maxFetchSize;
|
|
|
|
this.manualRedirect = manualRedirect;
|
|
}
|
|
|
|
async load() {
|
|
const { reqresp, recorder, networkId } = this;
|
|
const { url, status } = reqresp;
|
|
|
|
const { pageid, crawlState, gzip, logDetails } = recorder;
|
|
|
|
let fetched = "notfetched";
|
|
|
|
try {
|
|
if (
|
|
reqresp.method === "GET" &&
|
|
url &&
|
|
!(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status))
|
|
) {
|
|
if (!this.ignoreDupe) {
|
|
this.reqresp.asyncLoading = false;
|
|
return "dupe";
|
|
}
|
|
}
|
|
|
|
const body = await this._doFetch();
|
|
fetched = "fetched";
|
|
|
|
const responseRecord = createResponse(reqresp, pageid, body);
|
|
const requestRecord = createRequest(reqresp, responseRecord, pageid);
|
|
|
|
const serializer = new WARCSerializer(responseRecord, {
|
|
gzip,
|
|
maxMemSize: this.maxFetchSize,
|
|
});
|
|
|
|
try {
|
|
let readSize = await serializer.digestRecord();
|
|
if (serializer.httpHeadersBuff) {
|
|
readSize -= serializer.httpHeadersBuff.length;
|
|
}
|
|
reqresp.readSize = readSize;
|
|
// set truncated field and recompute header buff
|
|
if (reqresp.truncated) {
|
|
responseRecord.warcHeaders.headers.set(
|
|
"WARC-Truncated",
|
|
reqresp.truncated,
|
|
);
|
|
// todo: keep this internal in warcio after adding new header
|
|
serializer.warcHeadersBuff = encoder.encode(
|
|
responseRecord.warcHeaders.toString(),
|
|
);
|
|
}
|
|
} catch (e) {
|
|
logger.error(
|
|
"Error reading + digesting payload",
|
|
{ url, ...formatErr(e), ...logDetails },
|
|
"recorder",
|
|
);
|
|
}
|
|
|
|
if (
|
|
reqresp.readSize === reqresp.expectedSize ||
|
|
reqresp.expectedSize < 0
|
|
) {
|
|
logger.debug(
|
|
"Async fetch: streaming done",
|
|
{
|
|
size: reqresp.readSize,
|
|
expected: reqresp.expectedSize,
|
|
networkId,
|
|
url,
|
|
...logDetails,
|
|
},
|
|
"recorder",
|
|
);
|
|
} else {
|
|
logger.warn(
|
|
"Async fetch: possible response size mismatch",
|
|
{
|
|
type: this.constructor.name,
|
|
size: reqresp.readSize,
|
|
expected: reqresp.expectedSize,
|
|
url,
|
|
...logDetails,
|
|
},
|
|
"recorder",
|
|
);
|
|
if (status === 206 || status === 200) {
|
|
void serializer.externalBuffer?.purge();
|
|
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
|
|
return "notfetched";
|
|
}
|
|
}
|
|
|
|
const externalBuffer: TempFileBuffer =
|
|
serializer.externalBuffer as TempFileBuffer;
|
|
|
|
if (externalBuffer) {
|
|
const { currSize, buffers, fh } = externalBuffer;
|
|
|
|
// if fully buffered in memory, then populate the payload to return to browser
|
|
if (buffers && buffers.length && !fh) {
|
|
reqresp.payload = Buffer.concat(buffers, currSize);
|
|
externalBuffer.buffers = [reqresp.payload];
|
|
} else if (fh) {
|
|
logger.debug(
|
|
"Large payload written to WARC, but not returned to browser (would require rereading into memory)",
|
|
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
|
|
"recorder",
|
|
);
|
|
}
|
|
}
|
|
|
|
if (Object.keys(reqresp.extraOpts).length) {
|
|
responseRecord.warcHeaders.headers.set(
|
|
"WARC-JSON-Metadata",
|
|
JSON.stringify(reqresp.extraOpts),
|
|
);
|
|
}
|
|
|
|
recorder.writer.writeRecordPair(
|
|
responseRecord,
|
|
requestRecord,
|
|
serializer,
|
|
);
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
} catch (e: any) {
|
|
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!, status);
|
|
if (e.message === "response-filtered-out") {
|
|
throw e;
|
|
}
|
|
logger.debug(
|
|
"Streaming Fetch Error",
|
|
{ url, networkId, ...formatErr(e), ...logDetails },
|
|
"recorder",
|
|
);
|
|
// indicate response is ultimately not valid
|
|
reqresp.status = 0;
|
|
reqresp.errorText = e.message;
|
|
} finally {
|
|
recorder.addPageRecord(reqresp);
|
|
// exclude direct fetch request with fake id
|
|
if (networkId !== "0") {
|
|
recorder.removeReqResp(networkId);
|
|
}
|
|
}
|
|
|
|
return fetched;
|
|
}
|
|
|
|
async _doFetch() {
|
|
const { reqresp } = this;
|
|
const { method, url } = reqresp;
|
|
logger.debug("Async started: fetch", { url }, "recorder");
|
|
|
|
const headers = reqresp.getRequestHeadersDict();
|
|
|
|
let dispatcher = getProxyDispatcher();
|
|
|
|
if (dispatcher) {
|
|
dispatcher = dispatcher.compose((dispatch) => {
|
|
return (opts, handler) => {
|
|
if (opts.headers) {
|
|
reqresp.requestHeaders = opts.headers as Record<string, string>;
|
|
}
|
|
return dispatch(opts, handler);
|
|
};
|
|
});
|
|
}
|
|
|
|
const resp = await fetch(url!, {
|
|
method,
|
|
headers,
|
|
body: reqresp.postData || undefined,
|
|
redirect: this.manualRedirect ? "manual" : "follow",
|
|
dispatcher,
|
|
});
|
|
|
|
if (this.filter && !this.filter(resp)) {
|
|
// if redirect and cancelled, read whole buffer to avoid possible node error event
|
|
if (resp.status >= 300 && resp.status < 400) {
|
|
await resp.arrayBuffer();
|
|
} else {
|
|
// otherwise, just cancel
|
|
resp.body?.cancel().catch(() => {});
|
|
}
|
|
throw new Error("response-filtered-out");
|
|
}
|
|
|
|
if (
|
|
reqresp.expectedSize < 0 &&
|
|
resp.headers.get("content-length") &&
|
|
!resp.headers.get("content-encoding")
|
|
) {
|
|
reqresp.expectedSize = Number(resp.headers.get("content-length") || -1);
|
|
}
|
|
|
|
if (reqresp.expectedSize === 0) {
|
|
reqresp.fillFetchResponse(resp);
|
|
reqresp.payload = new Uint8Array();
|
|
return;
|
|
} else if (!resp.body) {
|
|
throw new Error("fetch body missing, fetch aborted");
|
|
}
|
|
|
|
reqresp.fillFetchResponse(resp);
|
|
|
|
return this.takeReader(resp.body.getReader());
|
|
}
|
|
|
|
async *takeReader(reader: ReadableStreamDefaultReader<Uint8Array>) {
|
|
let size = 0;
|
|
try {
|
|
while (true) {
|
|
const { value, done } = await reader.read();
|
|
if (done) {
|
|
break;
|
|
}
|
|
|
|
size += value.length;
|
|
yield value;
|
|
}
|
|
} catch (e) {
|
|
logger.warn(
|
|
"takeReader interrupted",
|
|
{
|
|
size,
|
|
url: this.reqresp.url,
|
|
...formatErr(e),
|
|
...this.recorder.logDetails,
|
|
},
|
|
"recorder",
|
|
);
|
|
this.reqresp.truncated = "disconnect";
|
|
}
|
|
}
|
|
|
|
async *takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) {
|
|
let size = 0;
|
|
try {
|
|
while (true) {
|
|
const { data, base64Encoded, eof } = await cdp.send("IO.read", {
|
|
handle: stream,
|
|
size: TAKE_STREAM_BUFF_SIZE,
|
|
});
|
|
const buff = Buffer.from(data, base64Encoded ? "base64" : "utf-8");
|
|
|
|
size += buff.length;
|
|
yield buff;
|
|
|
|
if (eof) {
|
|
break;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logger.warn(
|
|
"takeStream interrupted",
|
|
{
|
|
size,
|
|
url: this.reqresp.url,
|
|
...formatErr(e),
|
|
...this.recorder.logDetails,
|
|
},
|
|
"recorder",
|
|
);
|
|
this.reqresp.truncated = "disconnect";
|
|
}
|
|
}
|
|
}
|
|
|
|
// =================================================================
|
|
class ResponseStreamAsyncFetcher extends AsyncFetcher {
|
|
cdp: CDPSession;
|
|
requestId: string;
|
|
|
|
constructor(opts: ResponseStreamAsyncFetchOptions) {
|
|
super(opts);
|
|
this.cdp = opts.cdp;
|
|
this.requestId = opts.requestId;
|
|
}
|
|
|
|
async _doFetch() {
|
|
const { requestId, reqresp, cdp } = this;
|
|
const { url } = reqresp;
|
|
logger.debug("Async started: takeStream", { url }, "recorder");
|
|
|
|
const { stream } = await cdp.send("Fetch.takeResponseBodyAsStream", {
|
|
requestId,
|
|
});
|
|
|
|
return this.takeStreamIter(cdp, stream);
|
|
}
|
|
}
|
|
|
|
// =================================================================
|
|
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
|
|
cdp: CDPSession;
|
|
|
|
constructor(opts: NetworkLoadAsyncFetchOptions) {
|
|
super(opts);
|
|
this.cdp = opts.cdp;
|
|
}
|
|
|
|
async _doFetch() {
|
|
const { reqresp, cdp } = this;
|
|
const { url } = reqresp;
|
|
logger.debug("Async started: loadNetworkResource", { url }, "recorder");
|
|
|
|
const options = { disableCache: false, includeCredentials: true };
|
|
|
|
let result = null;
|
|
|
|
try {
|
|
result = await cdp.send("Network.loadNetworkResource", {
|
|
frameId: reqresp.frameId,
|
|
url,
|
|
options,
|
|
});
|
|
} catch (e) {
|
|
logger.debug(
|
|
"Network.loadNetworkResource failed, attempting node fetch",
|
|
{ url, ...formatErr(e), ...this.recorder.logDetails },
|
|
"recorder",
|
|
);
|
|
return await super._doFetch();
|
|
}
|
|
|
|
const { stream, headers, httpStatusCode, success, netError, netErrorName } =
|
|
result.resource;
|
|
|
|
if (!success || !stream) {
|
|
//await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
|
logger.debug(
|
|
"Network.loadNetworkResource failed, attempting node fetch",
|
|
{
|
|
url,
|
|
netErrorName,
|
|
netError,
|
|
httpStatusCode,
|
|
...this.recorder.logDetails,
|
|
},
|
|
"recorder",
|
|
);
|
|
return await super._doFetch();
|
|
}
|
|
|
|
if (
|
|
reqresp.expectedSize < 0 &&
|
|
headers &&
|
|
headers["content-length"] &&
|
|
!headers["content-encoding"]
|
|
) {
|
|
reqresp.expectedSize = Number(headers["content-length"] || -1);
|
|
}
|
|
|
|
if (reqresp.expectedSize === 0) {
|
|
reqresp.payload = new Uint8Array();
|
|
return;
|
|
}
|
|
|
|
reqresp.setStatus(httpStatusCode || 200);
|
|
reqresp.responseHeaders = headers || {};
|
|
|
|
return this.takeStreamIter(cdp, stream);
|
|
}
|
|
}
|
|
|
|
// =================================================================
|
|
// response
|
|
function createResponse(
|
|
reqresp: RequestResponseInfo,
|
|
pageid: string,
|
|
contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>,
|
|
) {
|
|
if (reqresp.isRemoveRange && reqresp.status === 206) {
|
|
reqresp.setStatus(200);
|
|
}
|
|
|
|
const url = reqresp.url;
|
|
const warcVersion = "WARC/1.1";
|
|
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
|
const date = new Date(reqresp.ts).toISOString();
|
|
|
|
if (!reqresp.payload) {
|
|
reqresp.payload = new Uint8Array();
|
|
}
|
|
|
|
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload.length);
|
|
|
|
const warcHeaders: Record<string, string> = {
|
|
"WARC-Page-ID": pageid,
|
|
};
|
|
|
|
if (reqresp.resourceType) {
|
|
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
|
}
|
|
|
|
if (!contentIter) {
|
|
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
|
}
|
|
|
|
if (Object.keys(reqresp.extraOpts).length) {
|
|
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
|
}
|
|
|
|
return WARCRecord.create(
|
|
{
|
|
url,
|
|
date,
|
|
warcVersion,
|
|
type: "response",
|
|
warcHeaders,
|
|
httpHeaders,
|
|
statusline,
|
|
},
|
|
contentIter,
|
|
);
|
|
}
|
|
|
|
// =================================================================
|
|
// request
|
|
function createRequest(
|
|
reqresp: RequestResponseInfo,
|
|
responseRecord: WARCRecord,
|
|
pageid: string,
|
|
) {
|
|
const url = reqresp.url;
|
|
const warcVersion = "WARC/1.1";
|
|
const method = reqresp.method;
|
|
|
|
const urlParsed = new URL(url);
|
|
|
|
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
|
|
|
|
const requestBody = reqresp.postData
|
|
? [encoder.encode(reqresp.postData)]
|
|
: [];
|
|
|
|
const httpHeaders = reqresp.getRequestHeadersDict();
|
|
|
|
const warcHeaders: Record<string, string> = {
|
|
"WARC-Concurrent-To": responseRecord.warcHeader("WARC-Record-ID")!,
|
|
"WARC-Page-ID": pageid,
|
|
};
|
|
|
|
if (reqresp.resourceType) {
|
|
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
|
}
|
|
|
|
const date = responseRecord.warcDate || undefined;
|
|
|
|
return WARCRecord.create(
|
|
{
|
|
url,
|
|
date,
|
|
warcVersion,
|
|
type: "request",
|
|
warcHeaders,
|
|
httpHeaders,
|
|
statusline,
|
|
},
|
|
requestBody,
|
|
);
|
|
}
|