2023-11-07 21:38:50 -08:00
|
|
|
import PQueue from "p-queue";
|
|
|
|
|
2023-11-14 21:54:40 -08:00
|
|
|
import { logger, formatErr } from "./logger.js";
|
2024-09-05 11:10:31 -07:00
|
|
|
import { sleep, timedRun } from "./timing.js";
|
2024-07-08 10:51:37 -07:00
|
|
|
import {
|
|
|
|
RequestResponseInfo,
|
|
|
|
isHTMLMime,
|
|
|
|
isRedirectStatus,
|
|
|
|
} from "./reqresp.js";
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2025-01-25 22:55:49 -08:00
|
|
|
import { fetch, Response } from "undici";
|
2024-06-13 18:46:05 -07:00
|
|
|
|
2024-11-22 10:31:12 -08:00
|
|
|
import {
|
|
|
|
getCustomRewriter,
|
|
|
|
removeRangeAsQuery,
|
|
|
|
rewriteDASH,
|
|
|
|
rewriteHLS,
|
|
|
|
} from "@webrecorder/wabac";
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
import { WARCRecord } from "warcio";
|
2023-11-09 11:27:11 -08:00
|
|
|
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
2023-11-07 21:38:50 -08:00
|
|
|
import { WARCWriter } from "./warcwriter.js";
|
2023-11-09 11:27:11 -08:00
|
|
|
import { RedisCrawlState, WorkerId } from "./state.js";
|
|
|
|
import { CDPSession, Protocol } from "puppeteer-core";
|
2023-11-09 18:33:44 -08:00
|
|
|
import { Crawler } from "../crawler.js";
|
2025-01-25 22:55:49 -08:00
|
|
|
import { getProxyDispatcher } from "./proxy.js";
|
2025-01-28 11:28:23 -08:00
|
|
|
import { ScopedSeed } from "./seeds.js";
|
2023-11-09 18:33:44 -08:00
|
|
|
|
|
|
|
const MAX_BROWSER_DEFAULT_FETCH_SIZE = 5_000_000;
|
2024-04-03 17:38:50 -07:00
|
|
|
const MAX_TEXT_REWRITE_SIZE = 25_000_000;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
const MAX_NETWORK_LOAD_SIZE = 200_000_000;
|
|
|
|
|
2023-12-07 23:02:55 -08:00
|
|
|
const TAKE_STREAM_BUFF_SIZE = 1024 * 64;
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
const ASYNC_FETCH_DUPE_KEY = "s:fetchdupe";
|
|
|
|
|
|
|
|
const WRITE_DUPE_KEY = "s:writedupe";
|
|
|
|
|
2024-02-17 23:32:19 -08:00
|
|
|
const MIME_EVENT_STREAM = "text/event-stream";
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
const RW_MIME_TYPES = [
|
|
|
|
"application/x-mpegURL",
|
|
|
|
"application/vnd.apple.mpegurl",
|
|
|
|
"application/dash+xml",
|
|
|
|
"text/html",
|
|
|
|
"application/json",
|
|
|
|
"text/javascript",
|
|
|
|
"application/javascript",
|
|
|
|
"application/x-javascript",
|
|
|
|
];
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
const encoder = new TextEncoder();
|
|
|
|
|
|
|
|
// =================================================================
|
2024-03-07 08:35:53 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
2023-11-09 11:27:11 -08:00
|
|
|
function logNetwork(msg: string, data: any) {
|
2024-03-07 08:35:53 -08:00
|
|
|
logger.debug(msg, data, "recorderNetwork");
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-02-19 19:11:48 -08:00
|
|
|
// =================================================================
|
|
|
|
export type PageInfoValue = {
|
|
|
|
status: number;
|
|
|
|
mime?: string;
|
|
|
|
type?: string;
|
2024-03-07 08:35:53 -08:00
|
|
|
error?: string;
|
|
|
|
fromBrowserCache?: boolean;
|
2024-02-19 19:11:48 -08:00
|
|
|
};
|
|
|
|
|
2024-01-15 13:08:13 -08:00
|
|
|
// =================================================================
|
|
|
|
export type PageInfoRecord = {
|
|
|
|
pageid: string;
|
2024-02-19 19:11:48 -08:00
|
|
|
urls: Record<string, PageInfoValue>;
|
2024-02-09 19:44:17 -05:00
|
|
|
url: string;
|
|
|
|
ts?: Date;
|
2024-07-08 10:51:37 -07:00
|
|
|
tsStatus: number;
|
2024-02-21 16:02:25 -08:00
|
|
|
counts: {
|
|
|
|
jsErrors: number;
|
|
|
|
};
|
2024-01-15 13:08:13 -08:00
|
|
|
};
|
|
|
|
|
2024-04-03 17:38:50 -07:00
|
|
|
// =================================================================
|
|
|
|
export type AsyncFetchOptions = {
|
|
|
|
reqresp: RequestResponseInfo;
|
|
|
|
expectedSize?: number;
|
|
|
|
// eslint-disable-next-line no-use-before-define
|
|
|
|
recorder: Recorder;
|
|
|
|
networkId: string;
|
|
|
|
filter?: (resp: Response) => boolean;
|
|
|
|
ignoreDupe?: boolean;
|
|
|
|
maxFetchSize?: number;
|
2024-05-24 14:51:51 -07:00
|
|
|
manualRedirect?: boolean;
|
2024-04-03 17:38:50 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
// =================================================================
|
2024-05-24 14:51:51 -07:00
|
|
|
export type DirectFetchRequest = {
|
|
|
|
url: string;
|
|
|
|
headers: Record<string, string>;
|
|
|
|
cdp: CDPSession;
|
|
|
|
};
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
// =================================================================
|
|
|
|
export type DirectFetchResponse = {
|
|
|
|
fetched: boolean;
|
|
|
|
mime: string;
|
|
|
|
ts: Date;
|
|
|
|
};
|
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
// =================================================================
|
|
|
|
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
|
2024-04-03 17:38:50 -07:00
|
|
|
cdp: CDPSession;
|
2024-05-24 14:51:51 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
// =================================================================
|
|
|
|
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
|
2024-04-03 17:38:50 -07:00
|
|
|
requestId: string;
|
|
|
|
};
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
// =================================================================
|
2023-11-09 19:11:11 -05:00
|
|
|
export class Recorder {
|
2023-11-09 11:27:11 -08:00
|
|
|
workerid: WorkerId;
|
2023-11-09 18:33:44 -08:00
|
|
|
|
|
|
|
crawler: Crawler;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
crawlState: RedisCrawlState;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
fetcherQ: PQueue;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
pendingRequests!: Map<string, RequestResponseInfo>;
|
|
|
|
skipIds!: Set<string>;
|
2024-01-15 13:08:13 -08:00
|
|
|
pageInfo!: PageInfoRecord;
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
mainFrameId: string | null = null;
|
2024-10-31 14:06:17 -07:00
|
|
|
skipRangeUrls!: Map<string, number>;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-27 09:26:51 -07:00
|
|
|
swTargetId?: string | null;
|
2023-11-09 11:27:11 -08:00
|
|
|
swFrameIds = new Set<string>();
|
|
|
|
swUrls = new Set<string>();
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
// TODO: Fix this the next time the file is edited.
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
logDetails: Record<string, any> = {};
|
2024-10-31 14:06:17 -07:00
|
|
|
|
|
|
|
pageFinished = false;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
gzip = true;
|
|
|
|
|
|
|
|
writer: WARCWriter;
|
|
|
|
|
2023-11-09 18:33:44 -08:00
|
|
|
pageUrl!: string;
|
2023-11-09 11:27:11 -08:00
|
|
|
pageid!: string;
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
pageSeed?: ScopedSeed;
|
|
|
|
|
2024-05-06 17:04:31 +02:00
|
|
|
frameIdToExecId: Map<string, number> | null;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
constructor({
|
|
|
|
workerid,
|
2024-03-26 14:54:27 -07:00
|
|
|
writer,
|
2023-11-09 19:11:11 -05:00
|
|
|
crawler,
|
|
|
|
}: {
|
|
|
|
workerid: WorkerId;
|
2024-03-26 14:54:27 -07:00
|
|
|
writer: WARCWriter;
|
2023-11-09 18:33:44 -08:00
|
|
|
crawler: Crawler;
|
2023-11-09 19:11:11 -05:00
|
|
|
}) {
|
2023-11-09 11:27:11 -08:00
|
|
|
this.workerid = workerid;
|
|
|
|
this.crawler = crawler;
|
|
|
|
this.crawlState = crawler.crawlState;
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
this.writer = writer;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
this.fetcherQ = new PQueue({ concurrency: 1 });
|
2024-05-06 17:04:31 +02:00
|
|
|
|
|
|
|
this.frameIdToExecId = null;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-05-06 17:04:31 +02:00
|
|
|
async onCreatePage({
|
|
|
|
cdp,
|
|
|
|
frameIdToExecId,
|
|
|
|
}: {
|
|
|
|
cdp: CDPSession;
|
|
|
|
frameIdToExecId: Map<string, number>;
|
|
|
|
}) {
|
|
|
|
this.frameIdToExecId = frameIdToExecId;
|
2024-10-31 14:06:17 -07:00
|
|
|
this.pageFinished = false;
|
2024-05-06 17:04:31 +02:00
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
// Fetch
|
2024-09-06 16:24:18 -07:00
|
|
|
cdp.on("Fetch.requestPaused", (params) => {
|
|
|
|
void this.handleRequestPaused(params, cdp);
|
2023-11-07 21:38:50 -08:00
|
|
|
});
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
await cdp.send("Fetch.enable", {
|
|
|
|
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
// Response
|
2024-03-07 08:35:53 -08:00
|
|
|
cdp.on("Network.responseReceived", (params) =>
|
|
|
|
this.handleResponseReceived(params),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
cdp.on("Network.responseReceivedExtraInfo", (params) =>
|
|
|
|
this.handleResponseReceivedExtraInfo(params),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
// Cache
|
|
|
|
cdp.on("Network.requestServedFromCache", (params) =>
|
|
|
|
this.handleRequestServedFromCache(params),
|
|
|
|
);
|
2024-02-16 14:36:32 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
// Request
|
|
|
|
cdp.on("Network.requestWillBeSent", (params) =>
|
|
|
|
this.handleRequestWillBeSent(params),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
cdp.on("Network.requestWillBeSentExtraInfo", (params) =>
|
|
|
|
this.handleRequestExtraInfo(params),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
// Loading
|
2024-03-07 08:35:53 -08:00
|
|
|
cdp.on("Network.loadingFinished", (params) =>
|
|
|
|
this.handleLoadingFinished(params),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
cdp.on("Network.loadingFailed", (params) =>
|
|
|
|
this.handleLoadingFailed(params),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
await cdp.send("Network.enable");
|
|
|
|
|
|
|
|
// Target
|
|
|
|
cdp.on("Target.attachedToTarget", async (params) => {
|
2024-03-27 09:26:51 -07:00
|
|
|
const { url, type, targetId } = params.targetInfo;
|
2023-11-07 21:38:50 -08:00
|
|
|
if (type === "service_worker") {
|
2024-03-27 09:26:51 -07:00
|
|
|
this.swTargetId = targetId;
|
2023-11-07 21:38:50 -08:00
|
|
|
this.swUrls.add(url);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
cdp.on("Target.detachedFromTarget", async (params) => {
|
2024-03-27 09:26:51 -07:00
|
|
|
const { targetId } = params;
|
|
|
|
if (this.swTargetId && targetId === this.swTargetId) {
|
2023-11-07 21:38:50 -08:00
|
|
|
this.swUrls.clear();
|
|
|
|
this.swFrameIds.clear();
|
2024-03-27 09:26:51 -07:00
|
|
|
this.swTargetId = null;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
await cdp.send("Target.setAutoAttach", {
|
|
|
|
autoAttach: true,
|
|
|
|
waitForDebuggerOnStart: false,
|
|
|
|
flatten: true,
|
|
|
|
});
|
|
|
|
|
|
|
|
// Console
|
2024-02-21 16:02:25 -08:00
|
|
|
cdp.on("Console.messageAdded", (params) => {
|
|
|
|
const { message } = params;
|
|
|
|
const { source, level } = message;
|
|
|
|
if (source === "console-api" && level === "error") {
|
|
|
|
this.pageInfo.counts.jsErrors++;
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
await cdp.send("Console.enable");
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-05-06 17:04:31 +02:00
|
|
|
hasFrame(frameId: string) {
|
|
|
|
return this.swFrameIds.has(frameId) || this.frameIdToExecId?.has(frameId);
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
2024-02-19 19:11:48 -08:00
|
|
|
const { requestId, response, type } = params;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-06-13 12:13:14 -07:00
|
|
|
const { mimeType, url, headers } = response;
|
2024-03-07 08:35:53 -08:00
|
|
|
|
|
|
|
logNetwork("Network.responseReceived", {
|
|
|
|
requestId,
|
|
|
|
url,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
2024-02-17 23:32:19 -08:00
|
|
|
|
|
|
|
if (mimeType === MIME_EVENT_STREAM) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-06-13 12:13:14 -07:00
|
|
|
if (this.shouldSkip(headers, url, undefined, type)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
const reqresp = this.pendingReqResp(requestId);
|
|
|
|
if (!reqresp) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-02-19 19:11:48 -08:00
|
|
|
reqresp.fillResponse(response, type);
|
2024-03-07 08:35:53 -08:00
|
|
|
}
|
2024-02-16 14:36:32 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
handleResponseReceivedExtraInfo(
|
|
|
|
params: Protocol.Network.ResponseReceivedExtraInfoEvent,
|
|
|
|
) {
|
|
|
|
const { requestId } = params;
|
|
|
|
|
|
|
|
logNetwork("Network.responseReceivedExtraInfo", {
|
|
|
|
requestId,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
|
|
|
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
|
|
if (reqresp) {
|
|
|
|
reqresp.fillResponseReceivedExtraInfo(params);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
handleRequestServedFromCache(
|
|
|
|
params: Protocol.Network.RequestServedFromCacheEvent,
|
|
|
|
) {
|
|
|
|
const { requestId } = params;
|
|
|
|
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
|
|
|
|
|
|
const url = reqresp?.url;
|
|
|
|
|
|
|
|
logNetwork("Network.requestServedFromCache", {
|
|
|
|
requestId,
|
|
|
|
url,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
|
|
|
|
|
|
|
if (reqresp) {
|
|
|
|
reqresp.fromCache = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
handleRequestWillBeSent(params: Protocol.Network.RequestWillBeSentEvent) {
|
|
|
|
const { redirectResponse, requestId, request, type } = params;
|
|
|
|
|
|
|
|
const { headers, method, url } = request;
|
|
|
|
|
|
|
|
logNetwork("Network.requestWillBeSent", {
|
|
|
|
requestId,
|
2024-07-09 11:02:06 -07:00
|
|
|
url,
|
2024-03-07 08:35:53 -08:00
|
|
|
redirectResponse,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
|
|
|
|
2024-07-09 11:02:06 -07:00
|
|
|
// handling redirect here, committing last response in redirect chain
|
|
|
|
// request data stored from requestPaused
|
2024-03-07 08:35:53 -08:00
|
|
|
if (redirectResponse) {
|
|
|
|
this.handleRedirectResponse(params);
|
|
|
|
} else {
|
|
|
|
if (!this.shouldSkip(headers, url, method, type)) {
|
|
|
|
const reqresp = this.pendingReqResp(requestId);
|
|
|
|
if (reqresp) {
|
2024-04-03 17:38:50 -07:00
|
|
|
reqresp.fillRequest(request, type || "");
|
2024-03-07 08:35:53 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
handleRequestExtraInfo(
|
|
|
|
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
|
|
|
|
) {
|
2024-03-07 08:35:53 -08:00
|
|
|
const { requestId, headers } = params;
|
|
|
|
|
|
|
|
logNetwork("Network.requestWillBeSentExtraInfo", {
|
|
|
|
requestId,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
|
|
|
|
|
|
|
if (!this.shouldSkip(headers)) {
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
2023-11-07 21:38:50 -08:00
|
|
|
if (reqresp) {
|
|
|
|
reqresp.fillRequestExtraInfo(params);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
handleRedirectResponse(params: Protocol.Network.RequestWillBeSentEvent) {
|
2024-02-19 19:11:48 -08:00
|
|
|
const { requestId, redirectResponse, type } = params;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
// remove and serialize, but allow reusing requestId
|
|
|
|
// as redirect chain may reuse same requestId for subsequent request
|
|
|
|
const reqresp = this.removeReqResp(requestId, true);
|
2023-11-09 11:27:11 -08:00
|
|
|
if (!reqresp || !redirectResponse) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-02-19 19:11:48 -08:00
|
|
|
reqresp.fillResponse(redirectResponse, type);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
if (reqresp.isSelfRedirect()) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"Skipping self redirect",
|
|
|
|
{ url: reqresp.url, status: reqresp.status, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-07-29 14:51:22 -07:00
|
|
|
try {
|
|
|
|
new URL(reqresp.url);
|
|
|
|
} catch (e) {
|
|
|
|
logger.warn(
|
|
|
|
"Skipping invalid URL from redirect",
|
|
|
|
{ url: reqresp.url, status: reqresp.status, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-09-06 16:24:18 -07:00
|
|
|
this.serializeToWARC(reqresp).catch((e) =>
|
|
|
|
logger.warn("Error Serializing to WARC", e, "recorder"),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
handleLoadingFailed(params: Protocol.Network.LoadingFailedEvent) {
|
2023-11-07 21:38:50 -08:00
|
|
|
const { errorText, type, requestId } = params;
|
|
|
|
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
2024-03-07 08:35:53 -08:00
|
|
|
|
|
|
|
const url = reqresp?.url;
|
|
|
|
|
|
|
|
logNetwork("Network.loadingFailed", {
|
|
|
|
requestId,
|
|
|
|
url,
|
2024-10-31 14:06:17 -07:00
|
|
|
errorText,
|
|
|
|
type,
|
2024-03-07 08:35:53 -08:00
|
|
|
...this.logDetails,
|
|
|
|
});
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
if (!reqresp) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-03-04 18:10:45 -08:00
|
|
|
if (type) {
|
|
|
|
reqresp.resourceType = type.toLowerCase();
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
switch (errorText) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case "net::ERR_BLOCKED_BY_CLIENT":
|
|
|
|
logNetwork("Request blocked", { url, errorText, ...this.logDetails });
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "net::ERR_ABORTED":
|
|
|
|
// check if this is a false positive -- a valid download that's already been fetched
|
|
|
|
// the abort is just for page, but download will succeed
|
2024-10-31 14:06:17 -07:00
|
|
|
if (
|
|
|
|
(type === "Document" || type === "Media") &&
|
|
|
|
reqresp.isValidBinary()
|
|
|
|
) {
|
2024-05-14 15:26:06 -07:00
|
|
|
this.removeReqResp(requestId);
|
|
|
|
return this.serializeToWARC(reqresp);
|
2024-10-31 14:06:17 -07:00
|
|
|
} else if (url && reqresp.requestHeaders && type === "Media") {
|
|
|
|
this.removeReqResp(requestId);
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"Attempt direct fetch of failed request",
|
|
|
|
{ url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
reqresp.deleteRange();
|
|
|
|
reqresp.requestId = "0";
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const fetcher = new AsyncFetcher({
|
|
|
|
reqresp,
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
expectedSize: reqresp.expectedSize ? reqresp.expectedSize : -1,
|
2023-11-09 19:11:11 -05:00
|
|
|
recorder: this,
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
networkId: "0",
|
2023-11-09 19:11:11 -05:00
|
|
|
});
|
2024-09-06 16:24:18 -07:00
|
|
|
void this.fetcherQ.add(() => fetcher.load());
|
2023-11-09 19:11:11 -05:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
logger.warn(
|
|
|
|
"Request failed",
|
2024-10-31 14:06:17 -07:00
|
|
|
{ url, errorText, type, status: reqresp.status, ...this.logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
2024-03-07 08:35:53 -08:00
|
|
|
reqresp.status = 0;
|
|
|
|
reqresp.errorText = errorText;
|
|
|
|
this.addPageRecord(reqresp);
|
2024-05-14 15:26:06 -07:00
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
this.removeReqResp(requestId);
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
handleLoadingFinished(params: Protocol.Network.LoadingFinishedEvent) {
|
2024-03-07 08:35:53 -08:00
|
|
|
const { requestId } = params;
|
|
|
|
|
|
|
|
const reqresp = this.pendingReqResp(requestId, true);
|
|
|
|
|
|
|
|
const url = reqresp?.url;
|
|
|
|
|
|
|
|
logNetwork("Network.loadingFinished", {
|
|
|
|
requestId,
|
|
|
|
url,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
if (!reqresp || reqresp.asyncLoading) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
this.removeReqResp(requestId);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
if (!this.isValidUrl(url)) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-09-06 16:24:18 -07:00
|
|
|
this.serializeToWARC(reqresp).catch((e) =>
|
|
|
|
logger.warn("Error Serializing to WARC", e, "recorder"),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
async handleRequestPaused(
|
|
|
|
params: Protocol.Fetch.RequestPausedEvent,
|
|
|
|
cdp: CDPSession,
|
2024-10-31 14:06:17 -07:00
|
|
|
isBrowserContext = false,
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
|
|
|
const {
|
|
|
|
requestId,
|
|
|
|
request,
|
|
|
|
responseStatusCode,
|
|
|
|
responseErrorReason,
|
|
|
|
resourceType,
|
|
|
|
networkId,
|
|
|
|
} = params;
|
2023-11-07 21:38:50 -08:00
|
|
|
const { method, headers, url } = request;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
logNetwork("Fetch.requestPaused", {
|
|
|
|
requestId,
|
|
|
|
networkId,
|
|
|
|
url,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
let continued = false;
|
|
|
|
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
responseStatusCode &&
|
|
|
|
!responseErrorReason &&
|
2024-10-31 14:06:17 -07:00
|
|
|
!this.shouldSkip(headers, url, method, resourceType)
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2024-10-31 14:06:17 -07:00
|
|
|
continued = await this.handleFetchResponse(
|
|
|
|
params,
|
|
|
|
cdp,
|
|
|
|
isBrowserContext,
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.error(
|
|
|
|
"Error handling response, probably skipping URL",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ url, ...formatErr(e), ...this.logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!continued) {
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
await cdp.send("Fetch.continueResponse", { requestId });
|
2023-11-07 21:38:50 -08:00
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"continueResponse failed",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ requestId, networkId, url, ...formatErr(e), ...this.logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
async handleFetchResponse(
|
|
|
|
params: Protocol.Fetch.RequestPausedEvent,
|
|
|
|
cdp: CDPSession,
|
2024-10-31 14:06:17 -07:00
|
|
|
isBrowserContext: boolean,
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
const { request } = params;
|
|
|
|
const { url } = request;
|
2023-11-09 19:11:11 -05:00
|
|
|
const {
|
|
|
|
requestId,
|
|
|
|
responseErrorReason,
|
|
|
|
responseStatusCode,
|
|
|
|
responseHeaders,
|
|
|
|
} = params;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
const networkId = params.networkId || requestId;
|
|
|
|
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
const reqresp = this.pendingReqResp(networkId);
|
|
|
|
|
|
|
|
if (!reqresp) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
if (responseErrorReason) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"Skipping failed response",
|
|
|
|
{ url, reason: responseErrorReason, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const contentLen = this._getContentLen(responseHeaders);
|
|
|
|
|
|
|
|
if (responseStatusCode === 206) {
|
|
|
|
const range = this._getContentRange(responseHeaders);
|
2024-07-17 13:24:25 -07:00
|
|
|
if (range === `bytes 0-${contentLen - 1}/${contentLen}`) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"Keep 206 Response, Full Range",
|
|
|
|
{ range, contentLen, url, networkId, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2024-07-17 13:24:25 -07:00
|
|
|
} else if (range?.startsWith("bytes 0-")) {
|
|
|
|
logger.debug(
|
|
|
|
"Re-request 206 Response without range",
|
|
|
|
{ range, contentLen, url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
this.removeReqResp(networkId);
|
|
|
|
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
if (!reqresp.fetchContinued) {
|
|
|
|
const reqrespNew = new RequestResponseInfo("0");
|
|
|
|
reqrespNew.fillRequest(params.request, params.resourceType);
|
|
|
|
reqrespNew.deleteRange();
|
|
|
|
reqrespNew.frameId = params.frameId;
|
|
|
|
|
|
|
|
this.addAsyncFetch(
|
|
|
|
{
|
|
|
|
reqresp: reqrespNew,
|
|
|
|
expectedSize: parseInt(range.split("/")[1]),
|
|
|
|
recorder: this,
|
|
|
|
networkId: "0",
|
|
|
|
cdp,
|
|
|
|
},
|
|
|
|
contentLen,
|
|
|
|
);
|
2024-07-17 13:24:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2023-11-07 21:38:50 -08:00
|
|
|
} else {
|
2024-10-31 14:06:17 -07:00
|
|
|
// logger.debug(
|
|
|
|
// "Skip 206 Response",
|
|
|
|
// { range, contentLen, url, ...this.logDetails },
|
|
|
|
// "recorder",
|
|
|
|
// );
|
2023-11-07 21:38:50 -08:00
|
|
|
this.removeReqResp(networkId);
|
2024-10-31 14:06:17 -07:00
|
|
|
const count = this.skipRangeUrls.get(url) || 0;
|
|
|
|
if (count > 2) {
|
|
|
|
// just fail additional range requests to save bandwidth, as these are not being recorded
|
|
|
|
await cdp.send("Fetch.failRequest", {
|
|
|
|
requestId,
|
|
|
|
errorReason: "BlockedByResponse",
|
|
|
|
});
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
this.skipRangeUrls.set(url, count + 1);
|
2023-11-07 21:38:50 -08:00
|
|
|
return false;
|
|
|
|
}
|
2024-11-22 10:31:12 -08:00
|
|
|
} else {
|
|
|
|
const filteredUrl = removeRangeAsQuery(url);
|
|
|
|
if (filteredUrl) {
|
|
|
|
this.removeReqResp(networkId);
|
|
|
|
|
|
|
|
logger.debug(
|
|
|
|
"Removed range in query, async fetching full URL",
|
|
|
|
{ url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
if (!reqresp.fetchContinued) {
|
|
|
|
const reqrespNew = new RequestResponseInfo("0");
|
|
|
|
reqrespNew.fillRequest(params.request, params.resourceType);
|
|
|
|
reqrespNew.url = filteredUrl;
|
|
|
|
reqrespNew.frameId = params.frameId;
|
2024-11-22 10:31:12 -08:00
|
|
|
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
this.addAsyncFetch({
|
|
|
|
reqresp: reqrespNew,
|
|
|
|
recorder: this,
|
|
|
|
networkId: "0",
|
|
|
|
cdp,
|
|
|
|
});
|
|
|
|
}
|
2024-11-22 10:31:12 -08:00
|
|
|
return false;
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-10-31 14:06:17 -07:00
|
|
|
// indicate that this is intercepted in the page context
|
|
|
|
if (!isBrowserContext) {
|
|
|
|
reqresp.inPageContext = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Already being handled by a different handler
|
|
|
|
if (reqresp.fetchContinued) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
reqresp.fetchContinued = true;
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
reqresp.fillFetchRequestPaused(params);
|
|
|
|
|
2024-07-08 10:51:37 -07:00
|
|
|
if (
|
|
|
|
url === this.pageUrl &&
|
|
|
|
(!this.pageInfo.ts ||
|
2025-01-28 11:28:23 -08:00
|
|
|
(responseStatusCode && responseStatusCode <= this.pageInfo.tsStatus))
|
2024-07-08 10:51:37 -07:00
|
|
|
) {
|
2025-01-28 11:28:23 -08:00
|
|
|
const errorReason = await this.blockPageResponse(
|
|
|
|
url,
|
|
|
|
reqresp,
|
|
|
|
responseHeaders,
|
|
|
|
);
|
|
|
|
|
|
|
|
if (errorReason) {
|
|
|
|
await cdp.send("Fetch.failRequest", {
|
|
|
|
requestId,
|
|
|
|
errorReason,
|
|
|
|
});
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-07-08 10:51:37 -07:00
|
|
|
logger.debug("Setting page timestamp", {
|
|
|
|
ts: reqresp.ts,
|
|
|
|
url,
|
|
|
|
status: responseStatusCode,
|
|
|
|
});
|
2024-02-09 19:44:17 -05:00
|
|
|
this.pageInfo.ts = reqresp.ts;
|
2024-07-08 10:51:37 -07:00
|
|
|
this.pageInfo.tsStatus = responseStatusCode!;
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
this.mainFrameId = params.frameId;
|
2024-02-09 19:44:17 -05:00
|
|
|
}
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
if (this.noResponseForStatus(responseStatusCode)) {
|
|
|
|
reqresp.payload = new Uint8Array();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
const mimeType = this.getMimeType(responseHeaders) || "";
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
let streamingConsume = false;
|
|
|
|
|
2024-04-03 17:38:50 -07:00
|
|
|
if (
|
2024-10-31 14:06:17 -07:00
|
|
|
this.shouldStream(
|
|
|
|
contentLen,
|
|
|
|
responseStatusCode || 0,
|
|
|
|
reqresp.resourceType || "",
|
|
|
|
mimeType,
|
|
|
|
)
|
2024-04-03 17:38:50 -07:00
|
|
|
) {
|
|
|
|
const opts: ResponseStreamAsyncFetchOptions = {
|
2023-11-09 19:11:11 -05:00
|
|
|
reqresp,
|
|
|
|
expectedSize: contentLen,
|
|
|
|
recorder: this,
|
|
|
|
networkId,
|
|
|
|
cdp,
|
2023-11-09 18:33:44 -08:00
|
|
|
requestId,
|
2023-11-09 19:11:11 -05:00
|
|
|
};
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-11-13 23:28:37 -08:00
|
|
|
// fetching using response stream as first attempt,
|
|
|
|
// await here and then either call fulFill, or if dupe, return false
|
|
|
|
const fetcher = new ResponseStreamAsyncFetcher(opts);
|
|
|
|
const res = await fetcher.load();
|
|
|
|
switch (res) {
|
|
|
|
case "dupe":
|
|
|
|
this.removeReqResp(networkId);
|
|
|
|
return false;
|
|
|
|
|
|
|
|
case "fetched":
|
|
|
|
streamingConsume = true;
|
|
|
|
break;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// if not consumed via takeStream, attempt async loading
|
|
|
|
if (!streamingConsume) {
|
2024-07-17 13:24:25 -07:00
|
|
|
this.addAsyncFetch(opts, contentLen);
|
2023-11-07 21:38:50 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
logNetwork("Fetching response", {
|
|
|
|
sizeExpected: this._getContentLen(responseHeaders),
|
|
|
|
url,
|
|
|
|
networkId,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
|
|
|
const { body, base64Encoded } = await cdp.send(
|
|
|
|
"Fetch.getResponseBody",
|
|
|
|
{ requestId },
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.payload = Buffer.from(body, base64Encoded ? "base64" : "utf-8");
|
2023-11-09 19:11:11 -05:00
|
|
|
logNetwork("Fetch done", {
|
|
|
|
size: reqresp.payload.length,
|
|
|
|
url,
|
|
|
|
networkId,
|
|
|
|
...this.logDetails,
|
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"Failed to load response body",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ url, networkId, ...formatErr(e), ...this.logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
const rewritten = await this.rewriteResponse(reqresp, mimeType);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-10-31 14:06:17 -07:00
|
|
|
// if in browser context, and not also intercepted in page context
|
|
|
|
// serialize here, as won't be getting a loadingFinished message for it
|
2024-11-13 23:28:37 -08:00
|
|
|
if (
|
|
|
|
isBrowserContext &&
|
|
|
|
!reqresp.inPageContext &&
|
|
|
|
!reqresp.asyncLoading &&
|
|
|
|
reqresp.payload
|
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
this.removeReqResp(networkId);
|
|
|
|
await this.serializeToWARC(reqresp);
|
|
|
|
}
|
|
|
|
|
|
|
|
// not rewritten, and not streaming, return false to continue
|
|
|
|
if (!rewritten && !streamingConsume) {
|
|
|
|
if (!reqresp.payload) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.error(
|
|
|
|
"Unable to get payload skipping recording",
|
|
|
|
{ url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
this.removeReqResp(networkId);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if has payload, encode it, otherwise return empty string
|
2023-11-09 19:11:11 -05:00
|
|
|
const body =
|
|
|
|
reqresp.payload && reqresp.payload.length
|
|
|
|
? Buffer.from(reqresp.payload).toString("base64")
|
|
|
|
: "";
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
try {
|
|
|
|
await cdp.send("Fetch.fulfillRequest", {
|
|
|
|
requestId,
|
2023-11-09 11:27:11 -08:00
|
|
|
responseCode: responseStatusCode || 0,
|
2023-11-07 21:38:50 -08:00
|
|
|
responseHeaders,
|
2023-11-09 19:11:11 -05:00
|
|
|
body,
|
2023-11-07 21:38:50 -08:00
|
|
|
});
|
|
|
|
} catch (e) {
|
2024-03-04 23:58:39 -08:00
|
|
|
const { resourceType } = reqresp;
|
|
|
|
const msg =
|
|
|
|
resourceType === "document"
|
|
|
|
? "document not loaded in browser, possibly other URLs missing"
|
|
|
|
: "URL not loaded in browser";
|
|
|
|
|
2024-11-13 23:28:37 -08:00
|
|
|
logger.debug(msg, { url, resourceType, e }, "recorder");
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-11-22 10:31:12 -08:00
|
|
|
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number = -1) {
|
2024-07-17 13:24:25 -07:00
|
|
|
let fetcher: AsyncFetcher;
|
|
|
|
|
2024-11-13 23:28:37 -08:00
|
|
|
if (
|
|
|
|
opts.reqresp.method !== "GET" ||
|
|
|
|
contentLen > MAX_NETWORK_LOAD_SIZE ||
|
|
|
|
!opts.reqresp.inPageContext
|
|
|
|
) {
|
2024-07-17 13:24:25 -07:00
|
|
|
fetcher = new AsyncFetcher(opts);
|
|
|
|
} else {
|
|
|
|
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
|
|
|
|
}
|
2024-09-06 16:24:18 -07:00
|
|
|
void this.fetcherQ.add(() => fetcher.load());
|
2024-07-17 13:24:25 -07:00
|
|
|
}
|
|
|
|
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
addExternalFetch(url: string, cdp: CDPSession) {
|
2025-03-31 12:02:25 -07:00
|
|
|
logger.debug(
|
|
|
|
"Handling fetch from behavior",
|
|
|
|
{ url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
const reqresp = new RequestResponseInfo("0");
|
|
|
|
reqresp.url = url;
|
|
|
|
reqresp.method = "GET";
|
|
|
|
reqresp.frameId = this.mainFrameId || undefined;
|
|
|
|
const fetcher = new NetworkLoadStreamAsyncFetcher({
|
|
|
|
reqresp,
|
|
|
|
recorder: this,
|
|
|
|
cdp,
|
|
|
|
networkId: "0",
|
|
|
|
});
|
|
|
|
void this.fetcherQ.add(() => fetcher.load());
|
|
|
|
// return true if successful
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
async blockPageResponse(
|
|
|
|
url: string,
|
|
|
|
reqresp: RequestResponseInfo,
|
|
|
|
responseHeaders?: Protocol.Fetch.HeaderEntry[],
|
|
|
|
): Promise<Protocol.Network.ErrorReason | undefined> {
|
|
|
|
if (reqresp.isRedirectStatus()) {
|
|
|
|
try {
|
|
|
|
let loc = this.getLocation(responseHeaders);
|
|
|
|
if (loc) {
|
|
|
|
loc = new URL(loc, url).href;
|
|
|
|
|
|
|
|
if (this.pageSeed && this.pageSeed.isExcluded(loc)) {
|
|
|
|
logger.warn(
|
|
|
|
"Skipping page that redirects to excluded URL",
|
|
|
|
{ newUrl: loc, origUrl: this.pageUrl },
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
|
|
|
|
return "BlockedByResponse";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (e) {
|
|
|
|
// ignore
|
|
|
|
logger.debug("Redirect check error", e, "recorder");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
startPage({ pageid, url }: { pageid: string; url: string }) {
|
2023-11-07 21:38:50 -08:00
|
|
|
this.pageid = pageid;
|
2023-11-09 18:33:44 -08:00
|
|
|
this.pageUrl = url;
|
2023-11-09 19:11:11 -05:00
|
|
|
this.logDetails = { page: url, workerid: this.workerid };
|
2023-11-07 21:38:50 -08:00
|
|
|
if (this.pendingRequests && this.pendingRequests.size) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"Interrupting timed out requests, moving to next page",
|
|
|
|
this.logDetails,
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
this.pendingRequests = new Map();
|
|
|
|
this.skipIds = new Set();
|
2024-10-31 14:06:17 -07:00
|
|
|
this.skipRangeUrls = new Map<string, number>();
|
|
|
|
this.pageFinished = false;
|
2024-07-08 10:51:37 -07:00
|
|
|
this.pageInfo = {
|
|
|
|
pageid,
|
|
|
|
urls: {},
|
|
|
|
url,
|
|
|
|
counts: { jsErrors: 0 },
|
|
|
|
tsStatus: 999,
|
|
|
|
};
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
this.mainFrameId = null;
|
2024-01-15 13:08:13 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
addPageRecord(reqresp: RequestResponseInfo) {
|
2024-02-19 19:11:48 -08:00
|
|
|
if (this.isValidUrl(reqresp.url)) {
|
|
|
|
const { status, resourceType: type } = reqresp;
|
|
|
|
const mime = reqresp.getMimeType();
|
2024-03-07 08:35:53 -08:00
|
|
|
const info: PageInfoValue = { status, mime, type };
|
|
|
|
if (reqresp.errorText) {
|
|
|
|
info.error = reqresp.errorText;
|
|
|
|
}
|
|
|
|
//TODO: revisit if we want to record this later
|
|
|
|
// if (reqresp.fromCache) {
|
|
|
|
// info.fromBrowserCache = true;
|
|
|
|
// }
|
|
|
|
this.pageInfo.urls[reqresp.getCanonURL()] = info;
|
2024-02-17 23:32:19 -08:00
|
|
|
}
|
2024-01-15 13:08:13 -08:00
|
|
|
}
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
writePageInfoRecord() {
|
2024-01-15 13:08:13 -08:00
|
|
|
const text = JSON.stringify(this.pageInfo, null, 2);
|
|
|
|
|
2024-03-26 14:54:27 -07:00
|
|
|
const url = this.pageUrl;
|
2024-01-15 13:08:13 -08:00
|
|
|
|
2024-04-12 14:31:07 -07:00
|
|
|
this.writer.writeNewResourceRecord(
|
|
|
|
{
|
|
|
|
buffer: new TextEncoder().encode(text),
|
|
|
|
resourceType: "pageinfo",
|
|
|
|
contentType: "application/json",
|
|
|
|
url,
|
|
|
|
},
|
|
|
|
{ type: "pageinfo", url },
|
|
|
|
"recorder",
|
|
|
|
);
|
2024-03-07 08:35:53 -08:00
|
|
|
|
|
|
|
return this.pageInfo.ts;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
async awaitPageResources() {
|
2023-11-07 21:38:50 -08:00
|
|
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
2024-11-13 23:28:37 -08:00
|
|
|
if (reqresp.payload && !reqresp.asyncLoading) {
|
2023-11-07 21:38:50 -08:00
|
|
|
this.removeReqResp(requestId);
|
|
|
|
await this.serializeToWARC(reqresp);
|
2024-07-09 11:02:06 -07:00
|
|
|
// if no url, and not fetch intercept or async loading,
|
|
|
|
// drop this request, as it was not being loaded
|
|
|
|
} else if (
|
|
|
|
!reqresp.url ||
|
|
|
|
(!reqresp.intercepting && !reqresp.asyncLoading)
|
|
|
|
) {
|
|
|
|
logger.debug(
|
|
|
|
"Removing pending request that was never fetched",
|
|
|
|
{ requestId, url: reqresp.url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
this.removeReqResp(requestId);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let numPending = this.pendingRequests.size;
|
|
|
|
|
2024-10-31 14:06:17 -07:00
|
|
|
let pending = [];
|
|
|
|
while (
|
|
|
|
numPending &&
|
|
|
|
!this.pageFinished &&
|
2025-02-10 23:00:55 +01:00
|
|
|
!this.crawler.interruptReason &&
|
2024-10-31 14:06:17 -07:00
|
|
|
!this.crawler.postCrawling
|
|
|
|
) {
|
|
|
|
pending = [];
|
2023-11-07 21:38:50 -08:00
|
|
|
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
2023-11-09 11:27:11 -08:00
|
|
|
const url = reqresp.url || "";
|
2023-11-09 19:11:11 -05:00
|
|
|
const entry: {
|
|
|
|
requestId: string;
|
|
|
|
url: string;
|
|
|
|
expectedSize?: number;
|
|
|
|
readSize?: number;
|
2024-07-09 11:02:06 -07:00
|
|
|
resourceType?: string;
|
2023-11-09 19:11:11 -05:00
|
|
|
} = { requestId, url };
|
2023-11-07 21:38:50 -08:00
|
|
|
if (reqresp.expectedSize) {
|
|
|
|
entry.expectedSize = reqresp.expectedSize;
|
|
|
|
}
|
|
|
|
if (reqresp.readSize) {
|
|
|
|
entry.readSize = reqresp.readSize;
|
|
|
|
}
|
2024-07-09 11:02:06 -07:00
|
|
|
if (reqresp.resourceType) {
|
|
|
|
entry.resourceType = reqresp.resourceType;
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
pending.push(entry);
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"Finishing pending requests for page",
|
|
|
|
{ numPending, pending, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
await sleep(5.0);
|
|
|
|
numPending = this.pendingRequests.size;
|
|
|
|
}
|
2024-10-31 14:06:17 -07:00
|
|
|
|
|
|
|
if (this.pendingRequests.size) {
|
|
|
|
logger.warn(
|
|
|
|
"Dropping timed out requests",
|
|
|
|
{ numPending, pending, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
for (const requestId of this.pendingRequests.keys()) {
|
|
|
|
this.removeReqResp(requestId);
|
|
|
|
}
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
async onClosePage() {
|
|
|
|
// Any page-specific handling before page is closed.
|
2024-05-06 17:04:31 +02:00
|
|
|
this.frameIdToExecId = null;
|
2024-10-31 14:06:17 -07:00
|
|
|
|
|
|
|
this.pageFinished = true;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-12-07 23:02:55 -08:00
|
|
|
async onDone(timeout: number) {
|
2023-11-07 21:38:50 -08:00
|
|
|
await this.crawlState.setStatus("pending-wait");
|
|
|
|
|
2023-12-07 23:02:55 -08:00
|
|
|
const finishFetch = async () => {
|
|
|
|
logger.debug("Finishing Fetcher Queue", this.logDetails, "recorder");
|
|
|
|
await this.fetcherQ.onIdle();
|
|
|
|
};
|
|
|
|
|
2024-03-21 13:56:05 -07:00
|
|
|
if (timeout > 0) {
|
|
|
|
await timedRun(
|
|
|
|
finishFetch(),
|
|
|
|
timeout,
|
|
|
|
"Finishing Fetch Timed Out",
|
|
|
|
this.logDetails,
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
logger.debug("Finishing WARC writing", this.logDetails, "recorder");
|
|
|
|
|
|
|
|
await this.writer.flush();
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
shouldSkip(
|
|
|
|
headers: Protocol.Network.Headers,
|
|
|
|
url?: string,
|
|
|
|
method?: string,
|
|
|
|
resourceType?: string,
|
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
if (headers && !method) {
|
|
|
|
method = headers[":method"];
|
|
|
|
}
|
|
|
|
|
2024-03-07 08:35:53 -08:00
|
|
|
// only check if url is provided, since it is optional
|
|
|
|
if (url && !this.isValidUrl(url)) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (method === "OPTIONS" || method === "HEAD") {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
if (["EventSource", "WebSocket", "Ping"].includes(resourceType || "")) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// beacon
|
|
|
|
if (resourceType === "Other" && method === "POST") {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip eventsource, resourceType may not be set correctly
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
headers &&
|
2024-02-17 23:32:19 -08:00
|
|
|
(headers["accept"] === MIME_EVENT_STREAM ||
|
|
|
|
headers["Accept"] === MIME_EVENT_STREAM)
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
async rewriteResponse(reqresp: RequestResponseInfo, contentType: string) {
|
2023-11-09 18:33:44 -08:00
|
|
|
const { url, extraOpts, payload } = reqresp;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-04-03 17:38:50 -07:00
|
|
|
// don't rewrite if payload is missing or too big
|
|
|
|
if (!payload || !payload.length || payload.length > MAX_TEXT_REWRITE_SIZE) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
let newString = null;
|
|
|
|
let string = null;
|
|
|
|
|
2023-11-09 18:33:44 -08:00
|
|
|
switch (contentType) {
|
2023-11-09 19:11:11 -05:00
|
|
|
case "application/x-mpegURL":
|
|
|
|
case "application/vnd.apple.mpegurl":
|
2023-11-09 11:27:11 -08:00
|
|
|
string = payload.toString();
|
2023-11-09 19:11:11 -05:00
|
|
|
newString = rewriteHLS(string, { save: extraOpts });
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "application/dash+xml":
|
|
|
|
string = payload.toString();
|
|
|
|
newString = rewriteDASH(string, { save: extraOpts });
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "text/html":
|
|
|
|
case "application/json":
|
|
|
|
case "text/javascript":
|
|
|
|
case "application/javascript":
|
|
|
|
case "application/x-javascript": {
|
2024-08-13 23:38:55 -07:00
|
|
|
const rw = getCustomRewriter(url, isHTMLMime(contentType));
|
2023-11-09 19:11:11 -05:00
|
|
|
|
2024-08-13 23:38:55 -07:00
|
|
|
if (rw) {
|
2023-11-09 19:11:11 -05:00
|
|
|
string = payload.toString();
|
|
|
|
newString = rw.rewrite(string, { live: true, save: extraOpts });
|
|
|
|
}
|
|
|
|
break;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!newString) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (newString !== string) {
|
|
|
|
extraOpts.rewritten = 1;
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"Content Rewritten",
|
|
|
|
{ url, ...this.logDetails },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.payload = encoder.encode(newString);
|
2024-07-17 13:24:25 -07:00
|
|
|
reqresp.isRemoveRange = true;
|
2023-11-07 21:38:50 -08:00
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-31 14:06:17 -07:00
|
|
|
isEssentialResource(resourceType: string, contentType: string) {
|
2024-09-05 10:32:31 -07:00
|
|
|
if (resourceType === "script" || resourceType === "stylesheet") {
|
2024-07-17 13:24:25 -07:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RW_MIME_TYPES.includes(contentType)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2023-11-09 18:33:44 -08:00
|
|
|
}
|
|
|
|
|
2024-10-31 14:06:17 -07:00
|
|
|
shouldStream(
|
|
|
|
contentLength: number,
|
|
|
|
responseStatusCode: number,
|
|
|
|
resourceType: string,
|
|
|
|
mimeType: string,
|
|
|
|
) {
|
|
|
|
// if contentLength is too large even for rewriting, always stream, will not do rewriting
|
|
|
|
// even if text
|
|
|
|
if (contentLength > MAX_TEXT_REWRITE_SIZE) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if contentLength larger but is essential resource, do stream
|
|
|
|
// otherwise full fetch for rewriting
|
|
|
|
if (
|
|
|
|
contentLength > MAX_BROWSER_DEFAULT_FETCH_SIZE &&
|
|
|
|
!this.isEssentialResource(resourceType, mimeType)
|
|
|
|
) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if contentLength is unknown, also stream if its an essential resource and not 3xx / 4xx / 5xx
|
|
|
|
// status code, as these codes may have no content-length, and are likely small
|
|
|
|
if (
|
|
|
|
contentLength < 0 &&
|
|
|
|
!this.isEssentialResource(resourceType, mimeType) &&
|
|
|
|
responseStatusCode >= 200 &&
|
|
|
|
responseStatusCode < 300
|
|
|
|
) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
protected getMimeType(
|
2023-11-09 19:11:11 -05:00
|
|
|
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
|
|
|
) {
|
2023-11-09 11:27:11 -08:00
|
|
|
if (!headers) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
for (const header of headers) {
|
2023-11-07 21:38:50 -08:00
|
|
|
if (header.name.toLowerCase() === "content-type") {
|
|
|
|
return header.value.split(";")[0];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2025-01-28 11:28:23 -08:00
|
|
|
protected getLocation(
|
|
|
|
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
|
|
|
) {
|
|
|
|
if (!headers) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
for (const header of headers) {
|
|
|
|
if (header.name.toLowerCase() === "location") {
|
|
|
|
return header.value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
|
2023-11-09 11:27:11 -08:00
|
|
|
if (!headers) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
for (const header of headers) {
|
2023-11-07 21:38:50 -08:00
|
|
|
if (header.name.toLowerCase() === "content-length") {
|
|
|
|
return Number(header.value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
_getContentRange(headers?: Protocol.Fetch.HeaderEntry[]) {
|
2023-11-09 11:27:11 -08:00
|
|
|
if (!headers) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
for (const header of headers) {
|
2023-11-07 21:38:50 -08:00
|
|
|
if (header.name.toLowerCase() === "content-range") {
|
|
|
|
return header.value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
noResponseForStatus(status: number | undefined | null) {
|
2023-11-09 19:11:11 -05:00
|
|
|
return !status || status === 204 || (status >= 300 && status < 400);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
isValidUrl(url?: string) {
|
2023-11-07 21:38:50 -08:00
|
|
|
return url && (url.startsWith("https:") || url.startsWith("http:"));
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
pendingReqResp(requestId: string, reuseOnly = false) {
|
2023-11-07 21:38:50 -08:00
|
|
|
if (!this.pendingRequests.has(requestId)) {
|
|
|
|
if (reuseOnly || !requestId) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
if (this.skipIds.has(requestId)) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logNetwork("Skipping ignored id", { requestId });
|
2023-11-07 21:38:50 -08:00
|
|
|
return null;
|
|
|
|
}
|
|
|
|
const reqresp = new RequestResponseInfo(requestId);
|
|
|
|
this.pendingRequests.set(requestId, reqresp);
|
|
|
|
return reqresp;
|
|
|
|
} else {
|
|
|
|
const reqresp = this.pendingRequests.get(requestId);
|
2023-11-09 11:27:11 -08:00
|
|
|
if (reqresp && requestId !== reqresp.requestId) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"Invalid request id",
|
|
|
|
{ requestId, actualRequestId: reqresp.requestId },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
return reqresp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
removeReqResp(requestId: string, allowReuse = false) {
|
2023-11-07 21:38:50 -08:00
|
|
|
const reqresp = this.pendingRequests.get(requestId);
|
2024-03-07 08:35:53 -08:00
|
|
|
if (reqresp) {
|
|
|
|
const { url, requestId } = reqresp;
|
|
|
|
logNetwork("Removing reqresp", { requestId, url });
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
this.pendingRequests.delete(requestId);
|
|
|
|
if (!allowReuse) {
|
|
|
|
this.skipIds.add(requestId);
|
|
|
|
}
|
|
|
|
return reqresp;
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
async serializeToWARC(reqresp: RequestResponseInfo) {
|
2024-03-07 08:35:53 -08:00
|
|
|
// always include in pageinfo record if going to serialize to WARC
|
|
|
|
// even if serialization does not happen
|
|
|
|
this.addPageRecord(reqresp);
|
|
|
|
|
|
|
|
const { url, method, status, payload, requestId } = reqresp;
|
|
|
|
|
|
|
|
// Specifically log skipping cached resources
|
|
|
|
if (reqresp.isCached()) {
|
|
|
|
logger.debug(
|
|
|
|
"Skipping cached resource, should be already recorded",
|
|
|
|
{ url, status },
|
|
|
|
"recorder",
|
|
|
|
);
|
|
|
|
return;
|
|
|
|
} else if (reqresp.shouldSkipSave()) {
|
2024-06-25 15:48:22 -07:00
|
|
|
logger.debug(
|
|
|
|
"Skipping writing request/response",
|
|
|
|
{
|
|
|
|
requestId,
|
|
|
|
url,
|
|
|
|
method,
|
|
|
|
status,
|
|
|
|
payloadLength: (payload && payload.length) || 0,
|
|
|
|
},
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
2024-03-07 08:35:53 -08:00
|
|
|
url &&
|
|
|
|
method === "GET" &&
|
2024-07-08 10:51:37 -07:00
|
|
|
!isRedirectStatus(status) &&
|
|
|
|
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2024-07-17 13:24:25 -07:00
|
|
|
logNetwork("Skipping dupe", { url, status, ...this.logDetails });
|
2023-11-07 21:38:50 -08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const responseRecord = createResponse(reqresp, this.pageid);
|
|
|
|
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
|
|
|
|
2024-04-04 09:36:16 -07:00
|
|
|
this.writer.writeRecordPair(responseRecord, requestRecord);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-06-26 09:16:24 -07:00
|
|
|
async directFetchCapture({
|
|
|
|
url,
|
|
|
|
headers,
|
|
|
|
cdp,
|
|
|
|
}: DirectFetchRequest): Promise<DirectFetchResponse> {
|
2023-11-09 11:27:11 -08:00
|
|
|
const reqresp = new RequestResponseInfo("0");
|
2024-05-24 14:51:51 -07:00
|
|
|
const ts = new Date();
|
|
|
|
|
|
|
|
const cookie = await this.getCookieString(cdp, url);
|
|
|
|
if (cookie) {
|
|
|
|
headers["Cookie"] = cookie;
|
|
|
|
}
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.url = url;
|
|
|
|
reqresp.method = "GET";
|
2024-05-24 14:51:51 -07:00
|
|
|
reqresp.requestHeaders = headers;
|
|
|
|
reqresp.ts = ts;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
let mime: string = "";
|
|
|
|
|
|
|
|
const filter = (resp: Response) => {
|
|
|
|
// only direct load 200 responses
|
|
|
|
if (resp.status !== 200) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const ct = resp.headers.get("content-type");
|
|
|
|
if (ct) {
|
|
|
|
mime = ct.split(";")[0];
|
|
|
|
}
|
|
|
|
|
2024-09-05 13:28:49 -07:00
|
|
|
const result = !isHTMLMime(mime);
|
|
|
|
|
|
|
|
if (result) {
|
|
|
|
logger.info(
|
|
|
|
"Directly fetching page URL without browser",
|
|
|
|
{ url, ...this.logDetails },
|
|
|
|
"fetch",
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
2024-05-24 14:51:51 -07:00
|
|
|
};
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
|
|
|
// should not get here, as dupe pages tracked via seen list
|
2023-11-09 19:11:11 -05:00
|
|
|
const fetcher = new AsyncFetcher({
|
|
|
|
reqresp,
|
|
|
|
recorder: this,
|
|
|
|
networkId: "0",
|
|
|
|
filter,
|
|
|
|
ignoreDupe: true,
|
2024-05-24 14:51:51 -07:00
|
|
|
manualRedirect: true,
|
2023-11-09 19:11:11 -05:00
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
const res = await fetcher.load();
|
|
|
|
|
2024-07-08 10:51:37 -07:00
|
|
|
// if we get here, resource was not filtered out, has status code of 200
|
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
this.addPageRecord(reqresp);
|
|
|
|
|
2024-07-08 10:51:37 -07:00
|
|
|
const fetched = res === "fetched";
|
|
|
|
|
|
|
|
if (
|
|
|
|
url === this.pageUrl &&
|
|
|
|
fetched &&
|
|
|
|
(!this.pageInfo.ts || 200 < this.pageInfo.tsStatus)
|
|
|
|
) {
|
|
|
|
logger.debug("Setting page timestamp", { ts, url, status: 200 });
|
2024-05-24 14:51:51 -07:00
|
|
|
this.pageInfo.ts = ts;
|
2024-07-08 10:51:37 -07:00
|
|
|
this.pageInfo.tsStatus = 200;
|
2024-05-24 14:51:51 -07:00
|
|
|
}
|
|
|
|
|
2024-07-08 10:51:37 -07:00
|
|
|
return { fetched, mime, ts };
|
2024-05-24 14:51:51 -07:00
|
|
|
}
|
|
|
|
|
2024-09-27 14:30:25 -04:00
|
|
|
async getCookieString(cdp: CDPSession, url: string): Promise<string> {
|
|
|
|
try {
|
|
|
|
const cookieList: string[] = [];
|
|
|
|
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
|
|
|
|
for (const { name, value } of cookies) {
|
|
|
|
cookieList.push(`${name}=${value}`);
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-09-27 14:30:25 -04:00
|
|
|
return cookieList.join(";");
|
|
|
|
} catch (e) {
|
|
|
|
logger.warn("Error getting cookies", { page: url, e }, "recorder");
|
|
|
|
return "";
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// =================================================================
|
2023-11-09 19:11:11 -05:00
|
|
|
class AsyncFetcher {
|
2023-11-09 11:27:11 -08:00
|
|
|
reqresp: RequestResponseInfo;
|
2023-11-09 19:11:11 -05:00
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
networkId: string;
|
|
|
|
filter?: (resp: Response) => boolean;
|
|
|
|
ignoreDupe = false;
|
|
|
|
|
2023-11-09 18:33:44 -08:00
|
|
|
maxFetchSize: number;
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
recorder: Recorder;
|
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
manualRedirect = false;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
constructor({
|
|
|
|
reqresp,
|
|
|
|
expectedSize = -1,
|
|
|
|
recorder,
|
|
|
|
networkId,
|
|
|
|
filter = undefined,
|
|
|
|
ignoreDupe = false,
|
2023-11-09 18:33:44 -08:00
|
|
|
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
2024-05-24 14:51:51 -07:00
|
|
|
manualRedirect = false,
|
2024-04-03 17:38:50 -07:00
|
|
|
}: AsyncFetchOptions) {
|
2023-11-07 21:38:50 -08:00
|
|
|
this.reqresp = reqresp;
|
|
|
|
this.reqresp.expectedSize = expectedSize;
|
|
|
|
this.reqresp.asyncLoading = true;
|
|
|
|
|
|
|
|
this.networkId = networkId;
|
|
|
|
this.filter = filter;
|
|
|
|
this.ignoreDupe = ignoreDupe;
|
|
|
|
|
|
|
|
this.recorder = recorder;
|
|
|
|
|
2023-11-09 18:33:44 -08:00
|
|
|
this.maxFetchSize = maxFetchSize;
|
2024-05-24 14:51:51 -07:00
|
|
|
|
|
|
|
this.manualRedirect = manualRedirect;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
async load() {
|
2024-09-05 11:10:31 -07:00
|
|
|
const { reqresp, recorder, networkId } = this;
|
2024-07-08 10:51:37 -07:00
|
|
|
const { url, status } = reqresp;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
const { pageid, crawlState, gzip, logDetails } = recorder;
|
|
|
|
|
|
|
|
let fetched = "notfetched";
|
|
|
|
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
reqresp.method === "GET" &&
|
|
|
|
url &&
|
2024-07-08 10:51:37 -07:00
|
|
|
!(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url, status))
|
2023-11-09 19:11:11 -05:00
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
if (!this.ignoreDupe) {
|
|
|
|
this.reqresp.asyncLoading = false;
|
|
|
|
return "dupe";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const body = await this._doFetch();
|
|
|
|
fetched = "fetched";
|
|
|
|
|
|
|
|
const responseRecord = createResponse(reqresp, pageid, body);
|
|
|
|
const requestRecord = createRequest(reqresp, responseRecord, pageid);
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const serializer = new WARCSerializer(responseRecord, {
|
|
|
|
gzip,
|
2023-11-09 18:33:44 -08:00
|
|
|
maxMemSize: this.maxFetchSize,
|
2023-11-09 19:11:11 -05:00
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
try {
|
|
|
|
let readSize = await serializer.digestRecord();
|
|
|
|
if (serializer.httpHeadersBuff) {
|
|
|
|
readSize -= serializer.httpHeadersBuff.length;
|
|
|
|
}
|
|
|
|
reqresp.readSize = readSize;
|
2023-12-07 23:02:55 -08:00
|
|
|
// set truncated field and recompute header buff
|
|
|
|
if (reqresp.truncated) {
|
|
|
|
responseRecord.warcHeaders.headers.set(
|
|
|
|
"WARC-Truncated",
|
|
|
|
reqresp.truncated,
|
|
|
|
);
|
|
|
|
// todo: keep this internal in warcio after adding new header
|
|
|
|
serializer.warcHeadersBuff = encoder.encode(
|
|
|
|
responseRecord.warcHeaders.toString(),
|
|
|
|
);
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.error(
|
|
|
|
"Error reading + digesting payload",
|
2024-09-05 11:10:31 -07:00
|
|
|
{ url, ...formatErr(e), ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
reqresp.readSize === reqresp.expectedSize ||
|
|
|
|
reqresp.expectedSize < 0
|
|
|
|
) {
|
|
|
|
logger.debug(
|
|
|
|
"Async fetch: streaming done",
|
|
|
|
{
|
|
|
|
size: reqresp.readSize,
|
|
|
|
expected: reqresp.expectedSize,
|
|
|
|
networkId,
|
|
|
|
url,
|
|
|
|
...logDetails,
|
|
|
|
},
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
} else {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"Async fetch: possible response size mismatch",
|
|
|
|
{
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
type: this.constructor.name,
|
2023-11-09 19:11:11 -05:00
|
|
|
size: reqresp.readSize,
|
|
|
|
expected: reqresp.expectedSize,
|
|
|
|
url,
|
|
|
|
...logDetails,
|
|
|
|
},
|
|
|
|
"recorder",
|
|
|
|
);
|
separate fetch api for autofetch bbehavior + additional improvements on partial responses: (#736)
Chromium now interrupts fetch() if abort() is called or page is
navigated, so autofetch behavior using native fetch() is less than
ideal. This PR adds support for __bx_fetch() command for autofetch
behavior (supported in browsertrix-behaviors 0.6.6) to fetch separately
from browser's reguar fetch()
- __bx_fetch() starts a fetch, but does not return content to browser,
doesn't need abort(), unaffected by page navigation, but will still try
to use browser network stack when possible, making it more efficient for
background fetching.
- if network stack fetch fails, fallback to regular node fetch() in the
crawler.
Additional improvements for interrupted fetch:
- don't store truncated media responses, even for 200
- avoid doing duplicate async fetching if response already handled (eg.
fetch handled in multiple contexts)
- fixes #735, where fetch was interrupted, resulted in an empty response
2024-12-31 13:52:12 -08:00
|
|
|
if (status === 206 || status === 200) {
|
|
|
|
void serializer.externalBuffer?.purge();
|
2024-11-13 23:28:37 -08:00
|
|
|
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url, status);
|
|
|
|
return "notfetched";
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const externalBuffer: TempFileBuffer =
|
|
|
|
serializer.externalBuffer as TempFileBuffer;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
if (externalBuffer) {
|
|
|
|
const { currSize, buffers, fh } = externalBuffer;
|
|
|
|
|
2024-04-03 17:38:50 -07:00
|
|
|
// if fully buffered in memory, then populate the payload to return to browser
|
2023-11-07 21:38:50 -08:00
|
|
|
if (buffers && buffers.length && !fh) {
|
|
|
|
reqresp.payload = Buffer.concat(buffers, currSize);
|
|
|
|
externalBuffer.buffers = [reqresp.payload];
|
2024-04-03 17:38:50 -07:00
|
|
|
} else if (fh) {
|
2024-10-31 14:06:17 -07:00
|
|
|
logger.debug(
|
2024-09-05 13:28:49 -07:00
|
|
|
"Large payload written to WARC, but not returned to browser (would require rereading into memory)",
|
2024-04-03 17:38:50 -07:00
|
|
|
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Object.keys(reqresp.extraOpts).length) {
|
2023-11-09 19:11:11 -05:00
|
|
|
responseRecord.warcHeaders.headers.set(
|
|
|
|
"WARC-JSON-Metadata",
|
|
|
|
JSON.stringify(reqresp.extraOpts),
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2024-04-04 09:36:16 -07:00
|
|
|
recorder.writer.writeRecordPair(
|
|
|
|
responseRecord,
|
|
|
|
requestRecord,
|
|
|
|
serializer,
|
2023-11-09 19:11:11 -05:00
|
|
|
);
|
2024-04-04 09:36:16 -07:00
|
|
|
|
2023-11-13 09:16:57 -08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
|
|
} catch (e: any) {
|
2024-07-08 10:51:37 -07:00
|
|
|
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!, status);
|
2023-11-13 09:16:57 -08:00
|
|
|
if (e.message === "response-filtered-out") {
|
|
|
|
throw e;
|
|
|
|
}
|
2024-06-13 15:42:27 -04:00
|
|
|
logger.debug(
|
2023-11-09 19:11:11 -05:00
|
|
|
"Streaming Fetch Error",
|
2024-09-05 11:10:31 -07:00
|
|
|
{ url, networkId, ...formatErr(e), ...logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2024-03-07 08:35:53 -08:00
|
|
|
// indicate response is ultimately not valid
|
|
|
|
reqresp.status = 0;
|
|
|
|
reqresp.errorText = e.message;
|
2023-11-07 21:38:50 -08:00
|
|
|
} finally {
|
2024-05-24 14:51:51 -07:00
|
|
|
recorder.addPageRecord(reqresp);
|
2024-03-07 08:35:53 -08:00
|
|
|
// exclude direct fetch request with fake id
|
|
|
|
if (networkId !== "0") {
|
|
|
|
recorder.removeReqResp(networkId);
|
|
|
|
}
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return fetched;
|
|
|
|
}
|
|
|
|
|
|
|
|
async _doFetch() {
|
|
|
|
const { reqresp } = this;
|
|
|
|
const { method, url } = reqresp;
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug("Async started: fetch", { url }, "recorder");
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
const headers = reqresp.getRequestHeadersDict();
|
|
|
|
|
2025-01-25 22:55:49 -08:00
|
|
|
let dispatcher = getProxyDispatcher();
|
|
|
|
|
|
|
|
if (dispatcher) {
|
|
|
|
dispatcher = dispatcher.compose((dispatch) => {
|
|
|
|
return (opts, handler) => {
|
|
|
|
if (opts.headers) {
|
|
|
|
reqresp.requestHeaders = opts.headers as Record<string, string>;
|
|
|
|
}
|
|
|
|
return dispatch(opts, handler);
|
|
|
|
};
|
|
|
|
});
|
|
|
|
}
|
2024-09-05 13:28:49 -07:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const resp = await fetch(url!, {
|
|
|
|
method,
|
|
|
|
headers,
|
|
|
|
body: reqresp.postData || undefined,
|
2024-05-24 14:51:51 -07:00
|
|
|
redirect: this.manualRedirect ? "manual" : "follow",
|
2024-09-05 13:28:49 -07:00
|
|
|
dispatcher,
|
2023-11-09 19:11:11 -05:00
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-09-17 10:29:23 -07:00
|
|
|
if (this.filter && !this.filter(resp)) {
|
|
|
|
// if redirect and cancelled, read whole buffer to avoid possible node error event
|
|
|
|
if (resp.status >= 300 && resp.status < 400) {
|
|
|
|
await resp.arrayBuffer();
|
|
|
|
} else {
|
|
|
|
// otherwise, just cancel
|
|
|
|
resp.body?.cancel().catch(() => {});
|
|
|
|
}
|
2023-11-13 09:16:57 -08:00
|
|
|
throw new Error("response-filtered-out");
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
reqresp.expectedSize < 0 &&
|
|
|
|
resp.headers.get("content-length") &&
|
|
|
|
!resp.headers.get("content-encoding")
|
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.expectedSize = Number(resp.headers.get("content-length") || -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reqresp.expectedSize === 0) {
|
2024-05-24 14:51:51 -07:00
|
|
|
reqresp.fillFetchResponse(resp);
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.payload = new Uint8Array();
|
|
|
|
return;
|
|
|
|
} else if (!resp.body) {
|
2023-11-13 09:16:57 -08:00
|
|
|
throw new Error("fetch body missing, fetch aborted");
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
reqresp.fillFetchResponse(resp);
|
|
|
|
|
|
|
|
return this.takeReader(resp.body.getReader());
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
async *takeReader(reader: ReadableStreamDefaultReader<Uint8Array>) {
|
2023-12-07 23:02:55 -08:00
|
|
|
let size = 0;
|
2023-11-07 21:38:50 -08:00
|
|
|
try {
|
|
|
|
while (true) {
|
|
|
|
const { value, done } = await reader.read();
|
|
|
|
if (done) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2023-12-07 23:02:55 -08:00
|
|
|
size += value.length;
|
2023-11-07 21:38:50 -08:00
|
|
|
yield value;
|
|
|
|
}
|
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"takeReader interrupted",
|
2023-12-07 23:02:55 -08:00
|
|
|
{
|
|
|
|
size,
|
|
|
|
url: this.reqresp.url,
|
|
|
|
...formatErr(e),
|
|
|
|
...this.recorder.logDetails,
|
|
|
|
},
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
this.reqresp.truncated = "disconnect";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
async *takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) {
|
2023-12-07 23:02:55 -08:00
|
|
|
let size = 0;
|
2023-11-07 21:38:50 -08:00
|
|
|
try {
|
|
|
|
while (true) {
|
2023-11-09 19:11:11 -05:00
|
|
|
const { data, base64Encoded, eof } = await cdp.send("IO.read", {
|
|
|
|
handle: stream,
|
2023-12-07 23:02:55 -08:00
|
|
|
size: TAKE_STREAM_BUFF_SIZE,
|
2023-11-09 19:11:11 -05:00
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
const buff = Buffer.from(data, base64Encoded ? "base64" : "utf-8");
|
|
|
|
|
2023-12-07 23:02:55 -08:00
|
|
|
size += buff.length;
|
2023-11-07 21:38:50 -08:00
|
|
|
yield buff;
|
|
|
|
|
|
|
|
if (eof) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.warn(
|
|
|
|
"takeStream interrupted",
|
2023-12-07 23:02:55 -08:00
|
|
|
{
|
|
|
|
size,
|
|
|
|
url: this.reqresp.url,
|
|
|
|
...formatErr(e),
|
|
|
|
...this.recorder.logDetails,
|
|
|
|
},
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
this.reqresp.truncated = "disconnect";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// =================================================================
|
2023-11-09 19:11:11 -05:00
|
|
|
class ResponseStreamAsyncFetcher extends AsyncFetcher {
|
2023-11-09 11:27:11 -08:00
|
|
|
cdp: CDPSession;
|
|
|
|
requestId: string;
|
|
|
|
|
2024-04-03 17:38:50 -07:00
|
|
|
constructor(opts: ResponseStreamAsyncFetchOptions) {
|
2023-11-07 21:38:50 -08:00
|
|
|
super(opts);
|
|
|
|
this.cdp = opts.cdp;
|
|
|
|
this.requestId = opts.requestId;
|
|
|
|
}
|
|
|
|
|
|
|
|
async _doFetch() {
|
|
|
|
const { requestId, reqresp, cdp } = this;
|
|
|
|
const { url } = reqresp;
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug("Async started: takeStream", { url }, "recorder");
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const { stream } = await cdp.send("Fetch.takeResponseBodyAsStream", {
|
|
|
|
requestId,
|
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
return this.takeStreamIter(cdp, stream);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// =================================================================
|
2023-11-09 19:11:11 -05:00
|
|
|
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
|
2023-11-09 11:27:11 -08:00
|
|
|
cdp: CDPSession;
|
|
|
|
|
2024-05-24 14:51:51 -07:00
|
|
|
constructor(opts: NetworkLoadAsyncFetchOptions) {
|
2023-11-07 21:38:50 -08:00
|
|
|
super(opts);
|
|
|
|
this.cdp = opts.cdp;
|
|
|
|
}
|
|
|
|
|
|
|
|
async _doFetch() {
|
|
|
|
const { reqresp, cdp } = this;
|
|
|
|
const { url } = reqresp;
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug("Async started: loadNetworkResource", { url }, "recorder");
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const options = { disableCache: false, includeCredentials: true };
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
let result = null;
|
|
|
|
|
|
|
|
try {
|
2023-11-09 19:11:11 -05:00
|
|
|
result = await cdp.send("Network.loadNetworkResource", {
|
|
|
|
frameId: reqresp.frameId,
|
|
|
|
url,
|
|
|
|
options,
|
|
|
|
});
|
2023-11-07 21:38:50 -08:00
|
|
|
} catch (e) {
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"Network.loadNetworkResource failed, attempting node fetch",
|
2023-11-14 21:54:40 -08:00
|
|
|
{ url, ...formatErr(e), ...this.recorder.logDetails },
|
2023-11-09 19:11:11 -05:00
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
return await super._doFetch();
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const { stream, headers, httpStatusCode, success, netError, netErrorName } =
|
|
|
|
result.resource;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
if (!success || !stream) {
|
|
|
|
//await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
2023-11-09 19:11:11 -05:00
|
|
|
logger.debug(
|
|
|
|
"Network.loadNetworkResource failed, attempting node fetch",
|
|
|
|
{
|
|
|
|
url,
|
|
|
|
netErrorName,
|
|
|
|
netError,
|
|
|
|
httpStatusCode,
|
|
|
|
...this.recorder.logDetails,
|
|
|
|
},
|
|
|
|
"recorder",
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
return await super._doFetch();
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
if (
|
|
|
|
reqresp.expectedSize < 0 &&
|
|
|
|
headers &&
|
|
|
|
headers["content-length"] &&
|
|
|
|
!headers["content-encoding"]
|
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.expectedSize = Number(headers["content-length"] || -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reqresp.expectedSize === 0) {
|
|
|
|
reqresp.payload = new Uint8Array();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-07-17 13:24:25 -07:00
|
|
|
reqresp.setStatus(httpStatusCode || 200);
|
2023-11-07 21:38:50 -08:00
|
|
|
reqresp.responseHeaders = headers || {};
|
|
|
|
|
|
|
|
return this.takeStreamIter(cdp, stream);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// =================================================================
|
|
|
|
// response
|
2023-11-09 19:11:11 -05:00
|
|
|
function createResponse(
|
|
|
|
reqresp: RequestResponseInfo,
|
|
|
|
pageid: string,
|
|
|
|
contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>,
|
|
|
|
) {
|
2024-07-17 13:24:25 -07:00
|
|
|
if (reqresp.isRemoveRange && reqresp.status === 206) {
|
|
|
|
reqresp.setStatus(200);
|
|
|
|
}
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
const url = reqresp.url;
|
|
|
|
const warcVersion = "WARC/1.1";
|
|
|
|
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
2024-02-09 19:44:17 -05:00
|
|
|
const date = new Date(reqresp.ts).toISOString();
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2024-06-25 15:48:22 -07:00
|
|
|
if (!reqresp.payload) {
|
|
|
|
reqresp.payload = new Uint8Array();
|
|
|
|
}
|
|
|
|
|
|
|
|
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload.length);
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const warcHeaders: Record<string, string> = {
|
2023-11-07 21:38:50 -08:00
|
|
|
"WARC-Page-ID": pageid,
|
|
|
|
};
|
|
|
|
|
2024-03-04 18:10:45 -08:00
|
|
|
if (reqresp.resourceType) {
|
|
|
|
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
|
|
|
}
|
|
|
|
|
2023-11-07 21:38:50 -08:00
|
|
|
if (!contentIter) {
|
2023-11-09 11:27:11 -08:00
|
|
|
contentIter = [reqresp.payload] as Iterable<Uint8Array>;
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Object.keys(reqresp.extraOpts).length) {
|
|
|
|
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
|
|
|
}
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
return WARCRecord.create(
|
|
|
|
{
|
|
|
|
url,
|
|
|
|
date,
|
|
|
|
warcVersion,
|
|
|
|
type: "response",
|
|
|
|
warcHeaders,
|
|
|
|
httpHeaders,
|
|
|
|
statusline,
|
|
|
|
},
|
|
|
|
contentIter,
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// =================================================================
|
|
|
|
// request
|
2023-11-09 19:11:11 -05:00
|
|
|
function createRequest(
|
|
|
|
reqresp: RequestResponseInfo,
|
|
|
|
responseRecord: WARCRecord,
|
|
|
|
pageid: string,
|
|
|
|
) {
|
2023-11-07 21:38:50 -08:00
|
|
|
const url = reqresp.url;
|
|
|
|
const warcVersion = "WARC/1.1";
|
|
|
|
const method = reqresp.method;
|
|
|
|
|
|
|
|
const urlParsed = new URL(url);
|
|
|
|
|
|
|
|
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const requestBody = reqresp.postData
|
|
|
|
? [encoder.encode(reqresp.postData)]
|
|
|
|
: [];
|
2023-11-07 21:38:50 -08:00
|
|
|
|
|
|
|
const httpHeaders = reqresp.getRequestHeadersDict();
|
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
const warcHeaders: Record<string, string> = {
|
2023-11-09 11:27:11 -08:00
|
|
|
"WARC-Concurrent-To": responseRecord.warcHeader("WARC-Record-ID")!,
|
2023-11-07 21:38:50 -08:00
|
|
|
"WARC-Page-ID": pageid,
|
|
|
|
};
|
|
|
|
|
2024-03-04 18:10:45 -08:00
|
|
|
if (reqresp.resourceType) {
|
|
|
|
warcHeaders["WARC-Resource-Type"] = reqresp.resourceType;
|
|
|
|
}
|
|
|
|
|
2023-11-09 11:27:11 -08:00
|
|
|
const date = responseRecord.warcDate || undefined;
|
2023-11-07 21:38:50 -08:00
|
|
|
|
2023-11-09 19:11:11 -05:00
|
|
|
return WARCRecord.create(
|
|
|
|
{
|
|
|
|
url,
|
|
|
|
date,
|
|
|
|
warcVersion,
|
|
|
|
type: "request",
|
|
|
|
warcHeaders,
|
|
|
|
httpHeaders,
|
|
|
|
statusline,
|
|
|
|
},
|
|
|
|
requestBody,
|
|
|
|
);
|
2023-11-07 21:38:50 -08:00
|
|
|
}
|