mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Load non-HTML resources directly whenever possible (#583)
Optimize the direct loading of non-HTML pages. Currently, the behavior is: - make a HEAD request first - make a direct fetch request only if HEAD request is a non-HTML and 200 - only use fetch request if non-HTML and 200 and doesn't set any cookies This changes the behavior to: - get cookies from browser for page URL - make a direct fetch request with cookies, if provided - only use fetch request if non-HTML and 200 Also: - ensures pageinfo is properly set with timestamp for direct fetch. - remove obsolete Agent handling that is no longer used in default (fetch) If fetch request results in HTML, the response is aborted and browser loading is used.
This commit is contained in:
parent
089d901b9b
commit
a7d279cfbd
5 changed files with 114 additions and 94 deletions
|
@ -44,27 +44,18 @@ import { Browser } from "./util/browser.js";
|
|||
import {
|
||||
ADD_LINK_FUNC,
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
HTML_TYPES,
|
||||
DEFAULT_SELECTORS,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||
import { OriginOverride } from "./util/originoverride.js";
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
import { Agent as HTTPAgent } from "http";
|
||||
import { Agent as HTTPSAgent } from "https";
|
||||
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
|
||||
import { Recorder } from "./util/recorder.js";
|
||||
import { SitemapReader } from "./util/sitemapper.js";
|
||||
import { ScopedSeed } from "./util/seeds.js";
|
||||
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
||||
|
||||
const HTTPS_AGENT = new HTTPSAgent({
|
||||
rejectUnauthorized: false,
|
||||
});
|
||||
|
||||
const HTTP_AGENT = new HTTPAgent();
|
||||
import { isHTMLContentType } from "./util/reqresp.js";
|
||||
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
|
@ -781,7 +772,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async crawlPage(opts: WorkerState): Promise<void> {
|
||||
await this.writeStats();
|
||||
|
||||
const { page, data, workerid, callbacks, directFetchCapture } = opts;
|
||||
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
|
||||
data.callbacks = callbacks;
|
||||
|
||||
const { url } = data;
|
||||
|
@ -790,35 +781,27 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.logDetails = logDetails;
|
||||
data.workerid = workerid;
|
||||
|
||||
data.isHTMLPage = await timedRun(
|
||||
this.isHTML(url, logDetails),
|
||||
FETCH_TIMEOUT_SECS,
|
||||
"HEAD request to determine if URL is HTML page timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true,
|
||||
);
|
||||
|
||||
if (!data.isHTMLPage && directFetchCapture) {
|
||||
if (directFetchCapture) {
|
||||
try {
|
||||
const { fetched, mime } = await timedRun(
|
||||
directFetchCapture(url),
|
||||
const { fetched, mime, ts } = await timedRun(
|
||||
directFetchCapture({ url, headers: this.headers, cdp }),
|
||||
FETCH_TIMEOUT_SECS,
|
||||
"Direct fetch capture attempt timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true,
|
||||
);
|
||||
if (mime) {
|
||||
data.mime = mime;
|
||||
data.isHTMLPage = isHTMLContentType(mime);
|
||||
}
|
||||
if (fetched) {
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
if (mime) {
|
||||
data.mime = mime;
|
||||
}
|
||||
data.status = 200;
|
||||
data.ts = new Date();
|
||||
data.ts = ts || new Date();
|
||||
logger.info(
|
||||
"Direct fetch successful",
|
||||
{ url, ...logDetails },
|
||||
{ url, mime, ...logDetails },
|
||||
"fetch",
|
||||
);
|
||||
return;
|
||||
|
@ -1752,7 +1735,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const contentType = resp.headers()["content-type"];
|
||||
|
||||
isHTMLPage = this.isHTMLContentType(contentType);
|
||||
isHTMLPage = isHTMLContentType(contentType);
|
||||
|
||||
if (contentType) {
|
||||
data.mime = contentType.split(";")[0];
|
||||
|
@ -1878,7 +1861,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"behavior",
|
||||
);
|
||||
try {
|
||||
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
|
||||
await frame.evaluate(
|
||||
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
||||
);
|
||||
} catch (e) {
|
||||
logger.warn("Waiting for custom page load failed", e, "behavior");
|
||||
}
|
||||
|
@ -2191,49 +2176,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
resolveAgent(urlParsed: URL) {
|
||||
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
|
||||
}
|
||||
|
||||
async isHTML(url: string, logDetails: LogDetails) {
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
method: "HEAD",
|
||||
headers: this.headers,
|
||||
agent: this.resolveAgent,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} as any);
|
||||
if (resp.status !== 200) {
|
||||
logger.debug("HEAD response code != 200, loading in browser", {
|
||||
status: resp.status,
|
||||
...logDetails,
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
return this.isHTMLContentType(resp.headers.get("Content-Type"));
|
||||
} catch (e) {
|
||||
// can't confirm not html, so try in browser
|
||||
logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails });
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
isHTMLContentType(contentType: string | null) {
|
||||
// just load if no content-type
|
||||
if (!contentType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const mime = contentType.split(";")[0];
|
||||
|
||||
if (HTML_TYPES.includes(mime)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
|
||||
if (!sitemap) {
|
||||
return;
|
||||
|
|
|
@ -6,7 +6,7 @@ import PQueue from "p-queue";
|
|||
|
||||
import { logger, formatErr } from "./logger.js";
|
||||
import { sleep, timedRun, timestampNow } from "./timing.js";
|
||||
import { RequestResponseInfo } from "./reqresp.js";
|
||||
import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";
|
||||
|
||||
// @ts-expect-error TODO fill in why error is expected
|
||||
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
|
||||
|
@ -75,11 +75,23 @@ export type AsyncFetchOptions = {
|
|||
filter?: (resp: Response) => boolean;
|
||||
ignoreDupe?: boolean;
|
||||
maxFetchSize?: number;
|
||||
manualRedirect?: boolean;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
|
||||
export type DirectFetchRequest = {
|
||||
url: string;
|
||||
headers: Record<string, string>;
|
||||
cdp: CDPSession;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
|
||||
cdp: CDPSession;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
|
||||
requestId: string;
|
||||
};
|
||||
|
||||
|
@ -1062,12 +1074,23 @@ export class Recorder {
|
|||
this.writer.writeRecordPair(responseRecord, requestRecord);
|
||||
}
|
||||
|
||||
async directFetchCapture(
|
||||
url: string,
|
||||
): Promise<{ fetched: boolean; mime: string }> {
|
||||
async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
|
||||
fetched: boolean;
|
||||
mime: string;
|
||||
ts: Date;
|
||||
}> {
|
||||
const reqresp = new RequestResponseInfo("0");
|
||||
const ts = new Date();
|
||||
|
||||
const cookie = await this.getCookieString(cdp, url);
|
||||
if (cookie) {
|
||||
headers["Cookie"] = cookie;
|
||||
}
|
||||
|
||||
reqresp.url = url;
|
||||
reqresp.method = "GET";
|
||||
reqresp.requestHeaders = headers;
|
||||
reqresp.ts = ts;
|
||||
|
||||
logger.debug(
|
||||
"Directly fetching page URL without browser",
|
||||
|
@ -1075,8 +1098,21 @@ export class Recorder {
|
|||
"recorder",
|
||||
);
|
||||
|
||||
const filter = (resp: Response) =>
|
||||
resp.status === 200 && !resp.headers.get("set-cookie");
|
||||
let mime: string = "";
|
||||
|
||||
const filter = (resp: Response) => {
|
||||
// only direct load 200 responses
|
||||
if (resp.status !== 200) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ct = resp.headers.get("content-type");
|
||||
if (ct) {
|
||||
mime = ct.split(";")[0];
|
||||
}
|
||||
|
||||
return !isHTMLContentType(mime);
|
||||
};
|
||||
|
||||
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
||||
// should not get here, as dupe pages tracked via seen list
|
||||
|
@ -1087,16 +1123,28 @@ export class Recorder {
|
|||
networkId: "0",
|
||||
filter,
|
||||
ignoreDupe: true,
|
||||
manualRedirect: true,
|
||||
});
|
||||
const res = await fetcher.load();
|
||||
|
||||
const mime =
|
||||
(reqresp.responseHeaders &&
|
||||
reqresp.responseHeaders["content-type"] &&
|
||||
reqresp.responseHeaders["content-type"].split(";")[0]) ||
|
||||
"";
|
||||
this.addPageRecord(reqresp);
|
||||
|
||||
return { fetched: res === "fetched", mime };
|
||||
if (url === this.pageUrl && !this.pageInfo.ts) {
|
||||
logger.debug("Setting page timestamp", { ts, url });
|
||||
this.pageInfo.ts = ts;
|
||||
}
|
||||
|
||||
return { fetched: res === "fetched", mime, ts };
|
||||
}
|
||||
|
||||
async getCookieString(cdp: CDPSession, url: string) {
|
||||
const cookieList: string[] = [];
|
||||
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
|
||||
for (const { name, value } of cookies) {
|
||||
cookieList.push(`${name}=${value}`);
|
||||
}
|
||||
|
||||
return cookieList.join(";");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1115,6 +1163,8 @@ class AsyncFetcher {
|
|||
tempdir: string;
|
||||
filename: string;
|
||||
|
||||
manualRedirect = false;
|
||||
|
||||
constructor({
|
||||
tempdir,
|
||||
reqresp,
|
||||
|
@ -1124,6 +1174,7 @@ class AsyncFetcher {
|
|||
filter = undefined,
|
||||
ignoreDupe = false,
|
||||
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
||||
manualRedirect = false,
|
||||
}: AsyncFetchOptions) {
|
||||
this.reqresp = reqresp;
|
||||
this.reqresp.expectedSize = expectedSize;
|
||||
|
@ -1142,6 +1193,8 @@ class AsyncFetcher {
|
|||
);
|
||||
|
||||
this.maxFetchSize = maxFetchSize;
|
||||
|
||||
this.manualRedirect = manualRedirect;
|
||||
}
|
||||
|
||||
async load() {
|
||||
|
@ -1277,9 +1330,9 @@ class AsyncFetcher {
|
|||
reqresp.status = 0;
|
||||
reqresp.errorText = e.message;
|
||||
} finally {
|
||||
recorder.addPageRecord(reqresp);
|
||||
// exclude direct fetch request with fake id
|
||||
if (networkId !== "0") {
|
||||
recorder.addPageRecord(reqresp);
|
||||
recorder.removeReqResp(networkId);
|
||||
}
|
||||
}
|
||||
|
@ -1307,6 +1360,7 @@ class AsyncFetcher {
|
|||
headers,
|
||||
body: reqresp.postData || undefined,
|
||||
signal,
|
||||
redirect: this.manualRedirect ? "manual" : "follow",
|
||||
});
|
||||
|
||||
if (this.filter && !this.filter(resp) && abort) {
|
||||
|
@ -1323,6 +1377,7 @@ class AsyncFetcher {
|
|||
}
|
||||
|
||||
if (reqresp.expectedSize === 0) {
|
||||
reqresp.fillFetchResponse(resp);
|
||||
reqresp.payload = new Uint8Array();
|
||||
return;
|
||||
} else if (!resp.body) {
|
||||
|
@ -1422,7 +1477,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
|
|||
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
|
||||
cdp: CDPSession;
|
||||
|
||||
constructor(opts: ResponseStreamAsyncFetchOptions) {
|
||||
constructor(opts: NetworkLoadAsyncFetchOptions) {
|
||||
super(opts);
|
||||
this.cdp = opts.cdp;
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";
|
|||
|
||||
import { Protocol } from "puppeteer-core";
|
||||
import { postToGetUrl } from "warcio";
|
||||
import { HTML_TYPES } from "./constants.js";
|
||||
|
||||
const CONTENT_LENGTH = "content-length";
|
||||
const CONTENT_TYPE = "content-type";
|
||||
|
@ -148,10 +149,15 @@ export class RequestResponseInfo {
|
|||
}
|
||||
}
|
||||
|
||||
isRedirectStatus() {
|
||||
return this.status >= 300 && this.status < 400 && this.status !== 304;
|
||||
}
|
||||
|
||||
isSelfRedirect() {
|
||||
if (this.status < 300 || this.status >= 400 || this.status === 304) {
|
||||
if (!this.isRedirectStatus()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
const headers = new Headers(this.getResponseHeadersDict());
|
||||
const location = headers.get("location") || "";
|
||||
|
@ -362,3 +368,18 @@ export class RequestResponseInfo {
|
|||
return value.replace(/\n/g, ", ");
|
||||
}
|
||||
}
|
||||
|
||||
export function isHTMLContentType(contentType: string | null) {
|
||||
// just load if no content-type
|
||||
if (!contentType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const mime = contentType.split(";")[0];
|
||||
|
||||
if (HTML_TYPES.includes(mime)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -66,7 +66,7 @@ export class PageState {
|
|||
|
||||
callbacks: PageCallbacks = {};
|
||||
|
||||
isHTMLPage?: boolean;
|
||||
isHTMLPage = true;
|
||||
text?: string;
|
||||
screenshotView?: Buffer;
|
||||
favicon?: string;
|
||||
|
|
|
@ -2,7 +2,7 @@ import os from "os";
|
|||
|
||||
import { logger, formatErr } from "./logger.js";
|
||||
import { sleep, timedRun } from "./timing.js";
|
||||
import { Recorder } from "./recorder.js";
|
||||
import { DirectFetchRequest, Recorder } from "./recorder.js";
|
||||
import { rxEscape } from "./seeds.js";
|
||||
import { CDPSession, Page } from "puppeteer-core";
|
||||
import { PageState, WorkerId } from "./state.js";
|
||||
|
@ -20,8 +20,10 @@ export type WorkerOpts = {
|
|||
workerid: WorkerId;
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
callbacks: Record<string, Function>;
|
||||
directFetchCapture?:
|
||||
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
|
||||
directFetchCapture:
|
||||
| ((
|
||||
request: DirectFetchRequest,
|
||||
) => Promise<{ fetched: boolean; mime: string; ts: Date }>)
|
||||
| null;
|
||||
frameIdToExecId: Map<string, number>;
|
||||
};
|
||||
|
@ -171,7 +173,7 @@ export class PageWorker {
|
|||
this.cdp = cdp;
|
||||
this.callbacks = {};
|
||||
const directFetchCapture = this.recorder
|
||||
? (x: string) => this.recorder!.directFetchCapture(x)
|
||||
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
|
||||
: null;
|
||||
this.opts = {
|
||||
page,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue