Load non-HTML resources directly whenever possible (#583)

Optimize the direct loading of non-HTML pages. Currently, the behavior
is:
- make a HEAD request first
- make a direct fetch request only if HEAD request is a non-HTML and 200
- only use fetch request if non-HTML and 200 and doesn't set any cookies

This changes the behavior to:
- get cookies from browser for page URL
- make a direct fetch request with cookies, if provided
- only use fetch request if non-HTML and 200
Also:
- ensures pageinfo is properly set with timestamp for direct fetch.
- remove obsolete Agent handling that is no longer used in default
(fetch)

If fetch request results in HTML, the response is aborted and browser
loading is used.
This commit is contained in:
Ilya Kreymer 2024-05-24 14:51:51 -07:00 committed by GitHub
parent 089d901b9b
commit a7d279cfbd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 114 additions and 94 deletions

View file

@ -44,27 +44,18 @@ import { Browser } from "./util/browser.js";
import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
HTML_TYPES,
DEFAULT_SELECTORS,
} from "./util/constants.js";
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { OriginOverride } from "./util/originoverride.js";
// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false,
});
const HTTP_AGENT = new HTTPAgent();
import { isHTMLContentType } from "./util/reqresp.js";
const behaviors = fs.readFileSync(
new URL(
@ -781,7 +772,7 @@ self.__bx_behaviors.selectMainBehavior();
async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats();
const { page, data, workerid, callbacks, directFetchCapture } = opts;
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
data.callbacks = callbacks;
const { url } = data;
@ -790,35 +781,27 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails;
data.workerid = workerid;
data.isHTMLPage = await timedRun(
this.isHTML(url, logDetails),
FETCH_TIMEOUT_SECS,
"HEAD request to determine if URL is HTML page timed out",
logDetails,
"fetch",
true,
);
if (!data.isHTMLPage && directFetchCapture) {
if (directFetchCapture) {
try {
const { fetched, mime } = await timedRun(
directFetchCapture(url),
const { fetched, mime, ts } = await timedRun(
directFetchCapture({ url, headers: this.headers, cdp }),
FETCH_TIMEOUT_SECS,
"Direct fetch capture attempt timed out",
logDetails,
"fetch",
true,
);
if (mime) {
data.mime = mime;
data.isHTMLPage = isHTMLContentType(mime);
}
if (fetched) {
data.loadState = LoadState.FULL_PAGE_LOADED;
if (mime) {
data.mime = mime;
}
data.status = 200;
data.ts = new Date();
data.ts = ts || new Date();
logger.info(
"Direct fetch successful",
{ url, ...logDetails },
{ url, mime, ...logDetails },
"fetch",
);
return;
@ -1752,7 +1735,7 @@ self.__bx_behaviors.selectMainBehavior();
const contentType = resp.headers()["content-type"];
isHTMLPage = this.isHTMLContentType(contentType);
isHTMLPage = isHTMLContentType(contentType);
if (contentType) {
data.mime = contentType.split(";")[0];
@ -1878,7 +1861,9 @@ self.__bx_behaviors.selectMainBehavior();
"behavior",
);
try {
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
await frame.evaluate(
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
);
} catch (e) {
logger.warn("Waiting for custom page load failed", e, "behavior");
}
@ -2191,49 +2176,6 @@ self.__bx_behaviors.selectMainBehavior();
}
}
resolveAgent(urlParsed: URL) {
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
}
async isHTML(url: string, logDetails: LogDetails) {
try {
const resp = await fetch(url, {
method: "HEAD",
headers: this.headers,
agent: this.resolveAgent,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any);
if (resp.status !== 200) {
logger.debug("HEAD response code != 200, loading in browser", {
status: resp.status,
...logDetails,
});
return true;
}
return this.isHTMLContentType(resp.headers.get("Content-Type"));
} catch (e) {
// can't confirm not html, so try in browser
logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails });
return true;
}
}
isHTMLContentType(contentType: string | null) {
// just load if no content-type
if (!contentType) {
return true;
}
const mime = contentType.split(";")[0];
if (HTML_TYPES.includes(mime)) {
return true;
}
return false;
}
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
if (!sitemap) {
return;

View file

@ -6,7 +6,7 @@ import PQueue from "p-queue";
import { logger, formatErr } from "./logger.js";
import { sleep, timedRun, timestampNow } from "./timing.js";
import { RequestResponseInfo } from "./reqresp.js";
import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";
// @ts-expect-error TODO fill in why error is expected
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
@ -75,11 +75,23 @@ export type AsyncFetchOptions = {
filter?: (resp: Response) => boolean;
ignoreDupe?: boolean;
maxFetchSize?: number;
manualRedirect?: boolean;
};
// =================================================================
export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
export type DirectFetchRequest = {
url: string;
headers: Record<string, string>;
cdp: CDPSession;
};
// =================================================================
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
cdp: CDPSession;
};
// =================================================================
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
requestId: string;
};
@ -1062,12 +1074,23 @@ export class Recorder {
this.writer.writeRecordPair(responseRecord, requestRecord);
}
async directFetchCapture(
url: string,
): Promise<{ fetched: boolean; mime: string }> {
async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
fetched: boolean;
mime: string;
ts: Date;
}> {
const reqresp = new RequestResponseInfo("0");
const ts = new Date();
const cookie = await this.getCookieString(cdp, url);
if (cookie) {
headers["Cookie"] = cookie;
}
reqresp.url = url;
reqresp.method = "GET";
reqresp.requestHeaders = headers;
reqresp.ts = ts;
logger.debug(
"Directly fetching page URL without browser",
@ -1075,8 +1098,21 @@ export class Recorder {
"recorder",
);
const filter = (resp: Response) =>
resp.status === 200 && !resp.headers.get("set-cookie");
let mime: string = "";
const filter = (resp: Response) => {
// only direct load 200 responses
if (resp.status !== 200) {
return false;
}
const ct = resp.headers.get("content-type");
if (ct) {
mime = ct.split(";")[0];
}
return !isHTMLContentType(mime);
};
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
// should not get here, as dupe pages tracked via seen list
@ -1087,16 +1123,28 @@ export class Recorder {
networkId: "0",
filter,
ignoreDupe: true,
manualRedirect: true,
});
const res = await fetcher.load();
const mime =
(reqresp.responseHeaders &&
reqresp.responseHeaders["content-type"] &&
reqresp.responseHeaders["content-type"].split(";")[0]) ||
"";
this.addPageRecord(reqresp);
return { fetched: res === "fetched", mime };
if (url === this.pageUrl && !this.pageInfo.ts) {
logger.debug("Setting page timestamp", { ts, url });
this.pageInfo.ts = ts;
}
return { fetched: res === "fetched", mime, ts };
}
async getCookieString(cdp: CDPSession, url: string) {
const cookieList: string[] = [];
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
for (const { name, value } of cookies) {
cookieList.push(`${name}=${value}`);
}
return cookieList.join(";");
}
}
@ -1115,6 +1163,8 @@ class AsyncFetcher {
tempdir: string;
filename: string;
manualRedirect = false;
constructor({
tempdir,
reqresp,
@ -1124,6 +1174,7 @@ class AsyncFetcher {
filter = undefined,
ignoreDupe = false,
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
manualRedirect = false,
}: AsyncFetchOptions) {
this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize;
@ -1142,6 +1193,8 @@ class AsyncFetcher {
);
this.maxFetchSize = maxFetchSize;
this.manualRedirect = manualRedirect;
}
async load() {
@ -1277,9 +1330,9 @@ class AsyncFetcher {
reqresp.status = 0;
reqresp.errorText = e.message;
} finally {
recorder.addPageRecord(reqresp);
// exclude direct fetch request with fake id
if (networkId !== "0") {
recorder.addPageRecord(reqresp);
recorder.removeReqResp(networkId);
}
}
@ -1307,6 +1360,7 @@ class AsyncFetcher {
headers,
body: reqresp.postData || undefined,
signal,
redirect: this.manualRedirect ? "manual" : "follow",
});
if (this.filter && !this.filter(resp) && abort) {
@ -1323,6 +1377,7 @@ class AsyncFetcher {
}
if (reqresp.expectedSize === 0) {
reqresp.fillFetchResponse(resp);
reqresp.payload = new Uint8Array();
return;
} else if (!resp.body) {
@ -1422,7 +1477,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
cdp: CDPSession;
constructor(opts: ResponseStreamAsyncFetchOptions) {
constructor(opts: NetworkLoadAsyncFetchOptions) {
super(opts);
this.cdp = opts.cdp;
}

View file

@ -3,6 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";
import { Protocol } from "puppeteer-core";
import { postToGetUrl } from "warcio";
import { HTML_TYPES } from "./constants.js";
const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type";
@ -148,10 +149,15 @@ export class RequestResponseInfo {
}
}
isRedirectStatus() {
return this.status >= 300 && this.status < 400 && this.status !== 304;
}
isSelfRedirect() {
if (this.status < 300 || this.status >= 400 || this.status === 304) {
if (!this.isRedirectStatus()) {
return false;
}
try {
const headers = new Headers(this.getResponseHeadersDict());
const location = headers.get("location") || "";
@ -362,3 +368,18 @@ export class RequestResponseInfo {
return value.replace(/\n/g, ", ");
}
}
export function isHTMLContentType(contentType: string | null) {
// just load if no content-type
if (!contentType) {
return true;
}
const mime = contentType.split(";")[0];
if (HTML_TYPES.includes(mime)) {
return true;
}
return false;
}

View file

@ -66,7 +66,7 @@ export class PageState {
callbacks: PageCallbacks = {};
isHTMLPage?: boolean;
isHTMLPage = true;
text?: string;
screenshotView?: Buffer;
favicon?: string;

View file

@ -2,7 +2,7 @@ import os from "os";
import { logger, formatErr } from "./logger.js";
import { sleep, timedRun } from "./timing.js";
import { Recorder } from "./recorder.js";
import { DirectFetchRequest, Recorder } from "./recorder.js";
import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js";
@ -20,8 +20,10 @@ export type WorkerOpts = {
workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/ban-types
callbacks: Record<string, Function>;
directFetchCapture?:
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
directFetchCapture:
| ((
request: DirectFetchRequest,
) => Promise<{ fetched: boolean; mime: string; ts: Date }>)
| null;
frameIdToExecId: Map<string, number>;
};
@ -171,7 +173,7 @@ export class PageWorker {
this.cdp = cdp;
this.callbacks = {};
const directFetchCapture = this.recorder
? (x: string) => this.recorder!.directFetchCapture(x)
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
: null;
this.opts = {
page,