browsertrix-crawler/src/util/originoverride.ts
Ilya Kreymer f7cbf9645b
Retry support and additional fixes (#743)
- retries: for failed pages, set retry to 5 in cases multiple retries
may be needed.
- redirect: if page url is /path/ -> /path, don't add as extra seed
- proxy: don't use global dispatcher, pass dispatcher explicitly when
using proxy, as proxy may interfere with local network requests
- final exit flag: if crawl is done and also interrupted, ensure WACZ is
still written/uploaded by setting final exit to true
- hashtag only change force reload: if loading page with same URL but
different hashtag, eg. `https://example.com/#B` after
`https://example.com/#A`, do a full reload
2025-01-25 22:55:49 -08:00

76 lines
2.1 KiB
TypeScript

import { HTTPRequest, Page } from "puppeteer-core";
import { formatErr, logger } from "./logger.js";
import { Browser } from "./browser.js";
import { fetch } from "undici";
import { getProxyDispatcher } from "./proxy.js";
export class OriginOverride {
originOverride: { origUrl: URL; destUrl: URL }[];
constructor(originOverride: string[]) {
this.originOverride = originOverride.map((override) => {
const [orig, dest] = override.split("=");
const origUrl = new URL(orig);
const destUrl = new URL(dest);
return { origUrl, destUrl };
});
}
async initPage(browser: Browser, page: Page) {
const onRequest = async (request: HTTPRequest) => {
try {
const url = request.url();
let newUrl = null;
let orig = null;
for (const { origUrl, destUrl } of this.originOverride) {
if (url.startsWith(origUrl.origin)) {
newUrl = destUrl.origin + url.slice(origUrl.origin.length);
orig = origUrl;
break;
}
}
if (!newUrl || !orig) {
await request.continue({}, -1);
return;
}
const headers = new Headers(request.headers());
headers.set("host", orig.host);
if (headers.get("origin")) {
headers.set("origin", orig.origin);
}
const resp = await fetch(newUrl, {
headers,
dispatcher: getProxyDispatcher(),
});
const body = Buffer.from(await resp.arrayBuffer());
const respHeaders = Object.fromEntries(resp.headers);
const status = resp.status;
logger.debug(
"Origin overridden",
{ orig: url, dest: newUrl, status, body: body.length },
"originOverride",
);
await request.respond({ body, headers: respHeaders, status }, -1);
} catch (e) {
logger.warn(
"Error overriding origin",
{ ...formatErr(e), url: page.url() },
"originOverride",
);
await request.continue({}, -1);
}
};
browser.interceptRequest(page, onRequest);
}
}