mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00

- retries: for failed pages, set retry to 5 in cases multiple retries may be needed. - redirect: if page url is /path/ -> /path, don't add as extra seed - proxy: don't use global dispatcher, pass dispatcher explicitly when using proxy, as proxy may interfere with local network requests - final exit flag: if crawl is done and also interrupted, ensure WACZ is still written/uploaded by setting final exit to true - hashtag only change force reload: if loading page with same URL but different hashtag, eg. `https://example.com/#B` after `https://example.com/#A`, do a full reload
76 lines
2.1 KiB
TypeScript
76 lines
2.1 KiB
TypeScript
import { HTTPRequest, Page } from "puppeteer-core";
|
|
import { formatErr, logger } from "./logger.js";
|
|
import { Browser } from "./browser.js";
|
|
|
|
import { fetch } from "undici";
|
|
import { getProxyDispatcher } from "./proxy.js";
|
|
|
|
export class OriginOverride {
|
|
originOverride: { origUrl: URL; destUrl: URL }[];
|
|
|
|
constructor(originOverride: string[]) {
|
|
this.originOverride = originOverride.map((override) => {
|
|
const [orig, dest] = override.split("=");
|
|
const origUrl = new URL(orig);
|
|
const destUrl = new URL(dest);
|
|
|
|
return { origUrl, destUrl };
|
|
});
|
|
}
|
|
|
|
async initPage(browser: Browser, page: Page) {
|
|
const onRequest = async (request: HTTPRequest) => {
|
|
try {
|
|
const url = request.url();
|
|
|
|
let newUrl = null;
|
|
let orig = null;
|
|
|
|
for (const { origUrl, destUrl } of this.originOverride) {
|
|
if (url.startsWith(origUrl.origin)) {
|
|
newUrl = destUrl.origin + url.slice(origUrl.origin.length);
|
|
orig = origUrl;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!newUrl || !orig) {
|
|
await request.continue({}, -1);
|
|
return;
|
|
}
|
|
|
|
const headers = new Headers(request.headers());
|
|
|
|
headers.set("host", orig.host);
|
|
if (headers.get("origin")) {
|
|
headers.set("origin", orig.origin);
|
|
}
|
|
|
|
const resp = await fetch(newUrl, {
|
|
headers,
|
|
dispatcher: getProxyDispatcher(),
|
|
});
|
|
|
|
const body = Buffer.from(await resp.arrayBuffer());
|
|
const respHeaders = Object.fromEntries(resp.headers);
|
|
const status = resp.status;
|
|
|
|
logger.debug(
|
|
"Origin overridden",
|
|
{ orig: url, dest: newUrl, status, body: body.length },
|
|
"originOverride",
|
|
);
|
|
|
|
await request.respond({ body, headers: respHeaders, status }, -1);
|
|
} catch (e) {
|
|
logger.warn(
|
|
"Error overriding origin",
|
|
{ ...formatErr(e), url: page.url() },
|
|
"originOverride",
|
|
);
|
|
await request.continue({}, -1);
|
|
}
|
|
};
|
|
browser.interceptRequest(page, onRequest);
|
|
}
|
|
}
|