browsertrix-crawler/util/originoverride.js
Ilya Kreymer 71b618fe94
Switch back to Puppeteer from Playwright (#301)
- reduced memory usage, avoids memory leak issues caused by using playwright (see #298) 
- browser: split Browser into Browser and BaseBrowser
- browser: puppeteer-specific functions added to Browser for additional flexibility if need to change again later
- browser: use defaultArgs from playwright
- browser: attempt to recover if initial target is gone
- logging: add debug logging from process.memoryUsage() after every page
- request interception: use priorities for cooperative request interception
- request interception: move to setupPage() to run once per page, enable if any of blockrules, adblockrules or originOverrides are used
- request interception: fix originOverrides enabled check, fix to work with catch-all request interception
- default args: set --waitUntil back to 'load,networkidle2'
- Update README with changes for puppeteer
- tests: fix extra hops depth test to ensure more than one page crawled

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-04-26 15:41:35 -07:00

51 lines
1.4 KiB
JavaScript

import { errJSON, logger } from "./logger.js";
export class OriginOverride
{
constructor(originOverride) {
this.originOverride = originOverride.map((override) => {
let [orig, dest] = override.split("=");
orig = new URL(orig).origin;
dest = new URL(dest).origin;
return {orig, dest};
});
}
async initPage(browser, page) {
const onRequest = async (request) => {
try {
const url = request.url();
let newUrl = null;
for (const {orig, dest} of this.originOverride) {
if (url.startsWith(orig)) {
newUrl = dest + url.slice(orig.length);
break;
}
}
if (!newUrl) {
request.continue({}, -1);
return;
}
const resp = await fetch(newUrl, {headers: request.headers()});
const body = Buffer.from(await resp.arrayBuffer());
const headers = Object.fromEntries(resp.headers);
const status = resp.status;
logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride");
request.respond({body, headers, status}, -1);
} catch (e) {
logger.warn("Error overriding origin", {...errJSON(e), url: page.url()}, "originoverride");
request.continue({}, -1);
}
};
await browser.interceptRequest(page, onRequest);
}
}