mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00

- reduced memory usage, avoids memory leak issues caused by using playwright (see #298) - browser: split Browser into Browser and BaseBrowser - browser: puppeteer-specific functions added to Browser for additional flexibility if need to change again later - browser: use defaultArgs from playwright - browser: attempt to recover if initial target is gone - logging: add debug logging from process.memoryUsage() after every page - request interception: use priorities for cooperative request interception - request interception: move to setupPage() to run once per page, enable if any of blockrules, adblockrules or originOverrides are used - request interception: fix originOverrides enabled check, fix to work with catch-all request interception - default args: set --waitUntil back to 'load,networkidle2' - Update README with changes for puppeteer - tests: fix extra hops depth test to ensure more than one page crawled --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
51 lines
1.4 KiB
JavaScript
51 lines
1.4 KiB
JavaScript
import { errJSON, logger } from "./logger.js";
|
|
|
|
export class OriginOverride
|
|
{
|
|
constructor(originOverride) {
|
|
this.originOverride = originOverride.map((override) => {
|
|
let [orig, dest] = override.split("=");
|
|
orig = new URL(orig).origin;
|
|
dest = new URL(dest).origin;
|
|
|
|
return {orig, dest};
|
|
});
|
|
}
|
|
|
|
async initPage(browser, page) {
|
|
const onRequest = async (request) => {
|
|
try {
|
|
const url = request.url();
|
|
|
|
let newUrl = null;
|
|
|
|
for (const {orig, dest} of this.originOverride) {
|
|
if (url.startsWith(orig)) {
|
|
newUrl = dest + url.slice(orig.length);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!newUrl) {
|
|
request.continue({}, -1);
|
|
return;
|
|
}
|
|
|
|
const resp = await fetch(newUrl, {headers: request.headers()});
|
|
|
|
const body = Buffer.from(await resp.arrayBuffer());
|
|
const headers = Object.fromEntries(resp.headers);
|
|
const status = resp.status;
|
|
|
|
logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride");
|
|
|
|
request.respond({body, headers, status}, -1);
|
|
|
|
} catch (e) {
|
|
logger.warn("Error overriding origin", {...errJSON(e), url: page.url()}, "originoverride");
|
|
request.continue({}, -1);
|
|
}
|
|
};
|
|
await browser.interceptRequest(page, onRequest);
|
|
}
|
|
}
|