ensure --proxyServer cli flag also supported by both direct fetch and browser, simplify

proxy init
This commit is contained in:
Ilya Kreymer 2024-06-05 22:26:18 -07:00
parent 61628049cb
commit 4ee0ce620f
4 changed files with 30 additions and 13 deletions

View file

@ -56,7 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
import { isHTMLContentType } from "./util/reqresp.js";
import { initDispatcher } from "./util/proxy.js";
import { initProxy } from "./util/proxy.js";
const behaviors = fs.readFileSync(
new URL(
@ -171,6 +171,8 @@ export class Crawler {
maxHeapUsed = 0;
maxHeapTotal = 0;
proxyServer?: string;
driver!: (opts: {
page: Page;
data: PageState;
@ -437,7 +439,7 @@ export class Crawler {
async bootstrap() {
const subprocesses: ChildProcess[] = [];
await initDispatcher();
this.proxyServer = initProxy(this.params.proxyServer);
subprocesses.push(this.launchRedis());
@ -1292,7 +1294,7 @@ self.__bx_behaviors.selectMainBehavior();
emulateDevice: this.emulateDevice,
swOpt: this.params.serviceWorker,
chromeOptions: {
proxy: this.params.proxyServer,
proxy: this.proxyServer,
userAgent: this.emulateDevice.userAgent,
extraArgs: this.extraChromeArgs(),
},

View file

@ -20,7 +20,6 @@ import puppeteer, {
} from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js";
import { getProxy } from "./proxy.js";
type BtrixChromeOpts = {
proxy?: string;
@ -236,7 +235,6 @@ export class Browser {
...extraArgs,
];
proxy = proxy || getProxy();
if (proxy) {
logger.info("Using proxy", { proxy }, "browser");
}

View file

@ -3,7 +3,7 @@ import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
import { socksDispatcher } from "fetch-socks";
import type { SocksProxyType } from "socks/typings/common/constants.js";
export function getProxy() {
export function getEnvProxyUrl() {
if (process.env.PROXY_SERVER) {
return process.env.PROXY_SERVER;
}
@ -16,15 +16,21 @@ export function getProxy() {
return "";
}
export function initDispatcher() {
const dispatcher = createDispatcher();
export function initProxy(proxy?: string): string {
if (!proxy) {
proxy = getEnvProxyUrl();
}
if (proxy) {
const dispatcher = createDispatcher(proxy);
if (dispatcher) {
setGlobalDispatcher(dispatcher);
return proxy;
}
}
return "";
}
export function createDispatcher(): Dispatcher | undefined {
const proxyUrl = getProxy();
export function createDispatcher(proxyUrl: string): Dispatcher | undefined {
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
// HTTP PROXY does not support auth, as it's not supported in the browser
// so must drop username/password for consistency

View file

@ -94,7 +94,7 @@ test("http proxy, PDF, separate env vars", () => {
}
});
test("http proxy, error, not running, separate env vars", () => {
test("http proxy set, but not running, separate env vars", () => {
let status = 0;
try {
@ -105,4 +105,15 @@ test("http proxy, error, not running, separate env vars", () => {
expect(status).toBe(1);
});
test("http proxy set, but not running, cli arg", () => {
let status = 0;
try {
child_process.execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --proxyServer http://host.docker.internal:${++globalPort} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});