diff --git a/src/crawler.ts b/src/crawler.ts index 6a6c2698..abddbbbc 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -56,7 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; import { isHTMLContentType } from "./util/reqresp.js"; -import { initDispatcher } from "./util/proxy.js"; +import { initProxy } from "./util/proxy.js"; const behaviors = fs.readFileSync( new URL( @@ -171,6 +171,8 @@ export class Crawler { maxHeapUsed = 0; maxHeapTotal = 0; + proxyServer?: string; + driver!: (opts: { page: Page; data: PageState; @@ -437,7 +439,7 @@ export class Crawler { async bootstrap() { const subprocesses: ChildProcess[] = []; - await initDispatcher(); + this.proxyServer = initProxy(this.params.proxyServer); subprocesses.push(this.launchRedis()); @@ -1292,7 +1294,7 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: this.params.proxyServer, + proxy: this.proxyServer, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, diff --git a/src/util/browser.ts b/src/util/browser.ts index e72f9a7d..73a74222 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -20,7 +20,6 @@ import puppeteer, { } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; -import { getProxy } from "./proxy.js"; type BtrixChromeOpts = { proxy?: string; @@ -236,7 +235,6 @@ export class Browser { ...extraArgs, ]; - proxy = proxy || getProxy(); if (proxy) { logger.info("Using proxy", { proxy }, "browser"); } diff --git a/src/util/proxy.ts b/src/util/proxy.ts index 57c3fa18..c7fe2a85 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -3,7 +3,7 @@ import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; import { socksDispatcher } from "fetch-socks"; import type { SocksProxyType } from "socks/typings/common/constants.js"; -export function getProxy() { +export function getEnvProxyUrl() { if (process.env.PROXY_SERVER) { return process.env.PROXY_SERVER; } @@ -16,15 +16,21 @@ export function getProxy() { return ""; } -export function initDispatcher() { - const dispatcher = createDispatcher(); - if (dispatcher) { - setGlobalDispatcher(dispatcher); +export function initProxy(proxy?: string): string { + if (!proxy) { + proxy = getEnvProxyUrl(); } + if (proxy) { + const dispatcher = createDispatcher(proxy); + if (dispatcher) { + setGlobalDispatcher(dispatcher); + return proxy; + } + } + return ""; } -export function createDispatcher(): Dispatcher | undefined { - const proxyUrl = getProxy(); +export function createDispatcher(proxyUrl: string): Dispatcher | undefined { if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { // HTTP PROXY does not support auth, as it's not supported in the browser // so must drop username/password for consistency diff --git a/tests/proxy.test.js b/tests/proxy.test.js index b2bdad96..49326bf8 100644 --- a/tests/proxy.test.js +++ b/tests/proxy.test.js @@ -94,7 +94,7 @@ test("http proxy, PDF, separate env vars", () => { } }); -test("http proxy, error, not running, separate env vars", () => { +test("http proxy set, but not running, separate env vars", () => { let status = 0; try { @@ -105,4 +105,15 @@ test("http proxy, error, not running, separate env vars", () => { expect(status).toBe(1); }); +test("http proxy set, but not running, cli arg", () => { + let status = 0; + + try { + child_process.execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --proxyServer http://host.docker.internal:${++globalPort} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); +}); +