mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
ensure --proxyServer cli flag also supported by both direct fetch and browser, simplify
proxy init
This commit is contained in:
parent
61628049cb
commit
4ee0ce620f
4 changed files with 30 additions and 13 deletions
|
@ -56,7 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js";
|
|||
import { ScopedSeed } from "./util/seeds.js";
|
||||
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
||||
import { isHTMLContentType } from "./util/reqresp.js";
|
||||
import { initDispatcher } from "./util/proxy.js";
|
||||
import { initProxy } from "./util/proxy.js";
|
||||
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
|
@ -171,6 +171,8 @@ export class Crawler {
|
|||
maxHeapUsed = 0;
|
||||
maxHeapTotal = 0;
|
||||
|
||||
proxyServer?: string;
|
||||
|
||||
driver!: (opts: {
|
||||
page: Page;
|
||||
data: PageState;
|
||||
|
@ -437,7 +439,7 @@ export class Crawler {
|
|||
async bootstrap() {
|
||||
const subprocesses: ChildProcess[] = [];
|
||||
|
||||
await initDispatcher();
|
||||
this.proxyServer = initProxy(this.params.proxyServer);
|
||||
|
||||
subprocesses.push(this.launchRedis());
|
||||
|
||||
|
@ -1292,7 +1294,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
emulateDevice: this.emulateDevice,
|
||||
swOpt: this.params.serviceWorker,
|
||||
chromeOptions: {
|
||||
proxy: this.params.proxyServer,
|
||||
proxy: this.proxyServer,
|
||||
userAgent: this.emulateDevice.userAgent,
|
||||
extraArgs: this.extraChromeArgs(),
|
||||
},
|
||||
|
|
|
@ -20,7 +20,6 @@ import puppeteer, {
|
|||
} from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||
import { Recorder } from "./recorder.js";
|
||||
import { getProxy } from "./proxy.js";
|
||||
|
||||
type BtrixChromeOpts = {
|
||||
proxy?: string;
|
||||
|
@ -236,7 +235,6 @@ export class Browser {
|
|||
...extraArgs,
|
||||
];
|
||||
|
||||
proxy = proxy || getProxy();
|
||||
if (proxy) {
|
||||
logger.info("Using proxy", { proxy }, "browser");
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
|
|||
import { socksDispatcher } from "fetch-socks";
|
||||
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
||||
|
||||
export function getProxy() {
|
||||
export function getEnvProxyUrl() {
|
||||
if (process.env.PROXY_SERVER) {
|
||||
return process.env.PROXY_SERVER;
|
||||
}
|
||||
|
@ -16,15 +16,21 @@ export function getProxy() {
|
|||
return "";
|
||||
}
|
||||
|
||||
export function initDispatcher() {
|
||||
const dispatcher = createDispatcher();
|
||||
if (dispatcher) {
|
||||
setGlobalDispatcher(dispatcher);
|
||||
export function initProxy(proxy?: string): string {
|
||||
if (!proxy) {
|
||||
proxy = getEnvProxyUrl();
|
||||
}
|
||||
if (proxy) {
|
||||
const dispatcher = createDispatcher(proxy);
|
||||
if (dispatcher) {
|
||||
setGlobalDispatcher(dispatcher);
|
||||
return proxy;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
export function createDispatcher(): Dispatcher | undefined {
|
||||
const proxyUrl = getProxy();
|
||||
export function createDispatcher(proxyUrl: string): Dispatcher | undefined {
|
||||
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
|
||||
// HTTP PROXY does not support auth, as it's not supported in the browser
|
||||
// so must drop username/password for consistency
|
||||
|
|
|
@ -94,7 +94,7 @@ test("http proxy, PDF, separate env vars", () => {
|
|||
}
|
||||
});
|
||||
|
||||
test("http proxy, error, not running, separate env vars", () => {
|
||||
test("http proxy set, but not running, separate env vars", () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
|
@ -105,4 +105,15 @@ test("http proxy, error, not running, separate env vars", () => {
|
|||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
test("http proxy set, but not running, cli arg", () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --proxyServer http://host.docker.internal:${++globalPort} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue