mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-08 06:09:48 +00:00
ensure --proxyServer cli flag also supported by both direct fetch and browser, simplify
proxy init
This commit is contained in:
parent
61628049cb
commit
4ee0ce620f
4 changed files with 30 additions and 13 deletions
|
|
@ -56,7 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js";
|
||||||
import { ScopedSeed } from "./util/seeds.js";
|
import { ScopedSeed } from "./util/seeds.js";
|
||||||
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
||||||
import { isHTMLContentType } from "./util/reqresp.js";
|
import { isHTMLContentType } from "./util/reqresp.js";
|
||||||
import { initDispatcher } from "./util/proxy.js";
|
import { initProxy } from "./util/proxy.js";
|
||||||
|
|
||||||
const behaviors = fs.readFileSync(
|
const behaviors = fs.readFileSync(
|
||||||
new URL(
|
new URL(
|
||||||
|
|
@ -171,6 +171,8 @@ export class Crawler {
|
||||||
maxHeapUsed = 0;
|
maxHeapUsed = 0;
|
||||||
maxHeapTotal = 0;
|
maxHeapTotal = 0;
|
||||||
|
|
||||||
|
proxyServer?: string;
|
||||||
|
|
||||||
driver!: (opts: {
|
driver!: (opts: {
|
||||||
page: Page;
|
page: Page;
|
||||||
data: PageState;
|
data: PageState;
|
||||||
|
|
@ -437,7 +439,7 @@ export class Crawler {
|
||||||
async bootstrap() {
|
async bootstrap() {
|
||||||
const subprocesses: ChildProcess[] = [];
|
const subprocesses: ChildProcess[] = [];
|
||||||
|
|
||||||
await initDispatcher();
|
this.proxyServer = initProxy(this.params.proxyServer);
|
||||||
|
|
||||||
subprocesses.push(this.launchRedis());
|
subprocesses.push(this.launchRedis());
|
||||||
|
|
||||||
|
|
@ -1292,7 +1294,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
emulateDevice: this.emulateDevice,
|
emulateDevice: this.emulateDevice,
|
||||||
swOpt: this.params.serviceWorker,
|
swOpt: this.params.serviceWorker,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
proxy: this.params.proxyServer,
|
proxy: this.proxyServer,
|
||||||
userAgent: this.emulateDevice.userAgent,
|
userAgent: this.emulateDevice.userAgent,
|
||||||
extraArgs: this.extraChromeArgs(),
|
extraArgs: this.extraChromeArgs(),
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,6 @@ import puppeteer, {
|
||||||
} from "puppeteer-core";
|
} from "puppeteer-core";
|
||||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||||
import { Recorder } from "./recorder.js";
|
import { Recorder } from "./recorder.js";
|
||||||
import { getProxy } from "./proxy.js";
|
|
||||||
|
|
||||||
type BtrixChromeOpts = {
|
type BtrixChromeOpts = {
|
||||||
proxy?: string;
|
proxy?: string;
|
||||||
|
|
@ -236,7 +235,6 @@ export class Browser {
|
||||||
...extraArgs,
|
...extraArgs,
|
||||||
];
|
];
|
||||||
|
|
||||||
proxy = proxy || getProxy();
|
|
||||||
if (proxy) {
|
if (proxy) {
|
||||||
logger.info("Using proxy", { proxy }, "browser");
|
logger.info("Using proxy", { proxy }, "browser");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
|
||||||
import { socksDispatcher } from "fetch-socks";
|
import { socksDispatcher } from "fetch-socks";
|
||||||
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
||||||
|
|
||||||
export function getProxy() {
|
export function getEnvProxyUrl() {
|
||||||
if (process.env.PROXY_SERVER) {
|
if (process.env.PROXY_SERVER) {
|
||||||
return process.env.PROXY_SERVER;
|
return process.env.PROXY_SERVER;
|
||||||
}
|
}
|
||||||
|
|
@ -16,15 +16,21 @@ export function getProxy() {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
export function initDispatcher() {
|
export function initProxy(proxy?: string): string {
|
||||||
const dispatcher = createDispatcher();
|
if (!proxy) {
|
||||||
if (dispatcher) {
|
proxy = getEnvProxyUrl();
|
||||||
setGlobalDispatcher(dispatcher);
|
|
||||||
}
|
}
|
||||||
|
if (proxy) {
|
||||||
|
const dispatcher = createDispatcher(proxy);
|
||||||
|
if (dispatcher) {
|
||||||
|
setGlobalDispatcher(dispatcher);
|
||||||
|
return proxy;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
export function createDispatcher(): Dispatcher | undefined {
|
export function createDispatcher(proxyUrl: string): Dispatcher | undefined {
|
||||||
const proxyUrl = getProxy();
|
|
||||||
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
|
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
|
||||||
// HTTP PROXY does not support auth, as it's not supported in the browser
|
// HTTP PROXY does not support auth, as it's not supported in the browser
|
||||||
// so must drop username/password for consistency
|
// so must drop username/password for consistency
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,7 @@ test("http proxy, PDF, separate env vars", () => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test("http proxy, error, not running, separate env vars", () => {
|
test("http proxy set, but not running, separate env vars", () => {
|
||||||
let status = 0;
|
let status = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
@ -105,4 +105,15 @@ test("http proxy, error, not running, separate env vars", () => {
|
||||||
expect(status).toBe(1);
|
expect(status).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("http proxy set, but not running, cli arg", () => {
|
||||||
|
let status = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
child_process.execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --proxyServer http://host.docker.internal:${++globalPort} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||||
|
} catch (e) {
|
||||||
|
status = e.status;
|
||||||
|
}
|
||||||
|
expect(status).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue