Support host-specific proxies with proxy config YAML (#837)

- Adds support for YAML-based config for multiple proxies, containing
'matchHosts' section by regex and 'proxies' declaration, allowing
matching any number of hosts to any number of named proxies.
- Specified via --proxyServerConfig option passed to both crawl and
profile creation commands.
- Implemented internally by generating a proxy PAC script which does
regex matching and running browser with the specified proxy PAC script
served by an internal http server.
- Also support matching different undici Agents by regex, for using
different proxies with direct fetching
- Precedence: --proxyServerConfig takes precedence over --proxyServer /
PROXY_SERVER, unless --proxyServerPreferSingleProxy is also provided
- Updated proxies doc section with example
- Updated tests with sample bad and good auth examples of proxy config

Fixes #836

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-08-20 16:07:29 -07:00 committed by GitHub
parent a6ad6a0e42
commit a42c0b926e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 424 additions and 68 deletions

View file

@ -16,7 +16,7 @@ import { initStorage } from "./util/storage.js";
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
import { getInfoString } from "./util/file_reader.js";
import { DISPLAY, ExitCodes } from "./util/constants.js";
import { initProxy } from "./util/proxy.js";
import { initProxy, loadProxyConfig } from "./util/proxy.js";
//import { sleep } from "./util/timing.js";
const profileHTML = fs.readFileSync(
@ -123,6 +123,12 @@ function initArgs() {
type: "string",
},
proxyServerConfig: {
describe:
"if set, path to yaml/json file that configures multiple path servers per URL regex",
type: "string",
},
sshProxyPrivateKeyFile: {
describe:
"path to SSH private key for SOCKS5 over SSH proxy connection",
@ -161,7 +167,9 @@ async function main() {
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
const proxyServer = await initProxy(params, false);
loadProxyConfig(params);
const { proxyServer, proxyPacUrl } = await initProxy(params, false);
if (!params.headless) {
logger.debug("Launching XVFB");
@ -203,7 +211,8 @@ async function main() {
headless: params.headless,
signals: false,
chromeOptions: {
proxy: proxyServer,
proxyServer,
proxyPacUrl,
extraArgs: [
"--window-position=0,0",
`--window-size=${params.windowSize}`,