From a42c0b926e110c16f359c26a3fde228bed2c062b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Aug 2025 16:07:29 -0700 Subject: [PATCH] Support host-specific proxies with proxy config YAML (#837) - Adds support for YAML-based config for multiple proxies, containing 'matchHosts' section by regex and 'proxies' declaration, allowing matching any number of hosts to any number of named proxies. - Specified via --proxyServerConfig option passed to both crawl and profile creation commands. - Implemented internally by generating a proxy PAC script which does regex matching and running browser with the specified proxy PAC script served by an internal http server. - Also support matching different undici Agents by regex, for using different proxies with direct fetching - Precedence: --proxyServerConfig takes precedence over --proxyServer / PROXY_SERVER, unless --proxyServerPreferSingleProxy is also provided - Updated proxies doc section with example - Updated tests with sample bad and good auth examples of proxy config Fixes #836 --------- Co-authored-by: Tessa Walsh --- docs/docs/user-guide/cli-options.md | 17 +- docs/docs/user-guide/proxies.md | 52 +++- package.json | 2 +- src/crawler.ts | 29 +- src/create-login-profile.ts | 15 +- src/util/argParser.ts | 16 ++ src/util/blockrules.ts | 6 +- src/util/browser.ts | 16 +- src/util/file_reader.ts | 2 +- src/util/originoverride.ts | 2 +- src/util/proxy.ts | 267 ++++++++++++++++-- src/util/recorder.ts | 6 +- src/util/sitemapper.ts | 2 +- src/util/worker.ts | 2 +- tests/fixtures/{ => proxies}/proxy-key | 0 tests/fixtures/{ => proxies}/proxy-key.pub | 0 .../fixtures/proxies/proxy-test-bad-auth.pac | 6 + .../fixtures/proxies/proxy-test-good-auth.pac | 5 + tests/proxy.test.js | 47 ++- 19 files changed, 424 insertions(+), 68 deletions(-) rename tests/fixtures/{ => proxies}/proxy-key (100%) rename tests/fixtures/{ => proxies}/proxy-key.pub (100%) create mode 100644 tests/fixtures/proxies/proxy-test-bad-auth.pac create mode 100644 tests/fixtures/proxies/proxy-test-good-auth.pac diff --git a/docs/docs/user-guide/cli-options.md b/docs/docs/user-guide/cli-options.md index d37160b0..0ccb2b87 100644 --- a/docs/docs/user-guide/cli-options.md +++ b/docs/docs/user-guide/cli-options.md @@ -103,16 +103,16 @@ Options: , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt - atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy"] [default: - []] + atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"] + [default: []] --logExcludeContext Comma-separated list of contexts to NOT include in logs [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer" , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt - atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy"] [default: - ["recorderNetwork","jsError","screencast"]] + atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"] + [default: ["recorderNetwork","jsError","screencast"]] --text Extract initial (default) or final t ext to pages.jsonl or WARC resource record(s) @@ -294,6 +294,13 @@ Options: --proxyServer if set, will use specified proxy ser ver. Takes precedence over any env v ar proxy settings [string] + --proxyServerPreferSingleProxy if set, and both proxyServer and pro + xyServerConfig are provided, the pro + xyServer value will be preferred + [boolean] [default: false] + --proxyServerConfig if set, path to yaml/json file that + configures multiple path servers per + URL regex [string] --dryRun If true, no archive data is written to disk, only pages and logs (and op tionally saved state). [boolean] @@ -343,6 +350,8 @@ Options: [number] [default: 7] --proxyServer if set, will use specified proxy server. Takes prece dence over any env var proxy settings [string] + --proxyServerConfig if set, path to yaml/json file that configures multi + ple path servers per URL regex [string] --sshProxyPrivateKeyFile path to SSH private key for SOCKS5 over SSH proxy co nnection [string] --sshProxyKnownHostsFile path to SSH known hosts file for SOCKS5 over SSH pro diff --git a/docs/docs/user-guide/proxies.md b/docs/docs/user-guide/proxies.md index 586f74cd..7bc395d9 100644 --- a/docs/docs/user-guide/proxies.md +++ b/docs/docs/user-guide/proxies.md @@ -80,7 +80,55 @@ The above proxy settings also apply to [Browser Profile Creation](browser-profil docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler create-login-profile --url https://example.com/ --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts ``` - - +## Host-Specific Proxies + +With the 1.7.0 release, the crawler also supports running with multiple proxies, defined in a separate proxy YAML config file. The file contains a match hosts section, matching hosts by regex to named proxies. + +For example, the following YAML file can be passed to `--proxyConfigFile` option: + +```yaml +matchHosts: + # load all URLs from example.com through 'example-1-proxy' + example.com/.*: example-1-proxy + + # load all URLS from https://my-social.example.com/.*/posts/ through + # a different proxy + https://my-social.example.com/.*/posts/: social-proxy + + # optional default proxy + "": default-proxy + +proxies: + # SOCKS5 proxy just needs a URL + example-1-proxy: socks5://username:password@my-socks-5-proxy.example.com + + # SSH proxy also should have at least a 'privateKeyFile' + social-proxy: + url: ssh://user@my-social-proxy.example.com + privateKeyFile: /proxies/social-proxy-private-key + # optional + publicHostsFile: /proxies/social-proxy-public-hosts + + default-proxy: + url: ssh://user@my-social-proxy.example.com + privateKeyFile: /proxies/default-proxy-private-key +``` + +If the above config is stored in `./proxies/proxyConfig.yaml` along with the SSH private keys and known public hosts +files, the crawler can be started with: + +```sh +docker run -v $PWD/crawls:/crawls -v $PWD/proxies:/proxies -it webrecorder/browsertrix-crawler --url https://example.com/ --proxyServerConfig /proxies/proxyConfig.yaml +``` + +Note that if SSH proxies are provided, an SSH tunnel must be opened for each one before the crawl starts. +The crawl will not start if any of the SSH proxy connections fail, even if a host-specific proxy is not actually used. +SOCKS5 and HTTP proxy connections are attempted only on first use. + +The same `--proxyServerConfig` option can also be used in browser profile creation with the `create-login-profile` command in the same way. + +### Proxy Precedence + +If both `--proxyServerConfig` and `--proxyServer`/`PROXY_SERVER` env var are specified, the `--proxyServerConfig` option takes precedence on matching hosts. To have the single `--proxyServer` option always take precedence instead, pass the `--proxyServerPreferSingleProxy` option. diff --git a/package.json b/package.json index eb545f8e..6e7da0b7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.7.0", + "version": "1.8.0-beta.0", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 5adf42c0..884200af 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -186,6 +186,7 @@ export class Crawler { maxHeapTotal = 0; proxyServer?: string; + proxyPacUrl?: string; driver: | ((opts: { @@ -508,7 +509,9 @@ export class Crawler { setWARCInfo(this.infoString, this.params.warcInfo); logger.info(this.infoString); - this.proxyServer = await initProxy(this.params, RUN_DETACHED); + const res = await initProxy(this.params, RUN_DETACHED); + this.proxyServer = res.proxyServer; + this.proxyPacUrl = res.proxyPacUrl; this.seeds = await parseSeeds(this.params); this.numOriginalSeeds = this.seeds.length; @@ -1276,7 +1279,7 @@ self.__bx_behaviors.selectMainBehavior(); } } - async pageFinished(data: PageState) { + async pageFinished(data: PageState, lastErrorText = "") { // if page loaded, considered page finished successfully // (even if behaviors timed out) const { loadState, logDetails, depth, url, pageSkipped } = data; @@ -1311,11 +1314,28 @@ self.__bx_behaviors.selectMainBehavior(); await this.serializeConfig(); if (depth === 0 && this.params.failOnFailedSeed) { + let errorCode = ExitCodes.GenericError; + + switch (lastErrorText) { + case "net::ERR_SOCKS_CONNECTION_FAILED": + case "net::SOCKS_CONNECTION_HOST_UNREACHABLE": + case "net::ERR_PROXY_CONNECTION_FAILED": + case "net::ERR_TUNNEL_CONNECTION_FAILED": + errorCode = ExitCodes.ProxyError; + break; + + case "net::ERR_TIMED_OUT": + case "net::ERR_INVALID_AUTH_CREDENTIALS": + if (this.proxyServer || this.proxyPacUrl) { + errorCode = ExitCodes.ProxyError; + } + break; + } logger.fatal( "Seed Page Load Failed, failing crawl", {}, "general", - ExitCodes.GenericError, + errorCode, ); } } @@ -1703,7 +1723,8 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: this.proxyServer, + proxyServer: this.proxyServer, + proxyPacUrl: this.proxyPacUrl, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 07aba7c4..0dcbc608 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -16,7 +16,7 @@ import { initStorage } from "./util/storage.js"; import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core"; import { getInfoString } from "./util/file_reader.js"; import { DISPLAY, ExitCodes } from "./util/constants.js"; -import { initProxy } from "./util/proxy.js"; +import { initProxy, loadProxyConfig } from "./util/proxy.js"; //import { sleep } from "./util/timing.js"; const profileHTML = fs.readFileSync( @@ -123,6 +123,12 @@ function initArgs() { type: "string", }, + proxyServerConfig: { + describe: + "if set, path to yaml/json file that configures multiple path servers per URL regex", + type: "string", + }, + sshProxyPrivateKeyFile: { describe: "path to SSH private key for SOCKS5 over SSH proxy connection", @@ -161,7 +167,9 @@ async function main() { process.on("SIGTERM", () => handleTerminate("SIGTERM")); - const proxyServer = await initProxy(params, false); + loadProxyConfig(params); + + const { proxyServer, proxyPacUrl } = await initProxy(params, false); if (!params.headless) { logger.debug("Launching XVFB"); @@ -203,7 +211,8 @@ async function main() { headless: params.headless, signals: false, chromeOptions: { - proxy: proxyServer, + proxyServer, + proxyPacUrl, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 95ee3e1a..cd64e8fd 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -29,6 +29,7 @@ import { logger, } from "./logger.js"; import { SaveState } from "./state.js"; +import { loadProxyConfig } from "./proxy.js"; // ============================================================================ export type CrawlerArgs = ReturnType & { @@ -641,6 +642,19 @@ class ArgParser { type: "string", }, + proxyServerPreferSingleProxy: { + describe: + "if set, and both proxyServer and proxyServerConfig are provided, the proxyServer value will be preferred", + type: "boolean", + default: false, + }, + + proxyServerConfig: { + describe: + "if set, path to yaml/json file that configures multiple path servers per URL regex", + type: "string", + }, + dryRun: { describe: "If true, no archive data is written to disk, only pages and logs (and optionally saved state).", @@ -778,6 +792,8 @@ class ArgParser { argv.emulateDevice = { viewport: null }; } + loadProxyConfig(argv); + if (argv.lang) { if (!ISO6391.validate(argv.lang)) { logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang); diff --git a/src/util/blockrules.ts b/src/util/blockrules.ts index 0e7fb511..49a7dda9 100644 --- a/src/util/blockrules.ts +++ b/src/util/blockrules.ts @@ -272,7 +272,9 @@ export class BlockRules { logDetails: Record, ) { try { - const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() }); + const res = await fetch(reqUrl, { + dispatcher: getProxyDispatcher(reqUrl), + }); const text = await res.text(); return !!text.match(frameTextMatch); @@ -303,7 +305,7 @@ export class BlockRules { method: "PUT", headers: { "Content-Type": "text/html" }, body, - dispatcher: getProxyDispatcher(), + dispatcher: getProxyDispatcher(putUrl.href), }); } } diff --git a/src/util/browser.ts b/src/util/browser.ts index 69f8ff27..6e2cb6c7 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -29,7 +29,8 @@ import { timedRun } from "./timing.js"; import assert from "node:assert"; type BtrixChromeOpts = { - proxy?: string; + proxyServer?: string; + proxyPacUrl?: string; userAgent?: string | null; extraArgs?: string[]; }; @@ -243,7 +244,8 @@ export class Browser { } chromeArgs({ - proxy = "", + proxyServer = "", + proxyPacUrl = "", userAgent = null, extraArgs = [], }: BtrixChromeOpts) { @@ -262,14 +264,14 @@ export class Browser { ...extraArgs, ]; - if (proxy) { - const proxyString = getSafeProxyString(proxy); + if (proxyServer) { + const proxyString = getSafeProxyString(proxyServer); logger.info("Using proxy", { proxy: proxyString }, "browser"); - } - if (proxy) { args.push("--ignore-certificate-errors"); - args.push(`--proxy-server=${proxy}`); + args.push(`--proxy-server=${proxyServer}`); + } else if (proxyPacUrl) { + args.push("--proxy-pac-url=" + proxyPacUrl); } return args; diff --git a/src/util/file_reader.ts b/src/util/file_reader.ts index f0908d12..2c2a65e9 100644 --- a/src/util/file_reader.ts +++ b/src/util/file_reader.ts @@ -41,7 +41,7 @@ async function writeUrlContentsToFile( pathPrefix: string, pathDefaultExt: string, ) { - const res = await fetch(url, { dispatcher: getProxyDispatcher() }); + const res = await fetch(url, { dispatcher: getProxyDispatcher(url) }); const fileContents = await res.text(); const filename = diff --git a/src/util/originoverride.ts b/src/util/originoverride.ts index 1b2b8c41..33c22aca 100644 --- a/src/util/originoverride.ts +++ b/src/util/originoverride.ts @@ -48,7 +48,7 @@ export class OriginOverride { const resp = await fetch(newUrl, { headers, - dispatcher: getProxyDispatcher(), + dispatcher: getProxyDispatcher(newUrl), }); const body = Buffer.from(await resp.arrayBuffer()); diff --git a/src/util/proxy.ts b/src/util/proxy.ts index 23401c37..c5a0c2f4 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -1,7 +1,9 @@ import net from "net"; -import { Agent, Dispatcher, ProxyAgent } from "undici"; - import child_process from "child_process"; +import fs from "fs"; + +import { Agent, Dispatcher, ProxyAgent } from "undici"; +import yaml from "js-yaml"; import { logger } from "./logger.js"; @@ -9,11 +11,40 @@ import { socksDispatcher } from "fetch-socks"; import type { SocksProxyType } from "socks/typings/common/constants.js"; import { ExitCodes, FETCH_HEADERS_TIMEOUT_SECS } from "./constants.js"; +import http, { IncomingMessage, ServerResponse } from "http"; + const SSH_PROXY_LOCAL_PORT = 9722; const SSH_WAIT_TIMEOUT = 30000; -let proxyDispatcher: Dispatcher | undefined = undefined; +//let proxyDispatcher: Dispatcher | undefined = undefined; + +type ProxyEntry = { + proxyUrl: string; + dispatcher: Dispatcher; +}; + +export type ProxyServerConfig = { + matchHosts?: Record; + proxies?: Record< + string, + string | { url: string; privateKeyFile?: string; publicHostsFile?: string } + >; +}; + +export type ProxyCLIArgs = { + sshProxyPrivateKeyFile?: string; + sshProxyKnownHostsFile?: string; + sshProxyLocalPort?: number; + + proxyServer?: string; + proxyServerPreferSingleProxy?: boolean; + + proxyMap?: ProxyServerConfig; +}; + +const proxyMap = new Map(); +let defaultProxyEntry: ProxyEntry | null = null; export function getEnvProxyUrl() { if (process.env.PROXY_SERVER) { @@ -28,6 +59,27 @@ export function getEnvProxyUrl() { return ""; } +export function loadProxyConfig(params: { + proxyServerConfig?: string; + proxyMap?: ProxyServerConfig; +}) { + if (params.proxyServerConfig) { + const proxyServerConfig = params.proxyServerConfig; + try { + const proxies = yaml.load( + fs.readFileSync(proxyServerConfig, "utf8"), + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ) as any; + params.proxyMap = proxies; + logger.debug("Proxy host match config loaded", { proxyServerConfig }); + } catch (e) { + logger.warn("Proxy host match config file not found, ignoring", { + proxyServerConfig, + }); + } + } +} + export function getSafeProxyString(proxyString: string): string { if (!proxyString) { return ""; @@ -54,31 +106,127 @@ export function getSafeProxyString(proxyString: string): string { } export async function initProxy( - // eslint-disable-next-line @typescript-eslint/no-explicit-any - params: Record, + params: ProxyCLIArgs, detached: boolean, -): Promise { - let proxy = params.proxyServer; +): Promise<{ proxyServer?: string; proxyPacUrl?: string }> { + const { sshProxyPrivateKeyFile, sshProxyKnownHostsFile, sshProxyLocalPort } = + params; + let localPort = sshProxyLocalPort || SSH_PROXY_LOCAL_PORT; - if (!proxy) { - proxy = getEnvProxyUrl(); + const singleProxy = params.proxyServer || getEnvProxyUrl(); + + if (singleProxy) { + defaultProxyEntry = await initSingleProxy( + singleProxy, + localPort++, + detached, + sshProxyPrivateKeyFile, + sshProxyKnownHostsFile, + ); + if (params.proxyServerPreferSingleProxy && defaultProxyEntry.proxyUrl) { + return { proxyServer: defaultProxyEntry.proxyUrl }; + } } - if (proxy && proxy.startsWith("ssh://")) { - proxy = await runSSHD(params, detached); + + if (!params.proxyMap?.matchHosts || !params.proxyMap?.proxies) { + if (defaultProxyEntry) { + logger.debug("Using Single Proxy", {}, "proxy"); + } + return { proxyServer: defaultProxyEntry?.proxyUrl }; + } + + const nameToProxy = new Map(); + + for (const [name, value] of Object.entries(params.proxyMap.proxies)) { + let proxyUrl = ""; + let privateKeyFile: string | undefined = ""; + let publicHostsFile: string | undefined = ""; + + if (typeof value === "string") { + proxyUrl = value; + } else { + proxyUrl = value.url; + privateKeyFile = value.privateKeyFile; + publicHostsFile = value.publicHostsFile; + } + + privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile; + publicHostsFile = publicHostsFile || sshProxyKnownHostsFile; + + logger.debug("Initing proxy", { + url: getSafeProxyString(proxyUrl), + localPort, + privateKeyFile, + publicHostsFile, + }); + + const entry = await initSingleProxy( + proxyUrl, + localPort++, + detached, + privateKeyFile, + publicHostsFile, + ); + + nameToProxy.set(name, entry); + } + + for (const [rx, name] of Object.entries(params.proxyMap.matchHosts)) { + const entry = nameToProxy.get(name); + + if (!entry) { + logger.fatal("Proxy specified but not found in proxies list: " + name); + return {}; + } + + if (rx) { + proxyMap.set(new RegExp(rx), entry); + } else { + defaultProxyEntry = entry; + } + } + + const p = new ProxyPacServer(); + + logger.debug("Using Proxy PAC script", {}, "proxy"); + + return { proxyPacUrl: `http://localhost:${p.port}/proxy.pac` }; +} + +export async function initSingleProxy( + proxyUrl: string, + localPort: number, + detached: boolean, + sshProxyPrivateKeyFile?: string, + sshProxyKnownHostsFile?: string, +): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> { + if (proxyUrl && proxyUrl.startsWith("ssh://")) { + proxyUrl = await runSSHD( + proxyUrl, + localPort, + detached, + sshProxyPrivateKeyFile, + sshProxyKnownHostsFile, + ); } const agentOpts: Agent.Options = { headersTimeout: FETCH_HEADERS_TIMEOUT_SECS * 1000, }; - // set global fetch() dispatcher (with proxy, if any) - const dispatcher = createDispatcher(proxy, agentOpts); - proxyDispatcher = dispatcher; - return proxy; + const dispatcher = createDispatcher(proxyUrl, agentOpts); + return { proxyUrl, dispatcher }; } -export function getProxyDispatcher() { - return proxyDispatcher; +export function getProxyDispatcher(url: string) { + // find url match by regex first + for (const [rx, { dispatcher }] of proxyMap.entries()) { + if (rx && url.match(rx)) { + return dispatcher; + } + } + // if default proxy set, return default dispatcher, otherwise no dispatcher + return defaultProxyEntry ? defaultProxyEntry.dispatcher : undefined; } export function createDispatcher( @@ -113,9 +261,13 @@ export function createDispatcher( } } -// eslint-disable-next-line @typescript-eslint/no-explicit-any -export async function runSSHD(params: Record, detached: boolean) { - const { proxyServer } = params; +export async function runSSHD( + proxyServer: string, + localPort: number, + detached: boolean, + privateKey?: string, + publicKnownHost?: string, +) { if (!proxyServer || !proxyServer.startsWith("ssh://")) { return ""; } @@ -126,17 +278,14 @@ export async function runSSHD(params: Record, detached: boolean) { const host = proxyServerUrl.hostname.replace("[", "").replace("]", ""); const port = proxyServerUrl.port || 22; const user = proxyServerUrl.username || "root"; - const localPort = params.sshProxyLocalPort || SSH_PROXY_LOCAL_PORT; const proxyString = `socks5://localhost:${localPort}`; const args: string[] = [ user + "@" + host, "-p", - port, + port + "", "-D", - localPort, - "-i", - params.sshProxyPrivateKeyFile, + localPort + "", "-o", "IdentitiesOnly=yes", "-o", @@ -146,12 +295,17 @@ export async function runSSHD(params: Record, detached: boolean) { "-o", ]; - if (params.sshProxyKnownHostsFile) { - args.push(`UserKnownHostsFile=${params.sshProxyKnownHostsFile}`); + if (publicKnownHost) { + args.push(`UserKnownHostsFile=${publicKnownHost}`); } else { args.push("StrictHostKeyChecking=no"); } + if (privateKey) { + args.push("-i"); + args.push(privateKey); + } + args.push("-M", "0", "-N", "-T"); logger.info("Checking SSH connection for proxy...", {}, "proxy"); @@ -221,7 +375,7 @@ export async function runSSHD(params: Record, detached: boolean) { "proxy", ExitCodes.ProxyError, ); - return; + return ""; } logger.info( @@ -241,10 +395,61 @@ export async function runSSHD(params: Record, detached: boolean) { }, "proxy", ); - runSSHD(params, detached).catch((e) => - logger.error("proxy retry error", e, "proxy"), - ); + runSSHD( + proxyServer, + localPort, + detached, + privateKey, + publicKnownHost, + ).catch((e) => logger.error("proxy retry error", e, "proxy")); }); return proxyString; } + +class ProxyPacServer { + port = 20278; + + proxyPacText = ""; + + constructor() { + const httpServer = http.createServer((req, res) => + this.handleRequest(req, res), + ); + httpServer.listen(this.port); + this.generateProxyPac(); + } + + async handleRequest(request: IncomingMessage, response: ServerResponse) { + response.writeHead(200, { + "Content-Type": "application/x-ns-proxy-autoconfig", + }); + response.end(this.proxyPacText); + } + + generateProxyPac() { + const urlToProxy = (proxyUrl: string) => { + const url = new URL(proxyUrl); + const hostport = url.href.slice(url.protocol.length + 2); + const type = url.protocol.slice(0, -1).toUpperCase(); + return `"${type} ${hostport}"`; + }; + + this.proxyPacText = ` + +function FindProxyForURL(url, host) { + +`; + proxyMap.forEach(({ proxyUrl }, k) => { + this.proxyPacText += ` if (url.match(/${ + k.source + }/)) { return ${urlToProxy(proxyUrl)}; }\n`; + }); + + this.proxyPacText += `\n return ${ + defaultProxyEntry ? urlToProxy(defaultProxyEntry.proxyUrl) : `"DIRECT"` + }; +} +`; + } +} diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 5c2c96e0..477f0515 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -144,6 +144,8 @@ export class Recorder extends EventEmitter { pageFinished = false; + lastErrorText = ""; + gzip = true; writer: WARCWriter; @@ -481,6 +483,7 @@ export class Recorder extends EventEmitter { break; default: + this.lastErrorText = errorText; logger.warn( "Request failed", { url, errorText, type, status: reqresp.status, ...this.logDetails }, @@ -953,6 +956,7 @@ export class Recorder extends EventEmitter { this.pageid = pageid; this.pageUrl = url; this.finalPageUrl = this.pageUrl; + this.lastErrorText = ""; this.logDetails = { page: url, workerid: this.workerid }; if (this.pendingRequests && this.pendingRequests.size) { logger.debug( @@ -1735,7 +1739,7 @@ class AsyncFetcher { const headers = reqresp.getRequestHeadersDict(); - let dispatcher = getProxyDispatcher(); + let dispatcher = getProxyDispatcher(url); if (dispatcher) { dispatcher = dispatcher.compose((dispatch) => { diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index d3c1e627..145b8ad8 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -68,7 +68,7 @@ export class SitemapReader extends EventEmitter { while (true) { const resp = await fetch(url, { headers: this.headers, - dispatcher: getProxyDispatcher(), + dispatcher: getProxyDispatcher(url), }); if (resp.ok) { diff --git a/src/util/worker.ts b/src/util/worker.ts index dee9ceba..212b0703 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -311,7 +311,7 @@ export class PageWorker { } await timedRun( - this.crawler.pageFinished(data), + this.crawler.pageFinished(data, this.recorder?.lastErrorText), FINISHED_TIMEOUT, "Page Finished Timed Out", this.logDetails, diff --git a/tests/fixtures/proxy-key b/tests/fixtures/proxies/proxy-key similarity index 100% rename from tests/fixtures/proxy-key rename to tests/fixtures/proxies/proxy-key diff --git a/tests/fixtures/proxy-key.pub b/tests/fixtures/proxies/proxy-key.pub similarity index 100% rename from tests/fixtures/proxy-key.pub rename to tests/fixtures/proxies/proxy-key.pub diff --git a/tests/fixtures/proxies/proxy-test-bad-auth.pac b/tests/fixtures/proxies/proxy-test-bad-auth.pac new file mode 100644 index 00000000..af9a1d94 --- /dev/null +++ b/tests/fixtures/proxies/proxy-test-bad-auth.pac @@ -0,0 +1,6 @@ +matchHosts: + old.webrecorder.net: socks-proxy + +proxies: + socks-proxy: socks5://user:passw1rd@proxy-with-auth:1080 + diff --git a/tests/fixtures/proxies/proxy-test-good-auth.pac b/tests/fixtures/proxies/proxy-test-good-auth.pac new file mode 100644 index 00000000..2cbbc4c7 --- /dev/null +++ b/tests/fixtures/proxies/proxy-test-good-auth.pac @@ -0,0 +1,5 @@ +matchHosts: + old.webrecorder.net: socks-proxy + +proxies: + socks-proxy: socks5://user:passw0rd@proxy-with-auth:1080 diff --git a/tests/proxy.test.js b/tests/proxy.test.js index 1c8ccea7..811ed325 100644 --- a/tests/proxy.test.js +++ b/tests/proxy.test.js @@ -9,6 +9,8 @@ const SOCKS_PORT = "1080"; const HTTP_PORT = "3128"; const WRONG_PORT = "33130"; +const PROXY_EXIT_CODE = 21; + const SSH_PROXY_IMAGE = "linuxserver/openssh-server" const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; @@ -27,7 +29,7 @@ beforeAll(() => { proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); - proxySSHId = execSync(`docker run -d --rm -e DOCKER_MODS=linuxserver/mods:openssh-server-ssh-tunnel -e USER_NAME=user -e PUBLIC_KEY_FILE=/keys/proxy-key.pub -v $PWD/tests/fixtures/proxy-key.pub:/keys/proxy-key.pub --network=proxy-test-net --name ssh-proxy ${SSH_PROXY_IMAGE}`); + proxySSHId = execSync(`docker run -d --rm -e DOCKER_MODS=linuxserver/mods:openssh-server-ssh-tunnel -e USER_NAME=user -e PUBLIC_KEY_FILE=/keys/proxy-key.pub -v $PWD/tests/fixtures/proxies/proxy-key.pub:/keys/proxy-key.pub --network=proxy-test-net --name ssh-proxy ${SSH_PROXY_IMAGE}`); }); afterAll(async () => { @@ -66,7 +68,7 @@ describe("socks5 + https proxy tests", () => { status = e.status; } // auth supported only for SOCKS5 - expect(status).toBe(scheme === "socks5" ? 0 : 1); + expect(status).toBe(scheme === "socks5" ? 0 : PROXY_EXIT_CODE); }); test(`${scheme} proxy, ${type}, wrong auth`, () => { @@ -77,7 +79,7 @@ describe("socks5 + https proxy tests", () => { } catch (e) { status = e.status; } - expect(status).toBe(1); + expect(status).toBe(PROXY_EXIT_CODE); }); test(`${scheme} proxy, ${type}, wrong protocol`, () => { @@ -88,7 +90,8 @@ describe("socks5 + https proxy tests", () => { } catch (e) { status = e.status; } - expect(status).toBe(1); + // wrong protocol (socks5 for http) causes connection to hang, causes a timeout, so just errors with 1 + expect(status === PROXY_EXIT_CODE || status === 1).toBe(true); }); } @@ -100,7 +103,7 @@ describe("socks5 + https proxy tests", () => { } catch (e) { status = e.status; } - expect(status).toBe(1); + expect(status).toBe(PROXY_EXIT_CODE); }); } }); @@ -118,7 +121,7 @@ test("http proxy set, but not running, separate env vars", () => { } catch (e) { status = e.status; } - expect(status).toBe(1); + expect(status).toBe(PROXY_EXIT_CODE); }); test("http proxy set, but not running, cli arg", () => { @@ -129,12 +132,12 @@ test("http proxy set, but not running, cli arg", () => { } catch (e) { status = e.status; } - expect(status).toBe(1); + expect(status).toBe(PROXY_EXIT_CODE); }); test("ssh socks proxy with custom user", () => { - execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxy-key:/keys/proxy-key webrecorder/browsertrix-crawler crawl --proxyServer ssh://user@ssh-proxy:2222 --sshProxyPrivateKeyFile /keys/proxy-key --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); + execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/proxy-key:/keys/proxy-key webrecorder/browsertrix-crawler crawl --proxyServer ssh://user@ssh-proxy:2222 --sshProxyPrivateKeyFile /keys/proxy-key --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); }); @@ -146,7 +149,7 @@ test("ssh socks proxy, wrong user", () => { } catch (e) { status = e.status; } - expect(status).toBe(21); + expect(status).toBe(PROXY_EXIT_CODE); }); @@ -164,4 +167,30 @@ test("ensure logged proxy string does not include any credentials", () => { }); +test("proxy with config file, wrong auth or no match", () => { + let status = 0; + try { + execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-bad-auth.pac --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(PROXY_EXIT_CODE); + // success, no match for PDF + execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-bad-auth.pac --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); +}); + + +test("proxy with config file, correct auth or no match", () => { + let status = 0; + try { + execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-good-auth.pac --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(0); + + // success, no match for PDF + execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-good-auth.pac --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + +});