proxy: support setting proxy via --proxyServer, PROXY_SERVER env var or PROXY_HOST + PROXY_PORT env vars (#589)

fixes #587 

The proxy env vars PROXY_HOST and PROXY_PORT were being ignored, as they
were hardcoded to obsolete values in the Dockerfile.

Proxy settings can now be set, in order of precedence via:
- --proxyServer cli flag
- PROXY_SERVER env var
- PROXY_HOST and PROXY_PORT env vars, which set an HTTP proxy server
only (for backwards compatibility with 0.12.x)

The --proxyServer / PROXY_SERVER settings are passed to the browser via
the --proxy-server flag.
AsyncFetcher / direct fetch also supports HTTP and SOCKS5 proxying.
Supported proxies are: HTTP no auth, SOCKS5 no auth, SOCKS5 with auth
(supported in Brave, but not Chrome!)

---------
Co-authored-by: Vinzenz Sinapius <Vinzenz.Sinapius@gmail.com>
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-06-10 13:11:00 -07:00 committed by GitHub
parent b83d1c58da
commit e2b4cc1844
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 262 additions and 17 deletions

View file

@ -6,11 +6,7 @@ FROM ${BROWSER_IMAGE_BASE}
# needed to add args to main build stage
ARG BROWSER_VERSION
ENV PROXY_HOST=localhost \
PROXY_PORT=8080 \
PROXY_CA_URL=http://wsgiprox/download/pem \
PROXY_CA_FILE=/tmp/proxy-ca.pem \
DISPLAY=:99 \
ENV DISPLAY=:99 \
GEOMETRY=1360x1020x16 \
BROWSER_VERSION=${BROWSER_VERSION} \
BROWSER_BIN=google-chrome \

View file

@ -21,6 +21,7 @@
"@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.6.0",
"crc": "^4.3.2",
"fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
"ioredis": "^5.3.2",
@ -34,6 +35,7 @@
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
"undici": "^6.18.2",
"uuid": "8.3.2",
"warcio": "^2.2.1",
"ws": "^7.4.4",

View file

@ -56,6 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
import { isHTMLContentType } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js";
const behaviors = fs.readFileSync(
new URL(
@ -170,6 +171,8 @@ export class Crawler {
maxHeapUsed = 0;
maxHeapTotal = 0;
proxyServer?: string;
driver!: (opts: {
page: Page;
data: PageState;
@ -443,6 +446,8 @@ export class Crawler {
async bootstrap() {
const subprocesses: ChildProcess[] = [];
this.proxyServer = initProxy(this.params.proxyServer);
subprocesses.push(this.launchRedis());
await fsp.mkdir(this.logDir, { recursive: true });
@ -1303,7 +1308,7 @@ self.__bx_behaviors.selectMainBehavior();
emulateDevice: this.emulateDevice,
swOpt: this.params.serviceWorker,
chromeOptions: {
proxy: false,
proxy: this.proxyServer,
userAgent: this.emulateDevice.userAgent,
extraArgs: this.extraChromeArgs(),
},

View file

@ -99,9 +99,10 @@ function cliOpts(): { [key: string]: Options } {
default: getDefaultWindowSize(),
},
proxy: {
type: "boolean",
default: false,
proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},
cookieDays: {
@ -179,7 +180,7 @@ async function main() {
headless: params.headless,
signals: false,
chromeOptions: {
proxy: false,
proxy: params.proxyServer,
extraArgs: [
"--window-position=0,0",
`--window-size=${params.windowSize}`,

View file

@ -545,6 +545,12 @@ class ArgParser {
default: "disabled",
},
proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},
dryRun: {
describe:
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",

View file

@ -4,6 +4,8 @@ import { logger, formatErr } from "./logger.js";
import { HTTPRequest, Page } from "puppeteer-core";
import { Browser } from "./browser.js";
import { fetch } from "undici";
const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];

View file

@ -22,7 +22,7 @@ import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js";
type BtrixChromeOpts = {
proxy?: boolean;
proxy?: string;
userAgent?: string | null;
extraArgs?: string[];
};
@ -115,7 +115,6 @@ export class Browser {
? undefined
: (target) => this.targetFilter(target),
};
await this._init(launchOpts, ondisconnect, recording);
}
@ -217,7 +216,7 @@ export class Browser {
}
chromeArgs({
proxy = true,
proxy = "",
userAgent = null,
extraArgs = [],
}: BtrixChromeOpts) {
@ -236,11 +235,13 @@ export class Browser {
...extraArgs,
];
if (proxy) {
logger.info("Using proxy", { proxy }, "browser");
}
if (proxy) {
args.push("--ignore-certificate-errors");
args.push(
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
);
args.push(`--proxy-server=${proxy}`);
}
return args;

60
src/util/proxy.ts Normal file
View file

@ -0,0 +1,60 @@
import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
import { socksDispatcher } from "fetch-socks";
import type { SocksProxyType } from "socks/typings/common/constants.js";
export function getEnvProxyUrl() {
if (process.env.PROXY_SERVER) {
return process.env.PROXY_SERVER;
}
// for backwards compatibility with 0.x proxy settings
if (process.env.PROXY_HOST && process.env.PROXY_PORT) {
return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`;
}
return "";
}
export function initProxy(proxy?: string): string {
if (!proxy) {
proxy = getEnvProxyUrl();
}
if (proxy) {
const dispatcher = createDispatcher(proxy);
if (dispatcher) {
setGlobalDispatcher(dispatcher);
return proxy;
}
}
return "";
}
export function createDispatcher(proxyUrl: string): Dispatcher | undefined {
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
// HTTP PROXY does not support auth, as it's not supported in the browser
// so must drop username/password for consistency
const url = new URL(proxyUrl);
url.username = "";
url.password = "";
return new ProxyAgent({ uri: url.href });
} else if (
proxyUrl.startsWith("socks://") ||
proxyUrl.startsWith("socks5://") ||
proxyUrl.startsWith("socks4://")
) {
// support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium)
const url = new URL(proxyUrl);
const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5;
const params = {
type,
host: url.hostname,
port: parseInt(url.port),
userId: url.username || undefined,
password: url.password || undefined,
};
return socksDispatcher(params);
} else {
return undefined;
}
}

View file

@ -8,6 +8,8 @@ import { logger, formatErr } from "./logger.js";
import { sleep, timedRun, timestampNow } from "./timing.js";
import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";
import { fetch, Response } from "undici";
// @ts-expect-error TODO fill in why error is expected
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
import {

View file

@ -4,6 +4,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";
import { Protocol } from "puppeteer-core";
import { postToGetUrl } from "warcio";
import { HTML_TYPES } from "./constants.js";
import { Response } from "undici";
const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type";

View file

@ -9,6 +9,8 @@ import { logger, formatErr } from "./logger.js";
import { DETECT_SITEMAP } from "./constants.js";
import { sleep } from "./timing.js";
import { fetch, Response } from "undici";
const SITEMAP_CONCURRENCY = 5;
const TEXT_CONTENT_TYPE = ["text/plain"];
@ -237,7 +239,8 @@ export class SitemapReader extends EventEmitter {
resp.headers.get("content-encoding") !== "gzip"
) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
stream = body.pipeThrough(ds as any);
} else {
stream = body;
}

127
tests/proxy.test.js Normal file
View file

@ -0,0 +1,127 @@
import { execSync, exec } from "child_process";
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
const PROXY_IMAGE = "tarampampam/3proxy:1.9.1";
const SOCKS_PORT = "1080";
const HTTP_PORT = "3128";
const WRONG_PORT = "33130";
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
const HTML = "https://webrecorder.net/";
const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug";
let proxyAuthId;
let proxyNoAuthId;
beforeAll(() => {
execSync("docker network create proxy-test-net");
proxyAuthId = execSync(`docker run -e PROXY_LOGIN=user -e PROXY_PASSWORD=passw0rd -d --rm --network=proxy-test-net --name proxy-with-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
});
afterAll(async () => {
execSync(`docker kill -s SIGINT ${proxyAuthId}`);
execSync(`docker kill -s SIGINT ${proxyNoAuthId}`);
await sleep(3000);
execSync("docker network rm proxy-test-net");
});
describe("socks5 + https proxy tests", () => {
for (const scheme of ["socks5", "http"]) {
const port = scheme === "socks5" ? SOCKS_PORT : HTTP_PORT;
for (const type of ["HTML page", "PDF"]) {
const url = type === "PDF" ? PDF : HTML;
test(`${scheme} proxy, ${type}, no auth`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(0);
});
test(`${scheme} proxy, ${type}, with auth`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
// auth supported only for SOCKS5
expect(status).toBe(scheme === "socks5" ? 0 : 1);
});
test(`${scheme} proxy, ${type}, wrong auth`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
test(`${scheme} proxy, ${type}, wrong protocol`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${scheme === "socks5" ? HTTP_PORT : SOCKS_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
}
test(`${scheme} proxy, proxy missing error`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
}
});
test("http proxy, PDF, separate env vars", () => {
execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${HTTP_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
});
test("http proxy set, but not running, separate env vars", () => {
let status = 0;
try {
execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
test("http proxy set, but not running, cli arg", () => {
let status = 0;
try {
execSync(`docker run --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --proxyServer http://proxy-no-auth:${WRONG_PORT} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});

View file

@ -2386,6 +2386,14 @@ fd-slicer@~1.1.0:
dependencies:
pend "~1.2.0"
fetch-socks@^1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/fetch-socks/-/fetch-socks-1.3.0.tgz#1f07b26924b5e7370aa23fd6e9332a5863736d1b"
integrity sha512-Cq7O53hoNiVeOs6u54f8M/H/w2yzhmnTQ3tcAJj9FNKYOeNGmt8qNU1zpWOzJD09f0uqfmBXxLbzWPsnT6GcRw==
dependencies:
socks "^2.8.1"
undici "^6.10.1"
file-entry-cache@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027"
@ -2778,6 +2786,14 @@ ioredis@^5.3.2:
redis-parser "^3.0.0"
standard-as-callback "^2.1.0"
ip-address@^9.0.5:
version "9.0.5"
resolved "https://registry.yarnpkg.com/ip-address/-/ip-address-9.0.5.tgz#117a960819b08780c3bd1f14ef3c1cc1d3f3ea5a"
integrity sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==
dependencies:
jsbn "1.1.0"
sprintf-js "^1.1.3"
ip@^1.1.8:
version "1.1.8"
resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
@ -3427,6 +3443,11 @@ js-yaml@^4.1.0:
dependencies:
argparse "^2.0.1"
jsbn@1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040"
integrity sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==
jsesc@^2.5.1:
version "2.5.2"
resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4"
@ -4437,6 +4458,14 @@ socks@^2.7.1:
ip "^2.0.0"
smart-buffer "^4.2.0"
socks@^2.8.1:
version "2.8.3"
resolved "https://registry.yarnpkg.com/socks/-/socks-2.8.3.tgz#1ebd0f09c52ba95a09750afe3f3f9f724a800cb5"
integrity sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw==
dependencies:
ip-address "^9.0.5"
smart-buffer "^4.2.0"
source-map-support@0.5.13:
version "0.5.13"
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
@ -4455,6 +4484,11 @@ split-on-first@^1.0.0:
resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f"
integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw==
sprintf-js@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.3.tgz#4914b903a2f8b685d17fdf78a70e917e872e444a"
integrity sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==
sprintf-js@~1.0.2:
version "1.0.3"
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c"
@ -4842,6 +4876,11 @@ undici-types@~5.25.1:
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3"
integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA==
undici@^6.10.1, undici@^6.18.2:
version "6.18.2"
resolved "https://registry.yarnpkg.com/undici/-/undici-6.18.2.tgz#f662a5dc33cf654fc412a9912e5a07b138d75c97"
integrity sha512-o/MQLTwRm9IVhOqhZ0NQ9oXax1ygPjw6Vs+Vq/4QRjbOAC3B1GCHy7TYxxbExKlb7bzDRzt9vBWU6BDz0RFfYg==
unique-string@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/unique-string/-/unique-string-3.0.0.tgz#84a1c377aff5fd7a8bc6b55d8244b2bd90d75b9a"