diff --git a/Dockerfile b/Dockerfile index 03c41b7d..2cfd228c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,11 +6,7 @@ FROM ${BROWSER_IMAGE_BASE} # needed to add args to main build stage ARG BROWSER_VERSION -ENV PROXY_HOST=localhost \ - PROXY_PORT=8080 \ - PROXY_CA_URL=http://wsgiprox/download/pem \ - PROXY_CA_FILE=/tmp/proxy-ca.pem \ - DISPLAY=:99 \ +ENV DISPLAY=:99 \ GEOMETRY=1360x1020x16 \ BROWSER_VERSION=${BROWSER_VERSION} \ BROWSER_BIN=google-chrome \ diff --git a/package.json b/package.json index f05071eb..a5ca8728 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "@webrecorder/wabac": "^2.16.12", "browsertrix-behaviors": "^0.6.0", "crc": "^4.3.2", + "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", "ioredis": "^5.3.2", @@ -34,6 +35,7 @@ "sax": "^1.3.0", "sharp": "^0.32.6", "tsc": "^2.0.4", + "undici": "^6.18.2", "uuid": "8.3.2", "warcio": "^2.2.1", "ws": "^7.4.4", diff --git a/src/crawler.ts b/src/crawler.ts index b1c3ac63..f8f6ea73 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -56,6 +56,7 @@ import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; import { isHTMLContentType } from "./util/reqresp.js"; +import { initProxy } from "./util/proxy.js"; const behaviors = fs.readFileSync( new URL( @@ -170,6 +171,8 @@ export class Crawler { maxHeapUsed = 0; maxHeapTotal = 0; + proxyServer?: string; + driver!: (opts: { page: Page; data: PageState; @@ -443,6 +446,8 @@ export class Crawler { async bootstrap() { const subprocesses: ChildProcess[] = []; + this.proxyServer = initProxy(this.params.proxyServer); + subprocesses.push(this.launchRedis()); await fsp.mkdir(this.logDir, { recursive: true }); @@ -1303,7 +1308,7 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: false, + proxy: this.proxyServer, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 047ed017..2f19f0ea 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -99,9 +99,10 @@ function cliOpts(): { [key: string]: Options } { default: getDefaultWindowSize(), }, - proxy: { - type: "boolean", - default: false, + proxyServer: { + describe: + "if set, will use specified proxy server. Takes precedence over any env var proxy settings", + type: "string", }, cookieDays: { @@ -179,7 +180,7 @@ async function main() { headless: params.headless, signals: false, chromeOptions: { - proxy: false, + proxy: params.proxyServer, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 011c101b..1608f2c5 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -545,6 +545,12 @@ class ArgParser { default: "disabled", }, + proxyServer: { + describe: + "if set, will use specified proxy server. Takes precedence over any env var proxy settings", + type: "string", + }, + dryRun: { describe: "If true, no archive data is written to disk, only pages and logs (and optionally saved state).", diff --git a/src/util/blockrules.ts b/src/util/blockrules.ts index a0fa1ebe..3f258b33 100644 --- a/src/util/blockrules.ts +++ b/src/util/blockrules.ts @@ -4,6 +4,8 @@ import { logger, formatErr } from "./logger.js"; import { HTTPRequest, Page } from "puppeteer-core"; import { Browser } from "./browser.js"; +import { fetch } from "undici"; + const RULE_TYPES = ["block", "allowOnly"]; const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; diff --git a/src/util/browser.ts b/src/util/browser.ts index 65500cb4..73a74222 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -22,7 +22,7 @@ import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; type BtrixChromeOpts = { - proxy?: boolean; + proxy?: string; userAgent?: string | null; extraArgs?: string[]; }; @@ -115,7 +115,6 @@ export class Browser { ? undefined : (target) => this.targetFilter(target), }; - await this._init(launchOpts, ondisconnect, recording); } @@ -217,7 +216,7 @@ export class Browser { } chromeArgs({ - proxy = true, + proxy = "", userAgent = null, extraArgs = [], }: BtrixChromeOpts) { @@ -236,11 +235,13 @@ export class Browser { ...extraArgs, ]; + if (proxy) { + logger.info("Using proxy", { proxy }, "browser"); + } + if (proxy) { args.push("--ignore-certificate-errors"); - args.push( - `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`, - ); + args.push(`--proxy-server=${proxy}`); } return args; diff --git a/src/util/proxy.ts b/src/util/proxy.ts new file mode 100644 index 00000000..c7fe2a85 --- /dev/null +++ b/src/util/proxy.ts @@ -0,0 +1,60 @@ +import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; + +import { socksDispatcher } from "fetch-socks"; +import type { SocksProxyType } from "socks/typings/common/constants.js"; + +export function getEnvProxyUrl() { + if (process.env.PROXY_SERVER) { + return process.env.PROXY_SERVER; + } + + // for backwards compatibility with 0.x proxy settings + if (process.env.PROXY_HOST && process.env.PROXY_PORT) { + return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; + } + + return ""; +} + +export function initProxy(proxy?: string): string { + if (!proxy) { + proxy = getEnvProxyUrl(); + } + if (proxy) { + const dispatcher = createDispatcher(proxy); + if (dispatcher) { + setGlobalDispatcher(dispatcher); + return proxy; + } + } + return ""; +} + +export function createDispatcher(proxyUrl: string): Dispatcher | undefined { + if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { + // HTTP PROXY does not support auth, as it's not supported in the browser + // so must drop username/password for consistency + const url = new URL(proxyUrl); + url.username = ""; + url.password = ""; + return new ProxyAgent({ uri: url.href }); + } else if ( + proxyUrl.startsWith("socks://") || + proxyUrl.startsWith("socks5://") || + proxyUrl.startsWith("socks4://") + ) { + // support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium) + const url = new URL(proxyUrl); + const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; + const params = { + type, + host: url.hostname, + port: parseInt(url.port), + userId: url.username || undefined, + password: url.password || undefined, + }; + return socksDispatcher(params); + } else { + return undefined; + } +} diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 182571ea..1a0fd7b1 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -8,6 +8,8 @@ import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; +import { fetch, Response } from "undici"; + // @ts-expect-error TODO fill in why error is expected import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; import { diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 1fc9fbe5..2052c143 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -4,6 +4,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; import { HTML_TYPES } from "./constants.js"; +import { Response } from "undici"; const CONTENT_LENGTH = "content-length"; const CONTENT_TYPE = "content-type"; diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 5d8507b5..e34a9bf1 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -9,6 +9,8 @@ import { logger, formatErr } from "./logger.js"; import { DETECT_SITEMAP } from "./constants.js"; import { sleep } from "./timing.js"; +import { fetch, Response } from "undici"; + const SITEMAP_CONCURRENCY = 5; const TEXT_CONTENT_TYPE = ["text/plain"]; @@ -237,7 +239,8 @@ export class SitemapReader extends EventEmitter { resp.headers.get("content-encoding") !== "gzip" ) { const ds = new DecompressionStream("gzip"); - stream = body.pipeThrough(ds); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + stream = body.pipeThrough(ds as any); } else { stream = body; } diff --git a/tests/proxy.test.js b/tests/proxy.test.js new file mode 100644 index 00000000..1c162620 --- /dev/null +++ b/tests/proxy.test.js @@ -0,0 +1,127 @@ +import { execSync, exec } from "child_process"; + +const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); + +const PROXY_IMAGE = "tarampampam/3proxy:1.9.1"; +const SOCKS_PORT = "1080"; +const HTTP_PORT = "3128"; +const WRONG_PORT = "33130"; + +const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; +const HTML = "https://webrecorder.net/"; + +const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug"; + +let proxyAuthId; +let proxyNoAuthId; + +beforeAll(() => { + execSync("docker network create proxy-test-net"); + + proxyAuthId = execSync(`docker run -e PROXY_LOGIN=user -e PROXY_PASSWORD=passw0rd -d --rm --network=proxy-test-net --name proxy-with-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); + + proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); +}); + +afterAll(async () => { + execSync(`docker kill -s SIGINT ${proxyAuthId}`); + execSync(`docker kill -s SIGINT ${proxyNoAuthId}`); + await sleep(3000); + execSync("docker network rm proxy-test-net"); +}); + +describe("socks5 + https proxy tests", () => { + for (const scheme of ["socks5", "http"]) { + const port = scheme === "socks5" ? SOCKS_PORT : HTTP_PORT; + + for (const type of ["HTML page", "PDF"]) { + + const url = type === "PDF" ? PDF : HTML; + + test(`${scheme} proxy, ${type}, no auth`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(0); + }); + + test(`${scheme} proxy, ${type}, with auth`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + // auth supported only for SOCKS5 + expect(status).toBe(scheme === "socks5" ? 0 : 1); + }); + + test(`${scheme} proxy, ${type}, wrong auth`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + + test(`${scheme} proxy, ${type}, wrong protocol`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${scheme === "socks5" ? HTTP_PORT : SOCKS_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + } + + test(`${scheme} proxy, proxy missing error`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + } +}); + + +test("http proxy, PDF, separate env vars", () => { + execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${HTTP_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); +}); + +test("http proxy set, but not running, separate env vars", () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); +}); + +test("http proxy set, but not running, cli arg", () => { + let status = 0; + + try { + execSync(`docker run --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --proxyServer http://proxy-no-auth:${WRONG_PORT} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); +}); + + diff --git a/yarn.lock b/yarn.lock index 07506669..097c0363 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2386,6 +2386,14 @@ fd-slicer@~1.1.0: dependencies: pend "~1.2.0" +fetch-socks@^1.3.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/fetch-socks/-/fetch-socks-1.3.0.tgz#1f07b26924b5e7370aa23fd6e9332a5863736d1b" + integrity sha512-Cq7O53hoNiVeOs6u54f8M/H/w2yzhmnTQ3tcAJj9FNKYOeNGmt8qNU1zpWOzJD09f0uqfmBXxLbzWPsnT6GcRw== + dependencies: + socks "^2.8.1" + undici "^6.10.1" + file-entry-cache@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" @@ -2778,6 +2786,14 @@ ioredis@^5.3.2: redis-parser "^3.0.0" standard-as-callback "^2.1.0" +ip-address@^9.0.5: + version "9.0.5" + resolved "https://registry.yarnpkg.com/ip-address/-/ip-address-9.0.5.tgz#117a960819b08780c3bd1f14ef3c1cc1d3f3ea5a" + integrity sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g== + dependencies: + jsbn "1.1.0" + sprintf-js "^1.1.3" + ip@^1.1.8: version "1.1.8" resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48" @@ -3427,6 +3443,11 @@ js-yaml@^4.1.0: dependencies: argparse "^2.0.1" +jsbn@1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040" + integrity sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A== + jsesc@^2.5.1: version "2.5.2" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4" @@ -4437,6 +4458,14 @@ socks@^2.7.1: ip "^2.0.0" smart-buffer "^4.2.0" +socks@^2.8.1: + version "2.8.3" + resolved "https://registry.yarnpkg.com/socks/-/socks-2.8.3.tgz#1ebd0f09c52ba95a09750afe3f3f9f724a800cb5" + integrity sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw== + dependencies: + ip-address "^9.0.5" + smart-buffer "^4.2.0" + source-map-support@0.5.13: version "0.5.13" resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932" @@ -4455,6 +4484,11 @@ split-on-first@^1.0.0: resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f" integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw== +sprintf-js@^1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.3.tgz#4914b903a2f8b685d17fdf78a70e917e872e444a" + integrity sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA== + sprintf-js@~1.0.2: version "1.0.3" resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c" @@ -4842,6 +4876,11 @@ undici-types@~5.25.1: resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3" integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA== +undici@^6.10.1, undici@^6.18.2: + version "6.18.2" + resolved "https://registry.yarnpkg.com/undici/-/undici-6.18.2.tgz#f662a5dc33cf654fc412a9912e5a07b138d75c97" + integrity sha512-o/MQLTwRm9IVhOqhZ0NQ9oXax1ygPjw6Vs+Vq/4QRjbOAC3B1GCHy7TYxxbExKlb7bzDRzt9vBWU6BDz0RFfYg== + unique-string@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/unique-string/-/unique-string-3.0.0.tgz#84a1c377aff5fd7a8bc6b55d8244b2bd90d75b9a"