proxy auth:

- support for SOCKS5 (as supported in Brave though not Chromium) but not HTTP (not supported in any browser w/o interactive prompt)
- tests: update tests to check socks5/http and html/pdf in a loop
This commit is contained in:
Ilya Kreymer 2024-06-05 22:02:55 -07:00
parent b6942786b1
commit 61628049cb
3 changed files with 89 additions and 36 deletions

View file

@ -26,12 +26,18 @@ export function initDispatcher() {
export function createDispatcher(): Dispatcher | undefined {
const proxyUrl = getProxy();
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
return new ProxyAgent({ uri: proxyUrl });
// HTTP PROXY does not support auth, as it's not supported in the browser
// so must drop username/password for consistency
const url = new URL(proxyUrl);
url.username = "";
url.password = "";
return new ProxyAgent({ uri: url.href });
} else if (
proxyUrl.startsWith("socks://") ||
proxyUrl.startsWith("socks5://") ||
proxyUrl.startsWith("socks4://")
) {
// support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium)
const url = new URL(proxyUrl);
const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5;
const params = {

View file

@ -3,7 +3,7 @@ import fs from "fs";
import path from "path";
import { WARCParser } from "warcio";
const PDF = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf";
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
test("ensure pdf is crawled", async () => {
child_process.execSync(

View file

@ -1,61 +1,108 @@
import child_process from "child_process";
let port = 33080;
let globalPort = 33080;
const PROXY_IMAGE = "ghcr.io/tarampampam/3proxy:1.9.1"
const PROXY_IMAGE = "tarampampam/3proxy:1.9.1";
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
const HTML = "https://webrecorder.net/";
const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug";
function killContainer(id) {
child_process.execSync(`docker kill -s SIGINT ${id}`);
}
function runSocksProxy(scheme, user="", pass="") {
const isSocks = scheme === "socks5";
const id = child_process.execSync(`docker run -d --rm -e PROXY_USER=${user} -e PROXY_PASSWORD=${pass} -p ${port++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"});
const port = globalPort;
const id = child_process.execSync(`docker run -e PROXY_LOGIN=${user} -e PROXY_PASSWORD=${pass} -d --rm -p ${globalPort++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"});
return {id, port};
}
describe("socks5 + https proxy tests", () => {
for (const mode of ["socks5", "http"]) {
const scheme = mode;
for (const scheme of ["socks5", "http"]) {
for (const type of ["HTML page", "PDF"]) {
test(`${scheme} proxy, no auth`, async () => {
const {id, port} = runSocksProxy(mode);
const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"});
const url = type === "PDF" ? PDF : HTML;
child_process.execSync(`docker kill -s SIGINT ${id}`);
test(`${scheme} proxy, ${type}, no auth`, () => {
const {id, port} = runSocksProxy(scheme);
let status = 0;
expect(!!result).toBe(true);
});
try {
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
} finally {
killContainer(id);
}
expect(status).toBe(0);
});
test(`${scheme} proxy, with auth`, async () => {
const {id, port} = runSocksProxy(mode, "user", "passw0rd");
const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"});
test(`${scheme} proxy, ${type}, with auth`, () => {
const {id, port} = runSocksProxy(scheme, "user", "passw0rd");
let status = 0;
child_process.execSync(`docker kill -s SIGINT ${id}`);
try {
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
} finally {
killContainer(id);
}
// auth supported only for SOCKS5
expect(status).toBe(scheme === "socks5" ? 0 : 1);
});
expect(!!result).toBe(true);
});
test(`${scheme} proxy, ${type}, wrong auth`, () => {
const {id, port} = runSocksProxy(scheme, "user", "passw1rd");
let status = 0;
test(`${scheme} proxy, error, not running`, async () => {
try {
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
} finally {
killContainer(id);
}
expect(status).toBe(1);
});
}
test(`${scheme} proxy, proxy missing error`, () => {
let status = 0;
try {
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed`, {encoding: "utf-8"});
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
test(`${scheme} proxy, error, wrong auth`, async () => {
const {id, port} = runSocksProxy(mode, "user", "passw1rd");
let status = 0;
try {
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed --timeout 10`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
child_process.execSync(`docker kill -s SIGINT ${id}`);
});
}
});
test("http proxy, PDF, separate env vars", () => {
const {id, port} = runSocksProxy("http");
try {
child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${port} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} finally {
killContainer(id);
}
});
test("http proxy, error, not running, separate env vars", () => {
let status = 0;
try {
child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});