mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
proxy auth:
- support for SOCKS5 (as supported in Brave though not Chromium) but not HTTP (not supported in any browser w/o interactive prompt) - tests: update tests to check socks5/http and html/pdf in a loop
This commit is contained in:
parent
b6942786b1
commit
61628049cb
3 changed files with 89 additions and 36 deletions
|
@ -26,12 +26,18 @@ export function initDispatcher() {
|
|||
export function createDispatcher(): Dispatcher | undefined {
|
||||
const proxyUrl = getProxy();
|
||||
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
|
||||
return new ProxyAgent({ uri: proxyUrl });
|
||||
// HTTP PROXY does not support auth, as it's not supported in the browser
|
||||
// so must drop username/password for consistency
|
||||
const url = new URL(proxyUrl);
|
||||
url.username = "";
|
||||
url.password = "";
|
||||
return new ProxyAgent({ uri: url.href });
|
||||
} else if (
|
||||
proxyUrl.startsWith("socks://") ||
|
||||
proxyUrl.startsWith("socks5://") ||
|
||||
proxyUrl.startsWith("socks4://")
|
||||
) {
|
||||
// support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium)
|
||||
const url = new URL(proxyUrl);
|
||||
const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5;
|
||||
const params = {
|
||||
|
|
|
@ -3,7 +3,7 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import { WARCParser } from "warcio";
|
||||
|
||||
const PDF = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf";
|
||||
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
||||
|
||||
test("ensure pdf is crawled", async () => {
|
||||
child_process.execSync(
|
||||
|
|
|
@ -1,61 +1,108 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
let port = 33080;
|
||||
let globalPort = 33080;
|
||||
|
||||
const PROXY_IMAGE = "ghcr.io/tarampampam/3proxy:1.9.1"
|
||||
const PROXY_IMAGE = "tarampampam/3proxy:1.9.1";
|
||||
|
||||
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
||||
const HTML = "https://webrecorder.net/";
|
||||
|
||||
const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug";
|
||||
|
||||
function killContainer(id) {
|
||||
child_process.execSync(`docker kill -s SIGINT ${id}`);
|
||||
}
|
||||
|
||||
function runSocksProxy(scheme, user="", pass="") {
|
||||
const isSocks = scheme === "socks5";
|
||||
const id = child_process.execSync(`docker run -d --rm -e PROXY_USER=${user} -e PROXY_PASSWORD=${pass} -p ${port++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
||||
const port = globalPort;
|
||||
const id = child_process.execSync(`docker run -e PROXY_LOGIN=${user} -e PROXY_PASSWORD=${pass} -d --rm -p ${globalPort++}:${isSocks ? "1080" : "3128"} ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
||||
return {id, port};
|
||||
}
|
||||
|
||||
describe("socks5 + https proxy tests", () => {
|
||||
for (const mode of ["socks5", "http"]) {
|
||||
const scheme = mode;
|
||||
for (const scheme of ["socks5", "http"]) {
|
||||
for (const type of ["HTML page", "PDF"]) {
|
||||
|
||||
test(`${scheme} proxy, no auth`, async () => {
|
||||
const {id, port} = runSocksProxy(mode);
|
||||
const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"});
|
||||
const url = type === "PDF" ? PDF : HTML;
|
||||
|
||||
child_process.execSync(`docker kill -s SIGINT ${id}`);
|
||||
|
||||
expect(!!result).toBe(true);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, with auth`, async () => {
|
||||
const {id, port} = runSocksProxy(mode, "user", "passw0rd");
|
||||
const result = child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} -d --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --logging debug`, {encoding: "utf-8"});
|
||||
|
||||
child_process.execSync(`docker kill -s SIGINT ${id}`);
|
||||
|
||||
expect(!!result).toBe(true);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, error, not running`, async () => {
|
||||
test(`${scheme} proxy, ${type}, no auth`, () => {
|
||||
const {id, port} = runSocksProxy(scheme);
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed`, {encoding: "utf-8"});
|
||||
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
} finally {
|
||||
killContainer(id);
|
||||
}
|
||||
expect(status).toBe(0);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, with auth`, () => {
|
||||
const {id, port} = runSocksProxy(scheme, "user", "passw0rd");
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
} finally {
|
||||
killContainer(id);
|
||||
}
|
||||
// auth supported only for SOCKS5
|
||||
expect(status).toBe(scheme === "socks5" ? 0 : 1);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, wrong auth`, () => {
|
||||
const {id, port} = runSocksProxy(scheme, "user", "passw1rd");
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
} finally {
|
||||
killContainer(id);
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
}
|
||||
|
||||
test(`${scheme} proxy, proxy missing error`, () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://host.docker.internal:${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
test("http proxy, PDF, separate env vars", () => {
|
||||
const {id, port} = runSocksProxy("http");
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${port} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} finally {
|
||||
killContainer(id);
|
||||
}
|
||||
});
|
||||
|
||||
test("http proxy, error, not running, separate env vars", () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_HOST=host.docker.internal -e PROXY_PORT=${++globalPort} --rm webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, error, wrong auth`, async () => {
|
||||
const {id, port} = runSocksProxy(mode, "user", "passw1rd");
|
||||
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@host.docker.internal:${port} --rm webrecorder/browsertrix-crawler crawl --url https://example.com/ --limit 1 --failOnFailedSeed --timeout 10`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
|
||||
child_process.execSync(`docker kill -s SIGINT ${id}`);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue