mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Support host-specific proxies with proxy config YAML (#837)
- Adds support for YAML-based config for multiple proxies, containing 'matchHosts' section by regex and 'proxies' declaration, allowing matching any number of hosts to any number of named proxies. - Specified via --proxyServerConfig option passed to both crawl and profile creation commands. - Implemented internally by generating a proxy PAC script which does regex matching and running browser with the specified proxy PAC script served by an internal http server. - Also support matching different undici Agents by regex, for using different proxies with direct fetching - Precedence: --proxyServerConfig takes precedence over --proxyServer / PROXY_SERVER, unless --proxyServerPreferSingleProxy is also provided - Updated proxies doc section with example - Updated tests with sample bad and good auth examples of proxy config Fixes #836 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
a6ad6a0e42
commit
a42c0b926e
19 changed files with 424 additions and 68 deletions
|
@ -103,16 +103,16 @@ Options:
|
||||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||||
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
||||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy"] [default:
|
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
|
||||||
[]]
|
[default: []]
|
||||||
--logExcludeContext Comma-separated list of contexts to
|
--logExcludeContext Comma-separated list of contexts to
|
||||||
NOT include in logs
|
NOT include in logs
|
||||||
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
||||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||||
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
||||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy"] [default:
|
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
|
||||||
["recorderNetwork","jsError","screencast"]]
|
[default: ["recorderNetwork","jsError","screencast"]]
|
||||||
--text Extract initial (default) or final t
|
--text Extract initial (default) or final t
|
||||||
ext to pages.jsonl or WARC resource
|
ext to pages.jsonl or WARC resource
|
||||||
record(s)
|
record(s)
|
||||||
|
@ -294,6 +294,13 @@ Options:
|
||||||
--proxyServer if set, will use specified proxy ser
|
--proxyServer if set, will use specified proxy ser
|
||||||
ver. Takes precedence over any env v
|
ver. Takes precedence over any env v
|
||||||
ar proxy settings [string]
|
ar proxy settings [string]
|
||||||
|
--proxyServerPreferSingleProxy if set, and both proxyServer and pro
|
||||||
|
xyServerConfig are provided, the pro
|
||||||
|
xyServer value will be preferred
|
||||||
|
[boolean] [default: false]
|
||||||
|
--proxyServerConfig if set, path to yaml/json file that
|
||||||
|
configures multiple path servers per
|
||||||
|
URL regex [string]
|
||||||
--dryRun If true, no archive data is written
|
--dryRun If true, no archive data is written
|
||||||
to disk, only pages and logs (and op
|
to disk, only pages and logs (and op
|
||||||
tionally saved state). [boolean]
|
tionally saved state). [boolean]
|
||||||
|
@ -343,6 +350,8 @@ Options:
|
||||||
[number] [default: 7]
|
[number] [default: 7]
|
||||||
--proxyServer if set, will use specified proxy server. Takes prece
|
--proxyServer if set, will use specified proxy server. Takes prece
|
||||||
dence over any env var proxy settings [string]
|
dence over any env var proxy settings [string]
|
||||||
|
--proxyServerConfig if set, path to yaml/json file that configures multi
|
||||||
|
ple path servers per URL regex [string]
|
||||||
--sshProxyPrivateKeyFile path to SSH private key for SOCKS5 over SSH proxy co
|
--sshProxyPrivateKeyFile path to SSH private key for SOCKS5 over SSH proxy co
|
||||||
nnection [string]
|
nnection [string]
|
||||||
--sshProxyKnownHostsFile path to SSH known hosts file for SOCKS5 over SSH pro
|
--sshProxyKnownHostsFile path to SSH known hosts file for SOCKS5 over SSH pro
|
||||||
|
|
|
@ -80,7 +80,55 @@ The above proxy settings also apply to [Browser Profile Creation](browser-profil
|
||||||
docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler create-login-profile --url https://example.com/ --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts
|
docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler create-login-profile --url https://example.com/ --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Host-Specific Proxies
|
||||||
|
|
||||||
|
With the 1.7.0 release, the crawler also supports running with multiple proxies, defined in a separate proxy YAML config file. The file contains a match hosts section, matching hosts by regex to named proxies.
|
||||||
|
|
||||||
|
For example, the following YAML file can be passed to `--proxyConfigFile` option:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
matchHosts:
|
||||||
|
# load all URLs from example.com through 'example-1-proxy'
|
||||||
|
example.com/.*: example-1-proxy
|
||||||
|
|
||||||
|
# load all URLS from https://my-social.example.com/.*/posts/ through
|
||||||
|
# a different proxy
|
||||||
|
https://my-social.example.com/.*/posts/: social-proxy
|
||||||
|
|
||||||
|
# optional default proxy
|
||||||
|
"": default-proxy
|
||||||
|
|
||||||
|
proxies:
|
||||||
|
# SOCKS5 proxy just needs a URL
|
||||||
|
example-1-proxy: socks5://username:password@my-socks-5-proxy.example.com
|
||||||
|
|
||||||
|
# SSH proxy also should have at least a 'privateKeyFile'
|
||||||
|
social-proxy:
|
||||||
|
url: ssh://user@my-social-proxy.example.com
|
||||||
|
privateKeyFile: /proxies/social-proxy-private-key
|
||||||
|
# optional
|
||||||
|
publicHostsFile: /proxies/social-proxy-public-hosts
|
||||||
|
|
||||||
|
default-proxy:
|
||||||
|
url: ssh://user@my-social-proxy.example.com
|
||||||
|
privateKeyFile: /proxies/default-proxy-private-key
|
||||||
|
```
|
||||||
|
|
||||||
|
If the above config is stored in `./proxies/proxyConfig.yaml` along with the SSH private keys and known public hosts
|
||||||
|
files, the crawler can be started with:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docker run -v $PWD/crawls:/crawls -v $PWD/proxies:/proxies -it webrecorder/browsertrix-crawler --url https://example.com/ --proxyServerConfig /proxies/proxyConfig.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that if SSH proxies are provided, an SSH tunnel must be opened for each one before the crawl starts.
|
||||||
|
The crawl will not start if any of the SSH proxy connections fail, even if a host-specific proxy is not actually used.
|
||||||
|
SOCKS5 and HTTP proxy connections are attempted only on first use.
|
||||||
|
|
||||||
|
The same `--proxyServerConfig` option can also be used in browser profile creation with the `create-login-profile` command in the same way.
|
||||||
|
|
||||||
|
### Proxy Precedence
|
||||||
|
|
||||||
|
If both `--proxyServerConfig` and `--proxyServer`/`PROXY_SERVER` env var are specified, the `--proxyServerConfig` option takes precedence on matching hosts. To have the single `--proxyServer` option always take precedence instead, pass the `--proxyServerPreferSingleProxy` option.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.7.0",
|
"version": "1.8.0-beta.0",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -186,6 +186,7 @@ export class Crawler {
|
||||||
maxHeapTotal = 0;
|
maxHeapTotal = 0;
|
||||||
|
|
||||||
proxyServer?: string;
|
proxyServer?: string;
|
||||||
|
proxyPacUrl?: string;
|
||||||
|
|
||||||
driver:
|
driver:
|
||||||
| ((opts: {
|
| ((opts: {
|
||||||
|
@ -508,7 +509,9 @@ export class Crawler {
|
||||||
setWARCInfo(this.infoString, this.params.warcInfo);
|
setWARCInfo(this.infoString, this.params.warcInfo);
|
||||||
logger.info(this.infoString);
|
logger.info(this.infoString);
|
||||||
|
|
||||||
this.proxyServer = await initProxy(this.params, RUN_DETACHED);
|
const res = await initProxy(this.params, RUN_DETACHED);
|
||||||
|
this.proxyServer = res.proxyServer;
|
||||||
|
this.proxyPacUrl = res.proxyPacUrl;
|
||||||
|
|
||||||
this.seeds = await parseSeeds(this.params);
|
this.seeds = await parseSeeds(this.params);
|
||||||
this.numOriginalSeeds = this.seeds.length;
|
this.numOriginalSeeds = this.seeds.length;
|
||||||
|
@ -1276,7 +1279,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async pageFinished(data: PageState) {
|
async pageFinished(data: PageState, lastErrorText = "") {
|
||||||
// if page loaded, considered page finished successfully
|
// if page loaded, considered page finished successfully
|
||||||
// (even if behaviors timed out)
|
// (even if behaviors timed out)
|
||||||
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
||||||
|
@ -1311,11 +1314,28 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.serializeConfig();
|
await this.serializeConfig();
|
||||||
|
|
||||||
if (depth === 0 && this.params.failOnFailedSeed) {
|
if (depth === 0 && this.params.failOnFailedSeed) {
|
||||||
|
let errorCode = ExitCodes.GenericError;
|
||||||
|
|
||||||
|
switch (lastErrorText) {
|
||||||
|
case "net::ERR_SOCKS_CONNECTION_FAILED":
|
||||||
|
case "net::SOCKS_CONNECTION_HOST_UNREACHABLE":
|
||||||
|
case "net::ERR_PROXY_CONNECTION_FAILED":
|
||||||
|
case "net::ERR_TUNNEL_CONNECTION_FAILED":
|
||||||
|
errorCode = ExitCodes.ProxyError;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "net::ERR_TIMED_OUT":
|
||||||
|
case "net::ERR_INVALID_AUTH_CREDENTIALS":
|
||||||
|
if (this.proxyServer || this.proxyPacUrl) {
|
||||||
|
errorCode = ExitCodes.ProxyError;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
"Seed Page Load Failed, failing crawl",
|
"Seed Page Load Failed, failing crawl",
|
||||||
{},
|
{},
|
||||||
"general",
|
"general",
|
||||||
ExitCodes.GenericError,
|
errorCode,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1703,7 +1723,8 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
emulateDevice: this.emulateDevice,
|
emulateDevice: this.emulateDevice,
|
||||||
swOpt: this.params.serviceWorker,
|
swOpt: this.params.serviceWorker,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
proxy: this.proxyServer,
|
proxyServer: this.proxyServer,
|
||||||
|
proxyPacUrl: this.proxyPacUrl,
|
||||||
userAgent: this.emulateDevice.userAgent,
|
userAgent: this.emulateDevice.userAgent,
|
||||||
extraArgs: this.extraChromeArgs(),
|
extraArgs: this.extraChromeArgs(),
|
||||||
},
|
},
|
||||||
|
|
|
@ -16,7 +16,7 @@ import { initStorage } from "./util/storage.js";
|
||||||
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||||
import { getInfoString } from "./util/file_reader.js";
|
import { getInfoString } from "./util/file_reader.js";
|
||||||
import { DISPLAY, ExitCodes } from "./util/constants.js";
|
import { DISPLAY, ExitCodes } from "./util/constants.js";
|
||||||
import { initProxy } from "./util/proxy.js";
|
import { initProxy, loadProxyConfig } from "./util/proxy.js";
|
||||||
//import { sleep } from "./util/timing.js";
|
//import { sleep } from "./util/timing.js";
|
||||||
|
|
||||||
const profileHTML = fs.readFileSync(
|
const profileHTML = fs.readFileSync(
|
||||||
|
@ -123,6 +123,12 @@ function initArgs() {
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
proxyServerConfig: {
|
||||||
|
describe:
|
||||||
|
"if set, path to yaml/json file that configures multiple path servers per URL regex",
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
sshProxyPrivateKeyFile: {
|
sshProxyPrivateKeyFile: {
|
||||||
describe:
|
describe:
|
||||||
"path to SSH private key for SOCKS5 over SSH proxy connection",
|
"path to SSH private key for SOCKS5 over SSH proxy connection",
|
||||||
|
@ -161,7 +167,9 @@ async function main() {
|
||||||
|
|
||||||
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
|
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
|
||||||
|
|
||||||
const proxyServer = await initProxy(params, false);
|
loadProxyConfig(params);
|
||||||
|
|
||||||
|
const { proxyServer, proxyPacUrl } = await initProxy(params, false);
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
logger.debug("Launching XVFB");
|
logger.debug("Launching XVFB");
|
||||||
|
@ -203,7 +211,8 @@ async function main() {
|
||||||
headless: params.headless,
|
headless: params.headless,
|
||||||
signals: false,
|
signals: false,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
proxy: proxyServer,
|
proxyServer,
|
||||||
|
proxyPacUrl,
|
||||||
extraArgs: [
|
extraArgs: [
|
||||||
"--window-position=0,0",
|
"--window-position=0,0",
|
||||||
`--window-size=${params.windowSize}`,
|
`--window-size=${params.windowSize}`,
|
||||||
|
|
|
@ -29,6 +29,7 @@ import {
|
||||||
logger,
|
logger,
|
||||||
} from "./logger.js";
|
} from "./logger.js";
|
||||||
import { SaveState } from "./state.js";
|
import { SaveState } from "./state.js";
|
||||||
|
import { loadProxyConfig } from "./proxy.js";
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export type CrawlerArgs = ReturnType<typeof parseArgs> & {
|
export type CrawlerArgs = ReturnType<typeof parseArgs> & {
|
||||||
|
@ -641,6 +642,19 @@ class ArgParser {
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
proxyServerPreferSingleProxy: {
|
||||||
|
describe:
|
||||||
|
"if set, and both proxyServer and proxyServerConfig are provided, the proxyServer value will be preferred",
|
||||||
|
type: "boolean",
|
||||||
|
default: false,
|
||||||
|
},
|
||||||
|
|
||||||
|
proxyServerConfig: {
|
||||||
|
describe:
|
||||||
|
"if set, path to yaml/json file that configures multiple path servers per URL regex",
|
||||||
|
type: "string",
|
||||||
|
},
|
||||||
|
|
||||||
dryRun: {
|
dryRun: {
|
||||||
describe:
|
describe:
|
||||||
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
|
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
|
||||||
|
@ -778,6 +792,8 @@ class ArgParser {
|
||||||
argv.emulateDevice = { viewport: null };
|
argv.emulateDevice = { viewport: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
loadProxyConfig(argv);
|
||||||
|
|
||||||
if (argv.lang) {
|
if (argv.lang) {
|
||||||
if (!ISO6391.validate(argv.lang)) {
|
if (!ISO6391.validate(argv.lang)) {
|
||||||
logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang);
|
logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang);
|
||||||
|
|
|
@ -272,7 +272,9 @@ export class BlockRules {
|
||||||
logDetails: Record<string, any>,
|
logDetails: Record<string, any>,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() });
|
const res = await fetch(reqUrl, {
|
||||||
|
dispatcher: getProxyDispatcher(reqUrl),
|
||||||
|
});
|
||||||
const text = await res.text();
|
const text = await res.text();
|
||||||
|
|
||||||
return !!text.match(frameTextMatch);
|
return !!text.match(frameTextMatch);
|
||||||
|
@ -303,7 +305,7 @@ export class BlockRules {
|
||||||
method: "PUT",
|
method: "PUT",
|
||||||
headers: { "Content-Type": "text/html" },
|
headers: { "Content-Type": "text/html" },
|
||||||
body,
|
body,
|
||||||
dispatcher: getProxyDispatcher(),
|
dispatcher: getProxyDispatcher(putUrl.href),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,8 @@ import { timedRun } from "./timing.js";
|
||||||
import assert from "node:assert";
|
import assert from "node:assert";
|
||||||
|
|
||||||
type BtrixChromeOpts = {
|
type BtrixChromeOpts = {
|
||||||
proxy?: string;
|
proxyServer?: string;
|
||||||
|
proxyPacUrl?: string;
|
||||||
userAgent?: string | null;
|
userAgent?: string | null;
|
||||||
extraArgs?: string[];
|
extraArgs?: string[];
|
||||||
};
|
};
|
||||||
|
@ -243,7 +244,8 @@ export class Browser {
|
||||||
}
|
}
|
||||||
|
|
||||||
chromeArgs({
|
chromeArgs({
|
||||||
proxy = "",
|
proxyServer = "",
|
||||||
|
proxyPacUrl = "",
|
||||||
userAgent = null,
|
userAgent = null,
|
||||||
extraArgs = [],
|
extraArgs = [],
|
||||||
}: BtrixChromeOpts) {
|
}: BtrixChromeOpts) {
|
||||||
|
@ -262,14 +264,14 @@ export class Browser {
|
||||||
...extraArgs,
|
...extraArgs,
|
||||||
];
|
];
|
||||||
|
|
||||||
if (proxy) {
|
if (proxyServer) {
|
||||||
const proxyString = getSafeProxyString(proxy);
|
const proxyString = getSafeProxyString(proxyServer);
|
||||||
logger.info("Using proxy", { proxy: proxyString }, "browser");
|
logger.info("Using proxy", { proxy: proxyString }, "browser");
|
||||||
}
|
|
||||||
|
|
||||||
if (proxy) {
|
|
||||||
args.push("--ignore-certificate-errors");
|
args.push("--ignore-certificate-errors");
|
||||||
args.push(`--proxy-server=${proxy}`);
|
args.push(`--proxy-server=${proxyServer}`);
|
||||||
|
} else if (proxyPacUrl) {
|
||||||
|
args.push("--proxy-pac-url=" + proxyPacUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
return args;
|
return args;
|
||||||
|
|
|
@ -41,7 +41,7 @@ async function writeUrlContentsToFile(
|
||||||
pathPrefix: string,
|
pathPrefix: string,
|
||||||
pathDefaultExt: string,
|
pathDefaultExt: string,
|
||||||
) {
|
) {
|
||||||
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
const res = await fetch(url, { dispatcher: getProxyDispatcher(url) });
|
||||||
const fileContents = await res.text();
|
const fileContents = await res.text();
|
||||||
|
|
||||||
const filename =
|
const filename =
|
||||||
|
|
|
@ -48,7 +48,7 @@ export class OriginOverride {
|
||||||
|
|
||||||
const resp = await fetch(newUrl, {
|
const resp = await fetch(newUrl, {
|
||||||
headers,
|
headers,
|
||||||
dispatcher: getProxyDispatcher(),
|
dispatcher: getProxyDispatcher(newUrl),
|
||||||
});
|
});
|
||||||
|
|
||||||
const body = Buffer.from(await resp.arrayBuffer());
|
const body = Buffer.from(await resp.arrayBuffer());
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import net from "net";
|
import net from "net";
|
||||||
import { Agent, Dispatcher, ProxyAgent } from "undici";
|
|
||||||
|
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
import fs from "fs";
|
||||||
|
|
||||||
|
import { Agent, Dispatcher, ProxyAgent } from "undici";
|
||||||
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
|
@ -9,11 +11,40 @@ import { socksDispatcher } from "fetch-socks";
|
||||||
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
||||||
import { ExitCodes, FETCH_HEADERS_TIMEOUT_SECS } from "./constants.js";
|
import { ExitCodes, FETCH_HEADERS_TIMEOUT_SECS } from "./constants.js";
|
||||||
|
|
||||||
|
import http, { IncomingMessage, ServerResponse } from "http";
|
||||||
|
|
||||||
const SSH_PROXY_LOCAL_PORT = 9722;
|
const SSH_PROXY_LOCAL_PORT = 9722;
|
||||||
|
|
||||||
const SSH_WAIT_TIMEOUT = 30000;
|
const SSH_WAIT_TIMEOUT = 30000;
|
||||||
|
|
||||||
let proxyDispatcher: Dispatcher | undefined = undefined;
|
//let proxyDispatcher: Dispatcher | undefined = undefined;
|
||||||
|
|
||||||
|
type ProxyEntry = {
|
||||||
|
proxyUrl: string;
|
||||||
|
dispatcher: Dispatcher;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type ProxyServerConfig = {
|
||||||
|
matchHosts?: Record<string, string>;
|
||||||
|
proxies?: Record<
|
||||||
|
string,
|
||||||
|
string | { url: string; privateKeyFile?: string; publicHostsFile?: string }
|
||||||
|
>;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type ProxyCLIArgs = {
|
||||||
|
sshProxyPrivateKeyFile?: string;
|
||||||
|
sshProxyKnownHostsFile?: string;
|
||||||
|
sshProxyLocalPort?: number;
|
||||||
|
|
||||||
|
proxyServer?: string;
|
||||||
|
proxyServerPreferSingleProxy?: boolean;
|
||||||
|
|
||||||
|
proxyMap?: ProxyServerConfig;
|
||||||
|
};
|
||||||
|
|
||||||
|
const proxyMap = new Map<RegExp, ProxyEntry>();
|
||||||
|
let defaultProxyEntry: ProxyEntry | null = null;
|
||||||
|
|
||||||
export function getEnvProxyUrl() {
|
export function getEnvProxyUrl() {
|
||||||
if (process.env.PROXY_SERVER) {
|
if (process.env.PROXY_SERVER) {
|
||||||
|
@ -28,6 +59,27 @@ export function getEnvProxyUrl() {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function loadProxyConfig(params: {
|
||||||
|
proxyServerConfig?: string;
|
||||||
|
proxyMap?: ProxyServerConfig;
|
||||||
|
}) {
|
||||||
|
if (params.proxyServerConfig) {
|
||||||
|
const proxyServerConfig = params.proxyServerConfig;
|
||||||
|
try {
|
||||||
|
const proxies = yaml.load(
|
||||||
|
fs.readFileSync(proxyServerConfig, "utf8"),
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
) as any;
|
||||||
|
params.proxyMap = proxies;
|
||||||
|
logger.debug("Proxy host match config loaded", { proxyServerConfig });
|
||||||
|
} catch (e) {
|
||||||
|
logger.warn("Proxy host match config file not found, ignoring", {
|
||||||
|
proxyServerConfig,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export function getSafeProxyString(proxyString: string): string {
|
export function getSafeProxyString(proxyString: string): string {
|
||||||
if (!proxyString) {
|
if (!proxyString) {
|
||||||
return "";
|
return "";
|
||||||
|
@ -54,31 +106,127 @@ export function getSafeProxyString(proxyString: string): string {
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function initProxy(
|
export async function initProxy(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
params: ProxyCLIArgs,
|
||||||
params: Record<string, any>,
|
|
||||||
detached: boolean,
|
detached: boolean,
|
||||||
): Promise<string | undefined> {
|
): Promise<{ proxyServer?: string; proxyPacUrl?: string }> {
|
||||||
let proxy = params.proxyServer;
|
const { sshProxyPrivateKeyFile, sshProxyKnownHostsFile, sshProxyLocalPort } =
|
||||||
|
params;
|
||||||
|
let localPort = sshProxyLocalPort || SSH_PROXY_LOCAL_PORT;
|
||||||
|
|
||||||
if (!proxy) {
|
const singleProxy = params.proxyServer || getEnvProxyUrl();
|
||||||
proxy = getEnvProxyUrl();
|
|
||||||
|
if (singleProxy) {
|
||||||
|
defaultProxyEntry = await initSingleProxy(
|
||||||
|
singleProxy,
|
||||||
|
localPort++,
|
||||||
|
detached,
|
||||||
|
sshProxyPrivateKeyFile,
|
||||||
|
sshProxyKnownHostsFile,
|
||||||
|
);
|
||||||
|
if (params.proxyServerPreferSingleProxy && defaultProxyEntry.proxyUrl) {
|
||||||
|
return { proxyServer: defaultProxyEntry.proxyUrl };
|
||||||
}
|
}
|
||||||
if (proxy && proxy.startsWith("ssh://")) {
|
}
|
||||||
proxy = await runSSHD(params, detached);
|
|
||||||
|
if (!params.proxyMap?.matchHosts || !params.proxyMap?.proxies) {
|
||||||
|
if (defaultProxyEntry) {
|
||||||
|
logger.debug("Using Single Proxy", {}, "proxy");
|
||||||
|
}
|
||||||
|
return { proxyServer: defaultProxyEntry?.proxyUrl };
|
||||||
|
}
|
||||||
|
|
||||||
|
const nameToProxy = new Map<string, ProxyEntry>();
|
||||||
|
|
||||||
|
for (const [name, value] of Object.entries(params.proxyMap.proxies)) {
|
||||||
|
let proxyUrl = "";
|
||||||
|
let privateKeyFile: string | undefined = "";
|
||||||
|
let publicHostsFile: string | undefined = "";
|
||||||
|
|
||||||
|
if (typeof value === "string") {
|
||||||
|
proxyUrl = value;
|
||||||
|
} else {
|
||||||
|
proxyUrl = value.url;
|
||||||
|
privateKeyFile = value.privateKeyFile;
|
||||||
|
publicHostsFile = value.publicHostsFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile;
|
||||||
|
publicHostsFile = publicHostsFile || sshProxyKnownHostsFile;
|
||||||
|
|
||||||
|
logger.debug("Initing proxy", {
|
||||||
|
url: getSafeProxyString(proxyUrl),
|
||||||
|
localPort,
|
||||||
|
privateKeyFile,
|
||||||
|
publicHostsFile,
|
||||||
|
});
|
||||||
|
|
||||||
|
const entry = await initSingleProxy(
|
||||||
|
proxyUrl,
|
||||||
|
localPort++,
|
||||||
|
detached,
|
||||||
|
privateKeyFile,
|
||||||
|
publicHostsFile,
|
||||||
|
);
|
||||||
|
|
||||||
|
nameToProxy.set(name, entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [rx, name] of Object.entries(params.proxyMap.matchHosts)) {
|
||||||
|
const entry = nameToProxy.get(name);
|
||||||
|
|
||||||
|
if (!entry) {
|
||||||
|
logger.fatal("Proxy specified but not found in proxies list: " + name);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rx) {
|
||||||
|
proxyMap.set(new RegExp(rx), entry);
|
||||||
|
} else {
|
||||||
|
defaultProxyEntry = entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const p = new ProxyPacServer();
|
||||||
|
|
||||||
|
logger.debug("Using Proxy PAC script", {}, "proxy");
|
||||||
|
|
||||||
|
return { proxyPacUrl: `http://localhost:${p.port}/proxy.pac` };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function initSingleProxy(
|
||||||
|
proxyUrl: string,
|
||||||
|
localPort: number,
|
||||||
|
detached: boolean,
|
||||||
|
sshProxyPrivateKeyFile?: string,
|
||||||
|
sshProxyKnownHostsFile?: string,
|
||||||
|
): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> {
|
||||||
|
if (proxyUrl && proxyUrl.startsWith("ssh://")) {
|
||||||
|
proxyUrl = await runSSHD(
|
||||||
|
proxyUrl,
|
||||||
|
localPort,
|
||||||
|
detached,
|
||||||
|
sshProxyPrivateKeyFile,
|
||||||
|
sshProxyKnownHostsFile,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const agentOpts: Agent.Options = {
|
const agentOpts: Agent.Options = {
|
||||||
headersTimeout: FETCH_HEADERS_TIMEOUT_SECS * 1000,
|
headersTimeout: FETCH_HEADERS_TIMEOUT_SECS * 1000,
|
||||||
};
|
};
|
||||||
|
|
||||||
// set global fetch() dispatcher (with proxy, if any)
|
const dispatcher = createDispatcher(proxyUrl, agentOpts);
|
||||||
const dispatcher = createDispatcher(proxy, agentOpts);
|
return { proxyUrl, dispatcher };
|
||||||
proxyDispatcher = dispatcher;
|
|
||||||
return proxy;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function getProxyDispatcher() {
|
export function getProxyDispatcher(url: string) {
|
||||||
return proxyDispatcher;
|
// find url match by regex first
|
||||||
|
for (const [rx, { dispatcher }] of proxyMap.entries()) {
|
||||||
|
if (rx && url.match(rx)) {
|
||||||
|
return dispatcher;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if default proxy set, return default dispatcher, otherwise no dispatcher
|
||||||
|
return defaultProxyEntry ? defaultProxyEntry.dispatcher : undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function createDispatcher(
|
export function createDispatcher(
|
||||||
|
@ -113,9 +261,13 @@ export function createDispatcher(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
export async function runSSHD(
|
||||||
export async function runSSHD(params: Record<string, any>, detached: boolean) {
|
proxyServer: string,
|
||||||
const { proxyServer } = params;
|
localPort: number,
|
||||||
|
detached: boolean,
|
||||||
|
privateKey?: string,
|
||||||
|
publicKnownHost?: string,
|
||||||
|
) {
|
||||||
if (!proxyServer || !proxyServer.startsWith("ssh://")) {
|
if (!proxyServer || !proxyServer.startsWith("ssh://")) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
@ -126,17 +278,14 @@ export async function runSSHD(params: Record<string, any>, detached: boolean) {
|
||||||
const host = proxyServerUrl.hostname.replace("[", "").replace("]", "");
|
const host = proxyServerUrl.hostname.replace("[", "").replace("]", "");
|
||||||
const port = proxyServerUrl.port || 22;
|
const port = proxyServerUrl.port || 22;
|
||||||
const user = proxyServerUrl.username || "root";
|
const user = proxyServerUrl.username || "root";
|
||||||
const localPort = params.sshProxyLocalPort || SSH_PROXY_LOCAL_PORT;
|
|
||||||
const proxyString = `socks5://localhost:${localPort}`;
|
const proxyString = `socks5://localhost:${localPort}`;
|
||||||
|
|
||||||
const args: string[] = [
|
const args: string[] = [
|
||||||
user + "@" + host,
|
user + "@" + host,
|
||||||
"-p",
|
"-p",
|
||||||
port,
|
port + "",
|
||||||
"-D",
|
"-D",
|
||||||
localPort,
|
localPort + "",
|
||||||
"-i",
|
|
||||||
params.sshProxyPrivateKeyFile,
|
|
||||||
"-o",
|
"-o",
|
||||||
"IdentitiesOnly=yes",
|
"IdentitiesOnly=yes",
|
||||||
"-o",
|
"-o",
|
||||||
|
@ -146,12 +295,17 @@ export async function runSSHD(params: Record<string, any>, detached: boolean) {
|
||||||
"-o",
|
"-o",
|
||||||
];
|
];
|
||||||
|
|
||||||
if (params.sshProxyKnownHostsFile) {
|
if (publicKnownHost) {
|
||||||
args.push(`UserKnownHostsFile=${params.sshProxyKnownHostsFile}`);
|
args.push(`UserKnownHostsFile=${publicKnownHost}`);
|
||||||
} else {
|
} else {
|
||||||
args.push("StrictHostKeyChecking=no");
|
args.push("StrictHostKeyChecking=no");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (privateKey) {
|
||||||
|
args.push("-i");
|
||||||
|
args.push(privateKey);
|
||||||
|
}
|
||||||
|
|
||||||
args.push("-M", "0", "-N", "-T");
|
args.push("-M", "0", "-N", "-T");
|
||||||
|
|
||||||
logger.info("Checking SSH connection for proxy...", {}, "proxy");
|
logger.info("Checking SSH connection for proxy...", {}, "proxy");
|
||||||
|
@ -221,7 +375,7 @@ export async function runSSHD(params: Record<string, any>, detached: boolean) {
|
||||||
"proxy",
|
"proxy",
|
||||||
ExitCodes.ProxyError,
|
ExitCodes.ProxyError,
|
||||||
);
|
);
|
||||||
return;
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
|
@ -241,10 +395,61 @@ export async function runSSHD(params: Record<string, any>, detached: boolean) {
|
||||||
},
|
},
|
||||||
"proxy",
|
"proxy",
|
||||||
);
|
);
|
||||||
runSSHD(params, detached).catch((e) =>
|
runSSHD(
|
||||||
logger.error("proxy retry error", e, "proxy"),
|
proxyServer,
|
||||||
);
|
localPort,
|
||||||
|
detached,
|
||||||
|
privateKey,
|
||||||
|
publicKnownHost,
|
||||||
|
).catch((e) => logger.error("proxy retry error", e, "proxy"));
|
||||||
});
|
});
|
||||||
|
|
||||||
return proxyString;
|
return proxyString;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class ProxyPacServer {
|
||||||
|
port = 20278;
|
||||||
|
|
||||||
|
proxyPacText = "";
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
const httpServer = http.createServer((req, res) =>
|
||||||
|
this.handleRequest(req, res),
|
||||||
|
);
|
||||||
|
httpServer.listen(this.port);
|
||||||
|
this.generateProxyPac();
|
||||||
|
}
|
||||||
|
|
||||||
|
async handleRequest(request: IncomingMessage, response: ServerResponse) {
|
||||||
|
response.writeHead(200, {
|
||||||
|
"Content-Type": "application/x-ns-proxy-autoconfig",
|
||||||
|
});
|
||||||
|
response.end(this.proxyPacText);
|
||||||
|
}
|
||||||
|
|
||||||
|
generateProxyPac() {
|
||||||
|
const urlToProxy = (proxyUrl: string) => {
|
||||||
|
const url = new URL(proxyUrl);
|
||||||
|
const hostport = url.href.slice(url.protocol.length + 2);
|
||||||
|
const type = url.protocol.slice(0, -1).toUpperCase();
|
||||||
|
return `"${type} ${hostport}"`;
|
||||||
|
};
|
||||||
|
|
||||||
|
this.proxyPacText = `
|
||||||
|
|
||||||
|
function FindProxyForURL(url, host) {
|
||||||
|
|
||||||
|
`;
|
||||||
|
proxyMap.forEach(({ proxyUrl }, k) => {
|
||||||
|
this.proxyPacText += ` if (url.match(/${
|
||||||
|
k.source
|
||||||
|
}/)) { return ${urlToProxy(proxyUrl)}; }\n`;
|
||||||
|
});
|
||||||
|
|
||||||
|
this.proxyPacText += `\n return ${
|
||||||
|
defaultProxyEntry ? urlToProxy(defaultProxyEntry.proxyUrl) : `"DIRECT"`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -144,6 +144,8 @@ export class Recorder extends EventEmitter {
|
||||||
|
|
||||||
pageFinished = false;
|
pageFinished = false;
|
||||||
|
|
||||||
|
lastErrorText = "";
|
||||||
|
|
||||||
gzip = true;
|
gzip = true;
|
||||||
|
|
||||||
writer: WARCWriter;
|
writer: WARCWriter;
|
||||||
|
@ -481,6 +483,7 @@ export class Recorder extends EventEmitter {
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
this.lastErrorText = errorText;
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Request failed",
|
"Request failed",
|
||||||
{ url, errorText, type, status: reqresp.status, ...this.logDetails },
|
{ url, errorText, type, status: reqresp.status, ...this.logDetails },
|
||||||
|
@ -953,6 +956,7 @@ export class Recorder extends EventEmitter {
|
||||||
this.pageid = pageid;
|
this.pageid = pageid;
|
||||||
this.pageUrl = url;
|
this.pageUrl = url;
|
||||||
this.finalPageUrl = this.pageUrl;
|
this.finalPageUrl = this.pageUrl;
|
||||||
|
this.lastErrorText = "";
|
||||||
this.logDetails = { page: url, workerid: this.workerid };
|
this.logDetails = { page: url, workerid: this.workerid };
|
||||||
if (this.pendingRequests && this.pendingRequests.size) {
|
if (this.pendingRequests && this.pendingRequests.size) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
|
@ -1735,7 +1739,7 @@ class AsyncFetcher {
|
||||||
|
|
||||||
const headers = reqresp.getRequestHeadersDict();
|
const headers = reqresp.getRequestHeadersDict();
|
||||||
|
|
||||||
let dispatcher = getProxyDispatcher();
|
let dispatcher = getProxyDispatcher(url);
|
||||||
|
|
||||||
if (dispatcher) {
|
if (dispatcher) {
|
||||||
dispatcher = dispatcher.compose((dispatch) => {
|
dispatcher = dispatcher.compose((dispatch) => {
|
||||||
|
|
|
@ -68,7 +68,7 @@ export class SitemapReader extends EventEmitter {
|
||||||
while (true) {
|
while (true) {
|
||||||
const resp = await fetch(url, {
|
const resp = await fetch(url, {
|
||||||
headers: this.headers,
|
headers: this.headers,
|
||||||
dispatcher: getProxyDispatcher(),
|
dispatcher: getProxyDispatcher(url),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (resp.ok) {
|
if (resp.ok) {
|
||||||
|
|
|
@ -311,7 +311,7 @@ export class PageWorker {
|
||||||
}
|
}
|
||||||
|
|
||||||
await timedRun(
|
await timedRun(
|
||||||
this.crawler.pageFinished(data),
|
this.crawler.pageFinished(data, this.recorder?.lastErrorText),
|
||||||
FINISHED_TIMEOUT,
|
FINISHED_TIMEOUT,
|
||||||
"Page Finished Timed Out",
|
"Page Finished Timed Out",
|
||||||
this.logDetails,
|
this.logDetails,
|
||||||
|
|
6
tests/fixtures/proxies/proxy-test-bad-auth.pac
vendored
Normal file
6
tests/fixtures/proxies/proxy-test-bad-auth.pac
vendored
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
matchHosts:
|
||||||
|
old.webrecorder.net: socks-proxy
|
||||||
|
|
||||||
|
proxies:
|
||||||
|
socks-proxy: socks5://user:passw1rd@proxy-with-auth:1080
|
||||||
|
|
5
tests/fixtures/proxies/proxy-test-good-auth.pac
vendored
Normal file
5
tests/fixtures/proxies/proxy-test-good-auth.pac
vendored
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
matchHosts:
|
||||||
|
old.webrecorder.net: socks-proxy
|
||||||
|
|
||||||
|
proxies:
|
||||||
|
socks-proxy: socks5://user:passw0rd@proxy-with-auth:1080
|
|
@ -9,6 +9,8 @@ const SOCKS_PORT = "1080";
|
||||||
const HTTP_PORT = "3128";
|
const HTTP_PORT = "3128";
|
||||||
const WRONG_PORT = "33130";
|
const WRONG_PORT = "33130";
|
||||||
|
|
||||||
|
const PROXY_EXIT_CODE = 21;
|
||||||
|
|
||||||
const SSH_PROXY_IMAGE = "linuxserver/openssh-server"
|
const SSH_PROXY_IMAGE = "linuxserver/openssh-server"
|
||||||
|
|
||||||
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
||||||
|
@ -27,7 +29,7 @@ beforeAll(() => {
|
||||||
|
|
||||||
proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
||||||
|
|
||||||
proxySSHId = execSync(`docker run -d --rm -e DOCKER_MODS=linuxserver/mods:openssh-server-ssh-tunnel -e USER_NAME=user -e PUBLIC_KEY_FILE=/keys/proxy-key.pub -v $PWD/tests/fixtures/proxy-key.pub:/keys/proxy-key.pub --network=proxy-test-net --name ssh-proxy ${SSH_PROXY_IMAGE}`);
|
proxySSHId = execSync(`docker run -d --rm -e DOCKER_MODS=linuxserver/mods:openssh-server-ssh-tunnel -e USER_NAME=user -e PUBLIC_KEY_FILE=/keys/proxy-key.pub -v $PWD/tests/fixtures/proxies/proxy-key.pub:/keys/proxy-key.pub --network=proxy-test-net --name ssh-proxy ${SSH_PROXY_IMAGE}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
afterAll(async () => {
|
afterAll(async () => {
|
||||||
|
@ -66,7 +68,7 @@ describe("socks5 + https proxy tests", () => {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
// auth supported only for SOCKS5
|
// auth supported only for SOCKS5
|
||||||
expect(status).toBe(scheme === "socks5" ? 0 : 1);
|
expect(status).toBe(scheme === "socks5" ? 0 : PROXY_EXIT_CODE);
|
||||||
});
|
});
|
||||||
|
|
||||||
test(`${scheme} proxy, ${type}, wrong auth`, () => {
|
test(`${scheme} proxy, ${type}, wrong auth`, () => {
|
||||||
|
@ -77,7 +79,7 @@ describe("socks5 + https proxy tests", () => {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
expect(status).toBe(1);
|
expect(status).toBe(PROXY_EXIT_CODE);
|
||||||
});
|
});
|
||||||
|
|
||||||
test(`${scheme} proxy, ${type}, wrong protocol`, () => {
|
test(`${scheme} proxy, ${type}, wrong protocol`, () => {
|
||||||
|
@ -88,7 +90,8 @@ describe("socks5 + https proxy tests", () => {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
expect(status).toBe(1);
|
// wrong protocol (socks5 for http) causes connection to hang, causes a timeout, so just errors with 1
|
||||||
|
expect(status === PROXY_EXIT_CODE || status === 1).toBe(true);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,7 +103,7 @@ describe("socks5 + https proxy tests", () => {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
expect(status).toBe(1);
|
expect(status).toBe(PROXY_EXIT_CODE);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -118,7 +121,7 @@ test("http proxy set, but not running, separate env vars", () => {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
expect(status).toBe(1);
|
expect(status).toBe(PROXY_EXIT_CODE);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("http proxy set, but not running, cli arg", () => {
|
test("http proxy set, but not running, cli arg", () => {
|
||||||
|
@ -129,12 +132,12 @@ test("http proxy set, but not running, cli arg", () => {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
expect(status).toBe(1);
|
expect(status).toBe(PROXY_EXIT_CODE);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("ssh socks proxy with custom user", () => {
|
test("ssh socks proxy with custom user", () => {
|
||||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxy-key:/keys/proxy-key webrecorder/browsertrix-crawler crawl --proxyServer ssh://user@ssh-proxy:2222 --sshProxyPrivateKeyFile /keys/proxy-key --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/proxy-key:/keys/proxy-key webrecorder/browsertrix-crawler crawl --proxyServer ssh://user@ssh-proxy:2222 --sshProxyPrivateKeyFile /keys/proxy-key --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,7 +149,7 @@ test("ssh socks proxy, wrong user", () => {
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
status = e.status;
|
status = e.status;
|
||||||
}
|
}
|
||||||
expect(status).toBe(21);
|
expect(status).toBe(PROXY_EXIT_CODE);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
@ -164,4 +167,30 @@ test("ensure logged proxy string does not include any credentials", () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("proxy with config file, wrong auth or no match", () => {
|
||||||
|
let status = 0;
|
||||||
|
try {
|
||||||
|
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-bad-auth.pac --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||||
|
} catch (e) {
|
||||||
|
status = e.status;
|
||||||
|
}
|
||||||
|
expect(status).toBe(PROXY_EXIT_CODE);
|
||||||
|
|
||||||
|
// success, no match for PDF
|
||||||
|
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-bad-auth.pac --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("proxy with config file, correct auth or no match", () => {
|
||||||
|
let status = 0;
|
||||||
|
try {
|
||||||
|
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-good-auth.pac --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||||
|
} catch (e) {
|
||||||
|
status = e.status;
|
||||||
|
}
|
||||||
|
expect(status).toBe(0);
|
||||||
|
|
||||||
|
// success, no match for PDF
|
||||||
|
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-good-auth.pac --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||||
|
|
||||||
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue