mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Compare commits
No commits in common. "main" and "v1.7.0-beta.0" have entirely different histories.
main
...
v1.7.0-bet
38 changed files with 754 additions and 1209 deletions
|
@ -1,4 +1,4 @@
|
|||
ARG BROWSER_VERSION=1.82.170
|
||||
ARG BROWSER_VERSION=1.80.113
|
||||
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION}
|
||||
|
||||
FROM ${BROWSER_IMAGE_BASE}
|
||||
|
@ -39,7 +39,7 @@ ADD config/ /app/
|
|||
|
||||
ADD html/ /app/html/
|
||||
|
||||
ARG RWP_VERSION=2.3.19
|
||||
ARG RWP_VERSION=2.3.7
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
|
||||
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
|
||||
|
|
|
@ -35,15 +35,14 @@ To disable all behaviors, use `--behaviors ""`.
|
|||
## Behavior and Page Timeouts
|
||||
|
||||
Browsertrix includes a number of timeouts, including before, during and after running behaviors.
|
||||
|
||||
The timeouts are as follows:
|
||||
|
||||
- `--pageLoadTimeout`: how long to wait for page to finish loading, *before* doing anything else.
|
||||
- `--waitUntil`: how long to wait for page to finish loading, *before* doing anything else.
|
||||
- `--postLoadDelay`: how long to wait *before* starting any behaviors, but after page has finished loading. A custom behavior can override this (see below).
|
||||
- `--behaviorTimeout`: maximum time to spend on running site-specific / Autoscroll behaviors (can be less if behavior finishes early).
|
||||
- `--pageExtraDelay`: how long to wait *after* finishing behaviors (or after `behaviorTimeout` has been reached) before moving on to next page.
|
||||
|
||||
A site-specific behavior (or Autoscroll) will start after the page is loaded (at most after `--pageLoadTimeout` seconds) and exactly after `--postLoadDelay` seconds.
|
||||
A site-specific behavior (or Autoscroll) will start after the page is loaded (at most after `--waitUntil` seconds) and exactly after `--postLoadDelay` seconds.
|
||||
|
||||
The behavior will then run until finished or at most until `--behaviorTimeout` is reached (90 seconds by default).
|
||||
|
||||
|
@ -266,29 +265,5 @@ Some of these functions which may be of use to behaviors authors are:
|
|||
- `scrollToOffset`: scroll to particular offset
|
||||
- `scrollIntoView`: smoothly scroll particular element into view
|
||||
- `getState`: increment a state counter and return all state counters + string message
|
||||
* `addLink`: add a given URL to the crawl queue
|
||||
|
||||
More detailed references will be added in the future.
|
||||
|
||||
## Fail On Content Check
|
||||
|
||||
In Browsertrix Crawler 1.7.0 and higher, the `--failOnContentCheck` option will result in a crawl failing if a behavior detects the presence or absence of certain content on a page in its `awaitPageLoad()` callback. By default, this is used to fail a crawl if site-specific behaviors determine that the user is not logged in on the following sites:
|
||||
|
||||
- Facebook
|
||||
- Instagram
|
||||
- TikTok
|
||||
- X
|
||||
|
||||
It is also used to fail crawls with YouTube videos if one of the videos is found not to play.
|
||||
|
||||
It is possible to add content checks to custom behaviors. To do so, include an `awaitPageLoad` method on the behavior and use the `ctx.Lib` function `assertContentValid` to check for content and fail the behavior with a specified reason if it is not found.
|
||||
|
||||
For an example, see the following `awaitPageLoad` example from the site-specific behavior for X:
|
||||
|
||||
```javascript
|
||||
async awaitPageLoad(ctx: any) {
|
||||
const { sleep, assertContentValid } = ctx.Lib;
|
||||
await sleep(5);
|
||||
assertContentValid(() => !document.documentElement.outerHTML.match(/Log In/i), "not_logged_in");
|
||||
}
|
||||
```
|
||||
|
|
|
@ -103,16 +103,16 @@ Options:
|
|||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
|
||||
[default: []]
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy"] [default:
|
||||
[]]
|
||||
--logExcludeContext Comma-separated list of contexts to
|
||||
NOT include in logs
|
||||
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
|
||||
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
|
||||
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
|
||||
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
|
||||
[default: ["recorderNetwork","jsError","screencast"]]
|
||||
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy"] [default:
|
||||
["recorderNetwork","jsError","screencast"]]
|
||||
--text Extract initial (default) or final t
|
||||
ext to pages.jsonl or WARC resource
|
||||
record(s)
|
||||
|
@ -261,10 +261,6 @@ Options:
|
|||
ailOnFailedSeed may result in crawl
|
||||
failing due to non-200 responses
|
||||
[boolean] [default: false]
|
||||
--failOnContentCheck If set, allows for behaviors to fail
|
||||
a crawl with custom reason based on
|
||||
content (e.g. logged out)
|
||||
[boolean] [default: false]
|
||||
--customBehaviors Custom behavior files to inject. Val
|
||||
id values: URL to file, path to file
|
||||
, path to directory of behaviors, UR
|
||||
|
@ -276,10 +272,6 @@ Options:
|
|||
git+https://git.example.com/repo.git
|
||||
?branch=dev&path=some/dir"
|
||||
[array] [default: []]
|
||||
--saveStorage if set, will store the localStorage/
|
||||
sessionStorage data for each page as
|
||||
part of WARC-JSON-Metadata field
|
||||
[boolean]
|
||||
--debugAccessRedis if set, runs internal redis without
|
||||
protected mode to allow external acc
|
||||
ess (for debugging) [boolean]
|
||||
|
@ -294,13 +286,6 @@ Options:
|
|||
--proxyServer if set, will use specified proxy ser
|
||||
ver. Takes precedence over any env v
|
||||
ar proxy settings [string]
|
||||
--proxyServerPreferSingleProxy if set, and both proxyServer and pro
|
||||
xyServerConfig are provided, the pro
|
||||
xyServer value will be preferred
|
||||
[boolean] [default: false]
|
||||
--proxyServerConfig if set, path to yaml/json file that
|
||||
configures multiple path servers per
|
||||
URL regex [string]
|
||||
--dryRun If true, no archive data is written
|
||||
to disk, only pages and logs (and op
|
||||
tionally saved state). [boolean]
|
||||
|
@ -350,8 +335,6 @@ Options:
|
|||
[number] [default: 7]
|
||||
--proxyServer if set, will use specified proxy server. Takes prece
|
||||
dence over any env var proxy settings [string]
|
||||
--proxyServerConfig if set, path to yaml/json file that configures multi
|
||||
ple path servers per URL regex [string]
|
||||
--sshProxyPrivateKeyFile path to SSH private key for SOCKS5 over SSH proxy co
|
||||
nnection [string]
|
||||
--sshProxyKnownHostsFile path to SSH known hosts file for SOCKS5 over SSH pro
|
||||
|
|
|
@ -80,55 +80,7 @@ The above proxy settings also apply to [Browser Profile Creation](browser-profil
|
|||
docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler create-login-profile --url https://example.com/ --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts
|
||||
```
|
||||
|
||||
## Host-Specific Proxies
|
||||
|
||||
With the 1.7.0 release, the crawler also supports running with multiple proxies, defined in a separate proxy YAML config file. The file contains a match hosts section, matching hosts by regex to named proxies.
|
||||
|
||||
For example, the following YAML file can be passed to `--proxyConfigFile` option:
|
||||
|
||||
```yaml
|
||||
matchHosts:
|
||||
# load all URLs from example.com through 'example-1-proxy'
|
||||
example.com/.*: example-1-proxy
|
||||
|
||||
# load all URLS from https://my-social.example.com/.*/posts/ through
|
||||
# a different proxy
|
||||
https://my-social.example.com/.*/posts/: social-proxy
|
||||
|
||||
# optional default proxy
|
||||
"": default-proxy
|
||||
|
||||
proxies:
|
||||
# SOCKS5 proxy just needs a URL
|
||||
example-1-proxy: socks5://username:password@my-socks-5-proxy.example.com
|
||||
|
||||
# SSH proxy also should have at least a 'privateKeyFile'
|
||||
social-proxy:
|
||||
url: ssh://user@my-social-proxy.example.com
|
||||
privateKeyFile: /proxies/social-proxy-private-key
|
||||
# optional
|
||||
publicHostsFile: /proxies/social-proxy-public-hosts
|
||||
|
||||
default-proxy:
|
||||
url: ssh://user@my-social-proxy.example.com
|
||||
privateKeyFile: /proxies/default-proxy-private-key
|
||||
```
|
||||
|
||||
If the above config is stored in `./proxies/proxyConfig.yaml` along with the SSH private keys and known public hosts
|
||||
files, the crawler can be started with:
|
||||
|
||||
```sh
|
||||
docker run -v $PWD/crawls:/crawls -v $PWD/proxies:/proxies -it webrecorder/browsertrix-crawler --url https://example.com/ --proxyServerConfig /proxies/proxyConfig.yaml
|
||||
```
|
||||
|
||||
Note that if SSH proxies are provided, an SSH tunnel must be opened for each one before the crawl starts.
|
||||
The crawl will not start if any of the SSH proxy connections fail, even if a host-specific proxy is not actually used.
|
||||
SOCKS5 and HTTP proxy connections are attempted only on first use.
|
||||
|
||||
The same `--proxyServerConfig` option can also be used in browser profile creation with the `create-login-profile` command in the same way.
|
||||
|
||||
### Proxy Precedence
|
||||
|
||||
If both `--proxyServerConfig` and `--proxyServer`/`PROXY_SERVER` env var are specified, the `--proxyServerConfig` option takes precedence on matching hosts. To have the single `--proxyServer` option always take precedence instead, pass the `--proxyServerPreferSingleProxy` option.
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
14
package.json
14
package.json
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.8.1",
|
||||
"version": "1.7.0-beta.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
@ -17,9 +17,9 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"@novnc/novnc": "1.4.0",
|
||||
"@puppeteer/replay": "^3.1.3",
|
||||
"@webrecorder/wabac": "^2.24.1",
|
||||
"browsertrix-behaviors": "^0.9.2",
|
||||
"@puppeteer/replay": "^3.1.1",
|
||||
"@webrecorder/wabac": "^2.23.6",
|
||||
"browsertrix-behaviors": "^0.9.0",
|
||||
"client-zip": "^2.4.5",
|
||||
"css-selector-parser": "^3.0.5",
|
||||
"fetch-socks": "^1.3.0",
|
||||
|
@ -33,13 +33,13 @@
|
|||
"p-queue": "^7.3.4",
|
||||
"pixelmatch": "^5.3.0",
|
||||
"pngjs": "^7.0.0",
|
||||
"puppeteer-core": "^24.22.0",
|
||||
"puppeteer-core": "^24.7.2",
|
||||
"sax": "^1.3.0",
|
||||
"sharp": "^0.32.6",
|
||||
"tsc": "^2.0.4",
|
||||
"undici": "^6.18.2",
|
||||
"uuid": "8.3.2",
|
||||
"warcio": "^2.4.7",
|
||||
"warcio": "^2.4.4",
|
||||
"ws": "^7.4.4",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
|
@ -71,7 +71,7 @@
|
|||
},
|
||||
"resolutions": {
|
||||
"wrap-ansi": "7.0.0",
|
||||
"warcio": "^2.4.7",
|
||||
"warcio": "^2.4.4",
|
||||
"@novnc/novnc": "1.4.0"
|
||||
}
|
||||
}
|
||||
|
|
162
src/crawler.ts
162
src/crawler.ts
|
@ -178,6 +178,7 @@ export class Crawler {
|
|||
|
||||
customBehaviors = "";
|
||||
behaviorsChecked = false;
|
||||
behaviorLastLine?: string;
|
||||
|
||||
browser: Browser;
|
||||
storage: S3StorageSync | null = null;
|
||||
|
@ -186,7 +187,6 @@ export class Crawler {
|
|||
maxHeapTotal = 0;
|
||||
|
||||
proxyServer?: string;
|
||||
proxyPacUrl?: string;
|
||||
|
||||
driver:
|
||||
| ((opts: {
|
||||
|
@ -509,9 +509,7 @@ export class Crawler {
|
|||
setWARCInfo(this.infoString, this.params.warcInfo);
|
||||
logger.info(this.infoString);
|
||||
|
||||
const res = await initProxy(this.params, RUN_DETACHED);
|
||||
this.proxyServer = res.proxyServer;
|
||||
this.proxyPacUrl = res.proxyPacUrl;
|
||||
this.proxyServer = await initProxy(this.params, RUN_DETACHED);
|
||||
|
||||
this.seeds = await parseSeeds(this.params);
|
||||
this.numOriginalSeeds = this.seeds.length;
|
||||
|
@ -669,6 +667,7 @@ export class Crawler {
|
|||
pageUrl: string,
|
||||
workerid: WorkerId,
|
||||
) {
|
||||
let behaviorLine;
|
||||
let message;
|
||||
let details;
|
||||
|
||||
|
@ -712,7 +711,11 @@ export class Crawler {
|
|||
|
||||
switch (type) {
|
||||
case "info":
|
||||
logger.info(message, details, context);
|
||||
behaviorLine = JSON.stringify(data);
|
||||
if (behaviorLine !== this.behaviorLastLine) {
|
||||
logger.info(message, details, context);
|
||||
this.behaviorLastLine = behaviorLine;
|
||||
}
|
||||
break;
|
||||
|
||||
case "error":
|
||||
|
@ -852,34 +855,31 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.browser.addInitScript(page, initScript);
|
||||
}
|
||||
|
||||
// Handle JS dialogs:
|
||||
// - Ensure off-page navigation is canceled while behavior is running
|
||||
// - dismiss close all other dialogs if not blocking unload
|
||||
page.on("dialog", async (dialog) => {
|
||||
let accepted = true;
|
||||
if (dialog.type() === "beforeunload") {
|
||||
if (opts.pageBlockUnload) {
|
||||
accepted = false;
|
||||
await dialog.dismiss();
|
||||
// only add if running with autoclick behavior
|
||||
if (this.params.behaviors.includes("autoclick")) {
|
||||
// Ensure off-page navigation is canceled while behavior is running
|
||||
page.on("dialog", async (dialog) => {
|
||||
let accepted = true;
|
||||
if (dialog.type() === "beforeunload") {
|
||||
if (opts.pageBlockUnload) {
|
||||
accepted = false;
|
||||
await dialog.dismiss();
|
||||
} else {
|
||||
await dialog.accept();
|
||||
}
|
||||
} else {
|
||||
await dialog.accept();
|
||||
}
|
||||
} else {
|
||||
// other JS dialog, just dismiss
|
||||
await dialog.dismiss();
|
||||
}
|
||||
logger.debug("JS Dialog", {
|
||||
accepted,
|
||||
blockingUnload: opts.pageBlockUnload,
|
||||
message: dialog.message(),
|
||||
type: dialog.type(),
|
||||
page: page.url(),
|
||||
workerid,
|
||||
logger.debug("JS Dialog", {
|
||||
accepted,
|
||||
blockingUnload: opts.pageBlockUnload,
|
||||
message: dialog.message(),
|
||||
type: dialog.type(),
|
||||
page: page.url(),
|
||||
workerid,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// only add if running with autoclick behavior
|
||||
if (this.params.behaviors.includes("autoclick")) {
|
||||
// Close any windows opened during navigation from autoclick
|
||||
await cdp.send("Target.setDiscoverTargets", { discover: true });
|
||||
|
||||
|
@ -1062,43 +1062,58 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.logDetails = logDetails;
|
||||
data.workerid = workerid;
|
||||
|
||||
let result = false;
|
||||
|
||||
if (recorder) {
|
||||
try {
|
||||
const headers = auth
|
||||
? { Authorization: auth, ...this.headers }
|
||||
: this.headers;
|
||||
|
||||
result = await timedRun(
|
||||
recorder.directFetchCapture({
|
||||
url,
|
||||
headers,
|
||||
cdp,
|
||||
state: data,
|
||||
crawler: this,
|
||||
}),
|
||||
const result = await timedRun(
|
||||
recorder.directFetchCapture({ url, headers, cdp }),
|
||||
this.params.pageLoadTimeout,
|
||||
"Direct fetch of page URL timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
"Direct fetch of page URL failed",
|
||||
{ e, ...logDetails },
|
||||
"fetch",
|
||||
);
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
logger.debug(
|
||||
"Direct fetch response not accepted, continuing with browser fetch",
|
||||
logDetails,
|
||||
"fetch",
|
||||
);
|
||||
} else {
|
||||
return;
|
||||
// fetched timed out, already logged, don't retry in browser
|
||||
if (!result) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { fetched, mime, ts } = result;
|
||||
|
||||
if (mime) {
|
||||
data.mime = mime;
|
||||
data.isHTMLPage = isHTMLMime(mime);
|
||||
}
|
||||
if (fetched) {
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
data.status = 200;
|
||||
data.ts = ts || new Date();
|
||||
logger.info(
|
||||
"Direct fetch successful",
|
||||
{ url, mime, ...logDetails },
|
||||
"fetch",
|
||||
);
|
||||
return;
|
||||
}
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message === "response-filtered-out") {
|
||||
// filtered out direct fetch
|
||||
logger.debug(
|
||||
"Direct fetch response not accepted, continuing with browser fetch",
|
||||
logDetails,
|
||||
"fetch",
|
||||
);
|
||||
} else {
|
||||
logger.error(
|
||||
"Direct fetch of page URL failed",
|
||||
{ e, ...logDetails },
|
||||
"fetch",
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1267,11 +1282,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
async pageFinished(data: PageState, lastErrorText = "") {
|
||||
// not yet finished
|
||||
if (data.asyncLoading) {
|
||||
return;
|
||||
}
|
||||
async pageFinished(data: PageState) {
|
||||
// if page loaded, considered page finished successfully
|
||||
// (even if behaviors timed out)
|
||||
const { loadState, logDetails, depth, url, pageSkipped } = data;
|
||||
|
@ -1306,28 +1317,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.serializeConfig();
|
||||
|
||||
if (depth === 0 && this.params.failOnFailedSeed) {
|
||||
let errorCode = ExitCodes.GenericError;
|
||||
|
||||
switch (lastErrorText) {
|
||||
case "net::ERR_SOCKS_CONNECTION_FAILED":
|
||||
case "net::SOCKS_CONNECTION_HOST_UNREACHABLE":
|
||||
case "net::ERR_PROXY_CONNECTION_FAILED":
|
||||
case "net::ERR_TUNNEL_CONNECTION_FAILED":
|
||||
errorCode = ExitCodes.ProxyError;
|
||||
break;
|
||||
|
||||
case "net::ERR_TIMED_OUT":
|
||||
case "net::ERR_INVALID_AUTH_CREDENTIALS":
|
||||
if (this.proxyServer || this.proxyPacUrl) {
|
||||
errorCode = ExitCodes.ProxyError;
|
||||
}
|
||||
break;
|
||||
}
|
||||
logger.fatal(
|
||||
"Seed Page Load Failed, failing crawl",
|
||||
{},
|
||||
"general",
|
||||
errorCode,
|
||||
ExitCodes.GenericError,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1715,8 +1709,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
emulateDevice: this.emulateDevice,
|
||||
swOpt: this.params.serviceWorker,
|
||||
chromeOptions: {
|
||||
proxyServer: this.proxyServer,
|
||||
proxyPacUrl: this.proxyPacUrl,
|
||||
proxy: this.proxyServer,
|
||||
userAgent: this.emulateDevice.userAgent,
|
||||
extraArgs: this.extraChromeArgs(),
|
||||
},
|
||||
|
@ -1969,8 +1962,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.error("Error creating WACZ", e);
|
||||
if (!streaming) {
|
||||
logger.fatal("Unable to write WACZ successfully");
|
||||
} else if (this.params.restartsOnError) {
|
||||
await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2470,26 +2461,21 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
switch (result) {
|
||||
case QueueState.ADDED:
|
||||
logger.debug("Queued new page URL", { url, ...logDetails }, "links");
|
||||
logger.debug("Queued new page url", { url, ...logDetails }, "links");
|
||||
return true;
|
||||
|
||||
case QueueState.LIMIT_HIT:
|
||||
logger.debug(
|
||||
"Page URL not queued, at page limit",
|
||||
"Not queued page url, at page limit",
|
||||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
if (!this.limitHit && depth === 0) {
|
||||
logger.error(
|
||||
"Page limit reached when adding URL list, some URLs not crawled.",
|
||||
);
|
||||
}
|
||||
this.limitHit = true;
|
||||
return false;
|
||||
|
||||
case QueueState.DUPE_URL:
|
||||
logger.debug(
|
||||
"Page URL not queued, already seen",
|
||||
"Not queued page url, already seen",
|
||||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
|
|
|
@ -16,7 +16,7 @@ import { initStorage } from "./util/storage.js";
|
|||
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||
import { getInfoString } from "./util/file_reader.js";
|
||||
import { DISPLAY, ExitCodes } from "./util/constants.js";
|
||||
import { initProxy, loadProxyConfig } from "./util/proxy.js";
|
||||
import { initProxy } from "./util/proxy.js";
|
||||
//import { sleep } from "./util/timing.js";
|
||||
|
||||
const profileHTML = fs.readFileSync(
|
||||
|
@ -123,12 +123,6 @@ function initArgs() {
|
|||
type: "string",
|
||||
},
|
||||
|
||||
proxyServerConfig: {
|
||||
describe:
|
||||
"if set, path to yaml/json file that configures multiple path servers per URL regex",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
sshProxyPrivateKeyFile: {
|
||||
describe:
|
||||
"path to SSH private key for SOCKS5 over SSH proxy connection",
|
||||
|
@ -167,9 +161,7 @@ async function main() {
|
|||
|
||||
process.on("SIGTERM", () => handleTerminate("SIGTERM"));
|
||||
|
||||
loadProxyConfig(params);
|
||||
|
||||
const { proxyServer, proxyPacUrl } = await initProxy(params, false);
|
||||
const proxyServer = await initProxy(params, false);
|
||||
|
||||
if (!params.headless) {
|
||||
logger.debug("Launching XVFB");
|
||||
|
@ -211,8 +203,7 @@ async function main() {
|
|||
headless: params.headless,
|
||||
signals: false,
|
||||
chromeOptions: {
|
||||
proxyServer,
|
||||
proxyPacUrl,
|
||||
proxy: proxyServer,
|
||||
extraArgs: [
|
||||
"--window-position=0,0",
|
||||
`--window-size=${params.windowSize}`,
|
||||
|
@ -339,11 +330,7 @@ async function createProfile(
|
|||
cdp: CDPSession,
|
||||
targetFilename = "",
|
||||
) {
|
||||
try {
|
||||
await cdp.send("Network.clearBrowserCache");
|
||||
} catch (e) {
|
||||
logger.warn("Error clearing cache", e, "browser");
|
||||
}
|
||||
await cdp.send("Network.clearBrowserCache");
|
||||
|
||||
await browser.close();
|
||||
|
||||
|
@ -550,8 +537,7 @@ class InteractiveBrowser {
|
|||
return;
|
||||
}
|
||||
|
||||
const cookies = await this.browser.getCookies();
|
||||
|
||||
const cookies = await this.browser.getCookies(this.page);
|
||||
for (const cookieOrig of cookies) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const cookie = cookieOrig as any;
|
||||
|
@ -571,7 +557,7 @@ class InteractiveBrowser {
|
|||
cookie.url = url;
|
||||
}
|
||||
}
|
||||
await this.browser.setCookies(cookies);
|
||||
await this.browser.setCookies(this.page, cookies);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error("Save Cookie Error: ", e);
|
||||
|
|
|
@ -29,7 +29,6 @@ import {
|
|||
logger,
|
||||
} from "./logger.js";
|
||||
import { SaveState } from "./state.js";
|
||||
import { loadProxyConfig } from "./proxy.js";
|
||||
|
||||
// ============================================================================
|
||||
export type CrawlerArgs = ReturnType<typeof parseArgs> & {
|
||||
|
@ -642,19 +641,6 @@ class ArgParser {
|
|||
type: "string",
|
||||
},
|
||||
|
||||
proxyServerPreferSingleProxy: {
|
||||
describe:
|
||||
"if set, and both proxyServer and proxyServerConfig are provided, the proxyServer value will be preferred",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
proxyServerConfig: {
|
||||
describe:
|
||||
"if set, path to yaml/json file that configures multiple path servers per URL regex",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
dryRun: {
|
||||
describe:
|
||||
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
|
||||
|
@ -792,8 +778,6 @@ class ArgParser {
|
|||
argv.emulateDevice = { viewport: null };
|
||||
}
|
||||
|
||||
loadProxyConfig(argv);
|
||||
|
||||
if (argv.lang) {
|
||||
if (!ISO6391.validate(argv.lang)) {
|
||||
logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang);
|
||||
|
|
|
@ -272,9 +272,7 @@ export class BlockRules {
|
|||
logDetails: Record<string, any>,
|
||||
) {
|
||||
try {
|
||||
const res = await fetch(reqUrl, {
|
||||
dispatcher: getProxyDispatcher(reqUrl),
|
||||
});
|
||||
const res = await fetch(reqUrl, { dispatcher: getProxyDispatcher() });
|
||||
const text = await res.text();
|
||||
|
||||
return !!text.match(frameTextMatch);
|
||||
|
@ -305,7 +303,7 @@ export class BlockRules {
|
|||
method: "PUT",
|
||||
headers: { "Content-Type": "text/html" },
|
||||
body,
|
||||
dispatcher: getProxyDispatcher(putUrl.href),
|
||||
dispatcher: getProxyDispatcher(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import puppeteer, {
|
|||
Page,
|
||||
LaunchOptions,
|
||||
Viewport,
|
||||
CookieData,
|
||||
} from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||
import { Recorder } from "./recorder.js";
|
||||
|
@ -30,8 +29,7 @@ import { timedRun } from "./timing.js";
|
|||
import assert from "node:assert";
|
||||
|
||||
type BtrixChromeOpts = {
|
||||
proxyServer?: string;
|
||||
proxyPacUrl?: string;
|
||||
proxy?: string;
|
||||
userAgent?: string | null;
|
||||
extraArgs?: string[];
|
||||
};
|
||||
|
@ -245,8 +243,7 @@ export class Browser {
|
|||
}
|
||||
|
||||
chromeArgs({
|
||||
proxyServer = "",
|
||||
proxyPacUrl = "",
|
||||
proxy = "",
|
||||
userAgent = null,
|
||||
extraArgs = [],
|
||||
}: BtrixChromeOpts) {
|
||||
|
@ -265,14 +262,14 @@ export class Browser {
|
|||
...extraArgs,
|
||||
];
|
||||
|
||||
if (proxyServer) {
|
||||
const proxyString = getSafeProxyString(proxyServer);
|
||||
if (proxy) {
|
||||
const proxyString = getSafeProxyString(proxy);
|
||||
logger.info("Using proxy", { proxy: proxyString }, "browser");
|
||||
}
|
||||
|
||||
if (proxy) {
|
||||
args.push("--ignore-certificate-errors");
|
||||
args.push(`--proxy-server=${proxyServer}`);
|
||||
} else if (proxyPacUrl) {
|
||||
args.push("--proxy-pac-url=" + proxyPacUrl);
|
||||
args.push(`--proxy-server=${proxy}`);
|
||||
}
|
||||
|
||||
return args;
|
||||
|
@ -617,12 +614,14 @@ export class Browser {
|
|||
await page.setViewport(params);
|
||||
}
|
||||
|
||||
async getCookies() {
|
||||
return (await this.browser?.cookies()) || [];
|
||||
async getCookies(page: Page) {
|
||||
return await page.cookies();
|
||||
}
|
||||
|
||||
async setCookies(cookies: CookieData[]) {
|
||||
return await this.browser?.setCookie(...cookies);
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async setCookies(page: Page, cookies: any) {
|
||||
return await page.setCookie(...cookies);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -81,7 +81,6 @@ export enum ExitCodes {
|
|||
DiskUtilization = 16,
|
||||
Fatal = 17,
|
||||
ProxyError = 21,
|
||||
UploadFailed = 22,
|
||||
}
|
||||
|
||||
export enum InterruptReason {
|
||||
|
|
|
@ -41,7 +41,7 @@ async function writeUrlContentsToFile(
|
|||
pathPrefix: string,
|
||||
pathDefaultExt: string,
|
||||
) {
|
||||
const res = await fetch(url, { dispatcher: getProxyDispatcher(url) });
|
||||
const res = await fetch(url, { dispatcher: getProxyDispatcher() });
|
||||
const fileContents = await res.text();
|
||||
|
||||
const filename =
|
||||
|
|
|
@ -368,7 +368,7 @@ class Flow {
|
|||
case StepType.DoubleClick:
|
||||
await locator(step)
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.click({
|
||||
count: 2,
|
||||
button: step.button && mouseButtonMap.get(step.button),
|
||||
|
@ -392,7 +392,7 @@ class Flow {
|
|||
|
||||
await locator(step)
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.click({
|
||||
delay: step.duration,
|
||||
button: step.button && mouseButtonMap.get(step.button),
|
||||
|
@ -410,7 +410,7 @@ class Flow {
|
|||
case StepType.Hover:
|
||||
await locator(step)
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.hover();
|
||||
break;
|
||||
|
||||
|
@ -426,14 +426,15 @@ class Flow {
|
|||
|
||||
case StepType.Change:
|
||||
await locator(step)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
.fill(step.value);
|
||||
break;
|
||||
|
||||
case StepType.Scroll: {
|
||||
if ("selectors" in step) {
|
||||
await locator(step)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.setTimeout(timeout * 1000)
|
||||
.scroll({
|
||||
scrollLeft: step.x || 0,
|
||||
|
|
|
@ -56,13 +56,10 @@ export const LOG_CONTEXT_TYPES = [
|
|||
"wacz",
|
||||
"replay",
|
||||
"proxy",
|
||||
"scope",
|
||||
] as const;
|
||||
|
||||
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
|
||||
|
||||
export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
|
||||
|
||||
export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
|
||||
"recorderNetwork",
|
||||
"jsError",
|
||||
|
@ -121,7 +118,7 @@ class Logger {
|
|||
message: string,
|
||||
dataUnknown: unknown,
|
||||
context: LogContext,
|
||||
logLevel: LogLevel,
|
||||
logLevel = "info",
|
||||
) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const data: Record<string, any> = formatErr(dataUnknown);
|
||||
|
@ -185,7 +182,7 @@ class Logger {
|
|||
}
|
||||
|
||||
info(message: string, data: unknown = {}, context: LogContext = "general") {
|
||||
this.logAsJSON(message, data, context, "info");
|
||||
this.logAsJSON(message, data, context);
|
||||
}
|
||||
|
||||
error(message: string, data: unknown = {}, context: LogContext = "general") {
|
||||
|
|
|
@ -48,7 +48,7 @@ export class OriginOverride {
|
|||
|
||||
const resp = await fetch(newUrl, {
|
||||
headers,
|
||||
dispatcher: getProxyDispatcher(newUrl),
|
||||
dispatcher: getProxyDispatcher(),
|
||||
});
|
||||
|
||||
const body = Buffer.from(await resp.arrayBuffer());
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
import net from "net";
|
||||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
import { Agent, Dispatcher, ProxyAgent } from "undici";
|
||||
import yaml from "js-yaml";
|
||||
|
||||
import child_process from "child_process";
|
||||
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
|
@ -11,40 +9,11 @@ import { socksDispatcher } from "fetch-socks";
|
|||
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
||||
import { ExitCodes, FETCH_HEADERS_TIMEOUT_SECS } from "./constants.js";
|
||||
|
||||
import http, { IncomingMessage, ServerResponse } from "http";
|
||||
|
||||
const SSH_PROXY_LOCAL_PORT = 9722;
|
||||
|
||||
const SSH_WAIT_TIMEOUT = 30000;
|
||||
|
||||
//let proxyDispatcher: Dispatcher | undefined = undefined;
|
||||
|
||||
type ProxyEntry = {
|
||||
proxyUrl: string;
|
||||
dispatcher: Dispatcher;
|
||||
};
|
||||
|
||||
export type ProxyServerConfig = {
|
||||
matchHosts?: Record<string, string>;
|
||||
proxies?: Record<
|
||||
string,
|
||||
string | { url: string; privateKeyFile?: string; publicHostsFile?: string }
|
||||
>;
|
||||
};
|
||||
|
||||
export type ProxyCLIArgs = {
|
||||
sshProxyPrivateKeyFile?: string;
|
||||
sshProxyKnownHostsFile?: string;
|
||||
sshProxyLocalPort?: number;
|
||||
|
||||
proxyServer?: string;
|
||||
proxyServerPreferSingleProxy?: boolean;
|
||||
|
||||
proxyMap?: ProxyServerConfig;
|
||||
};
|
||||
|
||||
const proxyMap = new Map<RegExp, ProxyEntry>();
|
||||
let defaultProxyEntry: ProxyEntry | null = null;
|
||||
let proxyDispatcher: Dispatcher | undefined = undefined;
|
||||
|
||||
export function getEnvProxyUrl() {
|
||||
if (process.env.PROXY_SERVER) {
|
||||
|
@ -59,27 +28,6 @@ export function getEnvProxyUrl() {
|
|||
return "";
|
||||
}
|
||||
|
||||
export function loadProxyConfig(params: {
|
||||
proxyServerConfig?: string;
|
||||
proxyMap?: ProxyServerConfig;
|
||||
}) {
|
||||
if (params.proxyServerConfig) {
|
||||
const proxyServerConfig = params.proxyServerConfig;
|
||||
try {
|
||||
const proxies = yaml.load(
|
||||
fs.readFileSync(proxyServerConfig, "utf8"),
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
) as any;
|
||||
params.proxyMap = proxies;
|
||||
logger.debug("Proxy host match config loaded", { proxyServerConfig });
|
||||
} catch (e) {
|
||||
logger.warn("Proxy host match config file not found, ignoring", {
|
||||
proxyServerConfig,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function getSafeProxyString(proxyString: string): string {
|
||||
if (!proxyString) {
|
||||
return "";
|
||||
|
@ -106,127 +54,31 @@ export function getSafeProxyString(proxyString: string): string {
|
|||
}
|
||||
|
||||
export async function initProxy(
|
||||
params: ProxyCLIArgs,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: Record<string, any>,
|
||||
detached: boolean,
|
||||
): Promise<{ proxyServer?: string; proxyPacUrl?: string }> {
|
||||
const { sshProxyPrivateKeyFile, sshProxyKnownHostsFile, sshProxyLocalPort } =
|
||||
params;
|
||||
let localPort = sshProxyLocalPort || SSH_PROXY_LOCAL_PORT;
|
||||
): Promise<string | undefined> {
|
||||
let proxy = params.proxyServer;
|
||||
|
||||
const singleProxy = params.proxyServer || getEnvProxyUrl();
|
||||
|
||||
if (singleProxy) {
|
||||
defaultProxyEntry = await initSingleProxy(
|
||||
singleProxy,
|
||||
localPort++,
|
||||
detached,
|
||||
sshProxyPrivateKeyFile,
|
||||
sshProxyKnownHostsFile,
|
||||
);
|
||||
if (params.proxyServerPreferSingleProxy && defaultProxyEntry.proxyUrl) {
|
||||
return { proxyServer: defaultProxyEntry.proxyUrl };
|
||||
}
|
||||
if (!proxy) {
|
||||
proxy = getEnvProxyUrl();
|
||||
}
|
||||
|
||||
if (!params.proxyMap?.matchHosts || !params.proxyMap?.proxies) {
|
||||
if (defaultProxyEntry) {
|
||||
logger.debug("Using Single Proxy", {}, "proxy");
|
||||
}
|
||||
return { proxyServer: defaultProxyEntry?.proxyUrl };
|
||||
}
|
||||
|
||||
const nameToProxy = new Map<string, ProxyEntry>();
|
||||
|
||||
for (const [name, value] of Object.entries(params.proxyMap.proxies)) {
|
||||
let proxyUrl = "";
|
||||
let privateKeyFile: string | undefined = "";
|
||||
let publicHostsFile: string | undefined = "";
|
||||
|
||||
if (typeof value === "string") {
|
||||
proxyUrl = value;
|
||||
} else {
|
||||
proxyUrl = value.url;
|
||||
privateKeyFile = value.privateKeyFile;
|
||||
publicHostsFile = value.publicHostsFile;
|
||||
}
|
||||
|
||||
privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile;
|
||||
publicHostsFile = publicHostsFile || sshProxyKnownHostsFile;
|
||||
|
||||
const entry = await initSingleProxy(
|
||||
proxyUrl,
|
||||
localPort++,
|
||||
detached,
|
||||
privateKeyFile,
|
||||
publicHostsFile,
|
||||
);
|
||||
|
||||
nameToProxy.set(name, entry);
|
||||
}
|
||||
|
||||
for (const [rx, name] of Object.entries(params.proxyMap.matchHosts)) {
|
||||
const entry = nameToProxy.get(name);
|
||||
|
||||
if (!entry) {
|
||||
logger.fatal("Proxy specified but not found in proxies list: " + name);
|
||||
return {};
|
||||
}
|
||||
|
||||
if (rx) {
|
||||
proxyMap.set(new RegExp(rx), entry);
|
||||
} else {
|
||||
defaultProxyEntry = entry;
|
||||
}
|
||||
}
|
||||
|
||||
const p = new ProxyPacServer();
|
||||
|
||||
logger.debug("Using Proxy PAC script", {}, "proxy");
|
||||
|
||||
return { proxyPacUrl: `http://localhost:${p.port}/proxy.pac` };
|
||||
}
|
||||
|
||||
export async function initSingleProxy(
|
||||
proxyUrl: string,
|
||||
localPort: number,
|
||||
detached: boolean,
|
||||
sshProxyPrivateKeyFile?: string,
|
||||
sshProxyKnownHostsFile?: string,
|
||||
): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> {
|
||||
logger.debug("Initing proxy", {
|
||||
url: getSafeProxyString(proxyUrl),
|
||||
localPort,
|
||||
sshProxyPrivateKeyFile,
|
||||
sshProxyKnownHostsFile,
|
||||
});
|
||||
|
||||
if (proxyUrl && proxyUrl.startsWith("ssh://")) {
|
||||
proxyUrl = await runSSHD(
|
||||
proxyUrl,
|
||||
localPort,
|
||||
detached,
|
||||
sshProxyPrivateKeyFile,
|
||||
sshProxyKnownHostsFile,
|
||||
);
|
||||
if (proxy && proxy.startsWith("ssh://")) {
|
||||
proxy = await runSSHD(params, detached);
|
||||
}
|
||||
|
||||
const agentOpts: Agent.Options = {
|
||||
headersTimeout: FETCH_HEADERS_TIMEOUT_SECS * 1000,
|
||||
};
|
||||
|
||||
const dispatcher = createDispatcher(proxyUrl, agentOpts);
|
||||
return { proxyUrl, dispatcher };
|
||||
// set global fetch() dispatcher (with proxy, if any)
|
||||
const dispatcher = createDispatcher(proxy, agentOpts);
|
||||
proxyDispatcher = dispatcher;
|
||||
return proxy;
|
||||
}
|
||||
|
||||
export function getProxyDispatcher(url: string) {
|
||||
// find url match by regex first
|
||||
for (const [rx, { dispatcher }] of proxyMap.entries()) {
|
||||
if (rx && url.match(rx)) {
|
||||
return dispatcher;
|
||||
}
|
||||
}
|
||||
// if default proxy set, return default dispatcher, otherwise no dispatcher
|
||||
return defaultProxyEntry ? defaultProxyEntry.dispatcher : undefined;
|
||||
export function getProxyDispatcher() {
|
||||
return proxyDispatcher;
|
||||
}
|
||||
|
||||
export function createDispatcher(
|
||||
|
@ -261,13 +113,9 @@ export function createDispatcher(
|
|||
}
|
||||
}
|
||||
|
||||
export async function runSSHD(
|
||||
proxyServer: string,
|
||||
localPort: number,
|
||||
detached: boolean,
|
||||
privateKey?: string,
|
||||
publicKnownHost?: string,
|
||||
) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export async function runSSHD(params: Record<string, any>, detached: boolean) {
|
||||
const { proxyServer } = params;
|
||||
if (!proxyServer || !proxyServer.startsWith("ssh://")) {
|
||||
return "";
|
||||
}
|
||||
|
@ -278,14 +126,17 @@ export async function runSSHD(
|
|||
const host = proxyServerUrl.hostname.replace("[", "").replace("]", "");
|
||||
const port = proxyServerUrl.port || 22;
|
||||
const user = proxyServerUrl.username || "root";
|
||||
const localPort = params.sshProxyLocalPort || SSH_PROXY_LOCAL_PORT;
|
||||
const proxyString = `socks5://localhost:${localPort}`;
|
||||
|
||||
const args: string[] = [
|
||||
user + "@" + host,
|
||||
"-p",
|
||||
port + "",
|
||||
port,
|
||||
"-D",
|
||||
localPort + "",
|
||||
localPort,
|
||||
"-i",
|
||||
params.sshProxyPrivateKeyFile,
|
||||
"-o",
|
||||
"IdentitiesOnly=yes",
|
||||
"-o",
|
||||
|
@ -295,17 +146,12 @@ export async function runSSHD(
|
|||
"-o",
|
||||
];
|
||||
|
||||
if (publicKnownHost) {
|
||||
args.push(`UserKnownHostsFile=${publicKnownHost}`);
|
||||
if (params.sshProxyKnownHostsFile) {
|
||||
args.push(`UserKnownHostsFile=${params.sshProxyKnownHostsFile}`);
|
||||
} else {
|
||||
args.push("StrictHostKeyChecking=no");
|
||||
}
|
||||
|
||||
if (privateKey) {
|
||||
args.push("-i");
|
||||
args.push(privateKey);
|
||||
}
|
||||
|
||||
args.push("-M", "0", "-N", "-T");
|
||||
|
||||
logger.info("Checking SSH connection for proxy...", {}, "proxy");
|
||||
|
@ -375,7 +221,7 @@ export async function runSSHD(
|
|||
"proxy",
|
||||
ExitCodes.ProxyError,
|
||||
);
|
||||
return "";
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info(
|
||||
|
@ -395,61 +241,10 @@ export async function runSSHD(
|
|||
},
|
||||
"proxy",
|
||||
);
|
||||
runSSHD(
|
||||
proxyServer,
|
||||
localPort,
|
||||
detached,
|
||||
privateKey,
|
||||
publicKnownHost,
|
||||
).catch((e) => logger.error("proxy retry error", e, "proxy"));
|
||||
runSSHD(params, detached).catch((e) =>
|
||||
logger.error("proxy retry error", e, "proxy"),
|
||||
);
|
||||
});
|
||||
|
||||
return proxyString;
|
||||
}
|
||||
|
||||
class ProxyPacServer {
|
||||
port = 20278;
|
||||
|
||||
proxyPacText = "";
|
||||
|
||||
constructor() {
|
||||
const httpServer = http.createServer((req, res) =>
|
||||
this.handleRequest(req, res),
|
||||
);
|
||||
httpServer.listen(this.port);
|
||||
this.generateProxyPac();
|
||||
}
|
||||
|
||||
async handleRequest(request: IncomingMessage, response: ServerResponse) {
|
||||
response.writeHead(200, {
|
||||
"Content-Type": "application/x-ns-proxy-autoconfig",
|
||||
});
|
||||
response.end(this.proxyPacText);
|
||||
}
|
||||
|
||||
generateProxyPac() {
|
||||
const urlToProxy = (proxyUrl: string) => {
|
||||
const url = new URL(proxyUrl);
|
||||
const hostport = url.href.slice(url.protocol.length + 2);
|
||||
const type = url.protocol.slice(0, -1).toUpperCase();
|
||||
return `"${type} ${hostport}"`;
|
||||
};
|
||||
|
||||
this.proxyPacText = `
|
||||
|
||||
function FindProxyForURL(url, host) {
|
||||
|
||||
`;
|
||||
proxyMap.forEach(({ proxyUrl }, k) => {
|
||||
this.proxyPacText += ` if (url.match(/${
|
||||
k.source
|
||||
}/)) { return ${urlToProxy(proxyUrl)}; }\n`;
|
||||
});
|
||||
|
||||
this.proxyPacText += `\n return ${
|
||||
defaultProxyEntry ? urlToProxy(defaultProxyEntry.proxyUrl) : `"DIRECT"`
|
||||
};
|
||||
}
|
||||
`;
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -342,7 +342,6 @@ export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
|
|||
|
||||
for (const seed of seeds) {
|
||||
const newSeed = typeof seed === "string" ? { url: seed } : seed;
|
||||
newSeed.url = removeQuotes(newSeed.url);
|
||||
|
||||
try {
|
||||
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
|
||||
|
@ -390,14 +389,3 @@ export function parseRx(
|
|||
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
|
||||
}
|
||||
}
|
||||
|
||||
export function removeQuotes(url: string) {
|
||||
url = url.trim();
|
||||
if (
|
||||
(url.startsWith(`"`) && url.endsWith(`"`)) ||
|
||||
(url.startsWith(`'`) && url.endsWith(`'`))
|
||||
) {
|
||||
url = url.slice(1, -1);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ export class SitemapReader extends EventEmitter {
|
|||
while (true) {
|
||||
const resp = await fetch(url, {
|
||||
headers: this.headers,
|
||||
dispatcher: getProxyDispatcher(url),
|
||||
dispatcher: getProxyDispatcher(),
|
||||
});
|
||||
|
||||
if (resp.ok) {
|
||||
|
|
|
@ -85,7 +85,6 @@ export class PageState {
|
|||
|
||||
skipBehaviors = false;
|
||||
pageSkipped = false;
|
||||
asyncLoading = false;
|
||||
filteredFrames: Frame[] = [];
|
||||
loadState: LoadState = LoadState.FAILED;
|
||||
contentCheckAllowed = false;
|
||||
|
@ -459,10 +458,6 @@ return inx;
|
|||
}
|
||||
|
||||
async trimToLimit(limit: number) {
|
||||
if (limit === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const totalComplete =
|
||||
(await this.numPending()) +
|
||||
(await this.numDone()) +
|
||||
|
|
|
@ -311,7 +311,7 @@ export class PageWorker {
|
|||
}
|
||||
|
||||
await timedRun(
|
||||
this.crawler.pageFinished(data, this.recorder?.lastErrorText),
|
||||
this.crawler.pageFinished(data),
|
||||
FINISHED_TIMEOUT,
|
||||
"Page Finished Timed Out",
|
||||
this.logDetails,
|
||||
|
|
|
@ -8,7 +8,7 @@ const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...
|
|||
|
||||
test("ensure basic crawl run with docker run passes", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
|
|
|
@ -1,21 +1,6 @@
|
|||
import child_process from "child_process";
|
||||
import Redis from "ioredis";
|
||||
|
||||
let proc = null;
|
||||
|
||||
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
|
||||
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31503`;
|
||||
|
||||
beforeAll(() => {
|
||||
proc = child_process.spawn("../../node_modules/.bin/http-server", ["-p", "31503"], {cwd: "tests/custom-behaviors/"});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
if (proc) {
|
||||
proc.kill();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
async function sleep(time) {
|
||||
await new Promise((resolve) => setTimeout(resolve, time));
|
||||
|
@ -24,7 +9,7 @@ async function sleep(time) {
|
|||
|
||||
test("test custom behaviors from local filepath", async () => {
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example-com.webrecorder.net/page --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
@ -36,10 +21,10 @@ test("test custom behaviors from local filepath", async () => {
|
|||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// but not for example.com
|
||||
// but not for example.org
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example-com.webrecorder.net/page","workerid":0}}',
|
||||
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(false);
|
||||
|
||||
|
@ -52,7 +37,7 @@ test("test custom behaviors from local filepath", async () => {
|
|||
});
|
||||
|
||||
test("test custom behavior from URL", async () => {
|
||||
const res = child_process.execSync(`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors ${TEST_HOST}/custom-2.js --scopeType page`);
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page");
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
|
@ -66,7 +51,7 @@ test("test custom behavior from URL", async () => {
|
|||
});
|
||||
|
||||
test("test mixed custom behavior sources", async () => {
|
||||
const res = child_process.execSync(`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors ${TEST_HOST}/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page`);
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page");
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
|
@ -89,7 +74,7 @@ test("test mixed custom behavior sources", async () => {
|
|||
|
||||
test("test custom behaviors from git repo", async () => {
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
@ -101,10 +86,10 @@ test("test custom behaviors from git repo", async () => {
|
|||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// but not for example.com
|
||||
// but not for example.org
|
||||
expect(
|
||||
log.indexOf(
|
||||
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example-com.webrecorder.net/","workerid":0}}',
|
||||
'"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(false);
|
||||
|
||||
|
@ -121,7 +106,7 @@ test("test invalid behavior exit", async () => {
|
|||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/invalid-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net.webrecorder.net/ --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/invalid-export.js --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/invalid-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/invalid-export.js --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
|
@ -136,7 +121,7 @@ test("test crawl exits if behavior not fetched from url", async () => {
|
|||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net --customBehaviors https://webrecorder.net/doesntexist/custombehavior.js --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors https://webrecorder.net/doesntexist/custombehavior.js --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
|
@ -151,7 +136,7 @@ test("test crawl exits if behavior not fetched from git repo", async () => {
|
|||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net --customBehaviors git+https://github.com/webrecorder/doesntexist --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors git+https://github.com/webrecorder/doesntexist --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
|
@ -166,7 +151,7 @@ test("test crawl exits if not custom behaviors collected from local path", async
|
|||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net --customBehaviors /custom-behaviors/doesntexist --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors /custom-behaviors/doesntexist --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
|
@ -181,7 +166,7 @@ test("test pushing behavior logs to redis", async () => {
|
|||
|
||||
const redisId = child_process.execSync("docker run --rm --network=crawl -p 36399:6379 --name redis -d redis");
|
||||
|
||||
const child = child_process.exec(`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-redis-test --network=crawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors ${TEST_HOST}/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page --logBehaviorsToRedis`);
|
||||
const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-redis-test --network=crawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page --logBehaviorsToRedis");
|
||||
|
||||
let resolve = null;
|
||||
const crawlFinished = new Promise(r => resolve = r);
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
},
|
||||
{
|
||||
"type": "change",
|
||||
"value": "https://example-com.webrecorder.net/",
|
||||
"value": "https://example.com/",
|
||||
"selectors": [
|
||||
[
|
||||
"aria/[role=\"main\"]",
|
||||
|
|
|
@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => {
|
|||
]);
|
||||
|
||||
const expectedExtraPages = new Set([
|
||||
"https://www.iana.org/static/_js/jquery.js",
|
||||
"https://www.iana.org/static/_js/iana.js",
|
||||
"https://www.iana.org/_js/jquery.js",
|
||||
"https://www.iana.org/_js/iana.js",
|
||||
]);
|
||||
|
||||
expect(pages).toEqual(expectedPages);
|
||||
|
@ -71,7 +71,7 @@ test("test valid autoclick selector passes validation", async () => {
|
|||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --clickSelector button --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
failed = true;
|
||||
|
@ -87,7 +87,7 @@ test("test invalid autoclick selector fails validation, crawl fails", async () =
|
|||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --clickSelector \",\" --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
|
|
|
@ -6,7 +6,7 @@ import { execSync } from "child_process";
|
|||
|
||||
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
|
||||
execSync(
|
||||
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
|
||||
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");
|
||||
|
||||
// no entries besides header
|
||||
expect(
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
matchHosts:
|
||||
old.webrecorder.net: socks-proxy
|
||||
|
||||
proxies:
|
||||
socks-proxy: socks5://user:passw1rd@proxy-with-auth:1080
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
matchHosts:
|
||||
old.webrecorder.net: socks-proxy
|
||||
|
||||
proxies:
|
||||
socks-proxy: socks5://user:passw0rd@proxy-with-auth:1080
|
3
tests/fixtures/urlSeedFile.txt
vendored
3
tests/fixtures/urlSeedFile.txt
vendored
|
@ -1,3 +1,2 @@
|
|||
https://old.webrecorder.net/about/
|
||||
https://webrecorder.net/about/
|
||||
https://specs.webrecorder.net/wacz/1.1.1/
|
||||
"https://old.webrecorder.net/faq"
|
||||
|
|
|
@ -10,7 +10,7 @@ export class TestBehavior {
|
|||
}
|
||||
|
||||
static isMatch() {
|
||||
return window.location.origin === "https://example-com.webrecorder.net";
|
||||
return window.location.origin === "https://example.com";
|
||||
}
|
||||
|
||||
async *run(ctx) {
|
||||
|
|
|
@ -76,7 +76,7 @@ test("PDF: check that the pages.jsonl file entry contains status code and mime t
|
|||
expect(pageH.loadState).toBe(2);
|
||||
});
|
||||
|
||||
test("PDF: check that CDX contains data from two crawls: one pdf 200, one 301 and one 200, two pageinfo entries", () => {
|
||||
test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => {
|
||||
const filedata = fs.readFileSync(
|
||||
"test-crawls/collections/crawl-pdf/indexes/index.cdxj",
|
||||
{ encoding: "utf-8" },
|
||||
|
@ -90,7 +90,6 @@ test("PDF: check that CDX contains data from two crawls: one pdf 200, one 301 an
|
|||
expect(cdxj[0].url).toBe(PDF_HTTP);
|
||||
expect(cdxj[0].status).toBe("301");
|
||||
|
||||
// this is duplicated as this is data from two crawls
|
||||
expect(cdxj[1].url).toBe(PDF);
|
||||
expect(cdxj[1].status).toBe("200");
|
||||
expect(cdxj[1].mime).toBe("application/pdf");
|
||||
|
@ -150,7 +149,7 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
|
|||
const lines = filedata.trim().split("\n");
|
||||
const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
|
||||
|
||||
expect(cdxj.length).toBe(5);
|
||||
expect(cdxj.length).toBe(6);
|
||||
|
||||
expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
|
||||
|
||||
|
@ -158,14 +157,18 @@ test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinf
|
|||
expect(cdxj[1].status).toBe("200");
|
||||
expect(cdxj[1].mime).toBe("application/xml");
|
||||
|
||||
expect(cdxj[2].url).toBe(XML_REDIR);
|
||||
expect(cdxj[2].status).toBe("301");
|
||||
expect(cdxj[2].url).toBe(XML);
|
||||
expect(cdxj[2].status).toBe("200");
|
||||
expect(cdxj[2].mime).toBe("application/xml");
|
||||
|
||||
expect(cdxj[3].url).toBe("urn:pageinfo:" + XML);
|
||||
expect(cdxj[3].mime).toBe("application/json");
|
||||
expect(cdxj[3].url).toBe(XML_REDIR);
|
||||
expect(cdxj[3].status).toBe("301");
|
||||
|
||||
expect(cdxj[4].url).toBe("urn:pageinfo:" + XML_REDIR);
|
||||
expect(cdxj[4].url).toBe("urn:pageinfo:" + XML);
|
||||
expect(cdxj[4].mime).toBe("application/json");
|
||||
|
||||
expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
|
||||
expect(cdxj[5].mime).toBe("application/json");
|
||||
});
|
||||
|
||||
|
||||
|
|
|
@ -118,9 +118,9 @@ function validateResourcesIndex(json) {
|
|||
{ status: 200, mime: "text/css", type: "stylesheet" },
|
||||
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
|
||||
{ status: 200, mime: "text/css", type: "stylesheet" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://old.webrecorder.net/assets/favicon.ico": {
|
||||
status: 200,
|
||||
|
@ -161,9 +161,9 @@ function validateResourcesAbout(json) {
|
|||
mime: "image/svg+xml",
|
||||
type: "image",
|
||||
},
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
|
||||
{ status: 200, mime: "font/woff2", type: "font" },
|
||||
});
|
||||
}
|
||||
|
|
|
@ -9,8 +9,6 @@ const SOCKS_PORT = "1080";
|
|||
const HTTP_PORT = "3128";
|
||||
const WRONG_PORT = "33130";
|
||||
|
||||
const PROXY_EXIT_CODE = 21;
|
||||
|
||||
const SSH_PROXY_IMAGE = "linuxserver/openssh-server"
|
||||
|
||||
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
||||
|
@ -29,7 +27,7 @@ beforeAll(() => {
|
|||
|
||||
proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
||||
|
||||
proxySSHId = execSync(`docker run -d --rm -e DOCKER_MODS=linuxserver/mods:openssh-server-ssh-tunnel -e USER_NAME=user -e PUBLIC_KEY_FILE=/keys/proxy-key.pub -v $PWD/tests/fixtures/proxies/proxy-key.pub:/keys/proxy-key.pub --network=proxy-test-net --name ssh-proxy ${SSH_PROXY_IMAGE}`);
|
||||
proxySSHId = execSync(`docker run -d --rm -e DOCKER_MODS=linuxserver/mods:openssh-server-ssh-tunnel -e USER_NAME=user -e PUBLIC_KEY_FILE=/keys/proxy-key.pub -v $PWD/tests/fixtures/proxy-key.pub:/keys/proxy-key.pub --network=proxy-test-net --name ssh-proxy ${SSH_PROXY_IMAGE}`);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
|
@ -68,7 +66,7 @@ describe("socks5 + https proxy tests", () => {
|
|||
status = e.status;
|
||||
}
|
||||
// auth supported only for SOCKS5
|
||||
expect(status).toBe(scheme === "socks5" ? 0 : PROXY_EXIT_CODE);
|
||||
expect(status).toBe(scheme === "socks5" ? 0 : 1);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, wrong auth`, () => {
|
||||
|
@ -79,7 +77,7 @@ describe("socks5 + https proxy tests", () => {
|
|||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(PROXY_EXIT_CODE);
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, wrong protocol`, () => {
|
||||
|
@ -90,8 +88,7 @@ describe("socks5 + https proxy tests", () => {
|
|||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
// wrong protocol (socks5 for http) causes connection to hang, causes a timeout, so just errors with 1
|
||||
expect(status === PROXY_EXIT_CODE || status === 1).toBe(true);
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -103,7 +100,7 @@ describe("socks5 + https proxy tests", () => {
|
|||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(PROXY_EXIT_CODE);
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
@ -121,7 +118,7 @@ test("http proxy set, but not running, separate env vars", () => {
|
|||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(PROXY_EXIT_CODE);
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
test("http proxy set, but not running, cli arg", () => {
|
||||
|
@ -132,12 +129,12 @@ test("http proxy set, but not running, cli arg", () => {
|
|||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(PROXY_EXIT_CODE);
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
|
||||
test("ssh socks proxy with custom user", () => {
|
||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/proxy-key:/keys/proxy-key webrecorder/browsertrix-crawler crawl --proxyServer ssh://user@ssh-proxy:2222 --sshProxyPrivateKeyFile /keys/proxy-key --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxy-key:/keys/proxy-key webrecorder/browsertrix-crawler crawl --proxyServer ssh://user@ssh-proxy:2222 --sshProxyPrivateKeyFile /keys/proxy-key --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||
});
|
||||
|
||||
|
||||
|
@ -149,7 +146,7 @@ test("ssh socks proxy, wrong user", () => {
|
|||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(PROXY_EXIT_CODE);
|
||||
expect(status).toBe(21);
|
||||
});
|
||||
|
||||
|
||||
|
@ -167,30 +164,4 @@ test("ensure logged proxy string does not include any credentials", () => {
|
|||
});
|
||||
|
||||
|
||||
test("proxy with config file, wrong auth or no match", () => {
|
||||
let status = 0;
|
||||
try {
|
||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-bad-auth.pac --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(PROXY_EXIT_CODE);
|
||||
|
||||
// success, no match for PDF
|
||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-bad-auth.pac --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
});
|
||||
|
||||
|
||||
test("proxy with config file, correct auth or no match", () => {
|
||||
let status = 0;
|
||||
try {
|
||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-good-auth.pac --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(0);
|
||||
|
||||
// success, no match for PDF
|
||||
execSync(`docker run --rm --network=proxy-test-net -v $PWD/tests/fixtures/proxies/:/proxies/ webrecorder/browsertrix-crawler crawl --proxyServerConfig /proxies/proxy-test-good-auth.pac --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
|
||||
});
|
||||
|
|
|
@ -38,7 +38,7 @@ afterAll(() => {
|
|||
|
||||
|
||||
test("run crawl with retries for no response", async () => {
|
||||
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://invalid-host-x:31501 --url https://example-com.webrecorder.net/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --retries 5`);
|
||||
execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://invalid-host-x:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --retries 5`);
|
||||
|
||||
const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null });
|
||||
|
||||
|
@ -90,7 +90,7 @@ test("run crawl with retries for 503, enough retries to succeed", async () => {
|
|||
requests = 0;
|
||||
success = false;
|
||||
|
||||
const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example-com.webrecorder.net/ --limit 2 --collection retry-fail-2 --retries 2 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
|
||||
const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-2 --retries 2 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
|
||||
|
||||
let status = 0;
|
||||
|
||||
|
@ -117,7 +117,7 @@ test("run crawl with retries for 503, not enough retries, fail", async () => {
|
|||
requests = 0;
|
||||
success = false;
|
||||
|
||||
const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example-com.webrecorder.net/ --limit 2 --collection retry-fail-3 --retries 1 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
|
||||
const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-3 --retries 1 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
|
||||
|
||||
let status = 0;
|
||||
|
||||
|
@ -143,7 +143,7 @@ test("run crawl with retries for 503, no retries, fail", async () => {
|
|||
requests = 0;
|
||||
success = false;
|
||||
|
||||
const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example-com.webrecorder.net/ --limit 2 --collection retry-fail-4 --retries 0 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
|
||||
const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-4 --retries 0 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
|
||||
|
||||
let status = 0;
|
||||
|
||||
|
|
|
@ -1,30 +1,13 @@
|
|||
import util from "util";
|
||||
import { spawn, exec as execCallback } from "child_process";
|
||||
import { exec as execCallback } from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
let proc = null;
|
||||
|
||||
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
|
||||
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
|
||||
|
||||
beforeAll(() => {
|
||||
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
if (proc) {
|
||||
proc.kill();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
||||
test("check that URLs in seed-list are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000 --scopeType page",
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
@ -60,7 +43,7 @@ test("check that URLs in seed-list are crawled", async () => {
|
|||
test("check that URLs in seed-list hosted at URL are crawled", async () => {
|
||||
try {
|
||||
await exec(
|
||||
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/urlSeedFile.txt" --timeout 90000 --scopeType page`,
|
||||
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
|
133
yarn.lock
133
yarn.lock
|
@ -772,17 +772,17 @@
|
|||
tslib "^2.7.0"
|
||||
tsyringe "^4.8.0"
|
||||
|
||||
"@puppeteer/browsers@2.10.10":
|
||||
version "2.10.10"
|
||||
resolved "https://registry.yarnpkg.com/@puppeteer/browsers/-/browsers-2.10.10.tgz#f806f92d966918c931fb9c48052eba2db848beaa"
|
||||
integrity sha512-3ZG500+ZeLql8rE0hjfhkycJjDj0pI/btEh3L9IkWUYcOrgP0xCNRq3HbtbqOPbvDhFaAWD88pDFtlLv8ns8gA==
|
||||
"@puppeteer/browsers@2.10.2":
|
||||
version "2.10.2"
|
||||
resolved "https://registry.yarnpkg.com/@puppeteer/browsers/-/browsers-2.10.2.tgz#c2a63cee699c6b5b971b9fcba9095098970f1648"
|
||||
integrity sha512-i4Ez+s9oRWQbNjtI/3+jxr7OH508mjAKvza0ekPJem0ZtmsYHP3B5dq62+IaBHKaGCOuqJxXzvFLUhJvQ6jtsQ==
|
||||
dependencies:
|
||||
debug "^4.4.3"
|
||||
debug "^4.4.0"
|
||||
extract-zip "^2.0.1"
|
||||
progress "^2.0.3"
|
||||
proxy-agent "^6.5.0"
|
||||
semver "^7.7.2"
|
||||
tar-fs "^3.1.0"
|
||||
semver "^7.7.1"
|
||||
tar-fs "^3.0.8"
|
||||
yargs "^17.7.2"
|
||||
|
||||
"@puppeteer/browsers@2.8.0":
|
||||
|
@ -798,10 +798,10 @@
|
|||
tar-fs "^3.0.8"
|
||||
yargs "^17.7.2"
|
||||
|
||||
"@puppeteer/replay@^3.1.3":
|
||||
version "3.1.3"
|
||||
resolved "https://registry.yarnpkg.com/@puppeteer/replay/-/replay-3.1.3.tgz#24178c5aa28af1c1b47d39043d62dd722680b55e"
|
||||
integrity sha512-chqKAKoVDtqXAFib93So2W+KHdd1RZ/yfOgXW+u0+BQaElTLVe+OpaLzEn+MIWfIkakhBHE5/tP0/CFQMVydQQ==
|
||||
"@puppeteer/replay@^3.1.1":
|
||||
version "3.1.1"
|
||||
resolved "https://registry.yarnpkg.com/@puppeteer/replay/-/replay-3.1.1.tgz#ada5412c5330ba22e3186ed4b622d26ac89bf564"
|
||||
integrity sha512-8tW1APEoqkpPVH19wRPqePb+/wbGuSVxE2OeRySKeb2SX1VpL2TuADodETRVGYYe07gBbs8FucaUu09A0QI7+w==
|
||||
dependencies:
|
||||
cli-table3 "0.6.5"
|
||||
colorette "2.0.20"
|
||||
|
@ -1134,16 +1134,16 @@
|
|||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||
|
||||
"@webrecorder/wabac@^2.24.1":
|
||||
version "2.24.1"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.24.1.tgz#4cf2423a8a593410eabc7cb84041331d39081a96"
|
||||
integrity sha512-n3MwHpPNbU1LrwZjlax9UJVvYwfYAiYQDjzAQbeE6SrAU/YFGgD3BthLCaHP5YyIvFjIKtUpfxbsxHYRqNAyxg==
|
||||
"@webrecorder/wabac@^2.23.6":
|
||||
version "2.23.6"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.6.tgz#cfcf9ef071732de0b71d49b0d3276711e88788a7"
|
||||
integrity sha512-eyRew3ddm0PzzD81racFf1REwTQeoMHAQLreG5+B5OBWBdfFblzlbsUTp4KqiFKHMZ14WXjIxznFcYwCUpkA6w==
|
||||
dependencies:
|
||||
"@peculiar/asn1-ecc" "^2.3.4"
|
||||
"@peculiar/asn1-schema" "^2.3.3"
|
||||
"@peculiar/x509" "^1.9.2"
|
||||
"@types/js-levenshtein" "^1.1.3"
|
||||
"@webrecorder/wombat" "^3.9.1"
|
||||
"@webrecorder/wombat" "^3.8.13"
|
||||
acorn "^8.10.0"
|
||||
auto-js-ipfs "^2.1.1"
|
||||
base64-js "^1.5.1"
|
||||
|
@ -1151,6 +1151,7 @@
|
|||
buffer "^6.0.3"
|
||||
fast-xml-parser "^4.4.1"
|
||||
hash-wasm "^4.9.0"
|
||||
http-link-header "^1.1.3"
|
||||
http-status-codes "^2.1.4"
|
||||
idb "^7.1.1"
|
||||
js-levenshtein "^1.1.6"
|
||||
|
@ -1161,14 +1162,14 @@
|
|||
path-parser "^6.1.0"
|
||||
process "^0.11.10"
|
||||
stream-browserify "^3.0.0"
|
||||
warcio "^2.4.7"
|
||||
warcio "^2.4.3"
|
||||
|
||||
"@webrecorder/wombat@^3.9.1":
|
||||
version "3.9.1"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.9.1.tgz#266135612e8063fa6b453f45d37d2c94e7be93d6"
|
||||
integrity sha512-NX7vYQxulVRPgZk4ok9JbrUsf0dct2f34D/B1ZUCcB4M9aTKDhDAxwoIJbMha4DLhQlPcPp2wjH5/uJtPvtsXQ==
|
||||
"@webrecorder/wombat@^3.8.13":
|
||||
version "3.8.13"
|
||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.13.tgz#264f639dd102dca415f5d01a649d6b95dfac9779"
|
||||
integrity sha512-gg80bEpJE+2Wn0ZTbfCkt9+vTftJemBwAWe9TYXo7ErCX1v7RbIrZ5LfkjSWx3vCx6R4V31DxXk1mycsVrEapA==
|
||||
dependencies:
|
||||
warcio "^2.4.7"
|
||||
warcio "^2.4.0"
|
||||
|
||||
"@zxing/text-encoding@0.9.0":
|
||||
version "0.9.0"
|
||||
|
@ -1594,10 +1595,10 @@ browserslist@^4.24.0:
|
|||
node-releases "^2.0.18"
|
||||
update-browserslist-db "^1.1.1"
|
||||
|
||||
browsertrix-behaviors@^0.9.2:
|
||||
version "0.9.2"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.2.tgz#b5bee47d15014a05a873d8cc6ea8917bfa61d5c8"
|
||||
integrity sha512-d7rLNKXaiD83S4uXKBUf2x9UzmMjbrqKoO820KVqzWtlpzqnXFUsqN/wKvMSiNbDzmL1+G9Um7Gwb1AjD0djCw==
|
||||
browsertrix-behaviors@^0.9.0:
|
||||
version "0.9.0"
|
||||
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.9.0.tgz#3789a07fdf43ca76b4cb4794119d082189338c6a"
|
||||
integrity sha512-rfpgW7r9ApwwH20IbpQrJaWupsfbVgxQRRuARs4m8nzIdF/WKTv38fTHDbYci8wJulcdu8D/eAlzyIBPwhrkkA==
|
||||
dependencies:
|
||||
query-selector-shadow-dom "^1.0.1"
|
||||
|
||||
|
@ -1711,10 +1712,10 @@ chromium-bidi@2.1.2:
|
|||
mitt "^3.0.1"
|
||||
zod "^3.24.1"
|
||||
|
||||
chromium-bidi@8.0.0:
|
||||
version "8.0.0"
|
||||
resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-8.0.0.tgz#d73c9beed40317adf2bcfeb9a47087003cd467ec"
|
||||
integrity sha512-d1VmE0FD7lxZQHzcDUCKZSNRtRwISXDsdg4HjdTR5+Ll5nQ/vzU12JeNmupD6VWffrPSlrnGhEWlLESKH3VO+g==
|
||||
chromium-bidi@4.1.1:
|
||||
version "4.1.1"
|
||||
resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-4.1.1.tgz#e1c34154ddd94473f180fd15158a24d36049e3d5"
|
||||
integrity sha512-biR7t4vF3YluE6RlMSk9IWk+b9U+WWyzHp+N2pL9vRTk+UXHYRTVp7jTK58ZNzMLBgoLMHY4QyJMbeuw3eKxqg==
|
||||
dependencies:
|
||||
mitt "^3.0.1"
|
||||
zod "^3.24.1"
|
||||
|
@ -1946,13 +1947,6 @@ debug@^4.4.0:
|
|||
dependencies:
|
||||
ms "^2.1.3"
|
||||
|
||||
debug@^4.4.3:
|
||||
version "4.4.3"
|
||||
resolved "https://registry.yarnpkg.com/debug/-/debug-4.4.3.tgz#c6ae432d9bd9662582fce08709b038c58e9e3d6a"
|
||||
integrity sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==
|
||||
dependencies:
|
||||
ms "^2.1.3"
|
||||
|
||||
decimal.js@^10.4.3:
|
||||
version "10.5.0"
|
||||
resolved "https://registry.yarnpkg.com/decimal.js/-/decimal.js-10.5.0.tgz#0f371c7cf6c4898ce0afb09836db73cd82010f22"
|
||||
|
@ -2042,16 +2036,16 @@ devtools-protocol@0.0.1413902:
|
|||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1413902.tgz#a0f00fe9eb25ab337a8f9656a29e0a1a69f42401"
|
||||
integrity sha512-yRtvFD8Oyk7C9Os3GmnFZLu53yAfsnyw1s+mLmHHUK0GQEc9zthHWvS1r67Zqzm5t7v56PILHIVZ7kmFMaL2yQ==
|
||||
|
||||
devtools-protocol@0.0.1425554:
|
||||
version "0.0.1425554"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1425554.tgz#51ed2fed1405f56783d24a393f7c75b6bbb58029"
|
||||
integrity sha512-uRfxR6Nlzdzt0ihVIkV+sLztKgs7rgquY/Mhcv1YNCWDh5IZgl5mnn2aeEnW5stYTE0wwiF4RYVz8eMEpV1SEw==
|
||||
|
||||
devtools-protocol@0.0.1436416:
|
||||
version "0.0.1436416"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1436416.tgz#ce8af8a210b8bcac83c5c8f095b9f977a9570df0"
|
||||
integrity sha512-iGLhz2WOrlBLcTcoVsFy5dPPUqILG6cc8MITYd5lV6i38gWG14bMXRH/d8G5KITrWHBnbsOnWHfc9Qs4/jej9Q==
|
||||
|
||||
devtools-protocol@0.0.1495869:
|
||||
version "0.0.1495869"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1495869.tgz#f68daef77a48d5dcbcdd55dbfa3265a51989c91b"
|
||||
integrity sha512-i+bkd9UYFis40RcnkW7XrOprCujXRAHg62IVh/Ah3G8MmNXpCGt1m0dTFhSdx/AVs8XEMbdOGRwdkR1Bcta8AA==
|
||||
|
||||
diff-sequences@^29.6.3:
|
||||
version "29.6.3"
|
||||
resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.6.3.tgz#4deaf894d11407c51efc8418012f9e70b84ea921"
|
||||
|
@ -2840,7 +2834,7 @@ html-escaper@^2.0.0:
|
|||
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
|
||||
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
|
||||
|
||||
http-link-header@^1.1.1:
|
||||
http-link-header@^1.1.1, http-link-header@^1.1.3:
|
||||
version "1.1.3"
|
||||
resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.3.tgz#b367b7a0ad1cf14027953f31aa1df40bb433da2a"
|
||||
integrity sha512-3cZ0SRL8fb9MUlU3mKM61FcQvPfXx2dBrZW3Vbg5CXa8jFlK8OaEpePenLe1oEXQduhz8b0QjsqfS59QP4AJDQ==
|
||||
|
@ -4555,18 +4549,17 @@ puppeteer-core@24.4.0, puppeteer-core@^24.4.0:
|
|||
typed-query-selector "^2.12.0"
|
||||
ws "^8.18.1"
|
||||
|
||||
puppeteer-core@^24.22.0:
|
||||
version "24.22.0"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-24.22.0.tgz#4d576b1a2b7699c088d3f0e843c32d81df82c3a6"
|
||||
integrity sha512-oUeWlIg0pMz8YM5pu0uqakM+cCyYyXkHBxx9di9OUELu9X9+AYrNGGRLK9tNME3WfN3JGGqQIH3b4/E9LGek/w==
|
||||
puppeteer-core@^24.7.2:
|
||||
version "24.7.2"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-24.7.2.tgz#734e377a5634ce1e419fa3ce20ad297a7e1a99ff"
|
||||
integrity sha512-P9pZyTmJqKODFCnkZgemCpoFA4LbAa8+NumHVQKyP5X9IgdNS1ZnAnIh1sMAwhF8/xEUGf7jt+qmNLlKieFw1Q==
|
||||
dependencies:
|
||||
"@puppeteer/browsers" "2.10.10"
|
||||
chromium-bidi "8.0.0"
|
||||
debug "^4.4.3"
|
||||
devtools-protocol "0.0.1495869"
|
||||
"@puppeteer/browsers" "2.10.2"
|
||||
chromium-bidi "4.1.1"
|
||||
debug "^4.4.0"
|
||||
devtools-protocol "0.0.1425554"
|
||||
typed-query-selector "^2.12.0"
|
||||
webdriver-bidi-protocol "0.2.11"
|
||||
ws "^8.18.3"
|
||||
ws "^8.18.1"
|
||||
|
||||
puppeteer@^24.4.0:
|
||||
version "24.4.0"
|
||||
|
@ -4841,11 +4834,6 @@ semver@^7.7.1:
|
|||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.7.1.tgz#abd5098d82b18c6c81f6074ff2647fd3e7220c9f"
|
||||
integrity sha512-hlq8tAfn0m/61p4BVRcPzIGr6LKiMwo4VM6dGi6pt4qcRkmNzTcWq6eCEjEh+qXjkMDvPlOFFSGwQjoEa6gyMA==
|
||||
|
||||
semver@^7.7.2:
|
||||
version "7.7.2"
|
||||
resolved "https://registry.yarnpkg.com/semver/-/semver-7.7.2.tgz#67d99fdcd35cec21e6f8b87a7fd515a33f982b58"
|
||||
integrity sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==
|
||||
|
||||
set-function-length@^1.2.1:
|
||||
version "1.2.2"
|
||||
resolved "https://registry.yarnpkg.com/set-function-length/-/set-function-length-1.2.2.tgz#aac72314198eaed975cf77b2c3b6b880695e5449"
|
||||
|
@ -5208,17 +5196,6 @@ tar-fs@^3.0.8:
|
|||
bare-fs "^4.0.1"
|
||||
bare-path "^3.0.0"
|
||||
|
||||
tar-fs@^3.1.0:
|
||||
version "3.1.1"
|
||||
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-3.1.1.tgz#4f164e59fb60f103d472360731e8c6bb4a7fe9ef"
|
||||
integrity sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==
|
||||
dependencies:
|
||||
pump "^3.0.0"
|
||||
tar-stream "^3.1.5"
|
||||
optionalDependencies:
|
||||
bare-fs "^4.0.1"
|
||||
bare-path "^3.0.0"
|
||||
|
||||
tar-stream@^2.1.4:
|
||||
version "2.2.0"
|
||||
resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.2.0.tgz#acad84c284136b060dc3faa64474aa9aebd77287"
|
||||
|
@ -5550,10 +5527,10 @@ walker@^1.0.8:
|
|||
dependencies:
|
||||
makeerror "1.0.12"
|
||||
|
||||
warcio@^2.4.7:
|
||||
version "2.4.7"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.7.tgz#7c3918463e550f62fe63df5f76a871424e74097a"
|
||||
integrity sha512-WGRqvoUqSalAkx+uJ8xnrxiiSPZ7Ru/h7iKC2XmuMMSOUSnS917l4V+qpaN9thAsZkZ+8qJRtee3uyOjlq4Dgg==
|
||||
warcio@^2.4.0, warcio@^2.4.3, warcio@^2.4.4:
|
||||
version "2.4.4"
|
||||
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.4.tgz#6c0c030bb55c0f0b824f854fa9e6718ca25d333d"
|
||||
integrity sha512-FrWOhv1qLNhPBPGEMm24Yo+DtkipK5DxK3ckVGbOf0OJ/UqaxAhiiby74q+GW70dsJV0wF+RA1ToK6CKseTshA==
|
||||
dependencies:
|
||||
"@types/pako" "^1.0.7"
|
||||
"@types/stream-buffers" "^3.0.7"
|
||||
|
@ -5573,11 +5550,6 @@ web-encoding@^1.1.5:
|
|||
optionalDependencies:
|
||||
"@zxing/text-encoding" "0.9.0"
|
||||
|
||||
webdriver-bidi-protocol@0.2.11:
|
||||
version "0.2.11"
|
||||
resolved "https://registry.yarnpkg.com/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.2.11.tgz#dba18d9b0a33aed33fab272dbd6e42411ac753cc"
|
||||
integrity sha512-Y9E1/oi4XMxcR8AT0ZC4OvYntl34SPgwjmELH+owjBr0korAX4jKgZULBWILGCVGdVCQ0dodTToIETozhG8zvA==
|
||||
|
||||
whatwg-encoding@^2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz#e7635f597fd87020858626805a2729fa7698ac53"
|
||||
|
@ -5690,11 +5662,6 @@ ws@^8.18.1:
|
|||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.1.tgz#ea131d3784e1dfdff91adb0a4a116b127515e3cb"
|
||||
integrity sha512-RKW2aJZMXeMxVpnZ6bck+RswznaxmzdULiBr6KY7XkTnW8uvt0iT9H5DkHUChXrc+uurzwa0rVI16n/Xzjdz1w==
|
||||
|
||||
ws@^8.18.3:
|
||||
version "8.18.3"
|
||||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.3.tgz#b56b88abffde62791c639170400c93dcb0c95472"
|
||||
integrity sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==
|
||||
|
||||
xdg-basedir@^4.0.0:
|
||||
version "4.0.0"
|
||||
resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue