mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Merge branch 'main' into release/1.1.4
bump to 1.2.0-beta.1
This commit is contained in:
commit
f504effa51
25 changed files with 564 additions and 169 deletions
26
.github/workflows/make-draft-release.yaml
vendored
Normal file
26
.github/workflows/make-draft-release.yaml
vendored
Normal file
|
@ -0,0 +1,26 @@
|
|||
name: Generate Draft Release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- "*-release"
|
||||
|
||||
jobs:
|
||||
package_chart:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Check out Git repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Get Version
|
||||
run: |
|
||||
echo "version=$(jq -r .version package.json)" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Make Draft Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
name: "Browsertrix Crawler v${{ env.version }}"
|
||||
tag_name: v${{ env.version }}
|
||||
draft: true
|
16
Dockerfile
16
Dockerfile
|
@ -1,4 +1,4 @@
|
|||
ARG BROWSER_VERSION=1.64.109
|
||||
ARG BROWSER_VERSION=1.66.115
|
||||
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION}
|
||||
|
||||
FROM ${BROWSER_IMAGE_BASE}
|
||||
|
@ -6,11 +6,7 @@ FROM ${BROWSER_IMAGE_BASE}
|
|||
# needed to add args to main build stage
|
||||
ARG BROWSER_VERSION
|
||||
|
||||
ENV PROXY_HOST=localhost \
|
||||
PROXY_PORT=8080 \
|
||||
PROXY_CA_URL=http://wsgiprox/download/pem \
|
||||
PROXY_CA_FILE=/tmp/proxy-ca.pem \
|
||||
DISPLAY=:99 \
|
||||
ENV DISPLAY=:99 \
|
||||
GEOMETRY=1360x1020x16 \
|
||||
BROWSER_VERSION=${BROWSER_VERSION} \
|
||||
BROWSER_BIN=google-chrome \
|
||||
|
@ -28,9 +24,6 @@ ADD package.json /app/
|
|||
# to allow forcing rebuilds from this stage
|
||||
ARG REBUILD
|
||||
|
||||
# Prefetch tldextract so pywb is able to boot in environments with limited internet access
|
||||
RUN tldextract --update
|
||||
|
||||
# Download and format ad host blocklist as JSON
|
||||
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
|
||||
curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \
|
||||
|
@ -64,8 +57,11 @@ WORKDIR /crawls
|
|||
# enable to test custom behaviors build (from browsertrix-behaviors)
|
||||
# COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js
|
||||
|
||||
# add brave/chromium group policies
|
||||
RUN mkdir -p /etc/brave/policies/managed/
|
||||
ADD config/policies /etc/brave/policies/managed/
|
||||
|
||||
ADD docker-entrypoint.sh /docker-entrypoint.sh
|
||||
ENTRYPOINT ["/docker-entrypoint.sh"]
|
||||
|
||||
CMD ["crawl"]
|
||||
|
||||
|
|
6
config/policies/brave-default.json
Normal file
6
config/policies/brave-default.json
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"BraveRewardsDisabled": true,
|
||||
"BraveWalletDisabled": true,
|
||||
"BraveVPNDisabled": 1,
|
||||
"BraveAIChatEnabled": false
|
||||
}
|
8
config/policies/lockdown-profilebrowser.json
Normal file
8
config/policies/lockdown-profilebrowser.json
Normal file
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"IncognitoModeAvailability": 1,
|
||||
"TorDisabled": true,
|
||||
"AllowFileSelectionDialogs": false,
|
||||
"URLBlocklist": [
|
||||
"file://*"
|
||||
]
|
||||
}
|
|
@ -144,6 +144,11 @@ Options:
|
|||
age behavior will run on each page.
|
||||
If 0, a behavior can run until finis
|
||||
h. [number] [default: 90]
|
||||
--postLoadDelay If >0, amount of time to sleep (in s
|
||||
econds) after page has loaded, befor
|
||||
e taking screenshots / getting text
|
||||
/ running behaviors
|
||||
[number] [default: 0]
|
||||
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
|
||||
econds) after behaviors before movin
|
||||
g on to next page
|
||||
|
@ -227,16 +232,19 @@ Options:
|
|||
--writePagesToRedis If set, write page objects to redis
|
||||
[boolean] [default: false]
|
||||
--failOnFailedSeed If set, crawler will fail with exit
|
||||
code 1 if any seed fails
|
||||
[boolean] [default: false]
|
||||
code 1 if any seed fails. When combi
|
||||
ned with --failOnInvalidStatus,will
|
||||
result in crawl failing with exit co
|
||||
de 1 if any seed has a 4xx/5xx respo
|
||||
nse [boolean] [default: false]
|
||||
--failOnFailedLimit If set, save state and exit if numbe
|
||||
r of failed pages exceeds this value
|
||||
[number] [default: 0]
|
||||
--failOnInvalidStatus If set, will treat pages with non-20
|
||||
0 response as failures. When combine
|
||||
d with --failOnFailedLimit or --fail
|
||||
OnFailedSeedmay result in crawl fail
|
||||
ing due to non-200 responses
|
||||
--failOnInvalidStatus If set, will treat pages with 4xx or
|
||||
5xx response as failures. When comb
|
||||
ined with --failOnFailedLimit or --f
|
||||
ailOnFailedSeed may result in crawl
|
||||
failing due to non-200 responses
|
||||
[boolean] [default: false]
|
||||
--customBehaviors injects a custom behavior file or se
|
||||
t of behavior files in a directory
|
||||
|
@ -250,6 +258,8 @@ Options:
|
|||
nabled, or disabled with custom prof
|
||||
ile
|
||||
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
|
||||
--dryRun If true, no data is written to disk,
|
||||
only logs [boolean]
|
||||
--qaSource Required for QA mode. Source (WACZ o
|
||||
r multi WACZ) for QA [string]
|
||||
--qaDebugImageDiff if specified, will write crawl.png,
|
||||
|
@ -269,7 +279,8 @@ Options:
|
|||
ted
|
||||
--password The password for the login. If not specified, will be promp
|
||||
ted (recommended)
|
||||
--filename The filename for the profile tarball
|
||||
--filename The filename for the profile tarball, stored within /crawls
|
||||
/profiles if absolute path not provided
|
||||
[default: "/crawls/profiles/profile.tar.gz"]
|
||||
--debugScreenshot If specified, take a screenshot after login and save as thi
|
||||
s filename
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.1.4",
|
||||
"version": "1.2.0-beta.1",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
@ -21,6 +21,7 @@
|
|||
"@webrecorder/wabac": "^2.16.12",
|
||||
"browsertrix-behaviors": "^0.6.0",
|
||||
"crc": "^4.3.2",
|
||||
"fetch-socks": "^1.3.0",
|
||||
"get-folder-size": "^4.0.0",
|
||||
"husky": "^8.0.3",
|
||||
"ioredis": "^5.3.2",
|
||||
|
|
151
src/crawler.ts
151
src/crawler.ts
|
@ -16,8 +16,6 @@ import { parseArgs } from "./util/argParser.js";
|
|||
|
||||
import yaml from "js-yaml";
|
||||
|
||||
import * as warcio from "warcio";
|
||||
|
||||
import { HealthChecker } from "./util/healthcheck.js";
|
||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||
import {
|
||||
|
@ -46,27 +44,19 @@ import { Browser } from "./util/browser.js";
|
|||
import {
|
||||
ADD_LINK_FUNC,
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
HTML_TYPES,
|
||||
DEFAULT_SELECTORS,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||
import { OriginOverride } from "./util/originoverride.js";
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
import { Agent as HTTPAgent } from "http";
|
||||
import { Agent as HTTPSAgent } from "https";
|
||||
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
|
||||
import { Recorder } from "./util/recorder.js";
|
||||
import { SitemapReader } from "./util/sitemapper.js";
|
||||
import { ScopedSeed } from "./util/seeds.js";
|
||||
import { WARCWriter } from "./util/warcwriter.js";
|
||||
|
||||
const HTTPS_AGENT = new HTTPSAgent({
|
||||
rejectUnauthorized: false,
|
||||
});
|
||||
|
||||
const HTTP_AGENT = new HTTPAgent();
|
||||
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
|
||||
import { isHTMLContentType } from "./util/reqresp.js";
|
||||
import { initProxy } from "./util/proxy.js";
|
||||
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
|
@ -184,6 +174,8 @@ export class Crawler {
|
|||
maxHeapUsed = 0;
|
||||
maxHeapTotal = 0;
|
||||
|
||||
proxyServer?: string;
|
||||
|
||||
driver!: (opts: {
|
||||
page: Page;
|
||||
data: PageState;
|
||||
|
@ -191,7 +183,7 @@ export class Crawler {
|
|||
crawler: Crawler;
|
||||
}) => NonNullable<unknown>;
|
||||
|
||||
recording = true;
|
||||
recording: boolean;
|
||||
|
||||
constructor() {
|
||||
const args = this.parseArgs();
|
||||
|
@ -225,6 +217,13 @@ export class Crawler {
|
|||
|
||||
logger.debug("Writing log to: " + this.logFilename, {}, "general");
|
||||
|
||||
this.recording = !this.params.dryRun;
|
||||
if (this.params.dryRun) {
|
||||
logger.warn(
|
||||
"Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.",
|
||||
);
|
||||
}
|
||||
|
||||
this.headers = {};
|
||||
|
||||
// pages file
|
||||
|
@ -449,17 +448,23 @@ export class Crawler {
|
|||
async bootstrap() {
|
||||
const subprocesses: ChildProcess[] = [];
|
||||
|
||||
this.proxyServer = initProxy(this.params.proxyServer);
|
||||
|
||||
subprocesses.push(this.launchRedis());
|
||||
|
||||
await fsp.mkdir(this.logDir, { recursive: true });
|
||||
|
||||
if (!this.params.dryRun) {
|
||||
await fsp.mkdir(this.archivesDir, { recursive: true });
|
||||
await fsp.mkdir(this.tempdir, { recursive: true });
|
||||
await fsp.mkdir(this.tempCdxDir, { recursive: true });
|
||||
}
|
||||
|
||||
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
|
||||
logger.setExternalLogStream(this.logFH);
|
||||
|
||||
this.infoString = await getInfoString();
|
||||
setWARCInfo(this.infoString, this.params.warcInfo);
|
||||
logger.info(this.infoString);
|
||||
|
||||
logger.info("Seeds", this.seeds);
|
||||
|
@ -515,10 +520,10 @@ export class Crawler {
|
|||
);
|
||||
}
|
||||
|
||||
if (this.params.screenshot) {
|
||||
if (this.params.screenshot && !this.params.dryRun) {
|
||||
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
|
||||
}
|
||||
if (this.params.text) {
|
||||
if (this.params.text && !this.params.dryRun) {
|
||||
this.textWriter = this.createExtraResourceWarcWriter("text");
|
||||
}
|
||||
}
|
||||
|
@ -788,7 +793,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async crawlPage(opts: WorkerState): Promise<void> {
|
||||
await this.writeStats();
|
||||
|
||||
const { page, data, workerid, callbacks, directFetchCapture } = opts;
|
||||
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
|
||||
data.callbacks = callbacks;
|
||||
|
||||
const { url } = data;
|
||||
|
@ -797,35 +802,27 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
data.logDetails = logDetails;
|
||||
data.workerid = workerid;
|
||||
|
||||
data.isHTMLPage = await timedRun(
|
||||
this.isHTML(url, logDetails),
|
||||
FETCH_TIMEOUT_SECS,
|
||||
"HEAD request to determine if URL is HTML page timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true,
|
||||
);
|
||||
|
||||
if (!data.isHTMLPage && directFetchCapture) {
|
||||
if (directFetchCapture) {
|
||||
try {
|
||||
const { fetched, mime } = await timedRun(
|
||||
directFetchCapture(url),
|
||||
const { fetched, mime, ts } = await timedRun(
|
||||
directFetchCapture({ url, headers: this.headers, cdp }),
|
||||
FETCH_TIMEOUT_SECS,
|
||||
"Direct fetch capture attempt timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true,
|
||||
);
|
||||
if (fetched) {
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
if (mime) {
|
||||
data.mime = mime;
|
||||
data.isHTMLPage = isHTMLContentType(mime);
|
||||
}
|
||||
if (fetched) {
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
data.status = 200;
|
||||
data.ts = new Date();
|
||||
data.ts = ts || new Date();
|
||||
logger.info(
|
||||
"Direct fetch successful",
|
||||
{ url, ...logDetails },
|
||||
{ url, mime, ...logDetails },
|
||||
"fetch",
|
||||
);
|
||||
return;
|
||||
|
@ -1105,30 +1102,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return res ? frame : null;
|
||||
}
|
||||
|
||||
async createWARCInfo(filename: string) {
|
||||
const warcVersion = "WARC/1.1";
|
||||
const type = "warcinfo";
|
||||
|
||||
const info = {
|
||||
software: this.infoString,
|
||||
format: "WARC File Format 1.1",
|
||||
};
|
||||
|
||||
const warcInfo = { ...info, ...this.params.warcInfo };
|
||||
const record = await warcio.WARCRecord.createWARCInfo(
|
||||
{ filename, type, warcVersion },
|
||||
warcInfo,
|
||||
);
|
||||
const buffer = await warcio.WARCSerializer.serialize(record, {
|
||||
gzip: true,
|
||||
});
|
||||
return buffer;
|
||||
}
|
||||
|
||||
async checkLimits() {
|
||||
let interrupt = false;
|
||||
|
||||
const size = await getDirSize(this.archivesDir);
|
||||
const size = this.params.dryRun ? 0 : await getDirSize(this.archivesDir);
|
||||
|
||||
await this.crawlState.setArchiveSize(size);
|
||||
|
||||
|
@ -1153,7 +1130,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (this.params.diskUtilization) {
|
||||
// Check that disk usage isn't already or soon to be above threshold
|
||||
const diskUtil = await checkDiskUtilization(this.params, size);
|
||||
const diskUtil = await checkDiskUtilization(
|
||||
this.collDir,
|
||||
this.params,
|
||||
size,
|
||||
);
|
||||
if (diskUtil.stop === true) {
|
||||
interrupt = true;
|
||||
}
|
||||
|
@ -1328,7 +1309,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
emulateDevice: this.emulateDevice,
|
||||
swOpt: this.params.serviceWorker,
|
||||
chromeOptions: {
|
||||
proxy: false,
|
||||
proxy: this.proxyServer,
|
||||
userAgent: this.emulateDevice.userAgent,
|
||||
extraArgs: this.extraChromeArgs(),
|
||||
},
|
||||
|
@ -1424,11 +1405,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
async postCrawl() {
|
||||
if (this.params.combineWARC) {
|
||||
if (this.params.combineWARC && !this.params.dryRun) {
|
||||
await this.combineWARC();
|
||||
}
|
||||
|
||||
if (this.params.generateCDX) {
|
||||
if (this.params.generateCDX && !this.params.dryRun) {
|
||||
logger.info("Generating CDX");
|
||||
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
|
||||
await this.crawlState.setStatus("generate-cdx");
|
||||
|
@ -1460,6 +1441,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (
|
||||
this.params.generateWACZ &&
|
||||
!this.params.dryRun &&
|
||||
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
|
||||
) {
|
||||
const uploaded = await this.generateWACZ();
|
||||
|
@ -1775,7 +1757,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const contentType = resp.headers()["content-type"];
|
||||
|
||||
isHTMLPage = this.isHTMLContentType(contentType);
|
||||
isHTMLPage = isHTMLContentType(contentType);
|
||||
|
||||
if (contentType) {
|
||||
data.mime = contentType.split(";")[0];
|
||||
|
@ -1923,7 +1905,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"behavior",
|
||||
);
|
||||
try {
|
||||
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
|
||||
await frame.evaluate(
|
||||
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
|
||||
);
|
||||
} catch (e) {
|
||||
logger.warn("Waiting for custom page load failed", e, "behavior");
|
||||
}
|
||||
|
@ -2186,12 +2170,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
let { ts } = state;
|
||||
if (!ts) {
|
||||
ts = new Date();
|
||||
if (!this.params.dryRun) {
|
||||
logger.warn(
|
||||
"Page date missing, setting to now",
|
||||
{ url, ts },
|
||||
"pageStatus",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
row.ts = ts.toISOString();
|
||||
|
||||
|
@ -2241,49 +2227,6 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
resolveAgent(urlParsed: URL) {
|
||||
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
|
||||
}
|
||||
|
||||
async isHTML(url: string, logDetails: LogDetails) {
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
method: "HEAD",
|
||||
headers: this.headers,
|
||||
agent: this.resolveAgent,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} as any);
|
||||
if (resp.status !== 200) {
|
||||
logger.debug("HEAD response code != 200, loading in browser", {
|
||||
status: resp.status,
|
||||
...logDetails,
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
return this.isHTMLContentType(resp.headers.get("Content-Type"));
|
||||
} catch (e) {
|
||||
// can't confirm not html, so try in browser
|
||||
logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails });
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
isHTMLContentType(contentType: string | null) {
|
||||
// just load if no content-type
|
||||
if (!contentType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const mime = contentType.split(";")[0];
|
||||
|
||||
if (HTML_TYPES.includes(mime)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
|
||||
if (!sitemap) {
|
||||
return;
|
||||
|
@ -2441,7 +2384,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
generatedCombinedWarcs.push(combinedWarcName);
|
||||
|
||||
const warcBuffer = await this.createWARCInfo(combinedWarcName);
|
||||
const warcBuffer = await createWARCInfo(combinedWarcName);
|
||||
fh.write(warcBuffer);
|
||||
}
|
||||
|
||||
|
|
|
@ -99,9 +99,10 @@ function cliOpts(): { [key: string]: Options } {
|
|||
default: getDefaultWindowSize(),
|
||||
},
|
||||
|
||||
proxy: {
|
||||
type: "boolean",
|
||||
default: false,
|
||||
proxyServer: {
|
||||
describe:
|
||||
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
cookieDays: {
|
||||
|
@ -179,7 +180,7 @@ async function main() {
|
|||
headless: params.headless,
|
||||
signals: false,
|
||||
chromeOptions: {
|
||||
proxy: false,
|
||||
proxy: params.proxyServer,
|
||||
extraArgs: [
|
||||
"--window-position=0,0",
|
||||
`--window-size=${params.windowSize}`,
|
||||
|
|
|
@ -545,6 +545,18 @@ class ArgParser {
|
|||
default: "disabled",
|
||||
},
|
||||
|
||||
proxyServer: {
|
||||
describe:
|
||||
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
dryRun: {
|
||||
describe:
|
||||
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
|
||||
type: "boolean",
|
||||
},
|
||||
|
||||
qaSource: {
|
||||
describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA",
|
||||
type: "string",
|
||||
|
|
|
@ -4,6 +4,8 @@ import { logger, formatErr } from "./logger.js";
|
|||
import { HTTPRequest, Page } from "puppeteer-core";
|
||||
import { Browser } from "./browser.js";
|
||||
|
||||
import { fetch } from "undici";
|
||||
|
||||
const RULE_TYPES = ["block", "allowOnly"];
|
||||
|
||||
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
|
||||
|
|
|
@ -22,7 +22,7 @@ import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
|||
import { Recorder } from "./recorder.js";
|
||||
|
||||
type BtrixChromeOpts = {
|
||||
proxy?: boolean;
|
||||
proxy?: string;
|
||||
userAgent?: string | null;
|
||||
extraArgs?: string[];
|
||||
};
|
||||
|
@ -115,7 +115,6 @@ export class Browser {
|
|||
? undefined
|
||||
: (target) => this.targetFilter(target),
|
||||
};
|
||||
|
||||
await this._init(launchOpts, ondisconnect, recording);
|
||||
}
|
||||
|
||||
|
@ -217,7 +216,7 @@ export class Browser {
|
|||
}
|
||||
|
||||
chromeArgs({
|
||||
proxy = true,
|
||||
proxy = "",
|
||||
userAgent = null,
|
||||
extraArgs = [],
|
||||
}: BtrixChromeOpts) {
|
||||
|
@ -236,11 +235,13 @@ export class Browser {
|
|||
...extraArgs,
|
||||
];
|
||||
|
||||
if (proxy) {
|
||||
logger.info("Using proxy", { proxy }, "browser");
|
||||
}
|
||||
|
||||
if (proxy) {
|
||||
args.push("--ignore-certificate-errors");
|
||||
args.push(
|
||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
||||
);
|
||||
args.push(`--proxy-server=${proxy}`);
|
||||
}
|
||||
|
||||
return args;
|
||||
|
|
60
src/util/proxy.ts
Normal file
60
src/util/proxy.ts
Normal file
|
@ -0,0 +1,60 @@
|
|||
import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
|
||||
|
||||
import { socksDispatcher } from "fetch-socks";
|
||||
import type { SocksProxyType } from "socks/typings/common/constants.js";
|
||||
|
||||
export function getEnvProxyUrl() {
|
||||
if (process.env.PROXY_SERVER) {
|
||||
return process.env.PROXY_SERVER;
|
||||
}
|
||||
|
||||
// for backwards compatibility with 0.x proxy settings
|
||||
if (process.env.PROXY_HOST && process.env.PROXY_PORT) {
|
||||
return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`;
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
export function initProxy(proxy?: string): string {
|
||||
if (!proxy) {
|
||||
proxy = getEnvProxyUrl();
|
||||
}
|
||||
if (proxy) {
|
||||
const dispatcher = createDispatcher(proxy);
|
||||
if (dispatcher) {
|
||||
setGlobalDispatcher(dispatcher);
|
||||
return proxy;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
export function createDispatcher(proxyUrl: string): Dispatcher | undefined {
|
||||
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
|
||||
// HTTP PROXY does not support auth, as it's not supported in the browser
|
||||
// so must drop username/password for consistency
|
||||
const url = new URL(proxyUrl);
|
||||
url.username = "";
|
||||
url.password = "";
|
||||
return new ProxyAgent({ uri: url.href });
|
||||
} else if (
|
||||
proxyUrl.startsWith("socks://") ||
|
||||
proxyUrl.startsWith("socks5://") ||
|
||||
proxyUrl.startsWith("socks4://")
|
||||
) {
|
||||
// support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium)
|
||||
const url = new URL(proxyUrl);
|
||||
const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5;
|
||||
const params = {
|
||||
type,
|
||||
host: url.hostname,
|
||||
port: parseInt(url.port),
|
||||
userId: url.username || undefined,
|
||||
password: url.password || undefined,
|
||||
};
|
||||
return socksDispatcher(params);
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
|
@ -6,7 +6,7 @@ import PQueue from "p-queue";
|
|||
|
||||
import { logger, formatErr } from "./logger.js";
|
||||
import { sleep, timedRun, timestampNow } from "./timing.js";
|
||||
import { RequestResponseInfo } from "./reqresp.js";
|
||||
import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";
|
||||
|
||||
import { fetch, Response } from "undici";
|
||||
|
||||
|
@ -77,11 +77,23 @@ export type AsyncFetchOptions = {
|
|||
filter?: (resp: Response) => boolean;
|
||||
ignoreDupe?: boolean;
|
||||
maxFetchSize?: number;
|
||||
manualRedirect?: boolean;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
|
||||
export type DirectFetchRequest = {
|
||||
url: string;
|
||||
headers: Record<string, string>;
|
||||
cdp: CDPSession;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
|
||||
cdp: CDPSession;
|
||||
};
|
||||
|
||||
// =================================================================
|
||||
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
|
||||
requestId: string;
|
||||
};
|
||||
|
||||
|
@ -1068,12 +1080,23 @@ export class Recorder {
|
|||
this.writer.writeRecordPair(responseRecord, requestRecord);
|
||||
}
|
||||
|
||||
async directFetchCapture(
|
||||
url: string,
|
||||
): Promise<{ fetched: boolean; mime: string }> {
|
||||
async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
|
||||
fetched: boolean;
|
||||
mime: string;
|
||||
ts: Date;
|
||||
}> {
|
||||
const reqresp = new RequestResponseInfo("0");
|
||||
const ts = new Date();
|
||||
|
||||
const cookie = await this.getCookieString(cdp, url);
|
||||
if (cookie) {
|
||||
headers["Cookie"] = cookie;
|
||||
}
|
||||
|
||||
reqresp.url = url;
|
||||
reqresp.method = "GET";
|
||||
reqresp.requestHeaders = headers;
|
||||
reqresp.ts = ts;
|
||||
|
||||
logger.debug(
|
||||
"Directly fetching page URL without browser",
|
||||
|
@ -1081,8 +1104,21 @@ export class Recorder {
|
|||
"recorder",
|
||||
);
|
||||
|
||||
const filter = (resp: Response) =>
|
||||
resp.status === 200 && !resp.headers.get("set-cookie");
|
||||
let mime: string = "";
|
||||
|
||||
const filter = (resp: Response) => {
|
||||
// only direct load 200 responses
|
||||
if (resp.status !== 200) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ct = resp.headers.get("content-type");
|
||||
if (ct) {
|
||||
mime = ct.split(";")[0];
|
||||
}
|
||||
|
||||
return !isHTMLContentType(mime);
|
||||
};
|
||||
|
||||
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
||||
// should not get here, as dupe pages tracked via seen list
|
||||
|
@ -1093,16 +1129,28 @@ export class Recorder {
|
|||
networkId: "0",
|
||||
filter,
|
||||
ignoreDupe: true,
|
||||
manualRedirect: true,
|
||||
});
|
||||
const res = await fetcher.load();
|
||||
|
||||
const mime =
|
||||
(reqresp.responseHeaders &&
|
||||
reqresp.responseHeaders["content-type"] &&
|
||||
reqresp.responseHeaders["content-type"].split(";")[0]) ||
|
||||
"";
|
||||
this.addPageRecord(reqresp);
|
||||
|
||||
return { fetched: res === "fetched", mime };
|
||||
if (url === this.pageUrl && !this.pageInfo.ts) {
|
||||
logger.debug("Setting page timestamp", { ts, url });
|
||||
this.pageInfo.ts = ts;
|
||||
}
|
||||
|
||||
return { fetched: res === "fetched", mime, ts };
|
||||
}
|
||||
|
||||
async getCookieString(cdp: CDPSession, url: string) {
|
||||
const cookieList: string[] = [];
|
||||
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
|
||||
for (const { name, value } of cookies) {
|
||||
cookieList.push(`${name}=${value}`);
|
||||
}
|
||||
|
||||
return cookieList.join(";");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1121,6 +1169,8 @@ class AsyncFetcher {
|
|||
tempdir: string;
|
||||
filename: string;
|
||||
|
||||
manualRedirect = false;
|
||||
|
||||
constructor({
|
||||
tempdir,
|
||||
reqresp,
|
||||
|
@ -1130,6 +1180,7 @@ class AsyncFetcher {
|
|||
filter = undefined,
|
||||
ignoreDupe = false,
|
||||
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
|
||||
manualRedirect = false,
|
||||
}: AsyncFetchOptions) {
|
||||
this.reqresp = reqresp;
|
||||
this.reqresp.expectedSize = expectedSize;
|
||||
|
@ -1148,6 +1199,8 @@ class AsyncFetcher {
|
|||
);
|
||||
|
||||
this.maxFetchSize = maxFetchSize;
|
||||
|
||||
this.manualRedirect = manualRedirect;
|
||||
}
|
||||
|
||||
async load() {
|
||||
|
@ -1283,9 +1336,9 @@ class AsyncFetcher {
|
|||
reqresp.status = 0;
|
||||
reqresp.errorText = e.message;
|
||||
} finally {
|
||||
recorder.addPageRecord(reqresp);
|
||||
// exclude direct fetch request with fake id
|
||||
if (networkId !== "0") {
|
||||
recorder.addPageRecord(reqresp);
|
||||
recorder.removeReqResp(networkId);
|
||||
}
|
||||
}
|
||||
|
@ -1313,6 +1366,7 @@ class AsyncFetcher {
|
|||
headers,
|
||||
body: reqresp.postData || undefined,
|
||||
signal,
|
||||
redirect: this.manualRedirect ? "manual" : "follow",
|
||||
});
|
||||
|
||||
if (this.filter && !this.filter(resp) && abort) {
|
||||
|
@ -1329,6 +1383,7 @@ class AsyncFetcher {
|
|||
}
|
||||
|
||||
if (reqresp.expectedSize === 0) {
|
||||
reqresp.fillFetchResponse(resp);
|
||||
reqresp.payload = new Uint8Array();
|
||||
return;
|
||||
} else if (!resp.body) {
|
||||
|
@ -1428,7 +1483,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
|
|||
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
|
||||
cdp: CDPSession;
|
||||
|
||||
constructor(opts: ResponseStreamAsyncFetchOptions) {
|
||||
constructor(opts: NetworkLoadAsyncFetchOptions) {
|
||||
super(opts);
|
||||
this.cdp = opts.cdp;
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";
|
|||
|
||||
import { Protocol } from "puppeteer-core";
|
||||
import { postToGetUrl } from "warcio";
|
||||
|
||||
import { HTML_TYPES } from "./constants.js";
|
||||
import { Response } from "undici";
|
||||
|
||||
const CONTENT_LENGTH = "content-length";
|
||||
|
@ -150,10 +150,15 @@ export class RequestResponseInfo {
|
|||
}
|
||||
}
|
||||
|
||||
isRedirectStatus() {
|
||||
return this.status >= 300 && this.status < 400 && this.status !== 304;
|
||||
}
|
||||
|
||||
isSelfRedirect() {
|
||||
if (this.status < 300 || this.status >= 400 || this.status === 304) {
|
||||
if (!this.isRedirectStatus()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
const headers = new Headers(this.getResponseHeadersDict());
|
||||
const location = headers.get("location") || "";
|
||||
|
@ -365,3 +370,18 @@ export class RequestResponseInfo {
|
|||
return value.replace(/\n/g, ", ");
|
||||
}
|
||||
}
|
||||
|
||||
export function isHTMLContentType(contentType: string | null) {
|
||||
// just load if no content-type
|
||||
if (!contentType) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const mime = contentType.split(";")[0];
|
||||
|
||||
if (HTML_TYPES.includes(mime)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -9,6 +9,8 @@ import { logger, formatErr } from "./logger.js";
|
|||
import { DETECT_SITEMAP } from "./constants.js";
|
||||
import { sleep } from "./timing.js";
|
||||
|
||||
import { fetch, Response } from "undici";
|
||||
|
||||
const SITEMAP_CONCURRENCY = 5;
|
||||
|
||||
const TEXT_CONTENT_TYPE = ["text/plain"];
|
||||
|
@ -237,7 +239,8 @@ export class SitemapReader extends EventEmitter {
|
|||
resp.headers.get("content-encoding") !== "gzip"
|
||||
) {
|
||||
const ds = new DecompressionStream("gzip");
|
||||
stream = body.pipeThrough(ds);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
stream = body.pipeThrough(ds as any);
|
||||
} else {
|
||||
stream = body;
|
||||
}
|
||||
|
|
|
@ -66,7 +66,7 @@ export class PageState {
|
|||
|
||||
callbacks: PageCallbacks = {};
|
||||
|
||||
isHTMLPage?: boolean;
|
||||
isHTMLPage = true;
|
||||
text?: string;
|
||||
screenshotView?: Buffer;
|
||||
favicon?: string;
|
||||
|
|
|
@ -202,6 +202,7 @@ export async function getDirSize(dir: string) {
|
|||
}
|
||||
|
||||
export async function checkDiskUtilization(
|
||||
collDir: string,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: Record<string, any>,
|
||||
|
@ -209,7 +210,7 @@ export async function checkDiskUtilization(
|
|||
dfOutput = null,
|
||||
) {
|
||||
const diskUsage: Record<string, string> = await getDiskUsage(
|
||||
"/crawls",
|
||||
collDir,
|
||||
dfOutput,
|
||||
);
|
||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||
|
|
|
@ -11,6 +11,8 @@ import PQueue from "p-queue";
|
|||
|
||||
const DEFAULT_ROLLOVER_SIZE = 1_000_000_000;
|
||||
|
||||
let warcInfo = {};
|
||||
|
||||
export type ResourceRecordData = {
|
||||
buffer: Uint8Array;
|
||||
resourceType: string;
|
||||
|
@ -117,6 +119,8 @@ export class WARCWriter implements IndexerOffsetLength {
|
|||
);
|
||||
}
|
||||
|
||||
fh.write(await createWARCInfo(this.filename));
|
||||
|
||||
return fh;
|
||||
}
|
||||
|
||||
|
@ -310,6 +314,33 @@ export class WARCWriter implements IndexerOffsetLength {
|
|||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
export function setWARCInfo(
|
||||
software: string,
|
||||
otherParams?: Record<string, string>,
|
||||
) {
|
||||
warcInfo = {
|
||||
software,
|
||||
format: "WARC File Format 1.1",
|
||||
...otherParams,
|
||||
};
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
export async function createWARCInfo(filename: string) {
|
||||
const warcVersion = "WARC/1.1";
|
||||
const type = "warcinfo";
|
||||
|
||||
const record = await WARCRecord.createWARCInfo(
|
||||
{ filename, type, warcVersion },
|
||||
warcInfo,
|
||||
);
|
||||
const buffer = await WARCSerializer.serialize(record, {
|
||||
gzip: true,
|
||||
});
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
export function streamFinish(fh: Writable) {
|
||||
const p = new Promise<void>((resolve) => {
|
||||
|
|
|
@ -2,7 +2,7 @@ import os from "os";
|
|||
|
||||
import { logger, formatErr } from "./logger.js";
|
||||
import { sleep, timedRun } from "./timing.js";
|
||||
import { Recorder } from "./recorder.js";
|
||||
import { DirectFetchRequest, Recorder } from "./recorder.js";
|
||||
import { rxEscape } from "./seeds.js";
|
||||
import { CDPSession, Page } from "puppeteer-core";
|
||||
import { PageState, WorkerId } from "./state.js";
|
||||
|
@ -20,8 +20,10 @@ export type WorkerOpts = {
|
|||
workerid: WorkerId;
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
callbacks: Record<string, Function>;
|
||||
directFetchCapture?:
|
||||
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
|
||||
directFetchCapture:
|
||||
| ((
|
||||
request: DirectFetchRequest,
|
||||
) => Promise<{ fetched: boolean; mime: string; ts: Date }>)
|
||||
| null;
|
||||
frameIdToExecId: Map<string, number>;
|
||||
};
|
||||
|
@ -171,7 +173,7 @@ export class PageWorker {
|
|||
this.cdp = cdp;
|
||||
this.callbacks = {};
|
||||
const directFetchCapture = this.recorder
|
||||
? (x: string) => this.recorder!.directFetchCapture(x)
|
||||
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
|
||||
: null;
|
||||
this.opts = {
|
||||
page,
|
||||
|
|
18
tests/dryrun.test.js
Normal file
18
tests/dryrun.test.js
Normal file
|
@ -0,0 +1,18 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
test("ensure dryRun crawl only writes pages and logs", async () => {
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
|
||||
);
|
||||
|
||||
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
|
||||
expect(files.length).toBe(2);
|
||||
expect(files[0]).toBe("logs");
|
||||
expect(files[1]).toBe("pages");
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -3,7 +3,7 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import { WARCParser } from "warcio";
|
||||
|
||||
const PDF = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf";
|
||||
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
||||
|
||||
test("ensure pdf is crawled", async () => {
|
||||
child_process.execSync(
|
||||
|
|
127
tests/proxy.test.js
Normal file
127
tests/proxy.test.js
Normal file
|
@ -0,0 +1,127 @@
|
|||
import { execSync, exec } from "child_process";
|
||||
|
||||
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
|
||||
|
||||
const PROXY_IMAGE = "tarampampam/3proxy:1.9.1";
|
||||
const SOCKS_PORT = "1080";
|
||||
const HTTP_PORT = "3128";
|
||||
const WRONG_PORT = "33130";
|
||||
|
||||
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
|
||||
const HTML = "https://webrecorder.net/";
|
||||
|
||||
const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug";
|
||||
|
||||
let proxyAuthId;
|
||||
let proxyNoAuthId;
|
||||
|
||||
beforeAll(() => {
|
||||
execSync("docker network create proxy-test-net");
|
||||
|
||||
proxyAuthId = execSync(`docker run -e PROXY_LOGIN=user -e PROXY_PASSWORD=passw0rd -d --rm --network=proxy-test-net --name proxy-with-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
||||
|
||||
proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
execSync(`docker kill -s SIGINT ${proxyAuthId}`);
|
||||
execSync(`docker kill -s SIGINT ${proxyNoAuthId}`);
|
||||
await sleep(3000);
|
||||
execSync("docker network rm proxy-test-net");
|
||||
});
|
||||
|
||||
describe("socks5 + https proxy tests", () => {
|
||||
for (const scheme of ["socks5", "http"]) {
|
||||
const port = scheme === "socks5" ? SOCKS_PORT : HTTP_PORT;
|
||||
|
||||
for (const type of ["HTML page", "PDF"]) {
|
||||
|
||||
const url = type === "PDF" ? PDF : HTML;
|
||||
|
||||
test(`${scheme} proxy, ${type}, no auth`, () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(0);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, with auth`, () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
// auth supported only for SOCKS5
|
||||
expect(status).toBe(scheme === "socks5" ? 0 : 1);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, wrong auth`, () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
test(`${scheme} proxy, ${type}, wrong protocol`, () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${scheme === "socks5" ? HTTP_PORT : SOCKS_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
}
|
||||
|
||||
test(`${scheme} proxy, proxy missing error`, () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
test("http proxy, PDF, separate env vars", () => {
|
||||
execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${HTTP_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
});
|
||||
|
||||
test("http proxy set, but not running, separate env vars", () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
test("http proxy set, but not running, cli arg", () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
execSync(`docker run --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --proxyServer http://proxy-no-auth:${WRONG_PORT} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(1);
|
||||
});
|
||||
|
||||
|
|
@ -29,6 +29,7 @@ grpcfuse 1000000 285000 715000 28% /crawls`;
|
|||
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
|
||||
// does not exceed 90% threshold
|
||||
const returnValue = await checkDiskUtilization(
|
||||
'/crawls',
|
||||
params,
|
||||
5000 * 1024,
|
||||
mockDfOutput,
|
||||
|
@ -55,6 +56,7 @@ grpcfuse 100000 85000 15000 85% /crawls`;
|
|||
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
|
||||
// exceeds 90% threshold
|
||||
const returnValue = await checkDiskUtilization(
|
||||
'/crawls',
|
||||
params,
|
||||
3000 * 1024,
|
||||
mockDfOutput,
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import fs from "fs";
|
||||
import zlib from "zlib";
|
||||
import path from "path";
|
||||
import child_process from "child_process";
|
||||
|
||||
test("check that the warcinfo file works as expected on the command line", async () => {
|
||||
test("run crawl", async() => {
|
||||
let success = false;
|
||||
|
||||
try {
|
||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||
const proc = child_process.execSync(
|
||||
|
@ -11,10 +14,42 @@ test("check that the warcinfo file works as expected on the command line", async
|
|||
);
|
||||
|
||||
console.log(proc);
|
||||
success = true;
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
expect(success).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the warcinfo for individual WARC is as expected", async () => {
|
||||
|
||||
const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/");
|
||||
|
||||
let filename = "";
|
||||
|
||||
for (const name of warcs) {
|
||||
if (name.startsWith("rec-")) {
|
||||
filename = path.join("test-crawls/collections/warcinfo/archive/", name);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const warcData = fs.readFileSync(filename);
|
||||
|
||||
const data = zlib.gunzipSync(warcData);
|
||||
|
||||
const string = data.toString("utf8");
|
||||
|
||||
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||
expect(
|
||||
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
||||
).not.toEqual(null);
|
||||
expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
|
||||
});
|
||||
|
||||
test("check that the warcinfo for combined WARC file is as expected", async () => {
|
||||
const warcData = fs.readFileSync(
|
||||
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
||||
);
|
||||
|
|
36
yarn.lock
36
yarn.lock
|
@ -2386,6 +2386,14 @@ fd-slicer@~1.1.0:
|
|||
dependencies:
|
||||
pend "~1.2.0"
|
||||
|
||||
fetch-socks@^1.3.0:
|
||||
version "1.3.0"
|
||||
resolved "https://registry.yarnpkg.com/fetch-socks/-/fetch-socks-1.3.0.tgz#1f07b26924b5e7370aa23fd6e9332a5863736d1b"
|
||||
integrity sha512-Cq7O53hoNiVeOs6u54f8M/H/w2yzhmnTQ3tcAJj9FNKYOeNGmt8qNU1zpWOzJD09f0uqfmBXxLbzWPsnT6GcRw==
|
||||
dependencies:
|
||||
socks "^2.8.1"
|
||||
undici "^6.10.1"
|
||||
|
||||
file-entry-cache@^6.0.1:
|
||||
version "6.0.1"
|
||||
resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027"
|
||||
|
@ -2778,6 +2786,14 @@ ioredis@^5.3.2:
|
|||
redis-parser "^3.0.0"
|
||||
standard-as-callback "^2.1.0"
|
||||
|
||||
ip-address@^9.0.5:
|
||||
version "9.0.5"
|
||||
resolved "https://registry.yarnpkg.com/ip-address/-/ip-address-9.0.5.tgz#117a960819b08780c3bd1f14ef3c1cc1d3f3ea5a"
|
||||
integrity sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==
|
||||
dependencies:
|
||||
jsbn "1.1.0"
|
||||
sprintf-js "^1.1.3"
|
||||
|
||||
ip@^1.1.8:
|
||||
version "1.1.8"
|
||||
resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
|
||||
|
@ -3427,6 +3443,11 @@ js-yaml@^4.1.0:
|
|||
dependencies:
|
||||
argparse "^2.0.1"
|
||||
|
||||
jsbn@1.1.0:
|
||||
version "1.1.0"
|
||||
resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040"
|
||||
integrity sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==
|
||||
|
||||
jsesc@^2.5.1:
|
||||
version "2.5.2"
|
||||
resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4"
|
||||
|
@ -4437,6 +4458,14 @@ socks@^2.7.1:
|
|||
ip "^2.0.0"
|
||||
smart-buffer "^4.2.0"
|
||||
|
||||
socks@^2.8.1:
|
||||
version "2.8.3"
|
||||
resolved "https://registry.yarnpkg.com/socks/-/socks-2.8.3.tgz#1ebd0f09c52ba95a09750afe3f3f9f724a800cb5"
|
||||
integrity sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw==
|
||||
dependencies:
|
||||
ip-address "^9.0.5"
|
||||
smart-buffer "^4.2.0"
|
||||
|
||||
source-map-support@0.5.13:
|
||||
version "0.5.13"
|
||||
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
|
||||
|
@ -4455,6 +4484,11 @@ split-on-first@^1.0.0:
|
|||
resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f"
|
||||
integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw==
|
||||
|
||||
sprintf-js@^1.1.3:
|
||||
version "1.1.3"
|
||||
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.3.tgz#4914b903a2f8b685d17fdf78a70e917e872e444a"
|
||||
integrity sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==
|
||||
|
||||
sprintf-js@~1.0.2:
|
||||
version "1.0.3"
|
||||
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c"
|
||||
|
@ -4842,7 +4876,7 @@ undici-types@~5.25.1:
|
|||
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3"
|
||||
integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA==
|
||||
|
||||
undici@^6.18.2:
|
||||
undici@^6.10.1, undici@^6.18.2:
|
||||
version "6.18.2"
|
||||
resolved "https://registry.yarnpkg.com/undici/-/undici-6.18.2.tgz#f662a5dc33cf654fc412a9912e5a07b138d75c97"
|
||||
integrity sha512-o/MQLTwRm9IVhOqhZ0NQ9oXax1ygPjw6Vs+Vq/4QRjbOAC3B1GCHy7TYxxbExKlb7bzDRzt9vBWU6BDz0RFfYg==
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue