Merge branch 'main' into release/1.1.4

bump to 1.2.0-beta.1
This commit is contained in:
Ilya Kreymer 2024-06-13 19:28:25 -07:00
commit f504effa51
25 changed files with 564 additions and 169 deletions

View file

@ -0,0 +1,26 @@
name: Generate Draft Release
on:
push:
branches:
- main
- "*-release"
jobs:
package_chart:
runs-on: ubuntu-latest
steps:
- name: Check out Git repository
uses: actions/checkout@v3
- name: Get Version
run: |
echo "version=$(jq -r .version package.json)" >> "$GITHUB_ENV"
- name: Make Draft Release
uses: softprops/action-gh-release@v1
with:
name: "Browsertrix Crawler v${{ env.version }}"
tag_name: v${{ env.version }}
draft: true

View file

@ -1,4 +1,4 @@
ARG BROWSER_VERSION=1.64.109
ARG BROWSER_VERSION=1.66.115
ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION}
FROM ${BROWSER_IMAGE_BASE}
@ -6,11 +6,7 @@ FROM ${BROWSER_IMAGE_BASE}
# needed to add args to main build stage
ARG BROWSER_VERSION
ENV PROXY_HOST=localhost \
PROXY_PORT=8080 \
PROXY_CA_URL=http://wsgiprox/download/pem \
PROXY_CA_FILE=/tmp/proxy-ca.pem \
DISPLAY=:99 \
ENV DISPLAY=:99 \
GEOMETRY=1360x1020x16 \
BROWSER_VERSION=${BROWSER_VERSION} \
BROWSER_BIN=google-chrome \
@ -28,9 +24,6 @@ ADD package.json /app/
# to allow forcing rebuilds from this stage
ARG REBUILD
# Prefetch tldextract so pywb is able to boot in environments with limited internet access
RUN tldextract --update
# Download and format ad host blocklist as JSON
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \
@ -64,8 +57,11 @@ WORKDIR /crawls
# enable to test custom behaviors build (from browsertrix-behaviors)
# COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js
# add brave/chromium group policies
RUN mkdir -p /etc/brave/policies/managed/
ADD config/policies /etc/brave/policies/managed/
ADD docker-entrypoint.sh /docker-entrypoint.sh
ENTRYPOINT ["/docker-entrypoint.sh"]
CMD ["crawl"]

View file

@ -0,0 +1,6 @@
{
"BraveRewardsDisabled": true,
"BraveWalletDisabled": true,
"BraveVPNDisabled": 1,
"BraveAIChatEnabled": false
}

View file

@ -0,0 +1,8 @@
{
"IncognitoModeAvailability": 1,
"TorDisabled": true,
"AllowFileSelectionDialogs": false,
"URLBlocklist": [
"file://*"
]
}

View file

@ -144,6 +144,11 @@ Options:
age behavior will run on each page.
If 0, a behavior can run until finis
h. [number] [default: 90]
--postLoadDelay If >0, amount of time to sleep (in s
econds) after page has loaded, befor
e taking screenshots / getting text
/ running behaviors
[number] [default: 0]
--pageExtraDelay, --delay If >0, amount of time to sleep (in s
econds) after behaviors before movin
g on to next page
@ -227,16 +232,19 @@ Options:
--writePagesToRedis If set, write page objects to redis
[boolean] [default: false]
--failOnFailedSeed If set, crawler will fail with exit
code 1 if any seed fails
[boolean] [default: false]
code 1 if any seed fails. When combi
ned with --failOnInvalidStatus,will
result in crawl failing with exit co
de 1 if any seed has a 4xx/5xx respo
nse [boolean] [default: false]
--failOnFailedLimit If set, save state and exit if numbe
r of failed pages exceeds this value
[number] [default: 0]
--failOnInvalidStatus If set, will treat pages with non-20
0 response as failures. When combine
d with --failOnFailedLimit or --fail
OnFailedSeedmay result in crawl fail
ing due to non-200 responses
--failOnInvalidStatus If set, will treat pages with 4xx or
5xx response as failures. When comb
ined with --failOnFailedLimit or --f
ailOnFailedSeed may result in crawl
failing due to non-200 responses
[boolean] [default: false]
--customBehaviors injects a custom behavior file or se
t of behavior files in a directory
@ -250,6 +258,8 @@ Options:
nabled, or disabled with custom prof
ile
[choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"]
--dryRun If true, no data is written to disk,
only logs [boolean]
--qaSource Required for QA mode. Source (WACZ o
r multi WACZ) for QA [string]
--qaDebugImageDiff if specified, will write crawl.png,
@ -269,7 +279,8 @@ Options:
ted
--password The password for the login. If not specified, will be promp
ted (recommended)
--filename The filename for the profile tarball
--filename The filename for the profile tarball, stored within /crawls
/profiles if absolute path not provided
[default: "/crawls/profiles/profile.tar.gz"]
--debugScreenshot If specified, take a screenshot after login and save as thi
s filename

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.1.4",
"version": "1.2.0-beta.1",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
@ -21,6 +21,7 @@
"@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.6.0",
"crc": "^4.3.2",
"fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
"ioredis": "^5.3.2",

View file

@ -16,8 +16,6 @@ import { parseArgs } from "./util/argParser.js";
import yaml from "js-yaml";
import * as warcio from "warcio";
import { HealthChecker } from "./util/healthcheck.js";
import { TextExtractViaSnapshot } from "./util/textextract.js";
import {
@ -46,27 +44,19 @@ import { Browser } from "./util/browser.js";
import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
HTML_TYPES,
DEFAULT_SELECTORS,
} from "./util/constants.js";
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { OriginOverride } from "./util/originoverride.js";
// to ignore HTTPS error for HEAD check
import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
import { WARCWriter } from "./util/warcwriter.js";
const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false,
});
const HTTP_AGENT = new HTTPAgent();
import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js";
import { isHTMLContentType } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js";
const behaviors = fs.readFileSync(
new URL(
@ -184,6 +174,8 @@ export class Crawler {
maxHeapUsed = 0;
maxHeapTotal = 0;
proxyServer?: string;
driver!: (opts: {
page: Page;
data: PageState;
@ -191,7 +183,7 @@ export class Crawler {
crawler: Crawler;
}) => NonNullable<unknown>;
recording = true;
recording: boolean;
constructor() {
const args = this.parseArgs();
@ -225,6 +217,13 @@ export class Crawler {
logger.debug("Writing log to: " + this.logFilename, {}, "general");
this.recording = !this.params.dryRun;
if (this.params.dryRun) {
logger.warn(
"Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.",
);
}
this.headers = {};
// pages file
@ -449,17 +448,23 @@ export class Crawler {
async bootstrap() {
const subprocesses: ChildProcess[] = [];
this.proxyServer = initProxy(this.params.proxyServer);
subprocesses.push(this.launchRedis());
await fsp.mkdir(this.logDir, { recursive: true });
if (!this.params.dryRun) {
await fsp.mkdir(this.archivesDir, { recursive: true });
await fsp.mkdir(this.tempdir, { recursive: true });
await fsp.mkdir(this.tempCdxDir, { recursive: true });
}
this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" });
logger.setExternalLogStream(this.logFH);
this.infoString = await getInfoString();
setWARCInfo(this.infoString, this.params.warcInfo);
logger.info(this.infoString);
logger.info("Seeds", this.seeds);
@ -515,10 +520,10 @@ export class Crawler {
);
}
if (this.params.screenshot) {
if (this.params.screenshot && !this.params.dryRun) {
this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots");
}
if (this.params.text) {
if (this.params.text && !this.params.dryRun) {
this.textWriter = this.createExtraResourceWarcWriter("text");
}
}
@ -788,7 +793,7 @@ self.__bx_behaviors.selectMainBehavior();
async crawlPage(opts: WorkerState): Promise<void> {
await this.writeStats();
const { page, data, workerid, callbacks, directFetchCapture } = opts;
const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts;
data.callbacks = callbacks;
const { url } = data;
@ -797,35 +802,27 @@ self.__bx_behaviors.selectMainBehavior();
data.logDetails = logDetails;
data.workerid = workerid;
data.isHTMLPage = await timedRun(
this.isHTML(url, logDetails),
FETCH_TIMEOUT_SECS,
"HEAD request to determine if URL is HTML page timed out",
logDetails,
"fetch",
true,
);
if (!data.isHTMLPage && directFetchCapture) {
if (directFetchCapture) {
try {
const { fetched, mime } = await timedRun(
directFetchCapture(url),
const { fetched, mime, ts } = await timedRun(
directFetchCapture({ url, headers: this.headers, cdp }),
FETCH_TIMEOUT_SECS,
"Direct fetch capture attempt timed out",
logDetails,
"fetch",
true,
);
if (fetched) {
data.loadState = LoadState.FULL_PAGE_LOADED;
if (mime) {
data.mime = mime;
data.isHTMLPage = isHTMLContentType(mime);
}
if (fetched) {
data.loadState = LoadState.FULL_PAGE_LOADED;
data.status = 200;
data.ts = new Date();
data.ts = ts || new Date();
logger.info(
"Direct fetch successful",
{ url, ...logDetails },
{ url, mime, ...logDetails },
"fetch",
);
return;
@ -1105,30 +1102,10 @@ self.__bx_behaviors.selectMainBehavior();
return res ? frame : null;
}
async createWARCInfo(filename: string) {
const warcVersion = "WARC/1.1";
const type = "warcinfo";
const info = {
software: this.infoString,
format: "WARC File Format 1.1",
};
const warcInfo = { ...info, ...this.params.warcInfo };
const record = await warcio.WARCRecord.createWARCInfo(
{ filename, type, warcVersion },
warcInfo,
);
const buffer = await warcio.WARCSerializer.serialize(record, {
gzip: true,
});
return buffer;
}
async checkLimits() {
let interrupt = false;
const size = await getDirSize(this.archivesDir);
const size = this.params.dryRun ? 0 : await getDirSize(this.archivesDir);
await this.crawlState.setArchiveSize(size);
@ -1153,7 +1130,11 @@ self.__bx_behaviors.selectMainBehavior();
if (this.params.diskUtilization) {
// Check that disk usage isn't already or soon to be above threshold
const diskUtil = await checkDiskUtilization(this.params, size);
const diskUtil = await checkDiskUtilization(
this.collDir,
this.params,
size,
);
if (diskUtil.stop === true) {
interrupt = true;
}
@ -1328,7 +1309,7 @@ self.__bx_behaviors.selectMainBehavior();
emulateDevice: this.emulateDevice,
swOpt: this.params.serviceWorker,
chromeOptions: {
proxy: false,
proxy: this.proxyServer,
userAgent: this.emulateDevice.userAgent,
extraArgs: this.extraChromeArgs(),
},
@ -1424,11 +1405,11 @@ self.__bx_behaviors.selectMainBehavior();
}
async postCrawl() {
if (this.params.combineWARC) {
if (this.params.combineWARC && !this.params.dryRun) {
await this.combineWARC();
}
if (this.params.generateCDX) {
if (this.params.generateCDX && !this.params.dryRun) {
logger.info("Generating CDX");
await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true });
await this.crawlState.setStatus("generate-cdx");
@ -1460,6 +1441,7 @@ self.__bx_behaviors.selectMainBehavior();
if (
this.params.generateWACZ &&
!this.params.dryRun &&
(!this.interrupted || this.finalExit || this.uploadAndDeleteLocal)
) {
const uploaded = await this.generateWACZ();
@ -1775,7 +1757,7 @@ self.__bx_behaviors.selectMainBehavior();
const contentType = resp.headers()["content-type"];
isHTMLPage = this.isHTMLContentType(contentType);
isHTMLPage = isHTMLContentType(contentType);
if (contentType) {
data.mime = contentType.split(";")[0];
@ -1923,7 +1905,9 @@ self.__bx_behaviors.selectMainBehavior();
"behavior",
);
try {
await frame.evaluate("self.__bx_behaviors.awaitPageLoad();");
await frame.evaluate(
"self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();",
);
} catch (e) {
logger.warn("Waiting for custom page load failed", e, "behavior");
}
@ -2186,12 +2170,14 @@ self.__bx_behaviors.selectMainBehavior();
let { ts } = state;
if (!ts) {
ts = new Date();
if (!this.params.dryRun) {
logger.warn(
"Page date missing, setting to now",
{ url, ts },
"pageStatus",
);
}
}
row.ts = ts.toISOString();
@ -2241,49 +2227,6 @@ self.__bx_behaviors.selectMainBehavior();
}
}
resolveAgent(urlParsed: URL) {
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
}
async isHTML(url: string, logDetails: LogDetails) {
try {
const resp = await fetch(url, {
method: "HEAD",
headers: this.headers,
agent: this.resolveAgent,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} as any);
if (resp.status !== 200) {
logger.debug("HEAD response code != 200, loading in browser", {
status: resp.status,
...logDetails,
});
return true;
}
return this.isHTMLContentType(resp.headers.get("Content-Type"));
} catch (e) {
// can't confirm not html, so try in browser
logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails });
return true;
}
}
isHTMLContentType(contentType: string | null) {
// just load if no content-type
if (!contentType) {
return true;
}
const mime = contentType.split(";")[0];
if (HTML_TYPES.includes(mime)) {
return true;
}
return false;
}
async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
if (!sitemap) {
return;
@ -2441,7 +2384,7 @@ self.__bx_behaviors.selectMainBehavior();
generatedCombinedWarcs.push(combinedWarcName);
const warcBuffer = await this.createWARCInfo(combinedWarcName);
const warcBuffer = await createWARCInfo(combinedWarcName);
fh.write(warcBuffer);
}

View file

@ -99,9 +99,10 @@ function cliOpts(): { [key: string]: Options } {
default: getDefaultWindowSize(),
},
proxy: {
type: "boolean",
default: false,
proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},
cookieDays: {
@ -179,7 +180,7 @@ async function main() {
headless: params.headless,
signals: false,
chromeOptions: {
proxy: false,
proxy: params.proxyServer,
extraArgs: [
"--window-position=0,0",
`--window-size=${params.windowSize}`,

View file

@ -545,6 +545,18 @@ class ArgParser {
default: "disabled",
},
proxyServer: {
describe:
"if set, will use specified proxy server. Takes precedence over any env var proxy settings",
type: "string",
},
dryRun: {
describe:
"If true, no archive data is written to disk, only pages and logs (and optionally saved state).",
type: "boolean",
},
qaSource: {
describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA",
type: "string",

View file

@ -4,6 +4,8 @@ import { logger, formatErr } from "./logger.js";
import { HTTPRequest, Page } from "puppeteer-core";
import { Browser } from "./browser.js";
import { fetch } from "undici";
const RULE_TYPES = ["block", "allowOnly"];
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];

View file

@ -22,7 +22,7 @@ import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js";
type BtrixChromeOpts = {
proxy?: boolean;
proxy?: string;
userAgent?: string | null;
extraArgs?: string[];
};
@ -115,7 +115,6 @@ export class Browser {
? undefined
: (target) => this.targetFilter(target),
};
await this._init(launchOpts, ondisconnect, recording);
}
@ -217,7 +216,7 @@ export class Browser {
}
chromeArgs({
proxy = true,
proxy = "",
userAgent = null,
extraArgs = [],
}: BtrixChromeOpts) {
@ -236,11 +235,13 @@ export class Browser {
...extraArgs,
];
if (proxy) {
logger.info("Using proxy", { proxy }, "browser");
}
if (proxy) {
args.push("--ignore-certificate-errors");
args.push(
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
);
args.push(`--proxy-server=${proxy}`);
}
return args;

60
src/util/proxy.ts Normal file
View file

@ -0,0 +1,60 @@
import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici";
import { socksDispatcher } from "fetch-socks";
import type { SocksProxyType } from "socks/typings/common/constants.js";
export function getEnvProxyUrl() {
if (process.env.PROXY_SERVER) {
return process.env.PROXY_SERVER;
}
// for backwards compatibility with 0.x proxy settings
if (process.env.PROXY_HOST && process.env.PROXY_PORT) {
return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`;
}
return "";
}
export function initProxy(proxy?: string): string {
if (!proxy) {
proxy = getEnvProxyUrl();
}
if (proxy) {
const dispatcher = createDispatcher(proxy);
if (dispatcher) {
setGlobalDispatcher(dispatcher);
return proxy;
}
}
return "";
}
export function createDispatcher(proxyUrl: string): Dispatcher | undefined {
if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) {
// HTTP PROXY does not support auth, as it's not supported in the browser
// so must drop username/password for consistency
const url = new URL(proxyUrl);
url.username = "";
url.password = "";
return new ProxyAgent({ uri: url.href });
} else if (
proxyUrl.startsWith("socks://") ||
proxyUrl.startsWith("socks5://") ||
proxyUrl.startsWith("socks4://")
) {
// support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium)
const url = new URL(proxyUrl);
const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5;
const params = {
type,
host: url.hostname,
port: parseInt(url.port),
userId: url.username || undefined,
password: url.password || undefined,
};
return socksDispatcher(params);
} else {
return undefined;
}
}

View file

@ -6,7 +6,7 @@ import PQueue from "p-queue";
import { logger, formatErr } from "./logger.js";
import { sleep, timedRun, timestampNow } from "./timing.js";
import { RequestResponseInfo } from "./reqresp.js";
import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js";
import { fetch, Response } from "undici";
@ -77,11 +77,23 @@ export type AsyncFetchOptions = {
filter?: (resp: Response) => boolean;
ignoreDupe?: boolean;
maxFetchSize?: number;
manualRedirect?: boolean;
};
// =================================================================
export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & {
export type DirectFetchRequest = {
url: string;
headers: Record<string, string>;
cdp: CDPSession;
};
// =================================================================
export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & {
cdp: CDPSession;
};
// =================================================================
export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & {
requestId: string;
};
@ -1068,12 +1080,23 @@ export class Recorder {
this.writer.writeRecordPair(responseRecord, requestRecord);
}
async directFetchCapture(
url: string,
): Promise<{ fetched: boolean; mime: string }> {
async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{
fetched: boolean;
mime: string;
ts: Date;
}> {
const reqresp = new RequestResponseInfo("0");
const ts = new Date();
const cookie = await this.getCookieString(cdp, url);
if (cookie) {
headers["Cookie"] = cookie;
}
reqresp.url = url;
reqresp.method = "GET";
reqresp.requestHeaders = headers;
reqresp.ts = ts;
logger.debug(
"Directly fetching page URL without browser",
@ -1081,8 +1104,21 @@ export class Recorder {
"recorder",
);
const filter = (resp: Response) =>
resp.status === 200 && !resp.headers.get("set-cookie");
let mime: string = "";
const filter = (resp: Response) => {
// only direct load 200 responses
if (resp.status !== 200) {
return false;
}
const ct = resp.headers.get("content-type");
if (ct) {
mime = ct.split(";")[0];
}
return !isHTMLContentType(mime);
};
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
// should not get here, as dupe pages tracked via seen list
@ -1093,16 +1129,28 @@ export class Recorder {
networkId: "0",
filter,
ignoreDupe: true,
manualRedirect: true,
});
const res = await fetcher.load();
const mime =
(reqresp.responseHeaders &&
reqresp.responseHeaders["content-type"] &&
reqresp.responseHeaders["content-type"].split(";")[0]) ||
"";
this.addPageRecord(reqresp);
return { fetched: res === "fetched", mime };
if (url === this.pageUrl && !this.pageInfo.ts) {
logger.debug("Setting page timestamp", { ts, url });
this.pageInfo.ts = ts;
}
return { fetched: res === "fetched", mime, ts };
}
async getCookieString(cdp: CDPSession, url: string) {
const cookieList: string[] = [];
const { cookies } = await cdp.send("Network.getCookies", { urls: [url] });
for (const { name, value } of cookies) {
cookieList.push(`${name}=${value}`);
}
return cookieList.join(";");
}
}
@ -1121,6 +1169,8 @@ class AsyncFetcher {
tempdir: string;
filename: string;
manualRedirect = false;
constructor({
tempdir,
reqresp,
@ -1130,6 +1180,7 @@ class AsyncFetcher {
filter = undefined,
ignoreDupe = false,
maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE,
manualRedirect = false,
}: AsyncFetchOptions) {
this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize;
@ -1148,6 +1199,8 @@ class AsyncFetcher {
);
this.maxFetchSize = maxFetchSize;
this.manualRedirect = manualRedirect;
}
async load() {
@ -1283,9 +1336,9 @@ class AsyncFetcher {
reqresp.status = 0;
reqresp.errorText = e.message;
} finally {
recorder.addPageRecord(reqresp);
// exclude direct fetch request with fake id
if (networkId !== "0") {
recorder.addPageRecord(reqresp);
recorder.removeReqResp(networkId);
}
}
@ -1313,6 +1366,7 @@ class AsyncFetcher {
headers,
body: reqresp.postData || undefined,
signal,
redirect: this.manualRedirect ? "manual" : "follow",
});
if (this.filter && !this.filter(resp) && abort) {
@ -1329,6 +1383,7 @@ class AsyncFetcher {
}
if (reqresp.expectedSize === 0) {
reqresp.fillFetchResponse(resp);
reqresp.payload = new Uint8Array();
return;
} else if (!resp.body) {
@ -1428,7 +1483,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher {
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
cdp: CDPSession;
constructor(opts: ResponseStreamAsyncFetchOptions) {
constructor(opts: NetworkLoadAsyncFetchOptions) {
super(opts);
this.cdp = opts.cdp;
}

View file

@ -3,7 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js";
import { Protocol } from "puppeteer-core";
import { postToGetUrl } from "warcio";
import { HTML_TYPES } from "./constants.js";
import { Response } from "undici";
const CONTENT_LENGTH = "content-length";
@ -150,10 +150,15 @@ export class RequestResponseInfo {
}
}
isRedirectStatus() {
return this.status >= 300 && this.status < 400 && this.status !== 304;
}
isSelfRedirect() {
if (this.status < 300 || this.status >= 400 || this.status === 304) {
if (!this.isRedirectStatus()) {
return false;
}
try {
const headers = new Headers(this.getResponseHeadersDict());
const location = headers.get("location") || "";
@ -365,3 +370,18 @@ export class RequestResponseInfo {
return value.replace(/\n/g, ", ");
}
}
export function isHTMLContentType(contentType: string | null) {
// just load if no content-type
if (!contentType) {
return true;
}
const mime = contentType.split(";")[0];
if (HTML_TYPES.includes(mime)) {
return true;
}
return false;
}

View file

@ -9,6 +9,8 @@ import { logger, formatErr } from "./logger.js";
import { DETECT_SITEMAP } from "./constants.js";
import { sleep } from "./timing.js";
import { fetch, Response } from "undici";
const SITEMAP_CONCURRENCY = 5;
const TEXT_CONTENT_TYPE = ["text/plain"];
@ -237,7 +239,8 @@ export class SitemapReader extends EventEmitter {
resp.headers.get("content-encoding") !== "gzip"
) {
const ds = new DecompressionStream("gzip");
stream = body.pipeThrough(ds);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
stream = body.pipeThrough(ds as any);
} else {
stream = body;
}

View file

@ -66,7 +66,7 @@ export class PageState {
callbacks: PageCallbacks = {};
isHTMLPage?: boolean;
isHTMLPage = true;
text?: string;
screenshotView?: Buffer;
favicon?: string;

View file

@ -202,6 +202,7 @@ export async function getDirSize(dir: string) {
}
export async function checkDiskUtilization(
collDir: string,
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
params: Record<string, any>,
@ -209,7 +210,7 @@ export async function checkDiskUtilization(
dfOutput = null,
) {
const diskUsage: Record<string, string> = await getDiskUsage(
"/crawls",
collDir,
dfOutput,
);
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));

View file

@ -11,6 +11,8 @@ import PQueue from "p-queue";
const DEFAULT_ROLLOVER_SIZE = 1_000_000_000;
let warcInfo = {};
export type ResourceRecordData = {
buffer: Uint8Array;
resourceType: string;
@ -117,6 +119,8 @@ export class WARCWriter implements IndexerOffsetLength {
);
}
fh.write(await createWARCInfo(this.filename));
return fh;
}
@ -310,6 +314,33 @@ export class WARCWriter implements IndexerOffsetLength {
}
}
// =================================================================
export function setWARCInfo(
software: string,
otherParams?: Record<string, string>,
) {
warcInfo = {
software,
format: "WARC File Format 1.1",
...otherParams,
};
}
// =================================================================
export async function createWARCInfo(filename: string) {
const warcVersion = "WARC/1.1";
const type = "warcinfo";
const record = await WARCRecord.createWARCInfo(
{ filename, type, warcVersion },
warcInfo,
);
const buffer = await WARCSerializer.serialize(record, {
gzip: true,
});
return buffer;
}
// =================================================================
export function streamFinish(fh: Writable) {
const p = new Promise<void>((resolve) => {

View file

@ -2,7 +2,7 @@ import os from "os";
import { logger, formatErr } from "./logger.js";
import { sleep, timedRun } from "./timing.js";
import { Recorder } from "./recorder.js";
import { DirectFetchRequest, Recorder } from "./recorder.js";
import { rxEscape } from "./seeds.js";
import { CDPSession, Page } from "puppeteer-core";
import { PageState, WorkerId } from "./state.js";
@ -20,8 +20,10 @@ export type WorkerOpts = {
workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/ban-types
callbacks: Record<string, Function>;
directFetchCapture?:
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
directFetchCapture:
| ((
request: DirectFetchRequest,
) => Promise<{ fetched: boolean; mime: string; ts: Date }>)
| null;
frameIdToExecId: Map<string, number>;
};
@ -171,7 +173,7 @@ export class PageWorker {
this.cdp = cdp;
this.callbacks = {};
const directFetchCapture = this.recorder
? (x: string) => this.recorder!.directFetchCapture(x)
? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req)
: null;
this.opts = {
page,

18
tests/dryrun.test.js Normal file
View file

@ -0,0 +1,18 @@
import child_process from "child_process";
import fs from "fs";
test("ensure dryRun crawl only writes pages and logs", async () => {
child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun',
);
const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
expect(files.length).toBe(2);
expect(files[0]).toBe("logs");
expect(files[1]).toBe("pages");
});

View file

@ -3,7 +3,7 @@ import fs from "fs";
import path from "path";
import { WARCParser } from "warcio";
const PDF = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf";
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
test("ensure pdf is crawled", async () => {
child_process.execSync(

127
tests/proxy.test.js Normal file
View file

@ -0,0 +1,127 @@
import { execSync, exec } from "child_process";
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
const PROXY_IMAGE = "tarampampam/3proxy:1.9.1";
const SOCKS_PORT = "1080";
const HTTP_PORT = "3128";
const WRONG_PORT = "33130";
const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
const HTML = "https://webrecorder.net/";
const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug";
let proxyAuthId;
let proxyNoAuthId;
beforeAll(() => {
execSync("docker network create proxy-test-net");
proxyAuthId = execSync(`docker run -e PROXY_LOGIN=user -e PROXY_PASSWORD=passw0rd -d --rm --network=proxy-test-net --name proxy-with-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"});
});
afterAll(async () => {
execSync(`docker kill -s SIGINT ${proxyAuthId}`);
execSync(`docker kill -s SIGINT ${proxyNoAuthId}`);
await sleep(3000);
execSync("docker network rm proxy-test-net");
});
describe("socks5 + https proxy tests", () => {
for (const scheme of ["socks5", "http"]) {
const port = scheme === "socks5" ? SOCKS_PORT : HTTP_PORT;
for (const type of ["HTML page", "PDF"]) {
const url = type === "PDF" ? PDF : HTML;
test(`${scheme} proxy, ${type}, no auth`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(0);
});
test(`${scheme} proxy, ${type}, with auth`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
// auth supported only for SOCKS5
expect(status).toBe(scheme === "socks5" ? 0 : 1);
});
test(`${scheme} proxy, ${type}, wrong auth`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
test(`${scheme} proxy, ${type}, wrong protocol`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${scheme === "socks5" ? HTTP_PORT : SOCKS_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
}
test(`${scheme} proxy, proxy missing error`, () => {
let status = 0;
try {
execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
}
});
test("http proxy, PDF, separate env vars", () => {
execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${HTTP_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
});
test("http proxy set, but not running, separate env vars", () => {
let status = 0;
try {
execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});
test("http proxy set, but not running, cli arg", () => {
let status = 0;
try {
execSync(`docker run --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --proxyServer http://proxy-no-auth:${WRONG_PORT} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"});
} catch (e) {
status = e.status;
}
expect(status).toBe(1);
});

View file

@ -29,6 +29,7 @@ grpcfuse 1000000 285000 715000 28% /crawls`;
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
// does not exceed 90% threshold
const returnValue = await checkDiskUtilization(
'/crawls',
params,
5000 * 1024,
mockDfOutput,
@ -55,6 +56,7 @@ grpcfuse 100000 85000 15000 85% /crawls`;
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
// exceeds 90% threshold
const returnValue = await checkDiskUtilization(
'/crawls',
params,
3000 * 1024,
mockDfOutput,

View file

@ -1,8 +1,11 @@
import fs from "fs";
import zlib from "zlib";
import path from "path";
import child_process from "child_process";
test("check that the warcinfo file works as expected on the command line", async () => {
test("run crawl", async() => {
let success = false;
try {
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
const proc = child_process.execSync(
@ -11,10 +14,42 @@ test("check that the warcinfo file works as expected on the command line", async
);
console.log(proc);
success = true;
} catch (error) {
console.log(error);
}
expect(success).toBe(true);
});
test("check that the warcinfo for individual WARC is as expected", async () => {
const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/");
let filename = "";
for (const name of warcs) {
if (name.startsWith("rec-")) {
filename = path.join("test-crawls/collections/warcinfo/archive/", name);
break;
}
}
const warcData = fs.readFileSync(filename);
const data = zlib.gunzipSync(warcData);
const string = data.toString("utf8");
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
expect(
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
).not.toEqual(null);
expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
});
test("check that the warcinfo for combined WARC file is as expected", async () => {
const warcData = fs.readFileSync(
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
);

View file

@ -2386,6 +2386,14 @@ fd-slicer@~1.1.0:
dependencies:
pend "~1.2.0"
fetch-socks@^1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/fetch-socks/-/fetch-socks-1.3.0.tgz#1f07b26924b5e7370aa23fd6e9332a5863736d1b"
integrity sha512-Cq7O53hoNiVeOs6u54f8M/H/w2yzhmnTQ3tcAJj9FNKYOeNGmt8qNU1zpWOzJD09f0uqfmBXxLbzWPsnT6GcRw==
dependencies:
socks "^2.8.1"
undici "^6.10.1"
file-entry-cache@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027"
@ -2778,6 +2786,14 @@ ioredis@^5.3.2:
redis-parser "^3.0.0"
standard-as-callback "^2.1.0"
ip-address@^9.0.5:
version "9.0.5"
resolved "https://registry.yarnpkg.com/ip-address/-/ip-address-9.0.5.tgz#117a960819b08780c3bd1f14ef3c1cc1d3f3ea5a"
integrity sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==
dependencies:
jsbn "1.1.0"
sprintf-js "^1.1.3"
ip@^1.1.8:
version "1.1.8"
resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
@ -3427,6 +3443,11 @@ js-yaml@^4.1.0:
dependencies:
argparse "^2.0.1"
jsbn@1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040"
integrity sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==
jsesc@^2.5.1:
version "2.5.2"
resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4"
@ -4437,6 +4458,14 @@ socks@^2.7.1:
ip "^2.0.0"
smart-buffer "^4.2.0"
socks@^2.8.1:
version "2.8.3"
resolved "https://registry.yarnpkg.com/socks/-/socks-2.8.3.tgz#1ebd0f09c52ba95a09750afe3f3f9f724a800cb5"
integrity sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw==
dependencies:
ip-address "^9.0.5"
smart-buffer "^4.2.0"
source-map-support@0.5.13:
version "0.5.13"
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
@ -4455,6 +4484,11 @@ split-on-first@^1.0.0:
resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f"
integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw==
sprintf-js@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.3.tgz#4914b903a2f8b685d17fdf78a70e917e872e444a"
integrity sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==
sprintf-js@~1.0.2:
version "1.0.3"
resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c"
@ -4842,7 +4876,7 @@ undici-types@~5.25.1:
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3"
integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA==
undici@^6.18.2:
undici@^6.10.1, undici@^6.18.2:
version "6.18.2"
resolved "https://registry.yarnpkg.com/undici/-/undici-6.18.2.tgz#f662a5dc33cf654fc412a9912e5a07b138d75c97"
integrity sha512-o/MQLTwRm9IVhOqhZ0NQ9oXax1ygPjw6Vs+Vq/4QRjbOAC3B1GCHy7TYxxbExKlb7bzDRzt9vBWU6BDz0RFfYg==