diff --git a/.github/workflows/make-draft-release.yaml b/.github/workflows/make-draft-release.yaml new file mode 100644 index 00000000..e44ae848 --- /dev/null +++ b/.github/workflows/make-draft-release.yaml @@ -0,0 +1,26 @@ +name: Generate Draft Release + +on: + push: + branches: + - main + - "*-release" + +jobs: + package_chart: + runs-on: ubuntu-latest + + steps: + - name: Check out Git repository + uses: actions/checkout@v3 + + - name: Get Version + run: | + echo "version=$(jq -r .version package.json)" >> "$GITHUB_ENV" + + - name: Make Draft Release + uses: softprops/action-gh-release@v1 + with: + name: "Browsertrix Crawler v${{ env.version }}" + tag_name: v${{ env.version }} + draft: true diff --git a/Dockerfile b/Dockerfile index 1f407fa6..19e7dddf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BROWSER_VERSION=1.64.109 +ARG BROWSER_VERSION=1.66.115 ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION} FROM ${BROWSER_IMAGE_BASE} @@ -6,11 +6,7 @@ FROM ${BROWSER_IMAGE_BASE} # needed to add args to main build stage ARG BROWSER_VERSION -ENV PROXY_HOST=localhost \ - PROXY_PORT=8080 \ - PROXY_CA_URL=http://wsgiprox/download/pem \ - PROXY_CA_FILE=/tmp/proxy-ca.pem \ - DISPLAY=:99 \ +ENV DISPLAY=:99 \ GEOMETRY=1360x1020x16 \ BROWSER_VERSION=${BROWSER_VERSION} \ BROWSER_BIN=google-chrome \ @@ -28,9 +24,6 @@ ADD package.json /app/ # to allow forcing rebuilds from this stage ARG REBUILD -# Prefetch tldextract so pywb is able to boot in environments with limited internet access -RUN tldextract --update - # Download and format ad host blocklist as JSON RUN mkdir -p /tmp/ads && cd /tmp/ads && \ curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \ @@ -64,8 +57,11 @@ WORKDIR /crawls # enable to test custom behaviors build (from browsertrix-behaviors) # COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js +# add brave/chromium group policies +RUN mkdir -p /etc/brave/policies/managed/ +ADD config/policies /etc/brave/policies/managed/ + ADD docker-entrypoint.sh /docker-entrypoint.sh ENTRYPOINT ["/docker-entrypoint.sh"] CMD ["crawl"] - diff --git a/config/policies/brave-default.json b/config/policies/brave-default.json new file mode 100644 index 00000000..aac2fc24 --- /dev/null +++ b/config/policies/brave-default.json @@ -0,0 +1,6 @@ +{ + "BraveRewardsDisabled": true, + "BraveWalletDisabled": true, + "BraveVPNDisabled": 1, + "BraveAIChatEnabled": false +} diff --git a/config/policies/lockdown-profilebrowser.json b/config/policies/lockdown-profilebrowser.json new file mode 100644 index 00000000..ecfd4994 --- /dev/null +++ b/config/policies/lockdown-profilebrowser.json @@ -0,0 +1,8 @@ +{ + "IncognitoModeAvailability": 1, + "TorDisabled": true, + "AllowFileSelectionDialogs": false, + "URLBlocklist": [ + "file://*" + ] +} diff --git a/docs/docs/user-guide/cli-options.md b/docs/docs/user-guide/cli-options.md index accda6e2..1d97210d 100644 --- a/docs/docs/user-guide/cli-options.md +++ b/docs/docs/user-guide/cli-options.md @@ -144,6 +144,11 @@ Options: age behavior will run on each page. If 0, a behavior can run until finis h. [number] [default: 90] + --postLoadDelay If >0, amount of time to sleep (in s + econds) after page has loaded, befor + e taking screenshots / getting text + / running behaviors + [number] [default: 0] --pageExtraDelay, --delay If >0, amount of time to sleep (in s econds) after behaviors before movin g on to next page @@ -227,16 +232,19 @@ Options: --writePagesToRedis If set, write page objects to redis [boolean] [default: false] --failOnFailedSeed If set, crawler will fail with exit - code 1 if any seed fails - [boolean] [default: false] + code 1 if any seed fails. When combi + ned with --failOnInvalidStatus,will + result in crawl failing with exit co + de 1 if any seed has a 4xx/5xx respo + nse [boolean] [default: false] --failOnFailedLimit If set, save state and exit if numbe r of failed pages exceeds this value [number] [default: 0] - --failOnInvalidStatus If set, will treat pages with non-20 - 0 response as failures. When combine - d with --failOnFailedLimit or --fail - OnFailedSeedmay result in crawl fail - ing due to non-200 responses + --failOnInvalidStatus If set, will treat pages with 4xx or + 5xx response as failures. When comb + ined with --failOnFailedLimit or --f + ailOnFailedSeed may result in crawl + failing due to non-200 responses [boolean] [default: false] --customBehaviors injects a custom behavior file or se t of behavior files in a directory @@ -250,6 +258,8 @@ Options: nabled, or disabled with custom prof ile [choices: "disabled", "disabled-if-profile", "enabled"] [default: "disabled"] + --dryRun If true, no data is written to disk, + only logs [boolean] --qaSource Required for QA mode. Source (WACZ o r multi WACZ) for QA [string] --qaDebugImageDiff if specified, will write crawl.png, @@ -269,7 +279,8 @@ Options: ted --password The password for the login. If not specified, will be promp ted (recommended) - --filename The filename for the profile tarball + --filename The filename for the profile tarball, stored within /crawls + /profiles if absolute path not provided [default: "/crawls/profiles/profile.tar.gz"] --debugScreenshot If specified, take a screenshot after login and save as thi s filename diff --git a/package.json b/package.json index 1c53a828..7cb8c9e0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.1.4", + "version": "1.2.0-beta.1", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", @@ -21,6 +21,7 @@ "@webrecorder/wabac": "^2.16.12", "browsertrix-behaviors": "^0.6.0", "crc": "^4.3.2", + "fetch-socks": "^1.3.0", "get-folder-size": "^4.0.0", "husky": "^8.0.3", "ioredis": "^5.3.2", diff --git a/src/crawler.ts b/src/crawler.ts index 37d8e54f..6b061b62 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -16,8 +16,6 @@ import { parseArgs } from "./util/argParser.js"; import yaml from "js-yaml"; -import * as warcio from "warcio"; - import { HealthChecker } from "./util/healthcheck.js"; import { TextExtractViaSnapshot } from "./util/textextract.js"; import { @@ -46,27 +44,19 @@ import { Browser } from "./util/browser.js"; import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, - HTML_TYPES, DEFAULT_SELECTORS, } from "./util/constants.js"; import { AdBlockRules, BlockRules } from "./util/blockrules.js"; import { OriginOverride } from "./util/originoverride.js"; -// to ignore HTTPS error for HEAD check -import { Agent as HTTPAgent } from "http"; -import { Agent as HTTPSAgent } from "https"; import { CDPSession, Frame, HTTPRequest, Page, Protocol } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; import { ScopedSeed } from "./util/seeds.js"; -import { WARCWriter } from "./util/warcwriter.js"; - -const HTTPS_AGENT = new HTTPSAgent({ - rejectUnauthorized: false, -}); - -const HTTP_AGENT = new HTTPAgent(); +import { WARCWriter, createWARCInfo, setWARCInfo } from "./util/warcwriter.js"; +import { isHTMLContentType } from "./util/reqresp.js"; +import { initProxy } from "./util/proxy.js"; const behaviors = fs.readFileSync( new URL( @@ -184,6 +174,8 @@ export class Crawler { maxHeapUsed = 0; maxHeapTotal = 0; + proxyServer?: string; + driver!: (opts: { page: Page; data: PageState; @@ -191,7 +183,7 @@ export class Crawler { crawler: Crawler; }) => NonNullable; - recording = true; + recording: boolean; constructor() { const args = this.parseArgs(); @@ -225,6 +217,13 @@ export class Crawler { logger.debug("Writing log to: " + this.logFilename, {}, "general"); + this.recording = !this.params.dryRun; + if (this.params.dryRun) { + logger.warn( + "Dry run mode: no archived data stored, only pages and logging. Storage and archive creation related options will be ignored.", + ); + } + this.headers = {}; // pages file @@ -449,17 +448,23 @@ export class Crawler { async bootstrap() { const subprocesses: ChildProcess[] = []; + this.proxyServer = initProxy(this.params.proxyServer); + subprocesses.push(this.launchRedis()); await fsp.mkdir(this.logDir, { recursive: true }); - await fsp.mkdir(this.archivesDir, { recursive: true }); - await fsp.mkdir(this.tempdir, { recursive: true }); - await fsp.mkdir(this.tempCdxDir, { recursive: true }); + + if (!this.params.dryRun) { + await fsp.mkdir(this.archivesDir, { recursive: true }); + await fsp.mkdir(this.tempdir, { recursive: true }); + await fsp.mkdir(this.tempCdxDir, { recursive: true }); + } this.logFH = fs.createWriteStream(this.logFilename, { flags: "a" }); logger.setExternalLogStream(this.logFH); this.infoString = await getInfoString(); + setWARCInfo(this.infoString, this.params.warcInfo); logger.info(this.infoString); logger.info("Seeds", this.seeds); @@ -515,10 +520,10 @@ export class Crawler { ); } - if (this.params.screenshot) { + if (this.params.screenshot && !this.params.dryRun) { this.screenshotWriter = this.createExtraResourceWarcWriter("screenshots"); } - if (this.params.text) { + if (this.params.text && !this.params.dryRun) { this.textWriter = this.createExtraResourceWarcWriter("text"); } } @@ -788,7 +793,7 @@ self.__bx_behaviors.selectMainBehavior(); async crawlPage(opts: WorkerState): Promise { await this.writeStats(); - const { page, data, workerid, callbacks, directFetchCapture } = opts; + const { page, cdp, data, workerid, callbacks, directFetchCapture } = opts; data.callbacks = callbacks; const { url } = data; @@ -797,35 +802,27 @@ self.__bx_behaviors.selectMainBehavior(); data.logDetails = logDetails; data.workerid = workerid; - data.isHTMLPage = await timedRun( - this.isHTML(url, logDetails), - FETCH_TIMEOUT_SECS, - "HEAD request to determine if URL is HTML page timed out", - logDetails, - "fetch", - true, - ); - - if (!data.isHTMLPage && directFetchCapture) { + if (directFetchCapture) { try { - const { fetched, mime } = await timedRun( - directFetchCapture(url), + const { fetched, mime, ts } = await timedRun( + directFetchCapture({ url, headers: this.headers, cdp }), FETCH_TIMEOUT_SECS, "Direct fetch capture attempt timed out", logDetails, "fetch", true, ); + if (mime) { + data.mime = mime; + data.isHTMLPage = isHTMLContentType(mime); + } if (fetched) { data.loadState = LoadState.FULL_PAGE_LOADED; - if (mime) { - data.mime = mime; - } data.status = 200; - data.ts = new Date(); + data.ts = ts || new Date(); logger.info( "Direct fetch successful", - { url, ...logDetails }, + { url, mime, ...logDetails }, "fetch", ); return; @@ -1105,30 +1102,10 @@ self.__bx_behaviors.selectMainBehavior(); return res ? frame : null; } - async createWARCInfo(filename: string) { - const warcVersion = "WARC/1.1"; - const type = "warcinfo"; - - const info = { - software: this.infoString, - format: "WARC File Format 1.1", - }; - - const warcInfo = { ...info, ...this.params.warcInfo }; - const record = await warcio.WARCRecord.createWARCInfo( - { filename, type, warcVersion }, - warcInfo, - ); - const buffer = await warcio.WARCSerializer.serialize(record, { - gzip: true, - }); - return buffer; - } - async checkLimits() { let interrupt = false; - const size = await getDirSize(this.archivesDir); + const size = this.params.dryRun ? 0 : await getDirSize(this.archivesDir); await this.crawlState.setArchiveSize(size); @@ -1153,7 +1130,11 @@ self.__bx_behaviors.selectMainBehavior(); if (this.params.diskUtilization) { // Check that disk usage isn't already or soon to be above threshold - const diskUtil = await checkDiskUtilization(this.params, size); + const diskUtil = await checkDiskUtilization( + this.collDir, + this.params, + size, + ); if (diskUtil.stop === true) { interrupt = true; } @@ -1328,7 +1309,7 @@ self.__bx_behaviors.selectMainBehavior(); emulateDevice: this.emulateDevice, swOpt: this.params.serviceWorker, chromeOptions: { - proxy: false, + proxy: this.proxyServer, userAgent: this.emulateDevice.userAgent, extraArgs: this.extraChromeArgs(), }, @@ -1424,11 +1405,11 @@ self.__bx_behaviors.selectMainBehavior(); } async postCrawl() { - if (this.params.combineWARC) { + if (this.params.combineWARC && !this.params.dryRun) { await this.combineWARC(); } - if (this.params.generateCDX) { + if (this.params.generateCDX && !this.params.dryRun) { logger.info("Generating CDX"); await fsp.mkdir(path.join(this.collDir, "indexes"), { recursive: true }); await this.crawlState.setStatus("generate-cdx"); @@ -1460,6 +1441,7 @@ self.__bx_behaviors.selectMainBehavior(); if ( this.params.generateWACZ && + !this.params.dryRun && (!this.interrupted || this.finalExit || this.uploadAndDeleteLocal) ) { const uploaded = await this.generateWACZ(); @@ -1775,7 +1757,7 @@ self.__bx_behaviors.selectMainBehavior(); const contentType = resp.headers()["content-type"]; - isHTMLPage = this.isHTMLContentType(contentType); + isHTMLPage = isHTMLContentType(contentType); if (contentType) { data.mime = contentType.split(";")[0]; @@ -1923,7 +1905,9 @@ self.__bx_behaviors.selectMainBehavior(); "behavior", ); try { - await frame.evaluate("self.__bx_behaviors.awaitPageLoad();"); + await frame.evaluate( + "self.__bx_behaviors && self.__bx_behaviors.awaitPageLoad();", + ); } catch (e) { logger.warn("Waiting for custom page load failed", e, "behavior"); } @@ -2186,11 +2170,13 @@ self.__bx_behaviors.selectMainBehavior(); let { ts } = state; if (!ts) { ts = new Date(); - logger.warn( - "Page date missing, setting to now", - { url, ts }, - "pageStatus", - ); + if (!this.params.dryRun) { + logger.warn( + "Page date missing, setting to now", + { url, ts }, + "pageStatus", + ); + } } row.ts = ts.toISOString(); @@ -2241,49 +2227,6 @@ self.__bx_behaviors.selectMainBehavior(); } } - resolveAgent(urlParsed: URL) { - return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT; - } - - async isHTML(url: string, logDetails: LogDetails) { - try { - const resp = await fetch(url, { - method: "HEAD", - headers: this.headers, - agent: this.resolveAgent, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } as any); - if (resp.status !== 200) { - logger.debug("HEAD response code != 200, loading in browser", { - status: resp.status, - ...logDetails, - }); - return true; - } - - return this.isHTMLContentType(resp.headers.get("Content-Type")); - } catch (e) { - // can't confirm not html, so try in browser - logger.debug("HEAD request failed", { ...formatErr(e), ...logDetails }); - return true; - } - } - - isHTMLContentType(contentType: string | null) { - // just load if no content-type - if (!contentType) { - return true; - } - - const mime = contentType.split(";")[0]; - - if (HTML_TYPES.includes(mime)) { - return true; - } - - return false; - } - async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) { if (!sitemap) { return; @@ -2441,7 +2384,7 @@ self.__bx_behaviors.selectMainBehavior(); generatedCombinedWarcs.push(combinedWarcName); - const warcBuffer = await this.createWARCInfo(combinedWarcName); + const warcBuffer = await createWARCInfo(combinedWarcName); fh.write(warcBuffer); } diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 047ed017..2f19f0ea 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -99,9 +99,10 @@ function cliOpts(): { [key: string]: Options } { default: getDefaultWindowSize(), }, - proxy: { - type: "boolean", - default: false, + proxyServer: { + describe: + "if set, will use specified proxy server. Takes precedence over any env var proxy settings", + type: "string", }, cookieDays: { @@ -179,7 +180,7 @@ async function main() { headless: params.headless, signals: false, chromeOptions: { - proxy: false, + proxy: params.proxyServer, extraArgs: [ "--window-position=0,0", `--window-size=${params.windowSize}`, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index c330e820..8967d272 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -545,6 +545,18 @@ class ArgParser { default: "disabled", }, + proxyServer: { + describe: + "if set, will use specified proxy server. Takes precedence over any env var proxy settings", + type: "string", + }, + + dryRun: { + describe: + "If true, no archive data is written to disk, only pages and logs (and optionally saved state).", + type: "boolean", + }, + qaSource: { describe: "Required for QA mode. Source (WACZ or multi WACZ) for QA", type: "string", diff --git a/src/util/blockrules.ts b/src/util/blockrules.ts index a0fa1ebe..3f258b33 100644 --- a/src/util/blockrules.ts +++ b/src/util/blockrules.ts @@ -4,6 +4,8 @@ import { logger, formatErr } from "./logger.js"; import { HTTPRequest, Page } from "puppeteer-core"; import { Browser } from "./browser.js"; +import { fetch } from "undici"; + const RULE_TYPES = ["block", "allowOnly"]; const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"]; diff --git a/src/util/browser.ts b/src/util/browser.ts index 65500cb4..73a74222 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -22,7 +22,7 @@ import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; type BtrixChromeOpts = { - proxy?: boolean; + proxy?: string; userAgent?: string | null; extraArgs?: string[]; }; @@ -115,7 +115,6 @@ export class Browser { ? undefined : (target) => this.targetFilter(target), }; - await this._init(launchOpts, ondisconnect, recording); } @@ -217,7 +216,7 @@ export class Browser { } chromeArgs({ - proxy = true, + proxy = "", userAgent = null, extraArgs = [], }: BtrixChromeOpts) { @@ -236,11 +235,13 @@ export class Browser { ...extraArgs, ]; + if (proxy) { + logger.info("Using proxy", { proxy }, "browser"); + } + if (proxy) { args.push("--ignore-certificate-errors"); - args.push( - `--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`, - ); + args.push(`--proxy-server=${proxy}`); } return args; diff --git a/src/util/proxy.ts b/src/util/proxy.ts new file mode 100644 index 00000000..c7fe2a85 --- /dev/null +++ b/src/util/proxy.ts @@ -0,0 +1,60 @@ +import { Dispatcher, ProxyAgent, setGlobalDispatcher } from "undici"; + +import { socksDispatcher } from "fetch-socks"; +import type { SocksProxyType } from "socks/typings/common/constants.js"; + +export function getEnvProxyUrl() { + if (process.env.PROXY_SERVER) { + return process.env.PROXY_SERVER; + } + + // for backwards compatibility with 0.x proxy settings + if (process.env.PROXY_HOST && process.env.PROXY_PORT) { + return `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`; + } + + return ""; +} + +export function initProxy(proxy?: string): string { + if (!proxy) { + proxy = getEnvProxyUrl(); + } + if (proxy) { + const dispatcher = createDispatcher(proxy); + if (dispatcher) { + setGlobalDispatcher(dispatcher); + return proxy; + } + } + return ""; +} + +export function createDispatcher(proxyUrl: string): Dispatcher | undefined { + if (proxyUrl.startsWith("http://") || proxyUrl.startsWith("https://")) { + // HTTP PROXY does not support auth, as it's not supported in the browser + // so must drop username/password for consistency + const url = new URL(proxyUrl); + url.username = ""; + url.password = ""; + return new ProxyAgent({ uri: url.href }); + } else if ( + proxyUrl.startsWith("socks://") || + proxyUrl.startsWith("socks5://") || + proxyUrl.startsWith("socks4://") + ) { + // support auth as SOCKS5 auth *is* supported in Brave (though not in Chromium) + const url = new URL(proxyUrl); + const type: SocksProxyType = url.protocol === "socks4:" ? 4 : 5; + const params = { + type, + host: url.hostname, + port: parseInt(url.port), + userId: url.username || undefined, + password: url.password || undefined, + }; + return socksDispatcher(params); + } else { + return undefined; + } +} diff --git a/src/util/recorder.ts b/src/util/recorder.ts index e6789edd..bc501de4 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -6,7 +6,7 @@ import PQueue from "p-queue"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun, timestampNow } from "./timing.js"; -import { RequestResponseInfo } from "./reqresp.js"; +import { RequestResponseInfo, isHTMLContentType } from "./reqresp.js"; import { fetch, Response } from "undici"; @@ -77,11 +77,23 @@ export type AsyncFetchOptions = { filter?: (resp: Response) => boolean; ignoreDupe?: boolean; maxFetchSize?: number; + manualRedirect?: boolean; }; // ================================================================= -export type ResponseStreamAsyncFetchOptions = AsyncFetchOptions & { +export type DirectFetchRequest = { + url: string; + headers: Record; cdp: CDPSession; +}; + +// ================================================================= +export type NetworkLoadAsyncFetchOptions = AsyncFetchOptions & { + cdp: CDPSession; +}; + +// ================================================================= +export type ResponseStreamAsyncFetchOptions = NetworkLoadAsyncFetchOptions & { requestId: string; }; @@ -1068,12 +1080,23 @@ export class Recorder { this.writer.writeRecordPair(responseRecord, requestRecord); } - async directFetchCapture( - url: string, - ): Promise<{ fetched: boolean; mime: string }> { + async directFetchCapture({ url, headers, cdp }: DirectFetchRequest): Promise<{ + fetched: boolean; + mime: string; + ts: Date; + }> { const reqresp = new RequestResponseInfo("0"); + const ts = new Date(); + + const cookie = await this.getCookieString(cdp, url); + if (cookie) { + headers["Cookie"] = cookie; + } + reqresp.url = url; reqresp.method = "GET"; + reqresp.requestHeaders = headers; + reqresp.ts = ts; logger.debug( "Directly fetching page URL without browser", @@ -1081,8 +1104,21 @@ export class Recorder { "recorder", ); - const filter = (resp: Response) => - resp.status === 200 && !resp.headers.get("set-cookie"); + let mime: string = ""; + + const filter = (resp: Response) => { + // only direct load 200 responses + if (resp.status !== 200) { + return false; + } + + const ct = resp.headers.get("content-type"); + if (ct) { + mime = ct.split(";")[0]; + } + + return !isHTMLContentType(mime); + }; // ignore dupes: if previous URL was not a page, still load as page. if previous was page, // should not get here, as dupe pages tracked via seen list @@ -1093,16 +1129,28 @@ export class Recorder { networkId: "0", filter, ignoreDupe: true, + manualRedirect: true, }); const res = await fetcher.load(); - const mime = - (reqresp.responseHeaders && - reqresp.responseHeaders["content-type"] && - reqresp.responseHeaders["content-type"].split(";")[0]) || - ""; + this.addPageRecord(reqresp); - return { fetched: res === "fetched", mime }; + if (url === this.pageUrl && !this.pageInfo.ts) { + logger.debug("Setting page timestamp", { ts, url }); + this.pageInfo.ts = ts; + } + + return { fetched: res === "fetched", mime, ts }; + } + + async getCookieString(cdp: CDPSession, url: string) { + const cookieList: string[] = []; + const { cookies } = await cdp.send("Network.getCookies", { urls: [url] }); + for (const { name, value } of cookies) { + cookieList.push(`${name}=${value}`); + } + + return cookieList.join(";"); } } @@ -1121,6 +1169,8 @@ class AsyncFetcher { tempdir: string; filename: string; + manualRedirect = false; + constructor({ tempdir, reqresp, @@ -1130,6 +1180,7 @@ class AsyncFetcher { filter = undefined, ignoreDupe = false, maxFetchSize = MAX_BROWSER_DEFAULT_FETCH_SIZE, + manualRedirect = false, }: AsyncFetchOptions) { this.reqresp = reqresp; this.reqresp.expectedSize = expectedSize; @@ -1148,6 +1199,8 @@ class AsyncFetcher { ); this.maxFetchSize = maxFetchSize; + + this.manualRedirect = manualRedirect; } async load() { @@ -1283,9 +1336,9 @@ class AsyncFetcher { reqresp.status = 0; reqresp.errorText = e.message; } finally { + recorder.addPageRecord(reqresp); // exclude direct fetch request with fake id if (networkId !== "0") { - recorder.addPageRecord(reqresp); recorder.removeReqResp(networkId); } } @@ -1313,6 +1366,7 @@ class AsyncFetcher { headers, body: reqresp.postData || undefined, signal, + redirect: this.manualRedirect ? "manual" : "follow", }); if (this.filter && !this.filter(resp) && abort) { @@ -1329,6 +1383,7 @@ class AsyncFetcher { } if (reqresp.expectedSize === 0) { + reqresp.fillFetchResponse(resp); reqresp.payload = new Uint8Array(); return; } else if (!resp.body) { @@ -1428,7 +1483,7 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher { class NetworkLoadStreamAsyncFetcher extends AsyncFetcher { cdp: CDPSession; - constructor(opts: ResponseStreamAsyncFetchOptions) { + constructor(opts: NetworkLoadAsyncFetchOptions) { super(opts); this.cdp = opts.cdp; } diff --git a/src/util/reqresp.ts b/src/util/reqresp.ts index 53adce4c..606de66c 100644 --- a/src/util/reqresp.ts +++ b/src/util/reqresp.ts @@ -3,7 +3,7 @@ import { getStatusText } from "@webrecorder/wabac/src/utils.js"; import { Protocol } from "puppeteer-core"; import { postToGetUrl } from "warcio"; - +import { HTML_TYPES } from "./constants.js"; import { Response } from "undici"; const CONTENT_LENGTH = "content-length"; @@ -150,10 +150,15 @@ export class RequestResponseInfo { } } + isRedirectStatus() { + return this.status >= 300 && this.status < 400 && this.status !== 304; + } + isSelfRedirect() { - if (this.status < 300 || this.status >= 400 || this.status === 304) { + if (!this.isRedirectStatus()) { return false; } + try { const headers = new Headers(this.getResponseHeadersDict()); const location = headers.get("location") || ""; @@ -365,3 +370,18 @@ export class RequestResponseInfo { return value.replace(/\n/g, ", "); } } + +export function isHTMLContentType(contentType: string | null) { + // just load if no content-type + if (!contentType) { + return true; + } + + const mime = contentType.split(";")[0]; + + if (HTML_TYPES.includes(mime)) { + return true; + } + + return false; +} diff --git a/src/util/sitemapper.ts b/src/util/sitemapper.ts index 5d8507b5..e34a9bf1 100644 --- a/src/util/sitemapper.ts +++ b/src/util/sitemapper.ts @@ -9,6 +9,8 @@ import { logger, formatErr } from "./logger.js"; import { DETECT_SITEMAP } from "./constants.js"; import { sleep } from "./timing.js"; +import { fetch, Response } from "undici"; + const SITEMAP_CONCURRENCY = 5; const TEXT_CONTENT_TYPE = ["text/plain"]; @@ -237,7 +239,8 @@ export class SitemapReader extends EventEmitter { resp.headers.get("content-encoding") !== "gzip" ) { const ds = new DecompressionStream("gzip"); - stream = body.pipeThrough(ds); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + stream = body.pipeThrough(ds as any); } else { stream = body; } diff --git a/src/util/state.ts b/src/util/state.ts index e25fa01e..bb8379f2 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -66,7 +66,7 @@ export class PageState { callbacks: PageCallbacks = {}; - isHTMLPage?: boolean; + isHTMLPage = true; text?: string; screenshotView?: Buffer; favicon?: string; diff --git a/src/util/storage.ts b/src/util/storage.ts index f89d3860..b37f2aaf 100644 --- a/src/util/storage.ts +++ b/src/util/storage.ts @@ -202,6 +202,7 @@ export async function getDirSize(dir: string) { } export async function checkDiskUtilization( + collDir: string, // TODO: Fix this the next time the file is edited. // eslint-disable-next-line @typescript-eslint/no-explicit-any params: Record, @@ -209,7 +210,7 @@ export async function checkDiskUtilization( dfOutput = null, ) { const diskUsage: Record = await getDiskUsage( - "/crawls", + collDir, dfOutput, ); const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1)); diff --git a/src/util/warcwriter.ts b/src/util/warcwriter.ts index 107b30ce..2a14552f 100644 --- a/src/util/warcwriter.ts +++ b/src/util/warcwriter.ts @@ -11,6 +11,8 @@ import PQueue from "p-queue"; const DEFAULT_ROLLOVER_SIZE = 1_000_000_000; +let warcInfo = {}; + export type ResourceRecordData = { buffer: Uint8Array; resourceType: string; @@ -117,6 +119,8 @@ export class WARCWriter implements IndexerOffsetLength { ); } + fh.write(await createWARCInfo(this.filename)); + return fh; } @@ -310,6 +314,33 @@ export class WARCWriter implements IndexerOffsetLength { } } +// ================================================================= +export function setWARCInfo( + software: string, + otherParams?: Record, +) { + warcInfo = { + software, + format: "WARC File Format 1.1", + ...otherParams, + }; +} + +// ================================================================= +export async function createWARCInfo(filename: string) { + const warcVersion = "WARC/1.1"; + const type = "warcinfo"; + + const record = await WARCRecord.createWARCInfo( + { filename, type, warcVersion }, + warcInfo, + ); + const buffer = await WARCSerializer.serialize(record, { + gzip: true, + }); + return buffer; +} + // ================================================================= export function streamFinish(fh: Writable) { const p = new Promise((resolve) => { diff --git a/src/util/worker.ts b/src/util/worker.ts index 27904c66..86f96bc5 100644 --- a/src/util/worker.ts +++ b/src/util/worker.ts @@ -2,7 +2,7 @@ import os from "os"; import { logger, formatErr } from "./logger.js"; import { sleep, timedRun } from "./timing.js"; -import { Recorder } from "./recorder.js"; +import { DirectFetchRequest, Recorder } from "./recorder.js"; import { rxEscape } from "./seeds.js"; import { CDPSession, Page } from "puppeteer-core"; import { PageState, WorkerId } from "./state.js"; @@ -20,8 +20,10 @@ export type WorkerOpts = { workerid: WorkerId; // eslint-disable-next-line @typescript-eslint/ban-types callbacks: Record; - directFetchCapture?: - | ((url: string) => Promise<{ fetched: boolean; mime: string }>) + directFetchCapture: + | (( + request: DirectFetchRequest, + ) => Promise<{ fetched: boolean; mime: string; ts: Date }>) | null; frameIdToExecId: Map; }; @@ -171,7 +173,7 @@ export class PageWorker { this.cdp = cdp; this.callbacks = {}; const directFetchCapture = this.recorder - ? (x: string) => this.recorder!.directFetchCapture(x) + ? (req: DirectFetchRequest) => this.recorder!.directFetchCapture(req) : null; this.opts = { page, diff --git a/tests/dryrun.test.js b/tests/dryrun.test.js new file mode 100644 index 00000000..ffb2f216 --- /dev/null +++ b/tests/dryrun.test.js @@ -0,0 +1,18 @@ +import child_process from "child_process"; +import fs from "fs"; + +test("ensure dryRun crawl only writes pages and logs", async () => { + child_process.execSync( + 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun', + ); + + const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort(); + expect(files.length).toBe(2); + expect(files[0]).toBe("logs"); + expect(files[1]).toBe("pages"); +}); + + + + + diff --git a/tests/pdf-crawl.test.js b/tests/pdf-crawl.test.js index 00c314d7..3bc6c077 100644 --- a/tests/pdf-crawl.test.js +++ b/tests/pdf-crawl.test.js @@ -3,7 +3,7 @@ import fs from "fs"; import path from "path"; import { WARCParser } from "warcio"; -const PDF = "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"; +const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; test("ensure pdf is crawled", async () => { child_process.execSync( diff --git a/tests/proxy.test.js b/tests/proxy.test.js new file mode 100644 index 00000000..1c162620 --- /dev/null +++ b/tests/proxy.test.js @@ -0,0 +1,127 @@ +import { execSync, exec } from "child_process"; + +const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); + +const PROXY_IMAGE = "tarampampam/3proxy:1.9.1"; +const SOCKS_PORT = "1080"; +const HTTP_PORT = "3128"; +const WRONG_PORT = "33130"; + +const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; +const HTML = "https://webrecorder.net/"; + +const extraArgs = "--limit 1 --failOnFailedSeed --timeout 10 --logging debug"; + +let proxyAuthId; +let proxyNoAuthId; + +beforeAll(() => { + execSync("docker network create proxy-test-net"); + + proxyAuthId = execSync(`docker run -e PROXY_LOGIN=user -e PROXY_PASSWORD=passw0rd -d --rm --network=proxy-test-net --name proxy-with-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); + + proxyNoAuthId = execSync(`docker run -d --rm --network=proxy-test-net --name proxy-no-auth ${PROXY_IMAGE}`, {encoding: "utf-8"}); +}); + +afterAll(async () => { + execSync(`docker kill -s SIGINT ${proxyAuthId}`); + execSync(`docker kill -s SIGINT ${proxyNoAuthId}`); + await sleep(3000); + execSync("docker network rm proxy-test-net"); +}); + +describe("socks5 + https proxy tests", () => { + for (const scheme of ["socks5", "http"]) { + const port = scheme === "socks5" ? SOCKS_PORT : HTTP_PORT; + + for (const type of ["HTML page", "PDF"]) { + + const url = type === "PDF" ? PDF : HTML; + + test(`${scheme} proxy, ${type}, no auth`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(0); + }); + + test(`${scheme} proxy, ${type}, with auth`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw0rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + // auth supported only for SOCKS5 + expect(status).toBe(scheme === "socks5" ? 0 : 1); + }); + + test(`${scheme} proxy, ${type}, wrong auth`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${port} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + + test(`${scheme} proxy, ${type}, wrong protocol`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://user:passw1rd@proxy-with-auth:${scheme === "socks5" ? HTTP_PORT : SOCKS_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${url} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + } + + test(`${scheme} proxy, proxy missing error`, () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_SERVER=${scheme}://proxy-no-auth:${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${HTML} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); + }); + } +}); + + +test("http proxy, PDF, separate env vars", () => { + execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${HTTP_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); +}); + +test("http proxy set, but not running, separate env vars", () => { + let status = 0; + + try { + execSync(`docker run -e PROXY_HOST=proxy-no-auth -e PROXY_PORT=${WRONG_PORT} --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); +}); + +test("http proxy set, but not running, cli arg", () => { + let status = 0; + + try { + execSync(`docker run --rm --network=proxy-test-net webrecorder/browsertrix-crawler crawl --proxyServer http://proxy-no-auth:${WRONG_PORT} --url ${PDF} ${extraArgs}`, {encoding: "utf-8"}); + } catch (e) { + status = e.status; + } + expect(status).toBe(1); +}); + + diff --git a/tests/storage.test.js b/tests/storage.test.js index a5c7f783..215f22c4 100644 --- a/tests/storage.test.js +++ b/tests/storage.test.js @@ -29,6 +29,7 @@ grpcfuse 1000000 285000 715000 28% /crawls`; // with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31% // does not exceed 90% threshold const returnValue = await checkDiskUtilization( + '/crawls', params, 5000 * 1024, mockDfOutput, @@ -55,6 +56,7 @@ grpcfuse 100000 85000 15000 85% /crawls`; // with generateWACZ, projected is 85k + 3k x 2 = 91k = 91% // exceeds 90% threshold const returnValue = await checkDiskUtilization( + '/crawls', params, 3000 * 1024, mockDfOutput, diff --git a/tests/warcinfo.test.js b/tests/warcinfo.test.js index 529a8379..da24c448 100644 --- a/tests/warcinfo.test.js +++ b/tests/warcinfo.test.js @@ -1,8 +1,11 @@ import fs from "fs"; import zlib from "zlib"; +import path from "path"; import child_process from "child_process"; -test("check that the warcinfo file works as expected on the command line", async () => { +test("run crawl", async() => { + let success = false; + try { const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); const proc = child_process.execSync( @@ -11,10 +14,42 @@ test("check that the warcinfo file works as expected on the command line", async ); console.log(proc); + success = true; } catch (error) { console.log(error); } + expect(success).toBe(true); +}); + +test("check that the warcinfo for individual WARC is as expected", async () => { + + const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/"); + + let filename = ""; + + for (const name of warcs) { + if (name.startsWith("rec-")) { + filename = path.join("test-crawls/collections/warcinfo/archive/", name); + break; + } + } + + const warcData = fs.readFileSync(filename); + + const data = zlib.gunzipSync(warcData); + + const string = data.toString("utf8"); + + expect(string.indexOf("operator: test")).toBeGreaterThan(-1); + expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); + expect( + string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/), + ).not.toEqual(null); + expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1); +}); + +test("check that the warcinfo for combined WARC file is as expected", async () => { const warcData = fs.readFileSync( "test-crawls/collections/warcinfo/warcinfo_0.warc.gz", ); diff --git a/yarn.lock b/yarn.lock index 856543b5..097c0363 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2386,6 +2386,14 @@ fd-slicer@~1.1.0: dependencies: pend "~1.2.0" +fetch-socks@^1.3.0: + version "1.3.0" + resolved "https://registry.yarnpkg.com/fetch-socks/-/fetch-socks-1.3.0.tgz#1f07b26924b5e7370aa23fd6e9332a5863736d1b" + integrity sha512-Cq7O53hoNiVeOs6u54f8M/H/w2yzhmnTQ3tcAJj9FNKYOeNGmt8qNU1zpWOzJD09f0uqfmBXxLbzWPsnT6GcRw== + dependencies: + socks "^2.8.1" + undici "^6.10.1" + file-entry-cache@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/file-entry-cache/-/file-entry-cache-6.0.1.tgz#211b2dd9659cb0394b073e7323ac3c933d522027" @@ -2778,6 +2786,14 @@ ioredis@^5.3.2: redis-parser "^3.0.0" standard-as-callback "^2.1.0" +ip-address@^9.0.5: + version "9.0.5" + resolved "https://registry.yarnpkg.com/ip-address/-/ip-address-9.0.5.tgz#117a960819b08780c3bd1f14ef3c1cc1d3f3ea5a" + integrity sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g== + dependencies: + jsbn "1.1.0" + sprintf-js "^1.1.3" + ip@^1.1.8: version "1.1.8" resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48" @@ -3427,6 +3443,11 @@ js-yaml@^4.1.0: dependencies: argparse "^2.0.1" +jsbn@1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-1.1.0.tgz#b01307cb29b618a1ed26ec79e911f803c4da0040" + integrity sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A== + jsesc@^2.5.1: version "2.5.2" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4" @@ -4437,6 +4458,14 @@ socks@^2.7.1: ip "^2.0.0" smart-buffer "^4.2.0" +socks@^2.8.1: + version "2.8.3" + resolved "https://registry.yarnpkg.com/socks/-/socks-2.8.3.tgz#1ebd0f09c52ba95a09750afe3f3f9f724a800cb5" + integrity sha512-l5x7VUUWbjVFbafGLxPWkYsHIhEvmF85tbIeFZWc8ZPtoMyybuEhL7Jye/ooC4/d48FgOjSJXgsF/AJPYCW8Zw== + dependencies: + ip-address "^9.0.5" + smart-buffer "^4.2.0" + source-map-support@0.5.13: version "0.5.13" resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932" @@ -4455,6 +4484,11 @@ split-on-first@^1.0.0: resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f" integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw== +sprintf-js@^1.1.3: + version "1.1.3" + resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.1.3.tgz#4914b903a2f8b685d17fdf78a70e917e872e444a" + integrity sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA== + sprintf-js@~1.0.2: version "1.0.3" resolved "https://registry.yarnpkg.com/sprintf-js/-/sprintf-js-1.0.3.tgz#04e6926f662895354f3dd015203633b857297e2c" @@ -4842,7 +4876,7 @@ undici-types@~5.25.1: resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3" integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA== -undici@^6.18.2: +undici@^6.10.1, undici@^6.18.2: version "6.18.2" resolved "https://registry.yarnpkg.com/undici/-/undici-6.18.2.tgz#f662a5dc33cf654fc412a9912e5a07b138d75c97" integrity sha512-o/MQLTwRm9IVhOqhZ0NQ9oXax1ygPjw6Vs+Vq/4QRjbOAC3B1GCHy7TYxxbExKlb7bzDRzt9vBWU6BDz0RFfYg==