diff --git a/Dockerfile b/Dockerfile index ca8cb670..6ba5fe8b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BROWSER_VERSION=1.80.125 +ARG BROWSER_VERSION=1.82.170 ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION} FROM ${BROWSER_IMAGE_BASE} @@ -39,7 +39,7 @@ ADD config/ /app/ ADD html/ /app/html/ -ARG RWP_VERSION=2.3.17 +ARG RWP_VERSION=2.3.19 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz diff --git a/docs/docs/user-guide/behaviors.md b/docs/docs/user-guide/behaviors.md index b0c34a61..f585ef18 100644 --- a/docs/docs/user-guide/behaviors.md +++ b/docs/docs/user-guide/behaviors.md @@ -266,6 +266,7 @@ Some of these functions which may be of use to behaviors authors are: - `scrollToOffset`: scroll to particular offset - `scrollIntoView`: smoothly scroll particular element into view - `getState`: increment a state counter and return all state counters + string message +* `addLink`: add a given URL to the crawl queue More detailed references will be added in the future. diff --git a/package.json b/package.json index fd4c263d..debd1ec9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.8.0-beta.0", + "version": "1.8.1", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", @@ -17,8 +17,8 @@ }, "dependencies": { "@novnc/novnc": "1.4.0", - "@puppeteer/replay": "^3.1.1", - "@webrecorder/wabac": "^2.23.11", + "@puppeteer/replay": "^3.1.3", + "@webrecorder/wabac": "^2.24.1", "browsertrix-behaviors": "^0.9.2", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", @@ -33,13 +33,13 @@ "p-queue": "^7.3.4", "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", - "puppeteer-core": "^24.7.2", + "puppeteer-core": "^24.22.0", "sax": "^1.3.0", "sharp": "^0.32.6", "tsc": "^2.0.4", "undici": "^6.18.2", "uuid": "8.3.2", - "warcio": "^2.4.5", + "warcio": "^2.4.7", "ws": "^7.4.4", "yargs": "^17.7.2" }, @@ -71,7 +71,7 @@ }, "resolutions": { "wrap-ansi": "7.0.0", - "warcio": "^2.4.5", + "warcio": "^2.4.7", "@novnc/novnc": "1.4.0" } } diff --git a/src/crawler.ts b/src/crawler.ts index 39b1b2f5..d815c3fd 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -129,8 +129,6 @@ export class Crawler { limitHit = false; pageLimit: number; - dupeSeedsFound = false; - saveStateFiles: string[] = []; lastSaveTime: number; @@ -854,31 +852,34 @@ self.__bx_behaviors.selectMainBehavior(); await this.browser.addInitScript(page, initScript); } - // only add if running with autoclick behavior - if (this.params.behaviors.includes("autoclick")) { - // Ensure off-page navigation is canceled while behavior is running - page.on("dialog", async (dialog) => { - let accepted = true; - if (dialog.type() === "beforeunload") { - if (opts.pageBlockUnload) { - accepted = false; - await dialog.dismiss(); - } else { - await dialog.accept(); - } + // Handle JS dialogs: + // - Ensure off-page navigation is canceled while behavior is running + // - dismiss close all other dialogs if not blocking unload + page.on("dialog", async (dialog) => { + let accepted = true; + if (dialog.type() === "beforeunload") { + if (opts.pageBlockUnload) { + accepted = false; + await dialog.dismiss(); } else { await dialog.accept(); } - logger.debug("JS Dialog", { - accepted, - blockingUnload: opts.pageBlockUnload, - message: dialog.message(), - type: dialog.type(), - page: page.url(), - workerid, - }); + } else { + // other JS dialog, just dismiss + await dialog.dismiss(); + } + logger.debug("JS Dialog", { + accepted, + blockingUnload: opts.pageBlockUnload, + message: dialog.message(), + type: dialog.type(), + page: page.url(), + workerid, }); + }); + // only add if running with autoclick behavior + if (this.params.behaviors.includes("autoclick")) { // Close any windows opened during navigation from autoclick await cdp.send("Target.setDiscoverTargets", { discover: true }); @@ -2487,10 +2488,6 @@ self.__bx_behaviors.selectMainBehavior(); return false; case QueueState.DUPE_URL: - if (!this.dupeSeedsFound && depth === 0) { - logger.error("Duplicate seed URLs found and skipped"); - this.dupeSeedsFound = true; - } logger.debug( "Page URL not queued, already seen", { url, ...logDetails }, diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 0dcbc608..66728c3d 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -339,7 +339,11 @@ async function createProfile( cdp: CDPSession, targetFilename = "", ) { - await cdp.send("Network.clearBrowserCache"); + try { + await cdp.send("Network.clearBrowserCache"); + } catch (e) { + logger.warn("Error clearing cache", e, "browser"); + } await browser.close(); @@ -546,7 +550,8 @@ class InteractiveBrowser { return; } - const cookies = await this.browser.getCookies(this.page); + const cookies = await this.browser.getCookies(); + for (const cookieOrig of cookies) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const cookie = cookieOrig as any; @@ -566,7 +571,7 @@ class InteractiveBrowser { cookie.url = url; } } - await this.browser.setCookies(this.page, cookies); + await this.browser.setCookies(cookies); // eslint-disable-next-line @typescript-eslint/no-explicit-any } catch (e: any) { logger.error("Save Cookie Error: ", e); diff --git a/src/util/browser.ts b/src/util/browser.ts index 6e2cb6c7..73102d0d 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -22,6 +22,7 @@ import puppeteer, { Page, LaunchOptions, Viewport, + CookieData, } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; @@ -616,14 +617,12 @@ export class Browser { await page.setViewport(params); } - async getCookies(page: Page) { - return await page.cookies(); + async getCookies() { + return (await this.browser?.cookies()) || []; } - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - async setCookies(page: Page, cookies: any) { - return await page.setCookie(...cookies); + async setCookies(cookies: CookieData[]) { + return await this.browser?.setCookie(...cookies); } } diff --git a/src/util/flowbehavior.ts b/src/util/flowbehavior.ts index 898bf232..82c502a5 100644 --- a/src/util/flowbehavior.ts +++ b/src/util/flowbehavior.ts @@ -368,7 +368,7 @@ class Flow { case StepType.DoubleClick: await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .click({ count: 2, button: step.button && mouseButtonMap.get(step.button), @@ -392,7 +392,7 @@ class Flow { await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .click({ delay: step.duration, button: step.button && mouseButtonMap.get(step.button), @@ -410,7 +410,7 @@ class Flow { case StepType.Hover: await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .hover(); break; @@ -426,15 +426,14 @@ class Flow { case StepType.Change: await locator(step) - //.on('action', () => startWaitingForEvents()) .setTimeout(timeout * 1000) + .setEnsureElementIsInTheViewport(true) .fill(step.value); break; case StepType.Scroll: { if ("selectors" in step) { await locator(step) - //.on('action', () => startWaitingForEvents()) .setTimeout(timeout * 1000) .scroll({ scrollLeft: step.x || 0, diff --git a/src/util/proxy.ts b/src/util/proxy.ts index c5a0c2f4..93abc151 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -153,13 +153,6 @@ export async function initProxy( privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile; publicHostsFile = publicHostsFile || sshProxyKnownHostsFile; - logger.debug("Initing proxy", { - url: getSafeProxyString(proxyUrl), - localPort, - privateKeyFile, - publicHostsFile, - }); - const entry = await initSingleProxy( proxyUrl, localPort++, @@ -200,6 +193,13 @@ export async function initSingleProxy( sshProxyPrivateKeyFile?: string, sshProxyKnownHostsFile?: string, ): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> { + logger.debug("Initing proxy", { + url: getSafeProxyString(proxyUrl), + localPort, + sshProxyPrivateKeyFile, + sshProxyKnownHostsFile, + }); + if (proxyUrl && proxyUrl.startsWith("ssh://")) { proxyUrl = await runSSHD( proxyUrl, diff --git a/tests/basic_crawl.test.js b/tests/basic_crawl.test.js index 6fc00af6..940967ab 100644 --- a/tests/basic_crawl.test.js +++ b/tests/basic_crawl.test.js @@ -8,7 +8,7 @@ const testIf = (condition, ...args) => condition ? test(...args) : test.skip(... test("ensure basic crawl run with docker run passes", async () => { child_process.execSync( - 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix', + 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix', ); child_process.execSync( diff --git a/tests/custom-behavior.test.js b/tests/custom-behavior.test.js index b49ea1d8..2a7f5fc7 100644 --- a/tests/custom-behavior.test.js +++ b/tests/custom-behavior.test.js @@ -1,6 +1,21 @@ import child_process from "child_process"; import Redis from "ioredis"; +let proc = null; + +const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; +const TEST_HOST = `http://${DOCKER_HOST_NAME}:31503`; + +beforeAll(() => { + proc = child_process.spawn("../../node_modules/.bin/http-server", ["-p", "31503"], {cwd: "tests/custom-behaviors/"}); +}); + +afterAll(() => { + if (proc) { + proc.kill(); + } +}); + async function sleep(time) { await new Promise((resolve) => setTimeout(resolve, time)); @@ -9,7 +24,7 @@ async function sleep(time) { test("test custom behaviors from local filepath", async () => { const res = child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page", + "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example-com.webrecorder.net/page --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page", ); const log = res.toString(); @@ -21,10 +36,10 @@ test("test custom behaviors from local filepath", async () => { ) > 0, ).toBe(true); - // but not for example.org + // but not for example.com expect( log.indexOf( - '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org","workerid":0}}', + '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example-com.webrecorder.net/page","workerid":0}}', ) > 0, ).toBe(false); @@ -37,7 +52,7 @@ test("test custom behaviors from local filepath", async () => { }); test("test custom behavior from URL", async () => { - const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --scopeType page"); + const res = child_process.execSync(`docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --customBehaviors ${TEST_HOST}/custom-2.js --scopeType page`); const log = res.toString(); @@ -51,7 +66,7 @@ test("test custom behavior from URL", async () => { }); test("test mixed custom behavior sources", async () => { - const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page"); + const res = child_process.execSync(`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors ${TEST_HOST}/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page`); const log = res.toString(); @@ -74,7 +89,7 @@ test("test mixed custom behavior sources", async () => { test("test custom behaviors from git repo", async () => { const res = child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors \"git+https://github.com/webrecorder/browsertrix-crawler.git?branch=main&path=tests/custom-behaviors\" --scopeType page", ); const log = res.toString(); @@ -86,10 +101,10 @@ test("test custom behaviors from git repo", async () => { ) > 0, ).toBe(true); - // but not for example.org + // but not for example.com expect( log.indexOf( - '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example.org/","workerid":0}}', + '"logLevel":"info","context":"behaviorScriptCustom","message":"test-stat","details":{"state":{},"behavior":"TestBehavior","page":"https://example-com.webrecorder.net/","workerid":0}}', ) > 0, ).toBe(false); @@ -106,7 +121,7 @@ test("test invalid behavior exit", async () => { try { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/invalid-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/invalid-export.js --scopeType page", + "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/invalid-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net.webrecorder.net/ --url https://example-com.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors /custom-behaviors/invalid-export.js --scopeType page", ); } catch (e) { status = e.status; @@ -121,7 +136,7 @@ test("test crawl exits if behavior not fetched from url", async () => { try { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors https://webrecorder.net/doesntexist/custombehavior.js --scopeType page", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net --customBehaviors https://webrecorder.net/doesntexist/custombehavior.js --scopeType page", ); } catch (e) { status = e.status; @@ -136,7 +151,7 @@ test("test crawl exits if behavior not fetched from git repo", async () => { try { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors git+https://github.com/webrecorder/doesntexist --scopeType page", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net --customBehaviors git+https://github.com/webrecorder/doesntexist --scopeType page", ); } catch (e) { status = e.status; @@ -151,7 +166,7 @@ test("test crawl exits if not custom behaviors collected from local path", async try { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com --customBehaviors /custom-behaviors/doesntexist --scopeType page", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net --customBehaviors /custom-behaviors/doesntexist --scopeType page", ); } catch (e) { status = e.status; @@ -166,7 +181,7 @@ test("test pushing behavior logs to redis", async () => { const redisId = child_process.execSync("docker run --rm --network=crawl -p 36399:6379 --name redis -d redis"); - const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-redis-test --network=crawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page --logBehaviorsToRedis"); + const child = child_process.exec(`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-redis-test --network=crawl --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --redisStoreUrl redis://redis:6379 --url https://specs.webrecorder.net/ --url https://old.webrecorder.net/ --customBehaviors ${TEST_HOST}/custom-2.js --customBehaviors /custom-behaviors/custom.js --scopeType page --logBehaviorsToRedis`); let resolve = null; const crawlFinished = new Promise(r => resolve = r); diff --git a/tests/custom-behaviors/custom-flow.json b/tests/custom-behaviors/custom-flow.json index 9a55b6f5..2b0f2e48 100644 --- a/tests/custom-behaviors/custom-flow.json +++ b/tests/custom-behaviors/custom-flow.json @@ -28,7 +28,7 @@ }, { "type": "change", - "value": "https://example.com/", + "value": "https://example-com.webrecorder.net/", "selectors": [ [ "aria/[role=\"main\"]", diff --git a/tests/custom_selector.test.js b/tests/custom_selector.test.js index c2516d6e..fff71427 100644 --- a/tests/custom_selector.test.js +++ b/tests/custom_selector.test.js @@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => { ]); const expectedExtraPages = new Set([ - "https://www.iana.org/_js/jquery.js", - "https://www.iana.org/_js/iana.js", + "https://www.iana.org/static/_js/jquery.js", + "https://www.iana.org/static/_js/iana.js", ]); expect(pages).toEqual(expectedPages); @@ -71,7 +71,7 @@ test("test valid autoclick selector passes validation", async () => { try { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --clickSelector button --scopeType page", ); } catch (e) { failed = true; @@ -87,7 +87,7 @@ test("test invalid autoclick selector fails validation, crawl fails", async () = try { child_process.execSync( - "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page", + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --clickSelector \",\" --scopeType page", ); } catch (e) { status = e.status; diff --git a/tests/exclude-redirected.test.js b/tests/exclude-redirected.test.js index aaa9decf..b81a0ef8 100644 --- a/tests/exclude-redirected.test.js +++ b/tests/exclude-redirected.test.js @@ -6,7 +6,7 @@ import { execSync } from "child_process"; test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => { execSync( - "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1"); + "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1"); // no entries besides header expect( diff --git a/tests/invalid-behaviors/invalid-export.js b/tests/invalid-behaviors/invalid-export.js index 061f6ff4..d458b910 100644 --- a/tests/invalid-behaviors/invalid-export.js +++ b/tests/invalid-behaviors/invalid-export.js @@ -10,7 +10,7 @@ export class TestBehavior { } static isMatch() { - return window.location.origin === "https://example.com"; + return window.location.origin === "https://example-com.webrecorder.net"; } async *run(ctx) { diff --git a/tests/retry-failed.test.js b/tests/retry-failed.test.js index a1c21bdf..b914ad34 100644 --- a/tests/retry-failed.test.js +++ b/tests/retry-failed.test.js @@ -38,7 +38,7 @@ afterAll(() => { test("run crawl with retries for no response", async () => { - execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://invalid-host-x:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --retries 5`); + execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://invalid-host-x:31501 --url https://example-com.webrecorder.net/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --retries 5`); const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null }); @@ -90,7 +90,7 @@ test("run crawl with retries for 503, enough retries to succeed", async () => { requests = 0; success = false; - const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-2 --retries 2 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); + const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example-com.webrecorder.net/ --limit 2 --collection retry-fail-2 --retries 2 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); let status = 0; @@ -117,7 +117,7 @@ test("run crawl with retries for 503, not enough retries, fail", async () => { requests = 0; success = false; - const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-3 --retries 1 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); + const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example-com.webrecorder.net/ --limit 2 --collection retry-fail-3 --retries 1 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); let status = 0; @@ -143,7 +143,7 @@ test("run crawl with retries for 503, no retries, fail", async () => { requests = 0; success = false; - const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-4 --retries 0 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); + const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example-com.webrecorder.net/ --limit 2 --collection retry-fail-4 --retries 0 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); let status = 0; diff --git a/yarn.lock b/yarn.lock index 49ca9c85..99848597 100644 --- a/yarn.lock +++ b/yarn.lock @@ -772,17 +772,17 @@ tslib "^2.7.0" tsyringe "^4.8.0" -"@puppeteer/browsers@2.10.2": - version "2.10.2" - resolved "https://registry.yarnpkg.com/@puppeteer/browsers/-/browsers-2.10.2.tgz#c2a63cee699c6b5b971b9fcba9095098970f1648" - integrity sha512-i4Ez+s9oRWQbNjtI/3+jxr7OH508mjAKvza0ekPJem0ZtmsYHP3B5dq62+IaBHKaGCOuqJxXzvFLUhJvQ6jtsQ== +"@puppeteer/browsers@2.10.10": + version "2.10.10" + resolved "https://registry.yarnpkg.com/@puppeteer/browsers/-/browsers-2.10.10.tgz#f806f92d966918c931fb9c48052eba2db848beaa" + integrity sha512-3ZG500+ZeLql8rE0hjfhkycJjDj0pI/btEh3L9IkWUYcOrgP0xCNRq3HbtbqOPbvDhFaAWD88pDFtlLv8ns8gA== dependencies: - debug "^4.4.0" + debug "^4.4.3" extract-zip "^2.0.1" progress "^2.0.3" proxy-agent "^6.5.0" - semver "^7.7.1" - tar-fs "^3.0.8" + semver "^7.7.2" + tar-fs "^3.1.0" yargs "^17.7.2" "@puppeteer/browsers@2.8.0": @@ -798,10 +798,10 @@ tar-fs "^3.0.8" yargs "^17.7.2" -"@puppeteer/replay@^3.1.1": - version "3.1.1" - resolved "https://registry.yarnpkg.com/@puppeteer/replay/-/replay-3.1.1.tgz#ada5412c5330ba22e3186ed4b622d26ac89bf564" - integrity sha512-8tW1APEoqkpPVH19wRPqePb+/wbGuSVxE2OeRySKeb2SX1VpL2TuADodETRVGYYe07gBbs8FucaUu09A0QI7+w== +"@puppeteer/replay@^3.1.3": + version "3.1.3" + resolved "https://registry.yarnpkg.com/@puppeteer/replay/-/replay-3.1.3.tgz#24178c5aa28af1c1b47d39043d62dd722680b55e" + integrity sha512-chqKAKoVDtqXAFib93So2W+KHdd1RZ/yfOgXW+u0+BQaElTLVe+OpaLzEn+MIWfIkakhBHE5/tP0/CFQMVydQQ== dependencies: cli-table3 "0.6.5" colorette "2.0.20" @@ -1134,16 +1134,16 @@ resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ== -"@webrecorder/wabac@^2.23.11": - version "2.23.11" - resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.11.tgz#945da06e08b6d093b525e6e5bfd6a8f17beb995b" - integrity sha512-rsBAkcYvgX+0HgwhgvSb3cBCBp0rVnHGQS/K5A9aJwOmfymHt0C2vInH/lmKV/5H38rJu29c2cvRX962h+lUiw== +"@webrecorder/wabac@^2.24.1": + version "2.24.1" + resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.24.1.tgz#4cf2423a8a593410eabc7cb84041331d39081a96" + integrity sha512-n3MwHpPNbU1LrwZjlax9UJVvYwfYAiYQDjzAQbeE6SrAU/YFGgD3BthLCaHP5YyIvFjIKtUpfxbsxHYRqNAyxg== dependencies: "@peculiar/asn1-ecc" "^2.3.4" "@peculiar/asn1-schema" "^2.3.3" "@peculiar/x509" "^1.9.2" "@types/js-levenshtein" "^1.1.3" - "@webrecorder/wombat" "^3.8.14" + "@webrecorder/wombat" "^3.9.1" acorn "^8.10.0" auto-js-ipfs "^2.1.1" base64-js "^1.5.1" @@ -1161,14 +1161,14 @@ path-parser "^6.1.0" process "^0.11.10" stream-browserify "^3.0.0" - warcio "^2.4.5" + warcio "^2.4.7" -"@webrecorder/wombat@^3.8.14": - version "3.8.14" - resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.14.tgz#fde951519ed9ab8271107a013fc1abd6a9997424" - integrity sha512-1CaL8Oel02V321SS+wOomV+cSDo279eVEAuiamO9jl9YoijRsGL9z/xZKE6sz6npLltE3YYziEBYO81xnaeTcA== +"@webrecorder/wombat@^3.9.1": + version "3.9.1" + resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.9.1.tgz#266135612e8063fa6b453f45d37d2c94e7be93d6" + integrity sha512-NX7vYQxulVRPgZk4ok9JbrUsf0dct2f34D/B1ZUCcB4M9aTKDhDAxwoIJbMha4DLhQlPcPp2wjH5/uJtPvtsXQ== dependencies: - warcio "^2.4.0" + warcio "^2.4.7" "@zxing/text-encoding@0.9.0": version "0.9.0" @@ -1711,10 +1711,10 @@ chromium-bidi@2.1.2: mitt "^3.0.1" zod "^3.24.1" -chromium-bidi@4.1.1: - version "4.1.1" - resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-4.1.1.tgz#e1c34154ddd94473f180fd15158a24d36049e3d5" - integrity sha512-biR7t4vF3YluE6RlMSk9IWk+b9U+WWyzHp+N2pL9vRTk+UXHYRTVp7jTK58ZNzMLBgoLMHY4QyJMbeuw3eKxqg== +chromium-bidi@8.0.0: + version "8.0.0" + resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-8.0.0.tgz#d73c9beed40317adf2bcfeb9a47087003cd467ec" + integrity sha512-d1VmE0FD7lxZQHzcDUCKZSNRtRwISXDsdg4HjdTR5+Ll5nQ/vzU12JeNmupD6VWffrPSlrnGhEWlLESKH3VO+g== dependencies: mitt "^3.0.1" zod "^3.24.1" @@ -1946,6 +1946,13 @@ debug@^4.4.0: dependencies: ms "^2.1.3" +debug@^4.4.3: + version "4.4.3" + resolved "https://registry.yarnpkg.com/debug/-/debug-4.4.3.tgz#c6ae432d9bd9662582fce08709b038c58e9e3d6a" + integrity sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA== + dependencies: + ms "^2.1.3" + decimal.js@^10.4.3: version "10.5.0" resolved "https://registry.yarnpkg.com/decimal.js/-/decimal.js-10.5.0.tgz#0f371c7cf6c4898ce0afb09836db73cd82010f22" @@ -2035,16 +2042,16 @@ devtools-protocol@0.0.1413902: resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1413902.tgz#a0f00fe9eb25ab337a8f9656a29e0a1a69f42401" integrity sha512-yRtvFD8Oyk7C9Os3GmnFZLu53yAfsnyw1s+mLmHHUK0GQEc9zthHWvS1r67Zqzm5t7v56PILHIVZ7kmFMaL2yQ== -devtools-protocol@0.0.1425554: - version "0.0.1425554" - resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1425554.tgz#51ed2fed1405f56783d24a393f7c75b6bbb58029" - integrity sha512-uRfxR6Nlzdzt0ihVIkV+sLztKgs7rgquY/Mhcv1YNCWDh5IZgl5mnn2aeEnW5stYTE0wwiF4RYVz8eMEpV1SEw== - devtools-protocol@0.0.1436416: version "0.0.1436416" resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1436416.tgz#ce8af8a210b8bcac83c5c8f095b9f977a9570df0" integrity sha512-iGLhz2WOrlBLcTcoVsFy5dPPUqILG6cc8MITYd5lV6i38gWG14bMXRH/d8G5KITrWHBnbsOnWHfc9Qs4/jej9Q== +devtools-protocol@0.0.1495869: + version "0.0.1495869" + resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1495869.tgz#f68daef77a48d5dcbcdd55dbfa3265a51989c91b" + integrity sha512-i+bkd9UYFis40RcnkW7XrOprCujXRAHg62IVh/Ah3G8MmNXpCGt1m0dTFhSdx/AVs8XEMbdOGRwdkR1Bcta8AA== + diff-sequences@^29.6.3: version "29.6.3" resolved "https://registry.yarnpkg.com/diff-sequences/-/diff-sequences-29.6.3.tgz#4deaf894d11407c51efc8418012f9e70b84ea921" @@ -4548,17 +4555,18 @@ puppeteer-core@24.4.0, puppeteer-core@^24.4.0: typed-query-selector "^2.12.0" ws "^8.18.1" -puppeteer-core@^24.7.2: - version "24.7.2" - resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-24.7.2.tgz#734e377a5634ce1e419fa3ce20ad297a7e1a99ff" - integrity sha512-P9pZyTmJqKODFCnkZgemCpoFA4LbAa8+NumHVQKyP5X9IgdNS1ZnAnIh1sMAwhF8/xEUGf7jt+qmNLlKieFw1Q== +puppeteer-core@^24.22.0: + version "24.22.0" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-24.22.0.tgz#4d576b1a2b7699c088d3f0e843c32d81df82c3a6" + integrity sha512-oUeWlIg0pMz8YM5pu0uqakM+cCyYyXkHBxx9di9OUELu9X9+AYrNGGRLK9tNME3WfN3JGGqQIH3b4/E9LGek/w== dependencies: - "@puppeteer/browsers" "2.10.2" - chromium-bidi "4.1.1" - debug "^4.4.0" - devtools-protocol "0.0.1425554" + "@puppeteer/browsers" "2.10.10" + chromium-bidi "8.0.0" + debug "^4.4.3" + devtools-protocol "0.0.1495869" typed-query-selector "^2.12.0" - ws "^8.18.1" + webdriver-bidi-protocol "0.2.11" + ws "^8.18.3" puppeteer@^24.4.0: version "24.4.0" @@ -4833,6 +4841,11 @@ semver@^7.7.1: resolved "https://registry.yarnpkg.com/semver/-/semver-7.7.1.tgz#abd5098d82b18c6c81f6074ff2647fd3e7220c9f" integrity sha512-hlq8tAfn0m/61p4BVRcPzIGr6LKiMwo4VM6dGi6pt4qcRkmNzTcWq6eCEjEh+qXjkMDvPlOFFSGwQjoEa6gyMA== +semver@^7.7.2: + version "7.7.2" + resolved "https://registry.yarnpkg.com/semver/-/semver-7.7.2.tgz#67d99fdcd35cec21e6f8b87a7fd515a33f982b58" + integrity sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA== + set-function-length@^1.2.1: version "1.2.2" resolved "https://registry.yarnpkg.com/set-function-length/-/set-function-length-1.2.2.tgz#aac72314198eaed975cf77b2c3b6b880695e5449" @@ -5195,6 +5208,17 @@ tar-fs@^3.0.8: bare-fs "^4.0.1" bare-path "^3.0.0" +tar-fs@^3.1.0: + version "3.1.1" + resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-3.1.1.tgz#4f164e59fb60f103d472360731e8c6bb4a7fe9ef" + integrity sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg== + dependencies: + pump "^3.0.0" + tar-stream "^3.1.5" + optionalDependencies: + bare-fs "^4.0.1" + bare-path "^3.0.0" + tar-stream@^2.1.4: version "2.2.0" resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.2.0.tgz#acad84c284136b060dc3faa64474aa9aebd77287" @@ -5526,10 +5550,10 @@ walker@^1.0.8: dependencies: makeerror "1.0.12" -warcio@^2.4.0, warcio@^2.4.5: - version "2.4.5" - resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.5.tgz#ba39c38e433491ab9016282813b9cf6539c3d808" - integrity sha512-b6R/aIsR4fXzrpY/Zud7LqHFi2Bt8Ov5VLOnruHQ10rk129e9d0KOCZlyRmPD6ENTcV7yze5rXvJ5WSNS8R1zw== +warcio@^2.4.7: + version "2.4.7" + resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.7.tgz#7c3918463e550f62fe63df5f76a871424e74097a" + integrity sha512-WGRqvoUqSalAkx+uJ8xnrxiiSPZ7Ru/h7iKC2XmuMMSOUSnS917l4V+qpaN9thAsZkZ+8qJRtee3uyOjlq4Dgg== dependencies: "@types/pako" "^1.0.7" "@types/stream-buffers" "^3.0.7" @@ -5549,6 +5573,11 @@ web-encoding@^1.1.5: optionalDependencies: "@zxing/text-encoding" "0.9.0" +webdriver-bidi-protocol@0.2.11: + version "0.2.11" + resolved "https://registry.yarnpkg.com/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.2.11.tgz#dba18d9b0a33aed33fab272dbd6e42411ac753cc" + integrity sha512-Y9E1/oi4XMxcR8AT0ZC4OvYntl34SPgwjmELH+owjBr0korAX4jKgZULBWILGCVGdVCQ0dodTToIETozhG8zvA== + whatwg-encoding@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz#e7635f597fd87020858626805a2729fa7698ac53" @@ -5661,6 +5690,11 @@ ws@^8.18.1: resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.1.tgz#ea131d3784e1dfdff91adb0a4a116b127515e3cb" integrity sha512-RKW2aJZMXeMxVpnZ6bck+RswznaxmzdULiBr6KY7XkTnW8uvt0iT9H5DkHUChXrc+uurzwa0rVI16n/Xzjdz1w== +ws@^8.18.3: + version "8.18.3" + resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.3.tgz#b56b88abffde62791c639170400c93dcb0c95472" + integrity sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg== + xdg-basedir@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-4.0.0.tgz#4bc8d9984403696225ef83a1573cbbcb4e79db13"