From f7a080fe83b7501f3e6af19c7f2eac7955f94585 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 25 Sep 2025 10:42:02 -0700 Subject: [PATCH 1/7] version: bump to 1.8.0 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 4aabdfb8..76285477 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.8.0-beta.1", + "version": "1.8.0", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", From cc2d89091605d469d97f6eb9c3fcfcbd7c88fecf Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 2 Oct 2025 15:45:55 -0400 Subject: [PATCH 2/7] Add addLink doc (#890) It's helpful to know this function is there! --- docs/docs/user-guide/behaviors.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docs/user-guide/behaviors.md b/docs/docs/user-guide/behaviors.md index b0c34a61..f585ef18 100644 --- a/docs/docs/user-guide/behaviors.md +++ b/docs/docs/user-guide/behaviors.md @@ -266,6 +266,7 @@ Some of these functions which may be of use to behaviors authors are: - `scrollToOffset`: scroll to particular offset - `scrollIntoView`: smoothly scroll particular element into view - `getState`: increment a state counter and return all state counters + string message +* `addLink`: add a given URL to the crawl queue More detailed references will be added in the future. From fd49041f63535cf6094a2e3c7e8e0e135bdc0f36 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Oct 2025 08:17:56 -0700 Subject: [PATCH 3/7] flow behaviors: add scrolling into view (#892) Some page elements don't quite respond correctly if the element is not in view, so should add the setEnsureElementIsInTheViewport() to click, doubleclick, hover and change step locators. --- src/util/flowbehavior.ts | 9 ++++----- tests/custom_selector.test.js | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/util/flowbehavior.ts b/src/util/flowbehavior.ts index 898bf232..82c502a5 100644 --- a/src/util/flowbehavior.ts +++ b/src/util/flowbehavior.ts @@ -368,7 +368,7 @@ class Flow { case StepType.DoubleClick: await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .click({ count: 2, button: step.button && mouseButtonMap.get(step.button), @@ -392,7 +392,7 @@ class Flow { await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .click({ delay: step.duration, button: step.button && mouseButtonMap.get(step.button), @@ -410,7 +410,7 @@ class Flow { case StepType.Hover: await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .hover(); break; @@ -426,15 +426,14 @@ class Flow { case StepType.Change: await locator(step) - //.on('action', () => startWaitingForEvents()) .setTimeout(timeout * 1000) + .setEnsureElementIsInTheViewport(true) .fill(step.value); break; case StepType.Scroll: { if ("selectors" in step) { await locator(step) - //.on('action', () => startWaitingForEvents()) .setTimeout(timeout * 1000) .scroll({ scrollLeft: step.x || 0, diff --git a/tests/custom_selector.test.js b/tests/custom_selector.test.js index 4b180bb0..fff71427 100644 --- a/tests/custom_selector.test.js +++ b/tests/custom_selector.test.js @@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => { ]); const expectedExtraPages = new Set([ - "https://www.iana.org/_js/jquery.js", - "https://www.iana.org/_js/iana.js", + "https://www.iana.org/static/_js/jquery.js", + "https://www.iana.org/static/_js/iana.js", ]); expect(pages).toEqual(expectedPages); From 2270964996e699451ec984d39c925566f358addd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 7 Oct 2025 08:18:22 -0700 Subject: [PATCH 4/7] logging: remove duplicate seeds found error (#893) Per discussion, the message is unnecessary / confusing (doesn't provide enough info) and can also happen on crawler restart. --- src/crawler.ts | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 39b1b2f5..e405cefe 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -129,8 +129,6 @@ export class Crawler { limitHit = false; pageLimit: number; - dupeSeedsFound = false; - saveStateFiles: string[] = []; lastSaveTime: number; @@ -2487,10 +2485,6 @@ self.__bx_behaviors.selectMainBehavior(); return false; case QueueState.DUPE_URL: - if (!this.dupeSeedsFound && depth === 0) { - logger.error("Duplicate seed URLs found and skipped"); - this.dupeSeedsFound = true; - } logger.debug( "Page URL not queued, already seen", { url, ...logDetails }, From 002feb287b719dccd0b291b0024009c39b3a7a37 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 8 Oct 2025 14:57:52 -0700 Subject: [PATCH 5/7] dismiss js dialog popups (#895) move the JS dialog handler to not be only for autoclick, dismiss all JS dialogs (alert(), prompt()) to avoid blocking page fixes #891 --- src/crawler.ts | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index e405cefe..d815c3fd 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -852,31 +852,34 @@ self.__bx_behaviors.selectMainBehavior(); await this.browser.addInitScript(page, initScript); } - // only add if running with autoclick behavior - if (this.params.behaviors.includes("autoclick")) { - // Ensure off-page navigation is canceled while behavior is running - page.on("dialog", async (dialog) => { - let accepted = true; - if (dialog.type() === "beforeunload") { - if (opts.pageBlockUnload) { - accepted = false; - await dialog.dismiss(); - } else { - await dialog.accept(); - } + // Handle JS dialogs: + // - Ensure off-page navigation is canceled while behavior is running + // - dismiss close all other dialogs if not blocking unload + page.on("dialog", async (dialog) => { + let accepted = true; + if (dialog.type() === "beforeunload") { + if (opts.pageBlockUnload) { + accepted = false; + await dialog.dismiss(); } else { await dialog.accept(); } - logger.debug("JS Dialog", { - accepted, - blockingUnload: opts.pageBlockUnload, - message: dialog.message(), - type: dialog.type(), - page: page.url(), - workerid, - }); + } else { + // other JS dialog, just dismiss + await dialog.dismiss(); + } + logger.debug("JS Dialog", { + accepted, + blockingUnload: opts.pageBlockUnload, + message: dialog.message(), + type: dialog.type(), + page: page.url(), + workerid, }); + }); + // only add if running with autoclick behavior + if (this.params.behaviors.includes("autoclick")) { // Close any windows opened during navigation from autoclick await cdp.send("Target.setDiscoverTargets", { discover: true }); From 4f234040cec59a1b238b1b8b3a1b1619760cf663 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 8 Oct 2025 17:09:20 -0700 Subject: [PATCH 6/7] Profile Saving Improvements (#894) fix some observed errors that occur when saving profile: - use browser.cookies instead of page.cookies to get all cookies, not just from page - catch exception when clearing cache and ignore - logging: log when proxy init is happening on all paths, in case error in proxy connection --- src/create-login-profile.ts | 11 ++++++++--- src/util/browser.ts | 11 +++++------ src/util/proxy.ts | 14 +++++++------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 0dcbc608..66728c3d 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -339,7 +339,11 @@ async function createProfile( cdp: CDPSession, targetFilename = "", ) { - await cdp.send("Network.clearBrowserCache"); + try { + await cdp.send("Network.clearBrowserCache"); + } catch (e) { + logger.warn("Error clearing cache", e, "browser"); + } await browser.close(); @@ -546,7 +550,8 @@ class InteractiveBrowser { return; } - const cookies = await this.browser.getCookies(this.page); + const cookies = await this.browser.getCookies(); + for (const cookieOrig of cookies) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const cookie = cookieOrig as any; @@ -566,7 +571,7 @@ class InteractiveBrowser { cookie.url = url; } } - await this.browser.setCookies(this.page, cookies); + await this.browser.setCookies(cookies); // eslint-disable-next-line @typescript-eslint/no-explicit-any } catch (e: any) { logger.error("Save Cookie Error: ", e); diff --git a/src/util/browser.ts b/src/util/browser.ts index 6e2cb6c7..73102d0d 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -22,6 +22,7 @@ import puppeteer, { Page, LaunchOptions, Viewport, + CookieData, } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; @@ -616,14 +617,12 @@ export class Browser { await page.setViewport(params); } - async getCookies(page: Page) { - return await page.cookies(); + async getCookies() { + return (await this.browser?.cookies()) || []; } - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - async setCookies(page: Page, cookies: any) { - return await page.setCookie(...cookies); + async setCookies(cookies: CookieData[]) { + return await this.browser?.setCookie(...cookies); } } diff --git a/src/util/proxy.ts b/src/util/proxy.ts index c5a0c2f4..93abc151 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -153,13 +153,6 @@ export async function initProxy( privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile; publicHostsFile = publicHostsFile || sshProxyKnownHostsFile; - logger.debug("Initing proxy", { - url: getSafeProxyString(proxyUrl), - localPort, - privateKeyFile, - publicHostsFile, - }); - const entry = await initSingleProxy( proxyUrl, localPort++, @@ -200,6 +193,13 @@ export async function initSingleProxy( sshProxyPrivateKeyFile?: string, sshProxyKnownHostsFile?: string, ): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> { + logger.debug("Initing proxy", { + url: getSafeProxyString(proxyUrl), + localPort, + sshProxyPrivateKeyFile, + sshProxyKnownHostsFile, + }); + if (proxyUrl && proxyUrl.startsWith("ssh://")) { proxyUrl = await runSSHD( proxyUrl, From 6f26148a9b7aea09c83cdbcb62c1f8ab6560f07a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 8 Oct 2025 17:10:26 -0700 Subject: [PATCH 7/7] bump version to 1.8.1 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 76285477..debd1ec9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.8.0", + "version": "1.8.1", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler",