diff --git a/docs/docs/user-guide/behaviors.md b/docs/docs/user-guide/behaviors.md index b0c34a61..f585ef18 100644 --- a/docs/docs/user-guide/behaviors.md +++ b/docs/docs/user-guide/behaviors.md @@ -266,6 +266,7 @@ Some of these functions which may be of use to behaviors authors are: - `scrollToOffset`: scroll to particular offset - `scrollIntoView`: smoothly scroll particular element into view - `getState`: increment a state counter and return all state counters + string message +* `addLink`: add a given URL to the crawl queue More detailed references will be added in the future. diff --git a/package.json b/package.json index 4aabdfb8..debd1ec9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "1.8.0-beta.1", + "version": "1.8.1", "main": "browsertrix-crawler", "type": "module", "repository": "https://github.com/webrecorder/browsertrix-crawler", diff --git a/src/crawler.ts b/src/crawler.ts index 39b1b2f5..d815c3fd 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -129,8 +129,6 @@ export class Crawler { limitHit = false; pageLimit: number; - dupeSeedsFound = false; - saveStateFiles: string[] = []; lastSaveTime: number; @@ -854,31 +852,34 @@ self.__bx_behaviors.selectMainBehavior(); await this.browser.addInitScript(page, initScript); } - // only add if running with autoclick behavior - if (this.params.behaviors.includes("autoclick")) { - // Ensure off-page navigation is canceled while behavior is running - page.on("dialog", async (dialog) => { - let accepted = true; - if (dialog.type() === "beforeunload") { - if (opts.pageBlockUnload) { - accepted = false; - await dialog.dismiss(); - } else { - await dialog.accept(); - } + // Handle JS dialogs: + // - Ensure off-page navigation is canceled while behavior is running + // - dismiss close all other dialogs if not blocking unload + page.on("dialog", async (dialog) => { + let accepted = true; + if (dialog.type() === "beforeunload") { + if (opts.pageBlockUnload) { + accepted = false; + await dialog.dismiss(); } else { await dialog.accept(); } - logger.debug("JS Dialog", { - accepted, - blockingUnload: opts.pageBlockUnload, - message: dialog.message(), - type: dialog.type(), - page: page.url(), - workerid, - }); + } else { + // other JS dialog, just dismiss + await dialog.dismiss(); + } + logger.debug("JS Dialog", { + accepted, + blockingUnload: opts.pageBlockUnload, + message: dialog.message(), + type: dialog.type(), + page: page.url(), + workerid, }); + }); + // only add if running with autoclick behavior + if (this.params.behaviors.includes("autoclick")) { // Close any windows opened during navigation from autoclick await cdp.send("Target.setDiscoverTargets", { discover: true }); @@ -2487,10 +2488,6 @@ self.__bx_behaviors.selectMainBehavior(); return false; case QueueState.DUPE_URL: - if (!this.dupeSeedsFound && depth === 0) { - logger.error("Duplicate seed URLs found and skipped"); - this.dupeSeedsFound = true; - } logger.debug( "Page URL not queued, already seen", { url, ...logDetails }, diff --git a/src/create-login-profile.ts b/src/create-login-profile.ts index 0dcbc608..66728c3d 100755 --- a/src/create-login-profile.ts +++ b/src/create-login-profile.ts @@ -339,7 +339,11 @@ async function createProfile( cdp: CDPSession, targetFilename = "", ) { - await cdp.send("Network.clearBrowserCache"); + try { + await cdp.send("Network.clearBrowserCache"); + } catch (e) { + logger.warn("Error clearing cache", e, "browser"); + } await browser.close(); @@ -546,7 +550,8 @@ class InteractiveBrowser { return; } - const cookies = await this.browser.getCookies(this.page); + const cookies = await this.browser.getCookies(); + for (const cookieOrig of cookies) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const cookie = cookieOrig as any; @@ -566,7 +571,7 @@ class InteractiveBrowser { cookie.url = url; } } - await this.browser.setCookies(this.page, cookies); + await this.browser.setCookies(cookies); // eslint-disable-next-line @typescript-eslint/no-explicit-any } catch (e: any) { logger.error("Save Cookie Error: ", e); diff --git a/src/util/browser.ts b/src/util/browser.ts index 6e2cb6c7..73102d0d 100644 --- a/src/util/browser.ts +++ b/src/util/browser.ts @@ -22,6 +22,7 @@ import puppeteer, { Page, LaunchOptions, Viewport, + CookieData, } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { Recorder } from "./recorder.js"; @@ -616,14 +617,12 @@ export class Browser { await page.setViewport(params); } - async getCookies(page: Page) { - return await page.cookies(); + async getCookies() { + return (await this.browser?.cookies()) || []; } - // TODO: Fix this the next time the file is edited. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - async setCookies(page: Page, cookies: any) { - return await page.setCookie(...cookies); + async setCookies(cookies: CookieData[]) { + return await this.browser?.setCookie(...cookies); } } diff --git a/src/util/flowbehavior.ts b/src/util/flowbehavior.ts index 898bf232..82c502a5 100644 --- a/src/util/flowbehavior.ts +++ b/src/util/flowbehavior.ts @@ -368,7 +368,7 @@ class Flow { case StepType.DoubleClick: await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .click({ count: 2, button: step.button && mouseButtonMap.get(step.button), @@ -392,7 +392,7 @@ class Flow { await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .click({ delay: step.duration, button: step.button && mouseButtonMap.get(step.button), @@ -410,7 +410,7 @@ class Flow { case StepType.Hover: await locator(step) .setTimeout(timeout * 1000) - //.on('action', () => startWaitingForEvents()) + .setEnsureElementIsInTheViewport(true) .hover(); break; @@ -426,15 +426,14 @@ class Flow { case StepType.Change: await locator(step) - //.on('action', () => startWaitingForEvents()) .setTimeout(timeout * 1000) + .setEnsureElementIsInTheViewport(true) .fill(step.value); break; case StepType.Scroll: { if ("selectors" in step) { await locator(step) - //.on('action', () => startWaitingForEvents()) .setTimeout(timeout * 1000) .scroll({ scrollLeft: step.x || 0, diff --git a/src/util/proxy.ts b/src/util/proxy.ts index c5a0c2f4..93abc151 100644 --- a/src/util/proxy.ts +++ b/src/util/proxy.ts @@ -153,13 +153,6 @@ export async function initProxy( privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile; publicHostsFile = publicHostsFile || sshProxyKnownHostsFile; - logger.debug("Initing proxy", { - url: getSafeProxyString(proxyUrl), - localPort, - privateKeyFile, - publicHostsFile, - }); - const entry = await initSingleProxy( proxyUrl, localPort++, @@ -200,6 +193,13 @@ export async function initSingleProxy( sshProxyPrivateKeyFile?: string, sshProxyKnownHostsFile?: string, ): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> { + logger.debug("Initing proxy", { + url: getSafeProxyString(proxyUrl), + localPort, + sshProxyPrivateKeyFile, + sshProxyKnownHostsFile, + }); + if (proxyUrl && proxyUrl.startsWith("ssh://")) { proxyUrl = await runSSHD( proxyUrl, diff --git a/tests/custom_selector.test.js b/tests/custom_selector.test.js index 4b180bb0..fff71427 100644 --- a/tests/custom_selector.test.js +++ b/tests/custom_selector.test.js @@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => { ]); const expectedExtraPages = new Set([ - "https://www.iana.org/_js/jquery.js", - "https://www.iana.org/_js/iana.js", + "https://www.iana.org/static/_js/jquery.js", + "https://www.iana.org/static/_js/iana.js", ]); expect(pages).toEqual(expectedPages);