Compare commits

...

6 commits
v1.8.0 ... main

Author SHA1 Message Date
Ilya Kreymer
6f26148a9b bump version to 1.8.1 2025-10-08 17:11:04 -07:00
Ilya Kreymer
4f234040ce
Profile Saving Improvements (#894)
fix some observed errors that occur when saving profile:
- use browser.cookies instead of page.cookies to get all cookies, not
just from page
- catch exception when clearing cache and ignore
- logging: log when proxy init is happening on all paths, in case error
in proxy connection
2025-10-08 17:09:20 -07:00
Ilya Kreymer
002feb287b
dismiss js dialog popups (#895)
move the JS dialog handler to not be only for autoclick, dismiss all JS
dialogs (alert(), prompt()) to avoid blocking page
fixes #891
2025-10-08 14:57:52 -07:00
Ilya Kreymer
2270964996
logging: remove duplicate seeds found error (#893)
Per discussion, the message is unnecessary / confusing (doesn't provide
enough info) and can also happen on crawler restart.
2025-10-07 08:18:22 -07:00
Ilya Kreymer
fd49041f63
flow behaviors: add scrolling into view (#892)
Some page elements don't quite respond correctly if the element is not
in view, so should add the setEnsureElementIsInTheViewport() to click,
doubleclick, hover and change step locators.
2025-10-07 08:17:56 -07:00
Ed Summers
cc2d890916
Add addLink doc (#890)
It's helpful to know this function is there!
2025-10-02 15:45:55 -04:00
8 changed files with 51 additions and 50 deletions

View file

@ -266,6 +266,7 @@ Some of these functions which may be of use to behaviors authors are:
- `scrollToOffset`: scroll to particular offset - `scrollToOffset`: scroll to particular offset
- `scrollIntoView`: smoothly scroll particular element into view - `scrollIntoView`: smoothly scroll particular element into view
- `getState`: increment a state counter and return all state counters + string message - `getState`: increment a state counter and return all state counters + string message
* `addLink`: add a given URL to the crawl queue
More detailed references will be added in the future. More detailed references will be added in the future.

View file

@ -1,6 +1,6 @@
{ {
"name": "browsertrix-crawler", "name": "browsertrix-crawler",
"version": "1.8.0", "version": "1.8.1",
"main": "browsertrix-crawler", "main": "browsertrix-crawler",
"type": "module", "type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler",

View file

@ -129,8 +129,6 @@ export class Crawler {
limitHit = false; limitHit = false;
pageLimit: number; pageLimit: number;
dupeSeedsFound = false;
saveStateFiles: string[] = []; saveStateFiles: string[] = [];
lastSaveTime: number; lastSaveTime: number;
@ -854,31 +852,34 @@ self.__bx_behaviors.selectMainBehavior();
await this.browser.addInitScript(page, initScript); await this.browser.addInitScript(page, initScript);
} }
// only add if running with autoclick behavior // Handle JS dialogs:
if (this.params.behaviors.includes("autoclick")) { // - Ensure off-page navigation is canceled while behavior is running
// Ensure off-page navigation is canceled while behavior is running // - dismiss close all other dialogs if not blocking unload
page.on("dialog", async (dialog) => { page.on("dialog", async (dialog) => {
let accepted = true; let accepted = true;
if (dialog.type() === "beforeunload") { if (dialog.type() === "beforeunload") {
if (opts.pageBlockUnload) { if (opts.pageBlockUnload) {
accepted = false; accepted = false;
await dialog.dismiss(); await dialog.dismiss();
} else {
await dialog.accept();
}
} else { } else {
await dialog.accept(); await dialog.accept();
} }
logger.debug("JS Dialog", { } else {
accepted, // other JS dialog, just dismiss
blockingUnload: opts.pageBlockUnload, await dialog.dismiss();
message: dialog.message(), }
type: dialog.type(), logger.debug("JS Dialog", {
page: page.url(), accepted,
workerid, blockingUnload: opts.pageBlockUnload,
}); message: dialog.message(),
type: dialog.type(),
page: page.url(),
workerid,
}); });
});
// only add if running with autoclick behavior
if (this.params.behaviors.includes("autoclick")) {
// Close any windows opened during navigation from autoclick // Close any windows opened during navigation from autoclick
await cdp.send("Target.setDiscoverTargets", { discover: true }); await cdp.send("Target.setDiscoverTargets", { discover: true });
@ -2487,10 +2488,6 @@ self.__bx_behaviors.selectMainBehavior();
return false; return false;
case QueueState.DUPE_URL: case QueueState.DUPE_URL:
if (!this.dupeSeedsFound && depth === 0) {
logger.error("Duplicate seed URLs found and skipped");
this.dupeSeedsFound = true;
}
logger.debug( logger.debug(
"Page URL not queued, already seen", "Page URL not queued, already seen",
{ url, ...logDetails }, { url, ...logDetails },

View file

@ -339,7 +339,11 @@ async function createProfile(
cdp: CDPSession, cdp: CDPSession,
targetFilename = "", targetFilename = "",
) { ) {
await cdp.send("Network.clearBrowserCache"); try {
await cdp.send("Network.clearBrowserCache");
} catch (e) {
logger.warn("Error clearing cache", e, "browser");
}
await browser.close(); await browser.close();
@ -546,7 +550,8 @@ class InteractiveBrowser {
return; return;
} }
const cookies = await this.browser.getCookies(this.page); const cookies = await this.browser.getCookies();
for (const cookieOrig of cookies) { for (const cookieOrig of cookies) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
const cookie = cookieOrig as any; const cookie = cookieOrig as any;
@ -566,7 +571,7 @@ class InteractiveBrowser {
cookie.url = url; cookie.url = url;
} }
} }
await this.browser.setCookies(this.page, cookies); await this.browser.setCookies(cookies);
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) { } catch (e: any) {
logger.error("Save Cookie Error: ", e); logger.error("Save Cookie Error: ", e);

View file

@ -22,6 +22,7 @@ import puppeteer, {
Page, Page,
LaunchOptions, LaunchOptions,
Viewport, Viewport,
CookieData,
} from "puppeteer-core"; } from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
import { Recorder } from "./recorder.js"; import { Recorder } from "./recorder.js";
@ -616,14 +617,12 @@ export class Browser {
await page.setViewport(params); await page.setViewport(params);
} }
async getCookies(page: Page) { async getCookies() {
return await page.cookies(); return (await this.browser?.cookies()) || [];
} }
// TODO: Fix this the next time the file is edited. async setCookies(cookies: CookieData[]) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any return await this.browser?.setCookie(...cookies);
async setCookies(page: Page, cookies: any) {
return await page.setCookie(...cookies);
} }
} }

View file

@ -368,7 +368,7 @@ class Flow {
case StepType.DoubleClick: case StepType.DoubleClick:
await locator(step) await locator(step)
.setTimeout(timeout * 1000) .setTimeout(timeout * 1000)
//.on('action', () => startWaitingForEvents()) .setEnsureElementIsInTheViewport(true)
.click({ .click({
count: 2, count: 2,
button: step.button && mouseButtonMap.get(step.button), button: step.button && mouseButtonMap.get(step.button),
@ -392,7 +392,7 @@ class Flow {
await locator(step) await locator(step)
.setTimeout(timeout * 1000) .setTimeout(timeout * 1000)
//.on('action', () => startWaitingForEvents()) .setEnsureElementIsInTheViewport(true)
.click({ .click({
delay: step.duration, delay: step.duration,
button: step.button && mouseButtonMap.get(step.button), button: step.button && mouseButtonMap.get(step.button),
@ -410,7 +410,7 @@ class Flow {
case StepType.Hover: case StepType.Hover:
await locator(step) await locator(step)
.setTimeout(timeout * 1000) .setTimeout(timeout * 1000)
//.on('action', () => startWaitingForEvents()) .setEnsureElementIsInTheViewport(true)
.hover(); .hover();
break; break;
@ -426,15 +426,14 @@ class Flow {
case StepType.Change: case StepType.Change:
await locator(step) await locator(step)
//.on('action', () => startWaitingForEvents())
.setTimeout(timeout * 1000) .setTimeout(timeout * 1000)
.setEnsureElementIsInTheViewport(true)
.fill(step.value); .fill(step.value);
break; break;
case StepType.Scroll: { case StepType.Scroll: {
if ("selectors" in step) { if ("selectors" in step) {
await locator(step) await locator(step)
//.on('action', () => startWaitingForEvents())
.setTimeout(timeout * 1000) .setTimeout(timeout * 1000)
.scroll({ .scroll({
scrollLeft: step.x || 0, scrollLeft: step.x || 0,

View file

@ -153,13 +153,6 @@ export async function initProxy(
privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile; privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile;
publicHostsFile = publicHostsFile || sshProxyKnownHostsFile; publicHostsFile = publicHostsFile || sshProxyKnownHostsFile;
logger.debug("Initing proxy", {
url: getSafeProxyString(proxyUrl),
localPort,
privateKeyFile,
publicHostsFile,
});
const entry = await initSingleProxy( const entry = await initSingleProxy(
proxyUrl, proxyUrl,
localPort++, localPort++,
@ -200,6 +193,13 @@ export async function initSingleProxy(
sshProxyPrivateKeyFile?: string, sshProxyPrivateKeyFile?: string,
sshProxyKnownHostsFile?: string, sshProxyKnownHostsFile?: string,
): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> { ): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> {
logger.debug("Initing proxy", {
url: getSafeProxyString(proxyUrl),
localPort,
sshProxyPrivateKeyFile,
sshProxyKnownHostsFile,
});
if (proxyUrl && proxyUrl.startsWith("ssh://")) { if (proxyUrl && proxyUrl.startsWith("ssh://")) {
proxyUrl = await runSSHD( proxyUrl = await runSSHD(
proxyUrl, proxyUrl,

View file

@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => {
]); ]);
const expectedExtraPages = new Set([ const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js", "https://www.iana.org/static/_js/jquery.js",
"https://www.iana.org/_js/iana.js", "https://www.iana.org/static/_js/iana.js",
]); ]);
expect(pages).toEqual(expectedPages); expect(pages).toEqual(expectedPages);