mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Compare commits
No commits in common. "main" and "v1.8.0" have entirely different histories.
8 changed files with 50 additions and 51 deletions
|
@ -266,7 +266,6 @@ Some of these functions which may be of use to behaviors authors are:
|
|||
- `scrollToOffset`: scroll to particular offset
|
||||
- `scrollIntoView`: smoothly scroll particular element into view
|
||||
- `getState`: increment a state counter and return all state counters + string message
|
||||
* `addLink`: add a given URL to the crawl queue
|
||||
|
||||
More detailed references will be added in the future.
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "1.8.1",
|
||||
"version": "1.8.0",
|
||||
"main": "browsertrix-crawler",
|
||||
"type": "module",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
|
|
|
@ -129,6 +129,8 @@ export class Crawler {
|
|||
limitHit = false;
|
||||
pageLimit: number;
|
||||
|
||||
dupeSeedsFound = false;
|
||||
|
||||
saveStateFiles: string[] = [];
|
||||
lastSaveTime: number;
|
||||
|
||||
|
@ -852,34 +854,31 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.browser.addInitScript(page, initScript);
|
||||
}
|
||||
|
||||
// Handle JS dialogs:
|
||||
// - Ensure off-page navigation is canceled while behavior is running
|
||||
// - dismiss close all other dialogs if not blocking unload
|
||||
page.on("dialog", async (dialog) => {
|
||||
let accepted = true;
|
||||
if (dialog.type() === "beforeunload") {
|
||||
if (opts.pageBlockUnload) {
|
||||
accepted = false;
|
||||
await dialog.dismiss();
|
||||
// only add if running with autoclick behavior
|
||||
if (this.params.behaviors.includes("autoclick")) {
|
||||
// Ensure off-page navigation is canceled while behavior is running
|
||||
page.on("dialog", async (dialog) => {
|
||||
let accepted = true;
|
||||
if (dialog.type() === "beforeunload") {
|
||||
if (opts.pageBlockUnload) {
|
||||
accepted = false;
|
||||
await dialog.dismiss();
|
||||
} else {
|
||||
await dialog.accept();
|
||||
}
|
||||
} else {
|
||||
await dialog.accept();
|
||||
}
|
||||
} else {
|
||||
// other JS dialog, just dismiss
|
||||
await dialog.dismiss();
|
||||
}
|
||||
logger.debug("JS Dialog", {
|
||||
accepted,
|
||||
blockingUnload: opts.pageBlockUnload,
|
||||
message: dialog.message(),
|
||||
type: dialog.type(),
|
||||
page: page.url(),
|
||||
workerid,
|
||||
logger.debug("JS Dialog", {
|
||||
accepted,
|
||||
blockingUnload: opts.pageBlockUnload,
|
||||
message: dialog.message(),
|
||||
type: dialog.type(),
|
||||
page: page.url(),
|
||||
workerid,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// only add if running with autoclick behavior
|
||||
if (this.params.behaviors.includes("autoclick")) {
|
||||
// Close any windows opened during navigation from autoclick
|
||||
await cdp.send("Target.setDiscoverTargets", { discover: true });
|
||||
|
||||
|
@ -2488,6 +2487,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return false;
|
||||
|
||||
case QueueState.DUPE_URL:
|
||||
if (!this.dupeSeedsFound && depth === 0) {
|
||||
logger.error("Duplicate seed URLs found and skipped");
|
||||
this.dupeSeedsFound = true;
|
||||
}
|
||||
logger.debug(
|
||||
"Page URL not queued, already seen",
|
||||
{ url, ...logDetails },
|
||||
|
|
|
@ -339,11 +339,7 @@ async function createProfile(
|
|||
cdp: CDPSession,
|
||||
targetFilename = "",
|
||||
) {
|
||||
try {
|
||||
await cdp.send("Network.clearBrowserCache");
|
||||
} catch (e) {
|
||||
logger.warn("Error clearing cache", e, "browser");
|
||||
}
|
||||
await cdp.send("Network.clearBrowserCache");
|
||||
|
||||
await browser.close();
|
||||
|
||||
|
@ -550,8 +546,7 @@ class InteractiveBrowser {
|
|||
return;
|
||||
}
|
||||
|
||||
const cookies = await this.browser.getCookies();
|
||||
|
||||
const cookies = await this.browser.getCookies(this.page);
|
||||
for (const cookieOrig of cookies) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const cookie = cookieOrig as any;
|
||||
|
@ -571,7 +566,7 @@ class InteractiveBrowser {
|
|||
cookie.url = url;
|
||||
}
|
||||
}
|
||||
await this.browser.setCookies(cookies);
|
||||
await this.browser.setCookies(this.page, cookies);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error("Save Cookie Error: ", e);
|
||||
|
|
|
@ -22,7 +22,6 @@ import puppeteer, {
|
|||
Page,
|
||||
LaunchOptions,
|
||||
Viewport,
|
||||
CookieData,
|
||||
} from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||
import { Recorder } from "./recorder.js";
|
||||
|
@ -617,12 +616,14 @@ export class Browser {
|
|||
await page.setViewport(params);
|
||||
}
|
||||
|
||||
async getCookies() {
|
||||
return (await this.browser?.cookies()) || [];
|
||||
async getCookies(page: Page) {
|
||||
return await page.cookies();
|
||||
}
|
||||
|
||||
async setCookies(cookies: CookieData[]) {
|
||||
return await this.browser?.setCookie(...cookies);
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async setCookies(page: Page, cookies: any) {
|
||||
return await page.setCookie(...cookies);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -368,7 +368,7 @@ class Flow {
|
|||
case StepType.DoubleClick:
|
||||
await locator(step)
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.click({
|
||||
count: 2,
|
||||
button: step.button && mouseButtonMap.get(step.button),
|
||||
|
@ -392,7 +392,7 @@ class Flow {
|
|||
|
||||
await locator(step)
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.click({
|
||||
delay: step.duration,
|
||||
button: step.button && mouseButtonMap.get(step.button),
|
||||
|
@ -410,7 +410,7 @@ class Flow {
|
|||
case StepType.Hover:
|
||||
await locator(step)
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.hover();
|
||||
break;
|
||||
|
||||
|
@ -426,14 +426,15 @@ class Flow {
|
|||
|
||||
case StepType.Change:
|
||||
await locator(step)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.setTimeout(timeout * 1000)
|
||||
.setEnsureElementIsInTheViewport(true)
|
||||
.fill(step.value);
|
||||
break;
|
||||
|
||||
case StepType.Scroll: {
|
||||
if ("selectors" in step) {
|
||||
await locator(step)
|
||||
//.on('action', () => startWaitingForEvents())
|
||||
.setTimeout(timeout * 1000)
|
||||
.scroll({
|
||||
scrollLeft: step.x || 0,
|
||||
|
|
|
@ -153,6 +153,13 @@ export async function initProxy(
|
|||
privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile;
|
||||
publicHostsFile = publicHostsFile || sshProxyKnownHostsFile;
|
||||
|
||||
logger.debug("Initing proxy", {
|
||||
url: getSafeProxyString(proxyUrl),
|
||||
localPort,
|
||||
privateKeyFile,
|
||||
publicHostsFile,
|
||||
});
|
||||
|
||||
const entry = await initSingleProxy(
|
||||
proxyUrl,
|
||||
localPort++,
|
||||
|
@ -193,13 +200,6 @@ export async function initSingleProxy(
|
|||
sshProxyPrivateKeyFile?: string,
|
||||
sshProxyKnownHostsFile?: string,
|
||||
): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> {
|
||||
logger.debug("Initing proxy", {
|
||||
url: getSafeProxyString(proxyUrl),
|
||||
localPort,
|
||||
sshProxyPrivateKeyFile,
|
||||
sshProxyKnownHostsFile,
|
||||
});
|
||||
|
||||
if (proxyUrl && proxyUrl.startsWith("ssh://")) {
|
||||
proxyUrl = await runSSHD(
|
||||
proxyUrl,
|
||||
|
|
|
@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => {
|
|||
]);
|
||||
|
||||
const expectedExtraPages = new Set([
|
||||
"https://www.iana.org/static/_js/jquery.js",
|
||||
"https://www.iana.org/static/_js/iana.js",
|
||||
"https://www.iana.org/_js/jquery.js",
|
||||
"https://www.iana.org/_js/iana.js",
|
||||
]);
|
||||
|
||||
expect(pages).toEqual(expectedPages);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue