mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Compare commits
6 commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
6f26148a9b | ||
![]() |
4f234040ce | ||
![]() |
002feb287b | ||
![]() |
2270964996 | ||
![]() |
fd49041f63 | ||
![]() |
cc2d890916 |
8 changed files with 51 additions and 50 deletions
|
@ -266,6 +266,7 @@ Some of these functions which may be of use to behaviors authors are:
|
||||||
- `scrollToOffset`: scroll to particular offset
|
- `scrollToOffset`: scroll to particular offset
|
||||||
- `scrollIntoView`: smoothly scroll particular element into view
|
- `scrollIntoView`: smoothly scroll particular element into view
|
||||||
- `getState`: increment a state counter and return all state counters + string message
|
- `getState`: increment a state counter and return all state counters + string message
|
||||||
|
* `addLink`: add a given URL to the crawl queue
|
||||||
|
|
||||||
More detailed references will be added in the future.
|
More detailed references will be added in the future.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "1.8.0",
|
"version": "1.8.1",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -129,8 +129,6 @@ export class Crawler {
|
||||||
limitHit = false;
|
limitHit = false;
|
||||||
pageLimit: number;
|
pageLimit: number;
|
||||||
|
|
||||||
dupeSeedsFound = false;
|
|
||||||
|
|
||||||
saveStateFiles: string[] = [];
|
saveStateFiles: string[] = [];
|
||||||
lastSaveTime: number;
|
lastSaveTime: number;
|
||||||
|
|
||||||
|
@ -854,31 +852,34 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.browser.addInitScript(page, initScript);
|
await this.browser.addInitScript(page, initScript);
|
||||||
}
|
}
|
||||||
|
|
||||||
// only add if running with autoclick behavior
|
// Handle JS dialogs:
|
||||||
if (this.params.behaviors.includes("autoclick")) {
|
// - Ensure off-page navigation is canceled while behavior is running
|
||||||
// Ensure off-page navigation is canceled while behavior is running
|
// - dismiss close all other dialogs if not blocking unload
|
||||||
page.on("dialog", async (dialog) => {
|
page.on("dialog", async (dialog) => {
|
||||||
let accepted = true;
|
let accepted = true;
|
||||||
if (dialog.type() === "beforeunload") {
|
if (dialog.type() === "beforeunload") {
|
||||||
if (opts.pageBlockUnload) {
|
if (opts.pageBlockUnload) {
|
||||||
accepted = false;
|
accepted = false;
|
||||||
await dialog.dismiss();
|
await dialog.dismiss();
|
||||||
} else {
|
|
||||||
await dialog.accept();
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
await dialog.accept();
|
await dialog.accept();
|
||||||
}
|
}
|
||||||
logger.debug("JS Dialog", {
|
} else {
|
||||||
accepted,
|
// other JS dialog, just dismiss
|
||||||
blockingUnload: opts.pageBlockUnload,
|
await dialog.dismiss();
|
||||||
message: dialog.message(),
|
}
|
||||||
type: dialog.type(),
|
logger.debug("JS Dialog", {
|
||||||
page: page.url(),
|
accepted,
|
||||||
workerid,
|
blockingUnload: opts.pageBlockUnload,
|
||||||
});
|
message: dialog.message(),
|
||||||
|
type: dialog.type(),
|
||||||
|
page: page.url(),
|
||||||
|
workerid,
|
||||||
});
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// only add if running with autoclick behavior
|
||||||
|
if (this.params.behaviors.includes("autoclick")) {
|
||||||
// Close any windows opened during navigation from autoclick
|
// Close any windows opened during navigation from autoclick
|
||||||
await cdp.send("Target.setDiscoverTargets", { discover: true });
|
await cdp.send("Target.setDiscoverTargets", { discover: true });
|
||||||
|
|
||||||
|
@ -2487,10 +2488,6 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
case QueueState.DUPE_URL:
|
case QueueState.DUPE_URL:
|
||||||
if (!this.dupeSeedsFound && depth === 0) {
|
|
||||||
logger.error("Duplicate seed URLs found and skipped");
|
|
||||||
this.dupeSeedsFound = true;
|
|
||||||
}
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Page URL not queued, already seen",
|
"Page URL not queued, already seen",
|
||||||
{ url, ...logDetails },
|
{ url, ...logDetails },
|
||||||
|
|
|
@ -339,7 +339,11 @@ async function createProfile(
|
||||||
cdp: CDPSession,
|
cdp: CDPSession,
|
||||||
targetFilename = "",
|
targetFilename = "",
|
||||||
) {
|
) {
|
||||||
await cdp.send("Network.clearBrowserCache");
|
try {
|
||||||
|
await cdp.send("Network.clearBrowserCache");
|
||||||
|
} catch (e) {
|
||||||
|
logger.warn("Error clearing cache", e, "browser");
|
||||||
|
}
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
|
||||||
|
@ -546,7 +550,8 @@ class InteractiveBrowser {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const cookies = await this.browser.getCookies(this.page);
|
const cookies = await this.browser.getCookies();
|
||||||
|
|
||||||
for (const cookieOrig of cookies) {
|
for (const cookieOrig of cookies) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const cookie = cookieOrig as any;
|
const cookie = cookieOrig as any;
|
||||||
|
@ -566,7 +571,7 @@ class InteractiveBrowser {
|
||||||
cookie.url = url;
|
cookie.url = url;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
await this.browser.setCookies(this.page, cookies);
|
await this.browser.setCookies(cookies);
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.error("Save Cookie Error: ", e);
|
logger.error("Save Cookie Error: ", e);
|
||||||
|
|
|
@ -22,6 +22,7 @@ import puppeteer, {
|
||||||
Page,
|
Page,
|
||||||
LaunchOptions,
|
LaunchOptions,
|
||||||
Viewport,
|
Viewport,
|
||||||
|
CookieData,
|
||||||
} from "puppeteer-core";
|
} from "puppeteer-core";
|
||||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||||
import { Recorder } from "./recorder.js";
|
import { Recorder } from "./recorder.js";
|
||||||
|
@ -616,14 +617,12 @@ export class Browser {
|
||||||
await page.setViewport(params);
|
await page.setViewport(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
async getCookies(page: Page) {
|
async getCookies() {
|
||||||
return await page.cookies();
|
return (await this.browser?.cookies()) || [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
async setCookies(cookies: CookieData[]) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
return await this.browser?.setCookie(...cookies);
|
||||||
async setCookies(page: Page, cookies: any) {
|
|
||||||
return await page.setCookie(...cookies);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -368,7 +368,7 @@ class Flow {
|
||||||
case StepType.DoubleClick:
|
case StepType.DoubleClick:
|
||||||
await locator(step)
|
await locator(step)
|
||||||
.setTimeout(timeout * 1000)
|
.setTimeout(timeout * 1000)
|
||||||
//.on('action', () => startWaitingForEvents())
|
.setEnsureElementIsInTheViewport(true)
|
||||||
.click({
|
.click({
|
||||||
count: 2,
|
count: 2,
|
||||||
button: step.button && mouseButtonMap.get(step.button),
|
button: step.button && mouseButtonMap.get(step.button),
|
||||||
|
@ -392,7 +392,7 @@ class Flow {
|
||||||
|
|
||||||
await locator(step)
|
await locator(step)
|
||||||
.setTimeout(timeout * 1000)
|
.setTimeout(timeout * 1000)
|
||||||
//.on('action', () => startWaitingForEvents())
|
.setEnsureElementIsInTheViewport(true)
|
||||||
.click({
|
.click({
|
||||||
delay: step.duration,
|
delay: step.duration,
|
||||||
button: step.button && mouseButtonMap.get(step.button),
|
button: step.button && mouseButtonMap.get(step.button),
|
||||||
|
@ -410,7 +410,7 @@ class Flow {
|
||||||
case StepType.Hover:
|
case StepType.Hover:
|
||||||
await locator(step)
|
await locator(step)
|
||||||
.setTimeout(timeout * 1000)
|
.setTimeout(timeout * 1000)
|
||||||
//.on('action', () => startWaitingForEvents())
|
.setEnsureElementIsInTheViewport(true)
|
||||||
.hover();
|
.hover();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -426,15 +426,14 @@ class Flow {
|
||||||
|
|
||||||
case StepType.Change:
|
case StepType.Change:
|
||||||
await locator(step)
|
await locator(step)
|
||||||
//.on('action', () => startWaitingForEvents())
|
|
||||||
.setTimeout(timeout * 1000)
|
.setTimeout(timeout * 1000)
|
||||||
|
.setEnsureElementIsInTheViewport(true)
|
||||||
.fill(step.value);
|
.fill(step.value);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case StepType.Scroll: {
|
case StepType.Scroll: {
|
||||||
if ("selectors" in step) {
|
if ("selectors" in step) {
|
||||||
await locator(step)
|
await locator(step)
|
||||||
//.on('action', () => startWaitingForEvents())
|
|
||||||
.setTimeout(timeout * 1000)
|
.setTimeout(timeout * 1000)
|
||||||
.scroll({
|
.scroll({
|
||||||
scrollLeft: step.x || 0,
|
scrollLeft: step.x || 0,
|
||||||
|
|
|
@ -153,13 +153,6 @@ export async function initProxy(
|
||||||
privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile;
|
privateKeyFile = privateKeyFile || sshProxyPrivateKeyFile;
|
||||||
publicHostsFile = publicHostsFile || sshProxyKnownHostsFile;
|
publicHostsFile = publicHostsFile || sshProxyKnownHostsFile;
|
||||||
|
|
||||||
logger.debug("Initing proxy", {
|
|
||||||
url: getSafeProxyString(proxyUrl),
|
|
||||||
localPort,
|
|
||||||
privateKeyFile,
|
|
||||||
publicHostsFile,
|
|
||||||
});
|
|
||||||
|
|
||||||
const entry = await initSingleProxy(
|
const entry = await initSingleProxy(
|
||||||
proxyUrl,
|
proxyUrl,
|
||||||
localPort++,
|
localPort++,
|
||||||
|
@ -200,6 +193,13 @@ export async function initSingleProxy(
|
||||||
sshProxyPrivateKeyFile?: string,
|
sshProxyPrivateKeyFile?: string,
|
||||||
sshProxyKnownHostsFile?: string,
|
sshProxyKnownHostsFile?: string,
|
||||||
): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> {
|
): Promise<{ proxyUrl: string; dispatcher: Dispatcher }> {
|
||||||
|
logger.debug("Initing proxy", {
|
||||||
|
url: getSafeProxyString(proxyUrl),
|
||||||
|
localPort,
|
||||||
|
sshProxyPrivateKeyFile,
|
||||||
|
sshProxyKnownHostsFile,
|
||||||
|
});
|
||||||
|
|
||||||
if (proxyUrl && proxyUrl.startsWith("ssh://")) {
|
if (proxyUrl && proxyUrl.startsWith("ssh://")) {
|
||||||
proxyUrl = await runSSHD(
|
proxyUrl = await runSSHD(
|
||||||
proxyUrl,
|
proxyUrl,
|
||||||
|
|
|
@ -43,8 +43,8 @@ test("test custom selector crawls JS files as pages", async () => {
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const expectedExtraPages = new Set([
|
const expectedExtraPages = new Set([
|
||||||
"https://www.iana.org/_js/jquery.js",
|
"https://www.iana.org/static/_js/jquery.js",
|
||||||
"https://www.iana.org/_js/iana.js",
|
"https://www.iana.org/static/_js/iana.js",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
expect(pages).toEqual(expectedPages);
|
expect(pages).toEqual(expectedPages);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue