mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
parent
9293375790
commit
9d577dac57
2 changed files with 5 additions and 4 deletions
|
@ -8,7 +8,7 @@ and [puppeteer](https://github.com/puppeteer/puppeteer) to control one or more b
|
|||
Thus far, Browsertrix Crawler supports:
|
||||
|
||||
- Single-container, browser based crawling with multiple headless/headful browsers.
|
||||
- Support for custom browser behaviors, ysing [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
|
||||
- Support for custom browser behaviors, using [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
|
||||
- Optimized (non-browser) capture of non-HTML resources.
|
||||
- Extensible Puppeteer driver script for customizing behavior per crawl or page.
|
||||
- Ability to create and reuse browser profiles with user/password login
|
||||
|
|
|
@ -617,13 +617,14 @@ class Crawler {
|
|||
}
|
||||
|
||||
async extractLinks(page, selector = "a[href]") {
|
||||
let results = null;
|
||||
let results = [];
|
||||
|
||||
try {
|
||||
results = await page.evaluate((selector) => {
|
||||
await Promise.allSettled(page.frames().map(frame => frame.evaluate((selector) => {
|
||||
/* eslint-disable-next-line no-undef */
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem.href);
|
||||
}, selector);
|
||||
}, selector))).then((linkResults) => {
|
||||
linkResults.forEach((linkResult) => {linkResult.value.forEach(link => results.push(link));});});
|
||||
} catch (e) {
|
||||
console.warn("Link Extraction failed", e);
|
||||
return;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue