Extract links from all frames attached to a page, fixes #45 (#48)

This commit is contained in:
Sebastian Nagel 2021-04-30 17:41:00 +02:00 committed by GitHub
parent 9293375790
commit 9d577dac57
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 4 deletions

View file

@ -8,7 +8,7 @@ and [puppeteer](https://github.com/puppeteer/puppeteer) to control one or more b
Thus far, Browsertrix Crawler supports:
- Single-container, browser based crawling with multiple headless/headful browsers.
- Support for custom browser behaviors, ysing [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
- Support for custom browser behaviors, using [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
- Optimized (non-browser) capture of non-HTML resources.
- Extensible Puppeteer driver script for customizing behavior per crawl or page.
- Ability to create and reuse browser profiles with user/password login

View file

@ -617,13 +617,14 @@ class Crawler {
}
async extractLinks(page, selector = "a[href]") {
let results = null;
let results = [];
try {
results = await page.evaluate((selector) => {
await Promise.allSettled(page.frames().map(frame => frame.evaluate((selector) => {
/* eslint-disable-next-line no-undef */
return [...document.querySelectorAll(selector)].map(elem => elem.href);
}, selector);
}, selector))).then((linkResults) => {
linkResults.forEach((linkResult) => {linkResult.value.forEach(link => results.push(link));});});
} catch (e) {
console.warn("Link Extraction failed", e);
return;