Extract links from all frames attached to a page, fixes #45 (#48)

This commit is contained in:
Sebastian Nagel 2021-04-30 17:41:00 +02:00 committed by GitHub
parent 9293375790
commit 9d577dac57
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 4 deletions

View file

@ -8,7 +8,7 @@ and [puppeteer](https://github.com/puppeteer/puppeteer) to control one or more b
Thus far, Browsertrix Crawler supports: Thus far, Browsertrix Crawler supports:
- Single-container, browser based crawling with multiple headless/headful browsers. - Single-container, browser based crawling with multiple headless/headful browsers.
- Support for custom browser behaviors, ysing [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors. - Support for custom browser behaviors, using [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
- Optimized (non-browser) capture of non-HTML resources. - Optimized (non-browser) capture of non-HTML resources.
- Extensible Puppeteer driver script for customizing behavior per crawl or page. - Extensible Puppeteer driver script for customizing behavior per crawl or page.
- Ability to create and reuse browser profiles with user/password login - Ability to create and reuse browser profiles with user/password login

View file

@ -617,13 +617,14 @@ class Crawler {
} }
async extractLinks(page, selector = "a[href]") { async extractLinks(page, selector = "a[href]") {
let results = null; let results = [];
try { try {
results = await page.evaluate((selector) => { await Promise.allSettled(page.frames().map(frame => frame.evaluate((selector) => {
/* eslint-disable-next-line no-undef */ /* eslint-disable-next-line no-undef */
return [...document.querySelectorAll(selector)].map(elem => elem.href); return [...document.querySelectorAll(selector)].map(elem => elem.href);
}, selector); }, selector))).then((linkResults) => {
linkResults.forEach((linkResult) => {linkResult.value.forEach(link => results.push(link));});});
} catch (e) { } catch (e) {
console.warn("Link Extraction failed", e); console.warn("Link Extraction failed", e);
return; return;