Extract links from all frames attached to a page, fixes #45 (#48)

2025-10-19 14:33:17 +00:00 · 2021-04-30 17:41:00 +02:00 · 2021-04-30 17:41:00 +02:00 · 9d577dac57
commit 9d577dac57
parent 9293375790
2 changed files with 5 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ and [puppeteer](https://github.com/puppeteer/puppeteer) to control one or more b
 Thus far, Browsertrix Crawler supports:

 - Single-container, browser based crawling with multiple headless/headful browsers.
- Support for custom browser behaviors, ysing [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
+- Support for custom browser behaviors, using [Browsertix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay and site-specific behaviors.
 - Optimized (non-browser) capture of non-HTML resources.
 - Extensible Puppeteer driver script for customizing behavior per crawl or page.
 - Ability to create and reuse browser profiles with user/password login
--- a/crawler.js
+++ b/crawler.js
@ -617,13 +617,14 @@ class Crawler {
  }

  async extractLinks(page, selector = "a[href]") {
-    let results = null;
+    let results = [];

    try {
-      results = await page.evaluate((selector) => {
+      await Promise.allSettled(page.frames().map(frame => frame.evaluate((selector) => {
        /* eslint-disable-next-line no-undef */
        return [...document.querySelectorAll(selector)].map(elem => elem.href);
-      }, selector);
+      }, selector))).then((linkResults) => {
+        linkResults.forEach((linkResult) => {linkResult.value.forEach(link => results.push(link));});});
    } catch (e) {
      console.warn("Link Extraction failed", e);
      return;