factor out behaviors to browsertrix-behaviors: (#32)

- inject built 'behaviors.js' from browsertrix-behaviors, init with options and run - remove bgbehaviors - move textextract to root for now - add requirements.txt for python dependencies - remove obsolete --scroll option, to part of the behaviors system logging: - configure logging options via --logging param, can include 'stats' (default), 'pywb', 'behaviors', and 'behaviors-debug' - inject custom logging function for behaviors to call if either behaviors or behaviors-debug is set - 'behaviors-debug' prints all debug messages from behaviors, while regular 'behaviors' prints main behavior messages (useful for verification) dockerfile: add 'rebuild' arg to faciliate rebuilding image from specific step bump to 0.3.0-beta.0
2025-12-08 06:09:48 +00:00 · 2021-03-13 16:48:31 -08:00 · 2021-03-13 16:48:31 -08:00 · bc7f1badf3
commit bc7f1badf3
parent 9ef3f25416
12 changed files with 3541 additions and 4042 deletions
--- a/11
+++ b/11
@ -4,6 +4,8 @@ FROM oldwebtoday/chrome:${BROWSER_VERSION} as chrome

 FROM nikolaik/python-nodejs:python3.8-nodejs14

+RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
+
 RUN apt-get update -y \
    && apt-get install --no-install-recommends -qqy fonts-stix locales-all redis-server xvfb \
    && apt-get clean \
@ -24,18 +26,21 @@ COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
 RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
    rm -rf /var/lib/opts/lists/*

-RUN pip install pywb>=2.5.0 uwsgi wacz
-
 WORKDIR /app

+ADD requirements.txt /app/
+RUN pip install -r requirements.txt
+
 ADD package.json /app/

+# to allow forcing rebuilds from this stage
+ARG REBUILD
+
 RUN yarn install

 ADD config.yaml /app/
 ADD uwsgi.ini /app/
 ADD *.js /app/
-ADD behaviors/ /app/behaviors/

 RUN ln -s /app/main.js /usr/bin/crawl

--- a/behaviors/bgbehaviors.js
+++ b/behaviors/bgbehaviors.js
@ -1,44 +0,0 @@
-const AutoPlayBehavior = require("./global/autoplay");
-
-const AutoFetchBehavior = require("./global/autofetcher");
-
-const AutoScrollBehavior = require("./global/autoscroll");
-
-
-// ===========================================================================
-class BackgroundBehaviors
-{
-  constructor(bgbehaviors) {
-    this.doAutoFetch = bgbehaviors.includes("auto-fetch");
-    this.doAutoPlay = bgbehaviors.includes("auto-play");
-    this.doAutoScroll = bgbehaviors.includes("auto-scroll");
-  }
-
-  async setup(page, crawler) {
-    const behaviors = [];
-
-    try {
-      if (this.doAutoFetch) {
-        behaviors.push(new AutoFetchBehavior());
-      }
-
-      if (this.doAutoPlay) {
-        behaviors.push(new AutoPlayBehavior());
-      }
-
-      if (this.doAutoScroll) {
-        behaviors.push(new AutoScrollBehavior());
-      }
-
-      await Promise.all(behaviors.map(b => b.beforeLoad(page, crawler)));
-
-    } catch (err) {
-      console.log(err);
-    }
-
-    return () => Promise.all(behaviors.map(b => b.afterLoad(page, crawler)));
-  }
-}
-
-module.exports = BackgroundBehaviors;
-
--- a/behaviors/global/autofetcher.js
+++ b/behaviors/global/autofetcher.js
@ -1,233 +0,0 @@
-// AutoFetcher script
-// extract and fetch all urls from srcsets, from images as well as audio/video
-// also extract any urls from media query stylesheets that have not necessarily been loaded
-// (May not work for cross-origin stylesheets)
-
-function autofetcher() {
-
-  const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +  
-  'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
-  'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
-  'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
-  'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
-
-  const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; // eslint-disable-line no-useless-escape
-
-  const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
-  const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
-
-
-  class AutoFetcher
-  {
-    constructor() {
-      this.urlSet = new Set();
-      this.urlqueue = [];
-      this.numPending = 0;
-    }
-
-    init() {
-      console.log("init autofetch");
-
-      window.addEventListener("load", () => {
-        this.run();
-        this.initObserver();
-      });
-    }
-
-    async run() {
-      /*eslint no-constant-condition: ["error", { "checkLoops": false }]*/
-      while (true) {
-        this.extractSrcSrcSetAll(document);
-        this.extractStyleSheets();
-        if (window.__crawler_nextPhase) {
-          await window.__crawler_nextPhase();
-        } else {
-          break;
-        }
-      }
-    }
-
-    isValidUrl(url) {
-      return url && (url.startsWith("http:") || url.startsWith("https:"));
-    }
-
-    queueUrl(url) {
-      try {
-        url = new URL(url, document.baseURI).href;
-      } catch (e) {
-        return;
-      }
-
-      if (!this.isValidUrl(url)) {
-        return;
-      }
-
-      if (this.urlSet.has(url)) {
-        return;
-      }
-
-      this.urlSet.add(url);
-
-      this.doFetch(url);
-    }
-
-    async doFetch(url) {
-      this.urlqueue.push(url);
-      if (this.numPending <= 6) {
-        while (this.urlqueue.length > 0) {
-          const url = this.urlqueue.shift();
-          try {
-            this.numPending++;
-            console.log("AutoFetching: " + url);
-            const resp = await fetch(url);
-            await resp.blob();
-          } catch (e) {
-            console.log(e)
-          }
-          this.numPending--;
-        }
-      }
-    }
-
-    initObserver() {
-      this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
-
-      this.mutobz.observe(document.documentElement, {
-        characterData: false,
-        characterDataOldValue: false,
-        attributes: true,
-        attributeOldValue: true,
-        subtree: true,
-        childList: true,
-        attributeFilter: ['srcset']
-      });
-    }
-
-    processChangedNode(target) {
-      switch (target.nodeType) {
-        case Node.ATTRIBUTE_NODE:
-          if (target.nodeName === "srcset") {
-            this.extractSrcSetAttr(target.nodeValue);
-          }
-          break;
-
-        case Node.TEXT_NODE:
-          if (target.parentNode && target.parentNode.tagName === "STYLE") {
-            this.extractStyleText(target.nodeValue);
-          }
-          break;
-
-        case Node.ELEMENT_NODE:
-          if (target.sheet) {
-            this.extractStyleSheet(target.sheet);
-          }
-          this.extractSrcSrcSet(target);
-          setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
-          break;
-      }
-    }
-
-    observeChange(changes) {
-      for (const change of changes) {
-        this.processChangedNode(change.target);
-
-        if (change.type === "childList") {
-          for (const node of change.addedNodes) {
-            this.processChangedNode(node);
-          }
-        }
-      }
-    }
-
-    extractSrcSrcSetAll(root) {
-      const elems = root.querySelectorAll(SRC_SET_SELECTOR);
-
-      for (const elem of elems) {
-        //console.log(elem);
-        this.extractSrcSrcSet(elem);
-      } 
-    }
-
-    extractSrcSrcSet(elem) {
-      if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
-        console.warn("No elem to extract from");
-        return;
-      }
-
-      const src = elem.src || elem.getAttribute("data-src");
-
-      if (src) {
-        this.queueUrl(src);
-      }
-
-      const srcset = elem.srcset || elem.getAttribute("data-srcset");
-
-      if (srcset) {
-        this.extractSrcSetAttr(srcset);
-      }
-    }
-
-    extractSrcSetAttr(srcset) {
-      for (const v of srcset.split(SRCSET_REGEX)) {
-        if (v) {
-          const parts = v.trim().split(" ");
-          this.queueUrl(parts[0]);
-        }
-      }
-    }
-
-    extractStyleSheets(root) {
-      root = root || document;
-
-      for (const sheet of root.styleSheets) {
-        this.extractStyleSheet(sheet);
-      }
-    }
-
-    extractStyleSheet(sheet) {
-      let rules;
-      
-      try {
-        rules = sheet.cssRules || sheet.rules;
-      } catch (e) {
-        console.log("Can't access stylesheet");
-        return;
-      }
-
-      for (const rule of rules) {
-        if (rule.type === CSSRule.MEDIA_RULE) {
-          this.extractStyleText(rule.cssText);
-        }
-      }
-    }
-
-    extractStyleText(text) {
-      const urlExtractor = (m, n1, n2, n3) => {
-        this.queueUrl(n2);
-        return n1 + n2 + n3;
-      };
-
-      text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
-    }
-  }
-
-  new AutoFetcher().init();
-}
-
-
-// ===========================================================================
-class AutoFetchBehavior
-{
-  async beforeLoad(page) {
-    const iife = `(${autofetcher.toString()})();`;
-    await page.evaluateOnNewDocument(iife);
-  }
-
-  async afterLoad() {
-
-  }
-}
-
-
-module.exports = AutoFetchBehavior;
-
--- a/behaviors/global/autoplay.js
+++ b/behaviors/global/autoplay.js
@ -1,139 +0,0 @@
-// ===========================================================================
-function autoplay() {
-  function run() {
-    if (self.navigator.__crawler_autoplay) {
-      return;
-    }
-
-    function loadAutoplay(url) {
-      if (self.__crawler_autoplayLoad) {
-        self.__crawler_autoplayLoad(url);
-      }
-      // delay to allow splash image to load
-      setTimeout(() => self.location.href = url, 1000);
-    }
-
-    //console.log("checking autoplay for " + document.location.href);
-    self.navigator.__crawler_autoplay = true;
-
-    const specialActions = [
-      {
-        rx: /w\.soundcloud\.com/,
-        check(url) {
-          const autoplay = url.searchParams.get('auto_play');
-          return autoplay === 'true';
-        },
-        handle(url) {
-          url.searchParams.set('auto_play', 'true');
-          // set continuous_play to true in order to handle
-          // a playlist etc
-          url.searchParams.set('continuous_play', 'true');
-          loadAutoplay(url.href);
-        },
-      },
-      {
-        rx: [/player\.vimeo\.com/, /youtube(?:-nocookie)?\.com\/embed\//],
-        check(url) {
-          const autoplay = url.searchParams.get('autoplay');
-          return autoplay === '1';
-        },
-        handle(url) {
-          url.searchParams.set('autoplay', '1');
-          loadAutoplay(url.href);
-        },
-      },
-    ];
-    const url = new URL(self.location.href);
-    for (let i = 0; i < specialActions.length; i++) {
-      if (Array.isArray(specialActions[i].rx)) {
-        const rxs = specialActions[i].rx;
-        for (let j = 0; j < rxs.length; j++) {
-          if (url.href.search(rxs[j]) >= 0) {
-            if (specialActions[i].check(url)) return;
-            return specialActions[i].handle(url);
-          }
-        }
-      } else if (url.href.search(specialActions[i].rx) >= 0) {
-        if (specialActions[i].check(url)) return;
-        return specialActions[i].handle(url);
-      }
-    }
-  }
-
-  self.document.addEventListener("readystatechange", run);
-
-  if (self.document.readyState === "complete") {
-    run();
-  }
-
-
-  const mediaSet = new Set();
-
-  setInterval(() => {
-    const medias = self.document.querySelectorAll("video, audio");
-
-    for (const media of medias) {
-      try {
-        if (media.src && !mediaSet.has(media.src)) {
-          if (self.__crawler_queueUrls && (media.src.startsWith("http:") || media.src.startsWith("https:"))) {
-            self.__crawler_queueUrls(media.src);
-          }
-          mediaSet.add(media.src);
-        } else if (!media.src) {
-          media.play();
-        }
-      } catch(e) {
-        console.log(e);
-      }
-    }
-  }, 3000);
-
-}
-
-
-// ===========================================================================
-class AutoPlayBehavior
-{
-  constructor() {
-    this.mediaPromises = [];
-    this.waitForVideo = false;
-  }
-
-  async beforeLoad(page, crawler) {
-    try {
-      await page.exposeFunction("__crawler_queueUrls", async (url) => {
-        this.mediaPromises.push(crawler.directFetchCapture(url));
-      });
-
-      await page.exposeFunction("__crawler_autoplayLoad", (url) => {
-        console.log("*** Loading autoplay URL: " + url);
-        this.waitForVideo = true;
-      });
-
-      const iife = `(${autoplay.toString()})();`;
-      await page.evaluateOnNewDocument(iife);
- 
-    } catch(err) {
-      console.log(err);
-    }
-  }
-
-  async afterLoad(page, crawler) {
-    try {
-      await Promise.all(this.mediaPromises);
-    } catch (e) {
-      console.log("Error loading media URLs", e);
-    }
-
-    if (this.waitForVideo) {
-      console.log("Extra wait 15s for video loading");
-      await crawler.sleep(15000);
-    }
-  }
-}
-
-
-
-
-module.exports = AutoPlayBehavior
-
--- a/behaviors/global/autoscroll.js
+++ b/behaviors/global/autoscroll.js
@ -1,37 +0,0 @@
-async function autoScroll() {
-  const canScrollMore = () =>
-    self.scrollY + self.innerHeight <
-    Math.max(
-      self.document.body.scrollHeight,
-      self.document.body.offsetHeight,
-      self.document.documentElement.clientHeight,
-      self.document.documentElement.scrollHeight,
-      self.document.documentElement.offsetHeight
-    );
-
-  const scrollOpts = { top: 250, left: 0, behavior: "auto" };
-
-  while (canScrollMore()) {
-    self.scrollBy(scrollOpts);
-    await new Promise(resolve => setTimeout(resolve, 500));
-  }
-}
-
-
-// ===========================================================================
-class AutoScrollBehavior
-{
-
-  async beforeLoad() {
-  }
-
-  async afterLoad(page, crawler) {
-    try {
-      await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
-    } catch (e) {
-      console.warn("Autoscroll Behavior Failed", e);
-    }
-  }
-}
-
-module.exports = AutoScrollBehavior;
--- a/crawler.js
+++ b/crawler.js
@ -7,13 +7,15 @@ const path = require("path");
 const fs = require("fs");
 const Sitemapper = require("sitemapper");
 const { v4: uuidv4 } = require("uuid");
-const TextExtract = require("./behaviors/global/textextract");
-const BackgroundBehaviors = require("./behaviors/bgbehaviors");

+const TextExtract = require("./textextract");
+const behaviors = fs.readFileSync("/app/node_modules/browsertrix-behaviors/dist/behaviors.js", "utf-8");

 const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
 const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];

+const BEHAVIOR_LOG_FUNC = "__bx_log";
+
 const CHROME_PATH = "google-chrome";

 // to ignore HTTPS error for HEAD check
@ -39,10 +41,8 @@ class Crawler {
    // was the limit hit?
    this.limitHit = false;

-    this.monitor = true;
-
    this.userAgent = "";
-    this.headers = {};
+    this.behaviorsLogDebug = false;

    const params = require("yargs")
      .usage("browsertrix-crawler [options]")
@ -64,9 +64,6 @@ class Crawler {

    // pages file
    this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
-
-    // background behaviors
-    this.bgbehaviors = new BackgroundBehaviors(this.params.bgbehaviors || []);
  }

  configureUA() {
@ -108,7 +105,7 @@ class Crawler {

  bootstrap() {
    let opts = {}
-    if (this.params.pywb_log) {
+    if (this.params.logging.includes("pywb")) {
      opts = {stdio: "inherit", cwd: this.params.cwd};
    }
    else{
@ -189,12 +186,6 @@ class Crawler {
        describe: "Regex of page URLs that should be excluded from the crawl."
      },

-      "scroll": {
-        describe: "If set, will autoscroll to bottom of the page",
-        type: "boolean",
-        default: false,
-      },
-
      "collection": {
        alias: "c",
        describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
@ -228,12 +219,12 @@ class Crawler {
        default: false,
      },
      
-      "pywb-log": {
-        describe: "If set, generate pywb log file",
-         type: "boolean",
-        default: false,
+      "logging": {
+        describe: "Logging options for crawler, can include: stats, pywb, behaviors",
+        type: "string",
+        default: "stats",
      },
-      
+    
      "text": {
        describe: "If set, extract text to the pages.jsonl file",
        type: "boolean",
@ -269,9 +260,9 @@ class Crawler {
        describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
      },

-      "bgbehaviors": {
+      "behaviors": {
        describe: "Which background behaviors to enable on each page",
-        default: "auto-play,auto-fetch",
+        default: "autoplay,autofetch,siteSpecific",
        type: "string",
      },
    };
@ -313,8 +304,19 @@ class Crawler {
      }
    }

+    // log options
+    argv.logging = argv.logging.split(",");
+
    // background behaviors to apply
-    argv.bgbehaviors = argv.bgbehaviors.split(",");
+    const behaviorOpts = {};
+    argv.behaviors.split(",").forEach((x) => behaviorOpts[x] = true);
+    if (argv.logging.includes("behaviors")) {
+      behaviorOpts.log = BEHAVIOR_LOG_FUNC;
+    } else if (argv.logging.includes("behaviors-debug")) {
+      behaviorOpts.log = BEHAVIOR_LOG_FUNC;
+      this.behaviorsLogDebug = true;
+    }
+    this.behaviorOpts = JSON.stringify(behaviorOpts);

    if (!argv.newContext) {
      argv.newContext = "page";
@ -421,15 +423,33 @@ class Crawler {
        await page.emulate(this.emulateDevice);
      }

-      const bgbehavior = await this.bgbehaviors.setup(page, this);
+      if (this.behaviorOpts) {
+        await page.exposeFunction(BEHAVIOR_LOG_FUNC, ({data, type}) => {
+          switch (type) {
+            case "info":
+              console.log(JSON.stringify(data));
+              break;
+
+            case "debug":
+            default:
+              if (this.behaviorsLogDebug) {
+                console.log("behavior debug: " + JSON.stringify(data));
+              }
+          }
+        });
+
+        await page.evaluateOnNewDocument(behaviors + `
+          self.__bx_behaviors.init(${this.behaviorOpts});
+        `);
+      }

      // run custom driver here
      await this.driver({page, data, crawler: this});
      
      
      const title = await page.title();
-      var text = ''
-      if (this.params.text){
+      let text = '';
+      if (this.params.text) {
        const client = await page.target().createCDPSession();
        const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
        text = await new TextExtract(result).parseTextFromDom();
@ -437,8 +457,8 @@ class Crawler {
    
      this.writePage(data.url, title, this.params.text, text);

-      if (bgbehavior) {
-        await bgbehavior();
+      if (this.behaviorOpts) {
+        await Promise.allSettled(page.frames().map(frame => frame.evaluate("self.__bx_behaviors.run();")));
      }

      this.writeStats();
@ -464,7 +484,7 @@ class Crawler {
      timeout: this.params.timeout * 2,
      puppeteerOptions: this.puppeteerArgs,
      puppeteer,
-      monitor: this.monitor
+      monitor: this.params.logging.includes("stats")
    });

    this.cluster.task((opts) => this.crawlPage(opts));
@ -669,8 +689,8 @@ class Crawler {
      });

      if (resp.status >= 400) {
-        console.log(`Skipping ${url}, invalid status ${resp.status}`);
-        return false;
+        console.log(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
+        return true;
      }

      const contentType = resp.headers.get("Content-Type");
--- a/defaultDriver.js
+++ b/defaultDriver.js
@ -1,5 +1,3 @@
-//const autoplayScript = require("/app/autoplay.js");
-
 /* eslint-disable no-undef */

 module.exports = async ({data, page, crawler}) => {
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,7 +2,7 @@ version: '3.5'
  
 services:
    crawler:
-        image: webrecorder/browsertrix-crawler:0.2.1-beta.0
+        image: webrecorder/browsertrix-crawler:0.3.0-beta.0
        build:
          context: ./

--- a/package.json
+++ b/package.json
@ -1,12 +1,13 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.2.1-beta.0",
+  "version": "0.3.0-beta.0",
  "main": "browsertrix-crawler",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
  "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
  "license": "MIT",
  "dependencies": {
    "abort-controller": "^3.0.0",
+    "browsertrix-behaviors": "github:webrecorder/browsertrix-behaviors",
    "node-fetch": "^2.6.1",
    "puppeteer-cluster": "^0.22.0",
    "puppeteer-core": "^5.3.1",
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+pywb>=2.5.0
+uwsgi
+wacz>=0.2.1
--- a/behaviors/global/textextract.js
+++ b/behaviors/global/textextract.js
--- a/yarn.lock
+++ b/yarn.lock