Customizable extract selectors + typo fix (0.4.2) (#72)

* fix typo in setting crawler.capturePrefix which caused directFetchCapture() to fail, causing non-HTML urls to fail. - wrap directFetchCapture() to retry browser loading in case of failure * custom link extraction improvements (improvements for #25) - extractLinks() returns a list of link URLs to allow for more flexibility in custom driver - rename queueUrls() to queueInScopeUrls() to indicate the filtering is performed - loadPage accepts a list of select opts {selector, extract, isAttribute} and defaults to {"a[href]", "href", false} - tests: add test for custom driver which uses custom selector * tests - tests: all tests uses 'test-crawls' instead of crawls - consolidation: combine initial crawl + rollover, combine warc, text tests into basic_crawl.test.js - add custom driver test and fixture to test custom link extraction * add to CHANGES, bump to 0.4.2
2025-10-19 14:33:17 +00:00 · 2021-07-23 18:31:43 -07:00 · 2021-07-23 18:31:43 -07:00 · 0e0b85d7c3
commit 0e0b85d7c3
parent 36ac3cb905
18 changed files with 167 additions and 108 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -20,7 +20,7 @@ jobs:
    - name: install requirements
      run: yarn install
    - name: run linter
-      run: yarn run eslint .
+      run: yarn lint
  
  build:

@ -40,12 +40,6 @@ jobs:
      run: yarn install
    - name: build docker
      run: docker-compose build
-    - name: run crawl
-      run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ  --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2              
-    - name: validate existing wacz
-      run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
-    - name: unzip wacz
-      run: sudo unzip crawls/collections/wr-net/wr-net.wacz -d crawls/collections/wr-net/wacz
    - name: run jest
      run: sudo yarn jest
        
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,5 +1,12 @@
 ## CHANGES

+v0.4.2
+- Compose/docs: Build latest image by default, update README to refer to latest image
+- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
+- Tests: Update all tests to use `test-crawls` directory
+- extractLinks() just extracts links from default selectors, allows custom driver to filter results
+- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
+
 v0.4.1
 - BlockRules Optimizations: don't intercept requests if no blockRules
 - Profile Creation: Support extending existing profile by passing a --profile param to load on startup
--- a/crawler.js
+++ b/crawler.js
@ -28,7 +28,7 @@ const { parseArgs } = require("./util/argParser");

 const { getBrowserExe, loadProfile } = require("./util/browser");

-const { BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
+const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");

 const { BlockRules } = require("./util/blockrules");

@ -67,7 +67,7 @@ class Crawler {
    this.debugLog("Seeds", this.params.scopedSeeds);

    this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
-    this.capturePrefix = this.captureBasePrerix + "/id_/";
+    this.capturePrefix = this.captureBasePrefix + "/id_/";

    this.gotoOpts = {
      waitUntil: this.params.waitUntil,
@ -405,12 +405,16 @@ class Crawler {
    }
  }

-  async loadPage(page, urlData, selector = "a[href]") {
+  async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
    const {url, seedId, depth} = urlData;

    if (!await this.isHTML(url)) {
+      try {
        await this.directFetchCapture(url);
        return;
+      } catch (e) {
+        // ignore failed direct fetch attempt, do browser-based capture
+      }
    }

    if (this.blockRules) {
@ -423,36 +427,38 @@ class Crawler {
      console.warn(`Load timeout for ${url}`, e);
    }

-    if (selector) {
-      await this.extractLinks(page, seedId, depth, selector);
-    }
-  }
-
-  async extractLinks(page, seedId, depth, selector = "a[href]", prop = "href", isAttribute = false) {
-    const results = [];
-
    const seed = this.params.scopedSeeds[seedId];

    // skip extraction if at max depth
-    if (seed.isAtMaxDepth(depth)) {
+    if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
      return;
    }

-    const loadProp = (selector, prop) => {
-      return [...document.querySelectorAll(selector)].map(elem => elem[prop]);
+    for (const opts of selectorOptsList) {
+      const links = await this.extractLinks(page, opts);
+      this.queueInScopeUrls(seedId, links, depth);
+    }
+  }
+
+  async extractLinks(page, {selector = "a[href]", extract = "href", isAttribute = false} = {}) {
+    const results = [];
+
+    const loadProp = (selector, extract) => {
+      return [...document.querySelectorAll(selector)].map(elem => elem[extract]);
    };

-    const loadAttr = (selector, attr) => {
-      return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(attr));
+    const loadAttr = (selector, extract) => {
+      return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(extract));
    };

    const loadFunc = isAttribute ? loadAttr : loadProp;

    try {
-      const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, prop)));
+      const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, extract)));

      if (linkResults) {
        for (const linkResult of linkResults) {
+          if (!linkResult.value) continue;
          for (const link of linkResult.value) {
            results.push(link);
          }
@ -461,12 +467,11 @@ class Crawler {

    } catch (e) {
      console.warn("Link Extraction failed", e);
-      return;
    }
-    this.queueUrls(seedId, results, depth);
+    return results;
  }

-  queueUrls(seedId, urls, depth) {
+  queueInScopeUrls(seedId, urls, depth) {
    try {
      depth += 1;
      const seed = this.params.scopedSeeds[seedId];
@ -619,7 +624,7 @@ class Crawler {

    try {
      const { sites } = await sitemapper.fetch();
-      this.queueUrls(seedId, sites, 0);
+      this.queueInScopeUrls(seedId, sites, 0);
    } catch(e) {
      console.warn(e);
    }
--- a/package.json
+++ b/package.json
@ -1,12 +1,12 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.4.1",
+  "version": "0.4.2",
  "main": "browsertrix-crawler",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
  "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
  "license": "MIT",
  "scripts": {
-    "lint": "eslint *.js util/*.js"
+    "lint": "eslint *.js util/*.js tests/*.test.js"
  },
  "dependencies": {
    "abort-controller": "^3.0.0",
--- a/tests/basic_crawl.test.js
+++ b/tests/basic_crawl.test.js
@ -0,0 +1,63 @@
+const child_process = require("child_process");
+const fs = require("fs");
+const path = require("path");
+const md5 = require("md5");
+
+
+test("ensure basic crawl run with docker run passes", async () => {
+  child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ  --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2");
+
+  child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
+
+  child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
+
+});
+
+test("check that a combined warc file exists in the archive folder", () => {
+  const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
+  var captureFound = 0;
+  
+  for (var i = 0; i < warcLists.length; i++) {
+    if (warcLists[i].endsWith("_0.warc.gz")){
+      captureFound = 1;
+    }
+  }
+  expect(captureFound).toEqual(1);
+});
+
+
+test("check that a combined warc file is under the rolloverSize", () => {
+  const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
+  let rolloverSize = 0;
+
+  function getFileSize(filename) {
+    return fs.statSync(filename).size;
+  }
+
+  for (let i = 0; i < warcLists.length; i++) {
+    const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
+    if (size < 10000){
+      rolloverSize = 1;
+    }
+  }
+  expect(rolloverSize).toEqual(1);
+});
+
+test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
+  expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
+});
+
+test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
+  expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
+});
+
+test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
+  const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
+  const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
+  const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
+  
+  expect(wacz_hash).toEqual(fixture_hash);
+  expect(wacz_hash).toEqual(crawl_hash);
+
+});
+
--- a/tests/collection_name.test.js
+++ b/tests/collection_name.test.js
@ -6,7 +6,7 @@ test("check that the collection name is properly validated", async () => {
  let passed = "";

  try{
-    await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
+    await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
    passed = true;
  }
  catch (error) {
@ -21,7 +21,7 @@ test("check that the collection name is not accepted if it doesn't meets our sta
  let passed = "";

  try{
-    await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
+    await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
    passed = true;
  }
  catch(e){
--- a/tests/combine_warc.test.js
+++ b/tests/combine_warc.test.js
@ -1,13 +0,0 @@
-const fs = require("fs");
-
-test("check that a combined warc file exists in the archive folder", () => {
-  const warcLists = fs.readdirSync("crawls/collections/wr-net");
-  var captureFound = 0;
-  
-  for (var i = 0; i < warcLists.length; i++) {
-    if (warcLists[i].endsWith("_0.warc.gz")){
-      captureFound = 1;
-    }
-  }
-  expect(captureFound).toEqual(1);
-});
--- a/tests/config_file.test.js
+++ b/tests/config_file.test.js
@ -8,13 +8,13 @@ test("check yaml config file with seed list is used", async () => {

  try{

-    await exec("docker-compose run -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection configtest --config /tests/fixtures/crawl-1.yaml --depth 0");
+    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
  }
  catch (error) {
    console.log(error);
  }

-  const crawledPages = fs.readFileSync("crawls/collections/configtest/pages/pages.jsonl", "utf8");
+  const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8");
  const pages = new Set();

  for (const line of crawledPages.trim().split("\n")) {
@ -36,7 +36,7 @@ test("check yaml config file with seed list is used", async () => {
  }
  expect(foundAllSeeds).toBe(true);

-  expect(fs.existsSync("crawls/collections/configtest/configtest.wacz")).toBe(true);
+  expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true);

 });

@ -45,22 +45,23 @@ test("check yaml config file will be overwritten by command line", async () => {

  try{

-    await exec("docker-compose run -v $PWD/crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection configtest --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
+    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
  }
  catch (error) {
    console.log(error);
  }

-  const crawledPages = fs.readFileSync("crawls/collections/configtest/pages/pages.jsonl", "utf8");
+  const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8");
  const pages = new Set();

  for (const line of crawledPages.trim().split("\n")) {
-    pages.add(JSON.parse(line).url);
+    const url = JSON.parse(line).url;
+    if (url) {
+      pages.add(url);
+    }
  }

-
-  
  expect(pages.has("https://www.example.com/")).toBe(true);
-
+  expect(pages.size).toBe(1);

 });
--- a/tests/config_stdin.test.js
+++ b/tests/config_stdin.test.js
@ -9,7 +9,7 @@ test("pass config file via stdin", async () => {
  const config = yaml.load(configYaml);

  try {
-    const proc = child_process.execSync("docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
+    const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});

    console.log(proc);
  }
@ -17,7 +17,7 @@ test("pass config file via stdin", async () => {
    console.log(error);
  }

-  const crawledPages = fs.readFileSync("crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
+  const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
  const pages = new Set();

  for (const line of crawledPages.trim().split("\n")) {
@ -39,6 +39,6 @@ test("pass config file via stdin", async () => {
  }
  expect(foundAllSeeds).toBe(true);

-  expect(fs.existsSync("crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
+  expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);

 });
--- a/tests/custom_driver.test.js
+++ b/tests/custom_driver.test.js
@ -0,0 +1,34 @@
+const child_process = require("child_process");
+const fs = require("fs");
+
+test("ensure custom driver with custom selector crawls JS files as pages", async () => {
+  jest.setTimeout(30000);
+
+  try {
+    child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.js");
+  }
+  catch (error) {
+    console.log(error);
+  }
+
+  const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8");
+  const pages = new Set();
+
+  for (const line of crawledPages.trim().split("\n")) {
+    const url = JSON.parse(line).url;
+    if (!url) {
+      continue;
+    }
+    pages.add(url);
+  }
+
+  const expectedPages = new Set([
+    "https://www.iana.org/",
+    "https://www.iana.org/_js/2013.1/jquery.js",
+    "https://www.iana.org/_js/2013.1/iana.js"
+  ]);
+
+  expect(pages).toEqual(expectedPages);
+
+});
+ 
--- a/tests/fixtures/crawl-1.yaml
+++ b/tests/fixtures/crawl-1.yaml
@ -1,5 +1,5 @@
 name: crawl-test-1
-
+collection: configtest
 seeds:
  - https://www.example.org
  - https://www.iana.org/
--- a/tests/fixtures/driver-1.js
+++ b/tests/fixtures/driver-1.js
@ -0,0 +1,4 @@
+module.exports = async ({data, page, crawler}) => {
+  await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
+};
+
--- a/tests/rollover.test.js
+++ b/tests/rollover.test.js
@ -1,20 +0,0 @@
-const fs = require("fs");
-const path = require("path");
-
-function getFileSize(filename) {
-  var stats = fs.statSync(filename);
-  return stats.size;
-}
-
-test("check that a combined warc file is under the rolloverSize", () => {
-  const warcLists = fs.readdirSync(path.join("crawls/collections/wr-net/wacz", "archive"));
-  var rolloverSize = 0;
-  
-  for (var i = 0; i < warcLists.length; i++) {
-    var size = getFileSize(path.join("crawls/collections/wr-net/wacz/archive/", warcLists[i]));
-    if (size < 10000){
-      rolloverSize = 1;
-    }
-  }
-  expect(rolloverSize).toEqual(1);
-});
--- a/tests/scopes.test.js
+++ b/tests/scopes.test.js
@ -6,13 +6,13 @@ function getSeeds(config) {
  const orig = fs.readFileSync;

  fs.readFileSync = (name, ...args) => {
-    if (name.endsWith("/configtest")) {
+    if (name.endsWith("/stdinconfig")) {
      return config;
    }
    return orig(name, ...args);
  };

-  return parseArgs(["node", "crawler", "--config", "configtest"]).scopedSeeds;
+  return parseArgs(["node", "crawler", "--config", "stdinconfig"]).scopedSeeds;
 }

 test("default scope", async () => {
--- a/tests/text.test.js
+++ b/tests/text.test.js
@ -1,22 +0,0 @@
-const fs = require("fs");
-const md5 = require("md5");
-
-
-test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
-  expect(fs.existsSync("crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
-});
-
-test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
-  expect(fs.existsSync("crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
-});
-
-test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
-  const crawl_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
-  const wacz_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
-  const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
-  
-  expect(wacz_hash).toEqual(fixture_hash);
-  expect(wacz_hash).toEqual(crawl_hash);
-
-});
-
--- a/tests/url_file_list.test.js
+++ b/tests/url_file_list.test.js
@ -7,13 +7,13 @@ test("check that all urls in a file list are crawled when the filelisturl param

  try{

-    await exec("docker-compose run -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
+    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
  }
  catch (error) {
    console.log(error);
  }

-  let crawled_pages = fs.readFileSync("crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
+  let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
  let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort();

  let seed_file_list = [];
--- a/tests/warcinfo.test.js
+++ b/tests/warcinfo.test.js
@ -7,7 +7,7 @@ test("check that the warcinfo file works as expected on the command line", async

  try{
    const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
-    const proc = child_process.execSync("docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
+    const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});

    console.log(proc);
  }
@ -15,7 +15,7 @@ test("check that the warcinfo file works as expected on the command line", async
    console.log(error);
  }

-  const warcData = fs.readFileSync("crawls/collections/warcinfo/warcinfo_0.warc.gz");
+  const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");

  const data = zlib.gunzipSync(warcData);

--- a/util/constants.js
+++ b/util/constants.js
@ -3,3 +3,9 @@ module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtm
 module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
 module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";

+module.exports.DEFAULT_SELECTORS = [{
+  selector: "a[href]",
+  extract: "href",
+  isAttribute: false
+}];
+