Scope Handling Improvements + Tests (#66)

* scope fixes: - remove default prefix scopeType, ensure scope include and exclude take precedence - add new 'custom' scopeType, when include or exclude are used - use --scopeIncludeRx and --scopeExcludeRx for better consistency for scope include and exclude (also allow --include/--exclude) - ensure per-seed scope include/exclude used when present, and scopeType set to 'custom' - ensure default scope is set to 'prefix' if no scopeType and no include/exclude regexes specified - rename --type to --scopeType in seed to maintain consistency - add sitemap param as alias for useSitemap tests: - add seed scope resolution tests for argParse, testing per-scope seed resolution, inheritance and overrides - fix screencaster to use relative paths to work with tests - ci: use yarn instead of npm * update README with new flags * bump version to 0.4.0-beta.3
2025-10-19 14:33:17 +00:00 · 2021-07-06 20:22:27 -07:00 · 2021-07-06 20:22:27 -07:00 · 473de8c49f
commit 473de8c49f
parent ef7d5e50d8
9 changed files with 216 additions and 36 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -18,7 +18,7 @@ jobs:
      with:
        node-version: ${{ matrix.node-version }}
    - name: install requirements
-      run: npm install
+      run: yarn install
    - name: run linter
      run: yarn run eslint .
  
@ -37,7 +37,7 @@ jobs:
      with:
        node-version: ${{ matrix.node-version }}
    - name: install requirements
-      run: npm install
+      run: yarn install
    - name: build docker
      run: docker-compose build
    - name: run crawl
--- a/README.md
+++ b/README.md
@ -73,13 +73,15 @@ Options:
                                                           [number] [default: 0]
      --timeout                             Timeout for each page to load (in
                                            seconds)      [number] [default: 90]
-      --scope                               Regex of page URLs that should be
+      --scopeType                           Predefined for which URLs to crawl,
+                                            can be: prefix, page, host, any, or
+                                            custom, to use the
+                                            scopeIncludeRx/scopeExcludeRx
+                                                                        [string]
+      --scopeIncludeRx, --include           Regex of page URLs that should be
                                            included in the crawl (defaults to
                                            the immediate directory of URL)
-      --scopeType                           Simplified scope for which URLs to
-                                            crawl, can be: prefix, page, host,
-                                            any     [string] [default: "prefix"]
-      --exclude                             Regex of page URLs that should be
+      --scopeExcludeRx, --exclude           Regex of page URLs that should be
                                            excluded from the crawl.
      --allowHashUrls                       Allow Hashtag URLs, useful for
                                            single-page-application crawling or
@ -88,7 +90,7 @@ Options:
  -c, --collection                          Collection name to crawl to (replay
                                            will be accessible under this name
                                            in pywb preview)
-                               [string] [default: "capture-2021-06-26T19-38-10"]
+                               [string] [default: "capture-2021-07-01T23-43-26"]
      --headless                            Run in headless mode, otherwise
                                            start xvfb[boolean] [default: false]
      --driver                              JS driver for the crawler
@ -122,7 +124,7 @@ Options:
      --userAgentSuffix                     Append suffix to existing browser
                                            user-agent (ex: +MyCrawler,
                                            info@example.com)           [string]
-      --useSitemap                          If enabled, check for sitemaps at
+      --useSitemap, --sitemap               If enabled, check for sitemaps at
                                            /sitemap.xml, or custom URL if URL
                                            is specified
      --statsFilename                       If set, output stats as JSON to this
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -2,7 +2,7 @@ version: '3.5'
  
 services:
    crawler:
-        image: webrecorder/browsertrix-crawler:0.4.0-beta.2
+        image: webrecorder/browsertrix-crawler:0.4.0-beta.3
        build:
          context: ./

--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "browsertrix-crawler",
-  "version": "0.4.0-beta.2",
+  "version": "0.4.0-beta.3",
  "main": "browsertrix-crawler",
  "repository": "https://github.com/webrecorder/browsertrix-crawler",
  "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
--- a/tests/config_stdin.test.js
+++ b/tests/config_stdin.test.js
@ -10,7 +10,7 @@ test("pass config file via stdin", async () => {

  try {
    const version = require("../package.json").version;
-    const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --exclude webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
+    const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --scopeExcludeRx webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"});

    console.log(proc);
  }
--- a/tests/scopes.test.js
+++ b/tests/scopes.test.js
@ -0,0 +1,166 @@
+const { parseArgs } = require("../util/argParser");
+
+const fs = require("fs");
+
+function getSeeds(config) {
+  const orig = fs.readFileSync;
+
+  fs.readFileSync = (name, ...args) => {
+    if (name.endsWith("/configtest")) {
+      return config;
+    }
+    return orig(name, ...args);
+  };
+
+  return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
+}
+
+test("default scope", async () => {
+  const seeds = getSeeds(`
+seeds:
+   - https://example.com/
+
+`);
+
+
+  expect(seeds.length).toEqual(1);
+  expect(seeds[0].scopeType).toEqual("prefix");
+  expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]);
+  expect(seeds[0].exclude).toEqual([]);
+
+});
+
+test("custom scope", async () => {
+  const seeds = getSeeds(`
+seeds:
+   - url: https://example.com/
+     include: https://example.com/(path|other)
+     exclude: https://example.com/pathexclude
+`);
+
+
+  expect(seeds.length).toEqual(1);
+  expect(seeds[0].scopeType).toEqual("custom");
+  expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
+  expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
+});
+
+
+test("inherit scope", async () => {
+  const seeds = getSeeds(`
+
+seeds:
+   - url: https://example.com/1
+   - url: https://example.com/2
+
+include: https://example.com/(path|other)
+exclude: https://example.com/pathexclude
+`);
+
+
+  expect(seeds.length).toEqual(2);
+
+  expect(seeds[0].scopeType).toEqual("custom");
+  expect(seeds[0].url).toEqual("https://example.com/1");
+  expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
+  expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
+
+  expect(seeds[1].scopeType).toEqual("custom");
+  expect(seeds[1].url).toEqual("https://example.com/2");
+  expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]);
+  expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
+
+});
+
+
+test("override scope", async () => {
+  const seeds = getSeeds(`
+
+seeds:
+   - url: https://example.com/1
+     include: https://example.com/(path|other)
+
+   - https://example.com/2
+
+   - url: https://example.com/subpath/file.html
+     scopeType: prefix
+
+include: https://example.com/onlythispath
+`);
+
+  expect(seeds.length).toEqual(3);
+
+  expect(seeds[0].scopeType).toEqual("custom");
+  expect(seeds[0].url).toEqual("https://example.com/1");
+  expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
+  expect(seeds[0].exclude).toEqual([]);
+
+  expect(seeds[1].scopeType).toEqual("custom");
+  expect(seeds[1].url).toEqual("https://example.com/2");
+  expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
+  expect(seeds[1].exclude).toEqual([]);
+
+  expect(seeds[2].scopeType).toEqual("prefix");
+  expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
+  expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
+  expect(seeds[2].exclude).toEqual([]);
+
+});
+
+
+test("override scope with exclude", async () => {
+  const seeds = getSeeds(`
+
+seeds:
+   - url: https://example.com/1
+     scopeType: page
+
+   - url: https://example.com/subpath/file.html
+     scopeType: prefix
+
+   - url: https://example.com/2
+     scopeType: any
+
+   - url: https://example.com/3
+     scopeType: none
+
+   - url: https://example.com/4
+     scopeType: none
+     exclude: ''
+
+exclude:
+  - /search\\?
+  - q\\?
+
+`);
+
+  expect(seeds.length).toEqual(5);
+  const excludeRxs = [/\/search\?/, /q\?/];
+
+  expect(seeds[0].scopeType).toEqual("page");
+  expect(seeds[0].url).toEqual("https://example.com/1");
+  expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
+  expect(seeds[0].exclude).toEqual(excludeRxs);
+
+  expect(seeds[1].scopeType).toEqual("prefix");
+  expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
+  expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]);
+  expect(seeds[1].exclude).toEqual(excludeRxs);
+
+  expect(seeds[2].scopeType).toEqual("any");
+  expect(seeds[2].url).toEqual("https://example.com/2");
+  expect(seeds[2].include).toEqual([/.*/]);
+  expect(seeds[2].exclude).toEqual(excludeRxs);
+
+  expect(seeds[3].scopeType).toEqual("none");
+  expect(seeds[3].url).toEqual("https://example.com/3");
+  expect(seeds[3].include).toEqual([]);
+  expect(seeds[3].exclude).toEqual(excludeRxs);
+
+  expect(seeds[4].scopeType).toEqual("none");
+  expect(seeds[4].url).toEqual("https://example.com/4");
+  expect(seeds[4].include).toEqual([]);
+  expect(seeds[4].exclude).toEqual([]);
+
+});
+
--- a/util/argParser.js
+++ b/util/argParser.js
@ -71,17 +71,18 @@ class ArgParser {
        type: "number",
      },

-      "scope": {
+      "scopeType": {
+        describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx",
+        type: "string",
+      },
+
+      "scopeIncludeRx": {
+        alias: "include",
        describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
      },

-      "scopeType": {
-        describe: "Simplified scope for which URLs to crawl, can be: prefix, page, host, any",
-        type: "string",
-        default: "prefix",
-      },
-
-      "exclude": {
+      "scopeExcludeRx": {
+        alias: "exclude",
        describe: "Regex of page URLs that should be excluded from the crawl."
      },

@ -169,6 +170,7 @@ class ArgParser {
      },

      "useSitemap": {
+        alias: "sitemap",
        describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
      },

@ -300,18 +302,21 @@ class ArgParser {
      }
    }

+    if (argv.include || argv.exclude) {
+      if (argv.scopeType && argv.scopeType !== "custom") {
+        console.warn("You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType");
+        argv.scopeType = "custom";
+      }
+    }
+
    const scopeOpts = {
-      type: argv.scopeType,
-      sitemap: argv.useSitemap,
-      include: argv.scope,
+      scopeType: argv.scopeType,
+      sitemap: argv.sitemap,
+      include: argv.include,
      exclude: argv.exclude,
      depth: argv.depth,
    };

-    if (argv.scope && argv.scopeType) {
-      console.warn("You've specified a --scopeType and a --scope regex. The custom scope regex will take precedence, overriding the scopeType");
-    }
-
    argv.scopedSeeds = [];

    for (let seed of argv.seeds) {
--- a/util/screencaster.js
+++ b/util/screencaster.js
@ -2,10 +2,11 @@ const ws = require("ws");
 const http = require("http");
 const url = require("url");
 const fs = require("fs");
+const path = require("path");

 const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;

-const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"});
+const indexHTML = fs.readFileSync(path.join(__dirname, "..", "screencast", "index.html"), {encoding: "utf8"});


 // ===========================================================================
--- a/util/seeds.js
+++ b/util/seeds.js
@ -1,14 +1,20 @@
 class ScopedSeed
 {
-  constructor({url, type, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
+  constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) {
    const parsedUrl = this.parseUrl(url);
    this.url = parsedUrl.href;
-    this.type = type;
-    if (type) {
-      [include, allowHash] = this.scopeFromType(type, parsedUrl);
-    }
    this.include = this.parseRx(include);
    this.exclude = this.parseRx(exclude);
+
+    if (!scopeType) {
+      scopeType = (this.include.length || this.exclude.length) ? "custom" : "prefix";
+    }
+
+    this.scopeType = scopeType;
+
+    if (this.scopeType !== "custom") {
+      [this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl);
+    }
    this.sitemap = this.resolveSiteMap(sitemap);
    this.allowHash = allowHash;
    this.maxDepth = depth < 0 ? 99999 : depth;
@ -44,11 +50,11 @@ class ScopedSeed
    return sitemap;
  }

-  scopeFromType(type, parsedUrl) {
+  scopeFromType(scopeType, parsedUrl) {
    let include;
    let allowHash = false;

-    switch (type) {
+    switch (scopeType) {
    case "page":
      // allow scheme-agnostic URLS as likely redirects
      include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")];
@ -72,7 +78,7 @@ class ScopedSeed
      break;

    default:
-      throw new Error(`Invalid scope type "${type}" specified, valid types are: page, prefix, host`);
+      throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`);
    }

    return [include, allowHash];