Support for per-URL conditional Block Rules (#68)

- Support for block rules specified in YAML config to exclude URLs based on regex, and also negate a rule by specifying `allowOnly` to allow URLs based on certain regex. - Support for conditional blocking for iframes, based on content of iframe text, specified via frameTextMatch regex. - Support for restricting block rules based on containing frame URL, specified via inFrameURL param. - Testing for various blockRules configurations - Fixes Support URL-level WARC-writing inclusion/exclusion lists #15 - optional message to add when a URL is blocked, specified via 'blockMessage' - update README for blockRules - bump to pywb dependency 2.5.0b4
2025-10-19 14:33:17 +00:00 · 2021-07-19 15:49:43 -07:00 · 2021-07-19 15:49:43 -07:00 · 6dbdff9656
commit 6dbdff9656
parent 838e1fa1bd
8 changed files with 363 additions and 11 deletions
--- a/util/argParser.js
+++ b/util/argParser.js
@ -90,6 +90,17 @@ class ArgParser {
        describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
      },

+      "blockRules": {
+        describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
+        type: "array",
+        default: [],
+      },
+
+      "blockMessage": {
+        describe: "If specified, when a URL is blocked, a record with this error message is added instead",
+        type: "string",
+      },
+
      "collection": {
        alias: "c",
        describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",