mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support for per-URL conditional Block Rules (#68)
- Support for block rules specified in YAML config to exclude URLs based on regex, and also negate a rule by specifying `allowOnly` to allow URLs based on certain regex. - Support for conditional blocking for iframes, based on content of iframe text, specified via frameTextMatch regex. - Support for restricting block rules based on containing frame URL, specified via inFrameURL param. - Testing for various blockRules configurations - Fixes Support URL-level WARC-writing inclusion/exclusion lists #15 - optional message to add when a URL is blocked, specified via 'blockMessage' - update README for blockRules - bump to pywb dependency 2.5.0b4
This commit is contained in:
parent
838e1fa1bd
commit
6dbdff9656
8 changed files with 363 additions and 11 deletions
|
@ -90,6 +90,17 @@ class ArgParser {
|
|||
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||
},
|
||||
|
||||
"blockRules": {
|
||||
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"blockMessage": {
|
||||
describe: "If specified, when a URL is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"collection": {
|
||||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue