diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7eb2b1bb..1a356611 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,7 +18,7 @@ jobs: with: node-version: ${{ matrix.node-version }} - name: install requirements - run: npm install + run: yarn install - name: run linter run: yarn run eslint . @@ -37,7 +37,7 @@ jobs: with: node-version: ${{ matrix.node-version }} - name: install requirements - run: npm install + run: yarn install - name: build docker run: docker-compose build - name: run crawl diff --git a/README.md b/README.md index 14a6f602..6e9580bd 100644 --- a/README.md +++ b/README.md @@ -73,13 +73,15 @@ Options: [number] [default: 0] --timeout Timeout for each page to load (in seconds) [number] [default: 90] - --scope Regex of page URLs that should be + --scopeType Predefined for which URLs to crawl, + can be: prefix, page, host, any, or + custom, to use the + scopeIncludeRx/scopeExcludeRx + [string] + --scopeIncludeRx, --include Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL) - --scopeType Simplified scope for which URLs to - crawl, can be: prefix, page, host, - any [string] [default: "prefix"] - --exclude Regex of page URLs that should be + --scopeExcludeRx, --exclude Regex of page URLs that should be excluded from the crawl. --allowHashUrls Allow Hashtag URLs, useful for single-page-application crawling or @@ -88,7 +90,7 @@ Options: -c, --collection Collection name to crawl to (replay will be accessible under this name in pywb preview) - [string] [default: "capture-2021-06-26T19-38-10"] + [string] [default: "capture-2021-07-01T23-43-26"] --headless Run in headless mode, otherwise start xvfb[boolean] [default: false] --driver JS driver for the crawler @@ -122,7 +124,7 @@ Options: --userAgentSuffix Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com) [string] - --useSitemap If enabled, check for sitemaps at + --useSitemap, --sitemap If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified --statsFilename If set, output stats as JSON to this diff --git a/docker-compose.yml b/docker-compose.yml index 28a28538..a0590b05 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.5' services: crawler: - image: webrecorder/browsertrix-crawler:0.4.0-beta.2 + image: webrecorder/browsertrix-crawler:0.4.0-beta.3 build: context: ./ diff --git a/package.json b/package.json index d735b921..5438b4d6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.4.0-beta.2", + "version": "0.4.0-beta.3", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", diff --git a/tests/config_stdin.test.js b/tests/config_stdin.test.js index 59bc652d..c286597a 100644 --- a/tests/config_stdin.test.js +++ b/tests/config_stdin.test.js @@ -10,7 +10,7 @@ test("pass config file via stdin", async () => { try { const version = require("../package.json").version; - const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --exclude webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"}); + const proc = child_process.execSync(`docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler:${version} crawl --config stdin --scopeExcludeRx webrecorder.net/202`, {input: configYaml, stdin: "inherit", encoding: "utf8"}); console.log(proc); } diff --git a/tests/scopes.test.js b/tests/scopes.test.js new file mode 100644 index 00000000..266fc142 --- /dev/null +++ b/tests/scopes.test.js @@ -0,0 +1,166 @@ +const { parseArgs } = require("../util/argParser"); + +const fs = require("fs"); + +function getSeeds(config) { + const orig = fs.readFileSync; + + fs.readFileSync = (name, ...args) => { + if (name.endsWith("/configtest")) { + return config; + } + return orig(name, ...args); + }; + + return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds; +} + +test("default scope", async () => { + const seeds = getSeeds(` +seeds: + - https://example.com/ + +`); + + + expect(seeds.length).toEqual(1); + expect(seeds[0].scopeType).toEqual("prefix"); + expect(seeds[0].include).toEqual([/^https:\/\/example\.com\//]); + expect(seeds[0].exclude).toEqual([]); + +}); + +test("custom scope", async () => { + const seeds = getSeeds(` +seeds: + - url: https://example.com/ + include: https://example.com/(path|other) + exclude: https://example.com/pathexclude +`); + + + expect(seeds.length).toEqual(1); + expect(seeds[0].scopeType).toEqual("custom"); + expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]); + expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); +}); + + +test("inherit scope", async () => { + const seeds = getSeeds(` + +seeds: + - url: https://example.com/1 + - url: https://example.com/2 + +include: https://example.com/(path|other) +exclude: https://example.com/pathexclude +`); + + + expect(seeds.length).toEqual(2); + + expect(seeds[0].scopeType).toEqual("custom"); + expect(seeds[0].url).toEqual("https://example.com/1"); + expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]); + expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); + + expect(seeds[1].scopeType).toEqual("custom"); + expect(seeds[1].url).toEqual("https://example.com/2"); + expect(seeds[1].include).toEqual([/https:\/\/example.com\/(path|other)/]); + expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); + +}); + + +test("override scope", async () => { + const seeds = getSeeds(` + +seeds: + - url: https://example.com/1 + include: https://example.com/(path|other) + + - https://example.com/2 + + - url: https://example.com/subpath/file.html + scopeType: prefix + +include: https://example.com/onlythispath +`); + + expect(seeds.length).toEqual(3); + + expect(seeds[0].scopeType).toEqual("custom"); + expect(seeds[0].url).toEqual("https://example.com/1"); + expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]); + expect(seeds[0].exclude).toEqual([]); + + expect(seeds[1].scopeType).toEqual("custom"); + expect(seeds[1].url).toEqual("https://example.com/2"); + expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]); + expect(seeds[1].exclude).toEqual([]); + + expect(seeds[2].scopeType).toEqual("prefix"); + expect(seeds[2].url).toEqual("https://example.com/subpath/file.html"); + expect(seeds[2].include).toEqual([/^https:\/\/example\.com\/subpath\//]); + expect(seeds[2].exclude).toEqual([]); + +}); + + +test("override scope with exclude", async () => { + const seeds = getSeeds(` + +seeds: + - url: https://example.com/1 + scopeType: page + + - url: https://example.com/subpath/file.html + scopeType: prefix + + - url: https://example.com/2 + scopeType: any + + - url: https://example.com/3 + scopeType: none + + - url: https://example.com/4 + scopeType: none + exclude: '' + +exclude: + - /search\\? + - q\\? + +`); + + expect(seeds.length).toEqual(5); + const excludeRxs = [/\/search\?/, /q\?/]; + + expect(seeds[0].scopeType).toEqual("page"); + expect(seeds[0].url).toEqual("https://example.com/1"); + expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]); + expect(seeds[0].exclude).toEqual(excludeRxs); + + expect(seeds[1].scopeType).toEqual("prefix"); + expect(seeds[1].url).toEqual("https://example.com/subpath/file.html"); + expect(seeds[1].include).toEqual([/^https:\/\/example\.com\/subpath\//]); + expect(seeds[1].exclude).toEqual(excludeRxs); + + expect(seeds[2].scopeType).toEqual("any"); + expect(seeds[2].url).toEqual("https://example.com/2"); + expect(seeds[2].include).toEqual([/.*/]); + expect(seeds[2].exclude).toEqual(excludeRxs); + + expect(seeds[3].scopeType).toEqual("none"); + expect(seeds[3].url).toEqual("https://example.com/3"); + expect(seeds[3].include).toEqual([]); + expect(seeds[3].exclude).toEqual(excludeRxs); + + expect(seeds[4].scopeType).toEqual("none"); + expect(seeds[4].url).toEqual("https://example.com/4"); + expect(seeds[4].include).toEqual([]); + expect(seeds[4].exclude).toEqual([]); + +}); + diff --git a/util/argParser.js b/util/argParser.js index f292c89f..b42c9afe 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -71,17 +71,18 @@ class ArgParser { type: "number", }, - "scope": { + "scopeType": { + describe: "Predefined for which URLs to crawl, can be: prefix, page, host, any, or custom, to use the scopeIncludeRx/scopeExcludeRx", + type: "string", + }, + + "scopeIncludeRx": { + alias: "include", describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)", }, - "scopeType": { - describe: "Simplified scope for which URLs to crawl, can be: prefix, page, host, any", - type: "string", - default: "prefix", - }, - - "exclude": { + "scopeExcludeRx": { + alias: "exclude", describe: "Regex of page URLs that should be excluded from the crawl." }, @@ -169,6 +170,7 @@ class ArgParser { }, "useSitemap": { + alias: "sitemap", describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified", }, @@ -300,18 +302,21 @@ class ArgParser { } } + if (argv.include || argv.exclude) { + if (argv.scopeType && argv.scopeType !== "custom") { + console.warn("You've specified a --scopeType and a --scopeIncludeRx or --scopeExcludeRx regex. The custom scope regex will take precedence, overriding the scopeType"); + argv.scopeType = "custom"; + } + } + const scopeOpts = { - type: argv.scopeType, - sitemap: argv.useSitemap, - include: argv.scope, + scopeType: argv.scopeType, + sitemap: argv.sitemap, + include: argv.include, exclude: argv.exclude, depth: argv.depth, }; - if (argv.scope && argv.scopeType) { - console.warn("You've specified a --scopeType and a --scope regex. The custom scope regex will take precedence, overriding the scopeType"); - } - argv.scopedSeeds = []; for (let seed of argv.seeds) { diff --git a/util/screencaster.js b/util/screencaster.js index 625afd89..9fc93891 100644 --- a/util/screencaster.js +++ b/util/screencaster.js @@ -2,10 +2,11 @@ const ws = require("ws"); const http = require("http"); const url = require("url"); const fs = require("fs"); +const path = require("path"); const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default; -const indexHTML = fs.readFileSync("/app/screencast/index.html", {encoding: "utf8"}); +const indexHTML = fs.readFileSync(path.join(__dirname, "..", "screencast", "index.html"), {encoding: "utf8"}); // =========================================================================== diff --git a/util/seeds.js b/util/seeds.js index 89f64c8e..6acc6adf 100644 --- a/util/seeds.js +++ b/util/seeds.js @@ -1,14 +1,20 @@ class ScopedSeed { - constructor({url, type, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) { + constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false} = {}) { const parsedUrl = this.parseUrl(url); this.url = parsedUrl.href; - this.type = type; - if (type) { - [include, allowHash] = this.scopeFromType(type, parsedUrl); - } this.include = this.parseRx(include); this.exclude = this.parseRx(exclude); + + if (!scopeType) { + scopeType = (this.include.length || this.exclude.length) ? "custom" : "prefix"; + } + + this.scopeType = scopeType; + + if (this.scopeType !== "custom") { + [this.include, allowHash] = this.scopeFromType(this.scopeType, parsedUrl); + } this.sitemap = this.resolveSiteMap(sitemap); this.allowHash = allowHash; this.maxDepth = depth < 0 ? 99999 : depth; @@ -44,11 +50,11 @@ class ScopedSeed return sitemap; } - scopeFromType(type, parsedUrl) { + scopeFromType(scopeType, parsedUrl) { let include; let allowHash = false; - switch (type) { + switch (scopeType) { case "page": // allow scheme-agnostic URLS as likely redirects include = [new RegExp("^" + rxEscape(parsedUrl.href).replace(parsedUrl.protocol, "https?:") + "#.+")]; @@ -72,7 +78,7 @@ class ScopedSeed break; default: - throw new Error(`Invalid scope type "${type}" specified, valid types are: page, prefix, host`); + throw new Error(`Invalid scope type "${scopeType}" specified, valid types are: page, prefix, host`); } return [include, allowHash];