browsertrix-crawler/tests/scopes.test.js

268 lines
7 KiB
JavaScript
Raw Normal View History

import { parseArgs } from "../util/argParser.js";
import fs from "fs";
function getSeeds(config) {
const orig = fs.readFileSync;
fs.readFileSync = (name, ...args) => {
if (name.endsWith("/stdinconfig")) {
return config;
}
return orig(name, ...args);
};
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
const res = parseArgs(["node", "crawler", "--config", "stdinconfig"]);
return res.parsed.scopedSeeds;
}
test("default scope", async () => {
const seeds = getSeeds(`
seeds:
- https://example.com/
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([]);
});
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
test("default scope + exclude", async () => {
const seeds = getSeeds(`
seeds:
- https://example.com/
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("prefix scope global + exclude", async () => {
const seeds = getSeeds(`
seeds:
- https://example.com/
scopeType: prefix
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("prefix scope per seed + exclude", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/
scopeType: prefix
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("host scope and domain scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/
scopeType: domain
- url: https://example.org/
scopeType: host
`);
expect(seeds.length).toEqual(2);
expect(seeds[0].scopeType).toEqual("domain");
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(true);
expect(!!seeds[0].include[0].exec("https://sub.domain.example.com/path")).toEqual(true);
expect(!!seeds[0].include[0].exec("https://notsub.domainexample.com/path")).toEqual(false);
expect(seeds[1].scopeType).toEqual("host");
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(false);
});
test("domain scope drop www.", async () => {
const seeds = getSeeds(`
seeds:
- url: https://www.example.com/
scopeType: domain
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("domain");
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
});
test("custom scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/
include: https?://example.com/(path|other)
exclude: https?://example.com/pathexclude
`);
expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
});
test("inherit scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
- url: https://example.com/2
include: https?://example.com/(path|other)
exclude: https://example.com/pathexclude
`);
expect(seeds.length).toEqual(2);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
expect(seeds[1].scopeType).toEqual("custom");
expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
});
test("override scope", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
include: https://example.com/(path|other)
- https://example.com/2
- url: https://example.com/subpath/file.html
scopeType: prefix
include: https://example.com/onlythispath
`);
expect(seeds.length).toEqual(3);
expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/https:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([]);
expect(seeds[1].scopeType).toEqual("custom");
expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https:\/\/example.com\/onlythispath/]);
expect(seeds[1].exclude).toEqual([]);
expect(seeds[2].scopeType).toEqual("prefix");
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[2].include).toEqual([/^https?:\/\/example\.com\/subpath\//]);
expect(seeds[2].exclude).toEqual([]);
});
test("override scope with exclude", async () => {
const seeds = getSeeds(`
seeds:
- url: https://example.com/1
scopeType: page-spa
- url: https://example.com/subpath/file.html
scopeType: prefix
- url: https://example.com/2
scopeType: any
- url: https://example.com/3
scopeType: page
- url: https://example.com/4
scopeType: page
exclude: ''
exclude:
- /search\\?
- q\\?
`);
expect(seeds.length).toEqual(5);
const excludeRxs = [/\/search\?/, /q\?/];
expect(seeds[0].scopeType).toEqual("page-spa");
expect(seeds[0].url).toEqual("https://example.com/1");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\/1#.+/]);
expect(seeds[0].exclude).toEqual(excludeRxs);
expect(seeds[1].scopeType).toEqual("prefix");
expect(seeds[1].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[1].include).toEqual([/^https?:\/\/example\.com\/subpath\//]);
expect(seeds[1].exclude).toEqual(excludeRxs);
expect(seeds[2].scopeType).toEqual("any");
expect(seeds[2].url).toEqual("https://example.com/2");
expect(seeds[2].include).toEqual([/.*/]);
expect(seeds[2].exclude).toEqual(excludeRxs);
expect(seeds[3].scopeType).toEqual("page");
expect(seeds[3].url).toEqual("https://example.com/3");
expect(seeds[3].include).toEqual([]);
expect(seeds[3].exclude).toEqual(excludeRxs);
expect(seeds[4].scopeType).toEqual("page");
expect(seeds[4].url).toEqual("https://example.com/4");
expect(seeds[4].include).toEqual([]);
expect(seeds[4].exclude).toEqual([]);
});