browsertrix-crawler/src/util/seeds.ts

280 lines
6.3 KiB
TypeScript
Raw Normal View History

import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js";
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
type ScopeType =
| "prefix"
| "host"
| "domain"
| "page"
| "page-spa"
| "any"
| "custom";
export class ScopedSeed {
url: string;
scopeType: ScopeType;
include: RegExp[];
exclude: RegExp[] = [];
allowHash = false;
depth = -1;
sitemap?: string | null;
extraHops = 0;
maxExtraHops = 0;
maxDepth = 0;
constructor({
url,
scopeType,
include,
exclude = [],
allowHash = false,
depth = -1,
sitemap = false,
extraHops = 0,
}: {
url: string;
scopeType: ScopeType;
include: string[];
exclude?: string[];
allowHash?: boolean;
depth?: number;
sitemap?: string | boolean | null;
extraHops?: number;
}) {
const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
throw new Error("Invalid URL");
}
this.url = parsedUrl.href;
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
this.scopeType = scopeType;
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
if (!this.scopeType) {
this.scopeType = this.include.length ? "custom" : "prefix";
}
if (this.scopeType !== "custom") {
const [includeNew, allowHashNew] = this.scopeFromType(
this.scopeType,
parsedUrl,
);
this.include = [...includeNew, ...this.include];
allowHash = allowHashNew;
}
State Save + Restore State from Config + Redis State + Scope Fix 0.5.0 (#78) * save state work: - support interrupting and saving crawl - support loading crawl state (frontier queue, pending, done) from YAML - support scope check when loading to apply new scoping rules when restarting crawl - failed urls added to done as failed, can be retried if crawl is stopped and restarted - save state to crawls/crawl-<ts>-<id>.yaml when interrupted - --saveState option controls when crawl state is saved, default to partial/when interrupted, also always, never. - support in-memory or redis based crawl state, using fork of puppeteer-cluster - --redisStore used to enable redis-based state * signals/crawl interruption: - crawl state set to drain/not provide any more urls to crawl - graceful stop of crawl in response to sigint/sigterm - initial sigint/sigterm waits for graceful end of current pages, second terminates immediately - initial sigabrt followed by sigterm terminates immediately - puppeteer disable handleSIGTERM, handleSIGHUP, handleSIGINT * redis state support: - use lua scripts for atomic move from queue -> pending, and pending -> done - pending key expiry set to page timeout - add numPending() and numSeen() to support better puppeteer-cluster semantics for early termination - drainMax returns the numPending() + numSeen() to work with cluster stats * arg improvements: - add --crawlId param, also settable via CRAWL_ID env var, defaulting to os.hostname() (used for redis key and crawl state file) - support setting cmdline args via env var CRAWL_ARGS - use 'choices' in args when possible * build update: - switch base browser image to new webrecorder/browsertrix-browser-base, simple image with .deb files only for amd64 and arm64 builds - use setuptools<58.0 * misc crawl/scoping rule fixes: - scoping rules fix when external is used with scopeType state: - limit: ensure no urls, including initial seeds, are added past the limit - signals: fix immediate shutdown on second signal - tests: add scope test for default scope + excludes * py-wacz update - add 'seed': true to pages that are seeds for optimized wacz creation, keeping non-seeds separate (supported via wacz 0.3.2) - pywb: use latest pywb branch for improved twitter video capture * update to latest browsertrix-behaviors * fix setuptools dependency #88 * update README for 0.5.0 beta
2021-09-28 09:41:16 -07:00
// for page scope, the depth is set to extraHops, as no other
// crawling is done
if (this.scopeType === "page") {
depth = extraHops;
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
Dev 0.9.0 Beta 1 Work - Playwright Removal + Worker Refactor + Redis State (#253) * Migrate from Puppeteer to Playwright! - use playwright persistent browser context to support profiles - move on-new-page setup actions to worker - fix screencaster, init only one per page object, associate with worker-id - fix device emulation: load on startup, also replace '-' with space for more friendly command-line usage - port additional chromium setup options - create / detach cdp per page for each new page, screencaster just uses existing cdp - fix evaluateWithCLI to call CDP command directly - workers directly during WorkerPool - await not necessary * State / Worker Refactor (#252) * refactoring state: - use RedisCrawlState, defaulting to local redis, remove MemoryCrawlState and BaseState - remove 'real' accessors / draining queue - no longer neede without puppeteer-cluster - switch to sorted set for crawl queue, set depth + extraHops as score, (fixes #150) - override console.error to avoid logging ioredis errors (fixes #244) - add MAX_DEPTH as const for extraHops - fix immediate exit on second interrupt * worker/state refactor: - remove job object from puppeteer-cluster - rename shift() -> nextFromQueue() - condense crawl mgmt logic to crawlPageInWorker: init page, mark pages as finished/failed, close page on failure, etc... - screencaster: don't screencast about:blank pages * more worker queue refactor: - remove p-queue - initialize PageWorkers which run in its own loop to process pages, until no pending pages, no queued pages - add setupPage(), teardownPage() to crawler, called from worker - await runWorkers() promise which runs all workers until completion - remove: p-queue, node-fetch, update README (no longer using any puppeteer-cluster base code) - bump to 0.9.0-beta.1 * use existing data object for per-page context, instead of adding things to page (will be more clear with typescript transition) * more fixes for playwright: - fix profile creation - browser: add newWindowPageWithCDP() to create new page + cdp in new window, use with timeout - crawler: various fixes, including for html check - logging: addition logging for screencaster, new window, etc... - remove unused packages --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-03-17 12:50:32 -07:00
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
}
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any
parseRx(value: any) {
if (value === null || value === undefined || value === "") {
return [];
} else if (!(value instanceof Array)) {
return [new RegExp(value)];
} else {
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
}
}
addExclusion(value: string | RegExp) {
if (!value) {
return;
}
if (!(value instanceof RegExp)) {
value = new RegExp(value);
}
this.exclude.push(value);
}
removeExclusion(value: string) {
for (let i = 0; i < this.exclude.length; i++) {
if (this.exclude[i].toString() == value.toString()) {
this.exclude.splice(i, 1);
return true;
}
}
}
parseUrl(url: string, logDetails = {}) {
let parsedUrl = null;
try {
parsedUrl = new URL(url.trim());
} catch (e) {
logger.warn("Invalid Page - not a valid URL", { url, ...logDetails });
return null;
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
logger.warn("Invalid Page - URL must start with http:// or https://", {
url,
...logDetails,
});
parsedUrl = null;
}
return parsedUrl;
}
resolveSiteMap(sitemap: boolean | string | null): string | null {
if (sitemap === true) {
const url = new URL(this.url);
url.pathname = "/sitemap.xml";
return url.href;
} else if (typeof sitemap === "string") {
const url = new URL(sitemap, this.url);
return url.href;
}
return null;
}
scopeFromType(scopeType: ScopeType, parsedUrl: URL): [RegExp[], boolean] {
let include: RegExp[] = [];
let allowHash = false;
switch (scopeType) {
case "page":
include = [];
break;
case "page-spa":
// allow scheme-agnostic URLS as likely redirects
include = [
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
];
allowHash = true;
break;
case "prefix":
include = [
new RegExp(
"^" +
urlRxEscape(
parsedUrl.origin +
parsedUrl.pathname.slice(
0,
parsedUrl.pathname.lastIndexOf("/") + 1,
),
parsedUrl,
),
),
];
break;
case "host":
include = [
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
];
break;
case "domain":
if (parsedUrl.hostname.startsWith("www.")) {
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
}
include = [
new RegExp(
"^" +
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
"\\/\\/",
"\\/\\/([^/]+\\.)*",
),
),
];
break;
case "any":
include = [/.*/];
break;
default:
logger.fatal(
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
);
}
return [include, allowHash];
}
isAtMaxDepth(depth: number) {
return depth >= this.maxDepth;
}
isIncluded(url: string, depth: number, extraHops = 0, logDetails = {}) {
if (depth > this.maxDepth) {
return false;
}
const urlParsed = this.parseUrl(url, logDetails);
if (!urlParsed) {
return false;
}
if (!this.allowHash) {
// remove hashtag
urlParsed.hash = "";
}
url = urlParsed.href;
if (url === this.url) {
return true;
}
// skip already crawled
// if (this.seenList.has(url)) {
// return false;
//}
let inScope = false;
// check scopes
for (const s of this.include) {
if (s.test(url)) {
inScope = true;
break;
}
}
let isOOS = false;
if (!inScope) {
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true;
} else {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
}
}
// check exclusions
for (const e of this.exclude) {
if (e.test(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return { url, isOOS };
}
}
export function rxEscape(string: string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}
export function urlRxEscape(url: string, parsedUrl: URL) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
}