2020-10-31 13:16:37 -07:00
|
|
|
const puppeteer = require("puppeteer-core");
|
|
|
|
const { Cluster } = require("puppeteer-cluster");
|
|
|
|
const child_process = require("child_process");
|
|
|
|
const fetch = require("node-fetch");
|
|
|
|
const AbortController = require("abort-controller");
|
2020-11-01 19:22:53 -08:00
|
|
|
const path = require("path");
|
2020-12-02 16:26:20 +00:00
|
|
|
const fs = require("fs");
|
2021-02-04 00:28:32 -05:00
|
|
|
const Sitemapper = require("sitemapper");
|
|
|
|
const { v4: uuidv4 } = require("uuid");
|
2021-02-23 16:52:54 -05:00
|
|
|
const TextExtract = require("./behaviors/global/textextract");
|
2021-02-08 22:21:34 -08:00
|
|
|
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
|
|
|
|
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
|
|
|
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
2020-11-03 17:16:29 +00:00
|
|
|
|
|
|
|
const CHROME_PATH = "google-chrome";
|
2020-10-31 13:16:37 -07:00
|
|
|
|
|
|
|
// to ignore HTTPS error for HEAD check
|
|
|
|
const HTTPS_AGENT = require("https").Agent({
|
|
|
|
rejectUnauthorized: false,
|
|
|
|
});
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const HTTP_AGENT = require("http").Agent();
|
|
|
|
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// ============================================================================
|
|
|
|
class Crawler {
|
|
|
|
constructor() {
|
2020-11-03 17:16:29 +00:00
|
|
|
this.headers = {};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
|
|
|
this.seenList = new Set();
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
this.emulateDevice = null;
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// links crawled counter
|
|
|
|
this.numLinks = 0;
|
|
|
|
|
2021-01-29 18:26:55 +00:00
|
|
|
// was the limit hit?
|
|
|
|
this.limitHit = false;
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
this.monitor = true;
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
this.userAgent = "";
|
|
|
|
this.headers = {};
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const params = require("yargs")
|
|
|
|
.usage("browsertrix-crawler [options]")
|
|
|
|
.option(this.cliOpts)
|
|
|
|
.check((argv) => this.validateArgs(argv)).argv;
|
|
|
|
|
|
|
|
console.log("Exclusions Regexes: ", params.exclude);
|
|
|
|
console.log("Scope Regexes: ", params.scope);
|
|
|
|
|
|
|
|
this.params = params;
|
|
|
|
this.capturePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record/id_/`;
|
2021-02-04 00:28:32 -05:00
|
|
|
|
|
|
|
|
|
|
|
// root collections dir
|
|
|
|
this.collDir = path.join(this.params.cwd, "collections", this.params.collection);
|
|
|
|
|
|
|
|
// pages directory
|
|
|
|
this.pagesDir = path.join(this.collDir, "pages");
|
|
|
|
|
|
|
|
// pages file
|
|
|
|
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
2021-02-08 22:21:34 -08:00
|
|
|
|
|
|
|
// background behaviors
|
|
|
|
this.bgbehaviors = new BackgroundBehaviors(this.params.bgbehaviors || []);
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
configureUA() {
|
|
|
|
// override userAgent
|
|
|
|
if (this.params.userAgent) {
|
|
|
|
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
this.emulateDevice.userAgent = this.params.userAgent;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.userAgent = this.params.userAgent;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if device set, it overrides the default Chrome UA
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
this.userAgent = this.emulateDevice.userAgent;
|
|
|
|
} else {
|
2021-02-03 22:24:38 -08:00
|
|
|
let version = process.env.BROWSER_VERSION;
|
2020-11-14 19:32:31 +00:00
|
|
|
|
|
|
|
try {
|
|
|
|
version = child_process.execFileSync("google-chrome", ["--product-version"], {encoding: "utf8"}).trim();
|
2021-02-04 00:28:32 -05:00
|
|
|
} catch(e) {
|
|
|
|
console.log(e);
|
|
|
|
}
|
2020-11-14 19:32:31 +00:00
|
|
|
|
|
|
|
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
|
|
|
}
|
|
|
|
|
|
|
|
// suffix to append to default userAgent
|
|
|
|
if (this.params.userAgentSuffix) {
|
|
|
|
this.userAgent += " " + this.params.userAgentSuffix;
|
|
|
|
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
this.emulateDevice.userAgent += " " + this.params.userAgentSuffix;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
bootstrap() {
|
2020-11-02 15:28:19 +00:00
|
|
|
const opts = {stdio: "ignore", cwd: this.params.cwd};
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2020-11-14 20:51:07 +00:00
|
|
|
this.configureUA();
|
2020-11-03 17:16:29 +00:00
|
|
|
|
|
|
|
this.headers = {"User-Agent": this.userAgent};
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
child_process.spawn("redis-server", {...opts, cwd: "/tmp/"});
|
|
|
|
|
|
|
|
child_process.spawnSync("wb-manager", ["init", this.params.collection], opts);
|
|
|
|
|
|
|
|
opts.env = {...process.env, COLL: this.params.collection};
|
|
|
|
|
|
|
|
child_process.spawn("uwsgi", [path.join(__dirname, "uwsgi.ini")], opts);
|
|
|
|
|
|
|
|
if (!this.params.headless) {
|
|
|
|
child_process.spawn("Xvfb", [
|
|
|
|
process.env.DISPLAY,
|
|
|
|
"-listen",
|
|
|
|
"tcp",
|
|
|
|
"-screen",
|
|
|
|
"0",
|
|
|
|
process.env.GEOMETRY,
|
|
|
|
"-ac",
|
|
|
|
"+extension",
|
|
|
|
"RANDR"
|
|
|
|
]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
get cliOpts() {
|
|
|
|
return {
|
|
|
|
"url": {
|
|
|
|
alias: "u",
|
|
|
|
describe: "The URL to start crawling from",
|
|
|
|
type: "string",
|
2020-11-01 21:35:00 -08:00
|
|
|
demandOption: true,
|
2020-11-01 19:22:53 -08:00
|
|
|
},
|
|
|
|
|
|
|
|
"workers": {
|
|
|
|
alias: "w",
|
|
|
|
describe: "The number of workers to run in parallel",
|
|
|
|
default: 1,
|
|
|
|
type: "number",
|
|
|
|
},
|
|
|
|
|
|
|
|
"newContext": {
|
|
|
|
describe: "The context for each new capture, can be a new: page, session or browser.",
|
|
|
|
default: "page",
|
|
|
|
type: "string"
|
|
|
|
},
|
|
|
|
|
|
|
|
"waitUntil": {
|
2021-02-04 22:42:03 -08:00
|
|
|
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
|
|
|
|
default: "load,networkidle0",
|
2020-11-01 19:22:53 -08:00
|
|
|
},
|
|
|
|
|
|
|
|
"limit": {
|
|
|
|
describe: "Limit crawl to this number of pages",
|
|
|
|
default: 0,
|
|
|
|
type: "number",
|
|
|
|
},
|
|
|
|
|
|
|
|
"timeout": {
|
|
|
|
describe: "Timeout for each page to load (in seconds)",
|
|
|
|
default: 90,
|
|
|
|
type: "number",
|
|
|
|
},
|
|
|
|
|
|
|
|
"scope": {
|
|
|
|
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
|
|
|
},
|
|
|
|
|
|
|
|
"exclude": {
|
|
|
|
describe: "Regex of page URLs that should be excluded from the crawl."
|
|
|
|
},
|
|
|
|
|
|
|
|
"scroll": {
|
|
|
|
describe: "If set, will autoscroll to bottom of the page",
|
|
|
|
type: "boolean",
|
|
|
|
default: false,
|
|
|
|
},
|
|
|
|
|
|
|
|
"collection": {
|
|
|
|
alias: "c",
|
2020-11-01 21:35:00 -08:00
|
|
|
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
2020-11-01 19:22:53 -08:00
|
|
|
type: "string",
|
2021-02-15 23:06:18 -05:00
|
|
|
default: `capture-${new Date().toISOString().slice(0,18)}`.replace(/:/g, "-")
|
2020-11-01 19:22:53 -08:00
|
|
|
},
|
|
|
|
|
|
|
|
"headless": {
|
|
|
|
describe: "Run in headless mode, otherwise start xvfb",
|
|
|
|
type: "boolean",
|
|
|
|
default: false,
|
|
|
|
},
|
|
|
|
|
|
|
|
"driver": {
|
|
|
|
describe: "JS driver for the crawler",
|
|
|
|
type: "string",
|
|
|
|
default: path.join(__dirname, "defaultDriver.js"),
|
|
|
|
},
|
|
|
|
|
2020-11-01 21:35:00 -08:00
|
|
|
"generateCDX": {
|
2021-02-17 12:37:07 -05:00
|
|
|
alias: ["generatecdx", "generateCdx"],
|
2020-11-01 19:22:53 -08:00
|
|
|
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
|
|
|
type: "boolean",
|
|
|
|
default: false,
|
2020-11-02 15:28:19 +00:00
|
|
|
},
|
2021-02-17 12:37:07 -05:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
"generateWACZ": {
|
2021-02-17 12:37:07 -05:00
|
|
|
alias: ["generatewacz", "generateWacz"],
|
2021-02-04 00:28:32 -05:00
|
|
|
describe: "If set, generate wacz",
|
|
|
|
type: "boolean",
|
|
|
|
default: false,
|
|
|
|
},
|
|
|
|
|
2021-02-23 16:52:54 -05:00
|
|
|
"text": {
|
|
|
|
describe: "If set, extract text to the pages.jsonl file",
|
|
|
|
type: "boolean",
|
|
|
|
default: false,
|
|
|
|
},
|
|
|
|
|
2020-11-02 15:28:19 +00:00
|
|
|
"cwd": {
|
2021-02-04 22:42:03 -08:00
|
|
|
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
2020-11-02 15:28:19 +00:00
|
|
|
type: "string",
|
|
|
|
default: process.cwd(),
|
2020-11-14 19:32:31 +00:00
|
|
|
},
|
|
|
|
|
|
|
|
"mobileDevice": {
|
|
|
|
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
|
|
|
type: "string",
|
|
|
|
},
|
|
|
|
|
|
|
|
"userAgent": {
|
|
|
|
describe: "Override user-agent with specified string",
|
|
|
|
type: "string",
|
|
|
|
},
|
|
|
|
|
|
|
|
"userAgentSuffix": {
|
|
|
|
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
|
|
|
type: "string",
|
2020-11-14 21:55:02 +00:00
|
|
|
},
|
|
|
|
|
|
|
|
"useSitemap": {
|
|
|
|
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
2020-12-02 16:26:20 +00:00
|
|
|
},
|
|
|
|
|
|
|
|
"statsFilename": {
|
|
|
|
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
2021-02-08 22:21:34 -08:00
|
|
|
},
|
|
|
|
|
|
|
|
"bgbehaviors": {
|
|
|
|
describe: "Which background behaviors to enable on each page",
|
|
|
|
default: "auto-play,auto-fetch",
|
|
|
|
type: "string",
|
|
|
|
},
|
2020-11-01 19:22:53 -08:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
validateUserUrl(url) {
|
|
|
|
url = new URL(url);
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (url.protocol !== "http:" && url.protocol != "https:") {
|
|
|
|
throw new Error("URL must start with http:// or https://");
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
return url.href;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
validateArgs(argv) {
|
|
|
|
if (argv.url) {
|
|
|
|
// Scope for crawl, default to the domain of the URL
|
|
|
|
// ensure valid url is used (adds trailing slash if missing)
|
|
|
|
//argv.seeds = [Crawler.validateUserUrl(argv.url)];
|
|
|
|
argv.url = this.validateUserUrl(argv.url);
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (!argv.scope) {
|
|
|
|
//argv.scope = url.href.slice(0, url.href.lastIndexOf("/") + 1);
|
|
|
|
argv.scope = [new RegExp("^" + this.rxEscape(argv.url.slice(0, argv.url.lastIndexOf("/") + 1)))];
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
argv.timeout *= 1000;
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// waitUntil condition must be: load, domcontentloaded, networkidle0, networkidle2
|
2021-02-04 22:42:03 -08:00
|
|
|
// can be multiple separate by comma
|
2020-11-01 19:22:53 -08:00
|
|
|
// (see: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options)
|
2021-02-04 22:42:03 -08:00
|
|
|
argv.waitUntil = argv.waitUntil.split(",");
|
|
|
|
|
|
|
|
for (const opt of argv.waitUntil) {
|
|
|
|
if (!WAIT_UNTIL_OPTS.includes(opt)) {
|
|
|
|
throw new Error("Invalid waitUntil option, must be one of: " + WAIT_UNTIL_OPTS.join(","));
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
// background behaviors to apply
|
|
|
|
argv.bgbehaviors = argv.bgbehaviors.split(",");
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (!argv.newContext) {
|
|
|
|
argv.newContext = "page";
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
switch (argv.newContext) {
|
2020-10-31 13:16:37 -07:00
|
|
|
case "page":
|
2020-11-01 19:22:53 -08:00
|
|
|
argv.newContext = Cluster.CONCURRENCY_PAGE;
|
2020-10-31 13:16:37 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
case "session":
|
2020-11-01 19:22:53 -08:00
|
|
|
argv.newContext = Cluster.CONCURRENCY_CONTEXT;
|
2020-10-31 13:16:37 -07:00
|
|
|
break;
|
|
|
|
|
|
|
|
case "browser":
|
2020-11-01 19:22:53 -08:00
|
|
|
argv.newContext = Cluster.CONCURRENCY_BROWSER;
|
2020-10-31 13:16:37 -07:00
|
|
|
break;
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
default:
|
|
|
|
throw new Error("Invalid newContext, must be one of: page, session, browser");
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-14 19:32:31 +00:00
|
|
|
if (argv.mobileDevice) {
|
|
|
|
this.emulateDevice = puppeteer.devices[argv.mobileDevice];
|
|
|
|
if (!this.emulateDevice) {
|
|
|
|
throw new Error("Unknown device: " + argv.mobileDevice);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-14 21:55:02 +00:00
|
|
|
if (argv.useSitemap === true) {
|
|
|
|
const url = new URL(argv.url);
|
|
|
|
url.pathname = "/sitemap.xml";
|
|
|
|
argv.useSitemap = url.href;
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// Support one or multiple exclude
|
|
|
|
if (argv.exclude) {
|
|
|
|
if (typeof(argv.exclude) === "string") {
|
|
|
|
argv.exclude = [new RegExp(argv.exclude)];
|
|
|
|
} else {
|
|
|
|
argv.exclude = argv.exclude.map(e => new RegExp(e));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
argv.exclude = [];
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// Support one or multiple scopes
|
|
|
|
if (argv.scope) {
|
|
|
|
if (typeof(argv.scope) === "string") {
|
|
|
|
argv.scope = [new RegExp(argv.scope)];
|
|
|
|
} else {
|
|
|
|
argv.scope = argv.scope.map(e => new RegExp(e));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
argv.scope = [];
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-12-02 16:26:20 +00:00
|
|
|
// Resolve statsFilename
|
|
|
|
if (argv.statsFilename) {
|
|
|
|
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
return true;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
get chromeArgs() {
|
|
|
|
// Chrome Flags, including proxy server
|
|
|
|
return [
|
|
|
|
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
|
|
|
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
|
|
|
"--no-sandbox",
|
|
|
|
"--disable-background-media-suspend",
|
|
|
|
"--autoplay-policy=no-user-gesture-required",
|
2021-01-29 00:33:01 -08:00
|
|
|
"--disable-features=IsolateOrigins,site-per-process",
|
2020-11-01 19:22:53 -08:00
|
|
|
];
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
get puppeteerArgs() {
|
|
|
|
// Puppeter Options
|
|
|
|
return {
|
|
|
|
headless: this.params.headless,
|
2020-11-03 17:16:29 +00:00
|
|
|
executablePath: CHROME_PATH,
|
2020-11-01 19:22:53 -08:00
|
|
|
ignoreHTTPSErrors: true,
|
|
|
|
args: this.chromeArgs
|
|
|
|
};
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async run() {
|
|
|
|
this.bootstrap();
|
2020-10-31 13:16:37 -07:00
|
|
|
|
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.crawl();
|
|
|
|
process.exit(0);
|
2020-10-31 13:16:37 -07:00
|
|
|
} catch(e) {
|
2020-11-01 19:22:53 -08:00
|
|
|
console.error("Crawl failed");
|
|
|
|
console.error(e);
|
|
|
|
process.exit(1);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2021-02-23 16:52:54 -05:00
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
async crawlPage({page, data}) {
|
|
|
|
try {
|
|
|
|
if (this.emulateDevice) {
|
|
|
|
await page.emulate(this.emulateDevice);
|
|
|
|
}
|
|
|
|
|
|
|
|
const bgbehavior = await this.bgbehaviors.setup(page, this);
|
|
|
|
|
|
|
|
// run custom driver here
|
|
|
|
await this.driver({page, data, crawler: this});
|
2021-02-23 16:52:54 -05:00
|
|
|
|
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
const title = await page.title();
|
2021-02-23 16:52:54 -05:00
|
|
|
var text = ''
|
|
|
|
if (this.params.text){
|
|
|
|
const client = await page.target().createCDPSession();
|
|
|
|
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
|
|
|
var text = await new TextExtract(result).parseTextFromDom()
|
|
|
|
}
|
|
|
|
|
|
|
|
this.writePage(data.url, title, this.params.text, text);
|
2021-02-08 22:21:34 -08:00
|
|
|
|
|
|
|
if (bgbehavior) {
|
|
|
|
await bgbehavior();
|
|
|
|
}
|
|
|
|
|
|
|
|
this.writeStats();
|
|
|
|
|
|
|
|
} catch (e) {
|
|
|
|
console.warn(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async crawl() {
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
this.driver = require(this.params.driver);
|
|
|
|
} catch(e) {
|
|
|
|
console.log(e);
|
|
|
|
return;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// Puppeteer Cluster init and options
|
|
|
|
this.cluster = await Cluster.launch({
|
|
|
|
concurrency: this.params.newContext,
|
|
|
|
maxConcurrency: this.params.workers,
|
|
|
|
skipDuplicateUrls: true,
|
|
|
|
timeout: this.params.timeout * 2,
|
|
|
|
puppeteerOptions: this.puppeteerArgs,
|
|
|
|
puppeteer,
|
|
|
|
monitor: this.monitor
|
|
|
|
});
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2021-02-08 22:21:34 -08:00
|
|
|
this.cluster.task((opts) => this.crawlPage(opts));
|
2020-11-01 19:22:53 -08:00
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
this.initPages();
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
this.queueUrl(this.params.url);
|
|
|
|
|
2020-11-14 21:55:02 +00:00
|
|
|
if (this.params.useSitemap) {
|
|
|
|
await this.parseSitemap(this.params.useSitemap);
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
await this.cluster.idle();
|
|
|
|
await this.cluster.close();
|
|
|
|
|
2020-12-02 16:26:20 +00:00
|
|
|
this.writeStats();
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// extra wait for all resources to land into WARCs
|
|
|
|
console.log("Waiting 5s to ensure WARCs are finished");
|
|
|
|
await this.sleep(5000);
|
|
|
|
|
2020-11-03 21:33:19 +00:00
|
|
|
if (this.params.generateCDX) {
|
2020-11-01 19:22:53 -08:00
|
|
|
console.log("Generate CDX");
|
|
|
|
|
2020-11-02 15:28:19 +00:00
|
|
|
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2021-02-04 00:28:32 -05:00
|
|
|
|
2021-02-17 12:37:07 -05:00
|
|
|
if (this.params.generateWACZ || this.params.generateWacz || this.params.generatewacz ) {
|
2021-02-04 00:28:32 -05:00
|
|
|
console.log("Generating WACZ");
|
|
|
|
|
|
|
|
const archiveDir = path.join(this.collDir, "archive");
|
|
|
|
|
|
|
|
// Get a list of the warcs inside
|
|
|
|
const warcFileList = fs.readdirSync(archiveDir);
|
|
|
|
|
|
|
|
// Build the argument list to pass to the wacz create command
|
|
|
|
const waczFilename = this.params.collection.concat(".wacz");
|
|
|
|
const waczPath = path.join(this.collDir, waczFilename);
|
|
|
|
const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
|
2021-02-17 12:37:07 -05:00
|
|
|
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
2021-02-04 00:28:32 -05:00
|
|
|
|
|
|
|
// Run the wacz create command
|
|
|
|
child_process.spawnSync("wacz" , argument_list);
|
|
|
|
console.log(`WACZ successfully generated and saved to: ${waczFilename}`);
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-12-02 16:26:20 +00:00
|
|
|
writeStats() {
|
|
|
|
if (this.params.statsFilename) {
|
|
|
|
const total = this.cluster.allTargetCount;
|
|
|
|
const workersRunning = this.cluster.workersBusy.length;
|
|
|
|
const numCrawled = total - this.cluster.jobQueue.size() - workersRunning;
|
2021-01-29 18:26:55 +00:00
|
|
|
const limit = {max: this.params.limit || 0, hit: this.limitHit};
|
|
|
|
const stats = {numCrawled, workersRunning, total, limit};
|
2020-12-02 16:26:20 +00:00
|
|
|
|
|
|
|
try {
|
2021-02-04 00:28:32 -05:00
|
|
|
fs.writeFileSync(this.params.statsFilename, JSON.stringify(stats, null, 2));
|
2020-12-02 16:26:20 +00:00
|
|
|
} catch (err) {
|
|
|
|
console.warn("Stats output failed", err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async extractLinks(page, selector = "a[href]") {
|
2020-10-31 13:16:37 -07:00
|
|
|
let results = null;
|
|
|
|
|
|
|
|
try {
|
2020-11-01 19:22:53 -08:00
|
|
|
results = await page.evaluate((selector) => {
|
|
|
|
/* eslint-disable-next-line no-undef */
|
|
|
|
return [...document.querySelectorAll(selector)].map(elem => elem.href);
|
|
|
|
}, selector);
|
2020-10-31 13:16:37 -07:00
|
|
|
} catch (e) {
|
|
|
|
console.warn("Link Extraction failed", e);
|
|
|
|
return;
|
|
|
|
}
|
2020-11-14 21:55:02 +00:00
|
|
|
this.queueUrls(results);
|
|
|
|
}
|
|
|
|
|
|
|
|
queueUrls(urls) {
|
2020-10-31 13:16:37 -07:00
|
|
|
try {
|
2020-11-14 21:55:02 +00:00
|
|
|
for (const url of urls) {
|
2020-11-01 19:22:53 -08:00
|
|
|
const captureUrl = this.shouldCrawl(url);
|
|
|
|
if (captureUrl) {
|
|
|
|
if (!this.queueUrl(captureUrl)) {
|
2020-10-31 13:16:37 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (e) {
|
2020-11-01 19:22:53 -08:00
|
|
|
console.log("Queuing Error: ", e);
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
queueUrl(url) {
|
|
|
|
this.seenList.add(url);
|
|
|
|
if (this.numLinks >= this.params.limit && this.params.limit > 0) {
|
2021-01-29 18:26:55 +00:00
|
|
|
this.limitHit = true;
|
2020-11-01 19:22:53 -08:00
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
this.numLinks++;
|
|
|
|
this.cluster.queue({url});
|
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
initPages() {
|
|
|
|
try {
|
|
|
|
// create pages dir if doesn't exist and write pages.jsonl header
|
|
|
|
if (!fs.existsSync(this.pagesDir)) {
|
|
|
|
fs.mkdirSync(this.pagesDir);
|
2021-02-23 16:52:54 -05:00
|
|
|
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
|
|
|
|
if (this.params.text) {
|
|
|
|
console.log("creating pages with full text");
|
|
|
|
header["hasText"] = true
|
|
|
|
}
|
|
|
|
else{
|
|
|
|
console.log("creating pages without full text");
|
|
|
|
header["hasText"] = false
|
|
|
|
}
|
|
|
|
const header_formatted = JSON.stringify(header).concat("\n")
|
|
|
|
fs.writeFileSync(this.pagesFile, header_formatted);
|
2021-02-04 00:28:32 -05:00
|
|
|
}
|
|
|
|
} catch(err) {
|
|
|
|
console.log("pages/pages.jsonl creation failed", err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-23 16:52:54 -05:00
|
|
|
writePage(url, title, text, text_content){
|
2021-02-04 00:28:32 -05:00
|
|
|
const id = uuidv4();
|
|
|
|
const row = {"id": id, "url": url, "title": title};
|
2021-02-23 16:52:54 -05:00
|
|
|
|
|
|
|
if (text == true){
|
|
|
|
row['text'] = text_content
|
|
|
|
}
|
|
|
|
|
2021-02-04 00:28:32 -05:00
|
|
|
const processedRow = JSON.stringify(row).concat("\n");
|
|
|
|
try {
|
|
|
|
fs.appendFileSync(this.pagesFile, processedRow);
|
|
|
|
}
|
|
|
|
catch (err) {
|
|
|
|
console.warn("pages/pages.jsonl append failed", err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
shouldCrawl(url) {
|
|
|
|
try {
|
|
|
|
url = new URL(url);
|
|
|
|
} catch(e) {
|
2020-10-31 13:16:37 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// remove hashtag
|
|
|
|
url.hash = "";
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// only queue http/https URLs
|
|
|
|
if (url.protocol != "http:" && url.protocol != "https:") {
|
|
|
|
return false;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
url = url.href;
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// skip already crawled
|
|
|
|
if (this.seenList.has(url)) {
|
2020-10-31 13:16:37 -07:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
let inScope = false;
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// check scopes
|
|
|
|
for (const s of this.params.scope) {
|
|
|
|
if (s.exec(url)) {
|
|
|
|
inScope = true;
|
|
|
|
break;
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (!inScope) {
|
|
|
|
//console.log(`Not in scope ${url} ${scope}`);
|
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// check exclusions
|
|
|
|
for (const e of this.params.exclude) {
|
|
|
|
if (e.exec(url)) {
|
|
|
|
//console.log(`Skipping ${url} excluded by ${e}`);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
return url;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
resolveAgent(urlParsed) {
|
|
|
|
return urlParsed.protocol === "https:" ? HTTPS_AGENT : HTTP_AGENT;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async isHTML(url) {
|
|
|
|
try {
|
|
|
|
const resp = await fetch(url, {
|
|
|
|
method: "HEAD",
|
|
|
|
headers: this.headers,
|
|
|
|
agent: this.resolveAgent
|
|
|
|
});
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (resp.status >= 400) {
|
|
|
|
console.log(`Skipping ${url}, invalid status ${resp.status}`);
|
|
|
|
return false;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const contentType = resp.headers.get("Content-Type");
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
// just load if no content-type
|
|
|
|
if (!contentType) {
|
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
const mime = contentType.split(";")[0];
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
if (HTML_TYPES.includes(mime)) {
|
|
|
|
return true;
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
return false;
|
|
|
|
} catch(e) {
|
|
|
|
console.log("HTML Check error", e);
|
|
|
|
// can't confirm not html, so try in browser
|
2020-10-31 13:16:37 -07:00
|
|
|
return true;
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
}
|
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
async directFetchCapture(url) {
|
|
|
|
//console.log(`Direct capture: ${this.capturePrefix}${url}`);
|
|
|
|
const abort = new AbortController();
|
|
|
|
const signal = abort.signal;
|
2021-02-08 12:45:46 -05:00
|
|
|
await fetch(this.capturePrefix + url, {signal, headers: this.headers});
|
2020-11-01 19:22:53 -08:00
|
|
|
abort.abort();
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
sleep(time) {
|
|
|
|
return new Promise(resolve => setTimeout(resolve, time));
|
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
rxEscape(string) {
|
|
|
|
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
|
|
}
|
2020-11-14 21:55:02 +00:00
|
|
|
|
|
|
|
async parseSitemap(url) {
|
|
|
|
const sitemapper = new Sitemapper({
|
|
|
|
url,
|
|
|
|
timeout: 15000,
|
|
|
|
requestHeaders: this.headers
|
|
|
|
});
|
|
|
|
|
|
|
|
try {
|
|
|
|
const { sites } = await sitemapper.fetch();
|
|
|
|
|
|
|
|
this.queueUrls(sites);
|
|
|
|
|
|
|
|
} catch(e) {
|
|
|
|
console.log(e);
|
|
|
|
}
|
|
|
|
}
|
2020-11-01 19:22:53 -08:00
|
|
|
}
|
2020-10-31 13:16:37 -07:00
|
|
|
|
2020-11-01 19:22:53 -08:00
|
|
|
module.exports.Crawler = Crawler;
|
2020-10-31 13:16:37 -07:00
|
|
|
|