mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
move to src/util
This commit is contained in:
parent
5e5b4de79b
commit
e5fa61d4cf
14 changed files with 417 additions and 258 deletions
|
@ -30,6 +30,7 @@
|
|||
"yargs": "^17.7.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.8.7",
|
||||
"eslint": "^8.37.0",
|
||||
"eslint-plugin-react": "^7.22.0",
|
||||
"jest": "^29.2.1",
|
||||
|
|
163
src/logger.js
163
src/logger.js
|
@ -1,163 +0,0 @@
|
|||
"use strict";
|
||||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
||||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
||||
return new (P || (P = Promise))(function (resolve, reject) {
|
||||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
||||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
||||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
||||
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
||||
});
|
||||
};
|
||||
var __generator = (this && this.__generator) || function (thisArg, body) {
|
||||
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
|
||||
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
|
||||
function verb(n) { return function (v) { return step([n, v]); }; }
|
||||
function step(op) {
|
||||
if (f) throw new TypeError("Generator is already executing.");
|
||||
while (_) try {
|
||||
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
|
||||
if (y = 0, t) op = [op[0] & 2, t.value];
|
||||
switch (op[0]) {
|
||||
case 0: case 1: t = op; break;
|
||||
case 4: _.label++; return { value: op[1], done: false };
|
||||
case 5: _.label++; y = op[1]; op = [0]; continue;
|
||||
case 7: op = _.ops.pop(); _.trys.pop(); continue;
|
||||
default:
|
||||
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
|
||||
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
|
||||
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
|
||||
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
|
||||
if (t[2]) _.ops.pop();
|
||||
_.trys.pop(); continue;
|
||||
}
|
||||
op = body.call(thisArg, _);
|
||||
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
|
||||
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
|
||||
}
|
||||
};
|
||||
exports.__esModule = true;
|
||||
exports.logger = exports.errJSON = void 0;
|
||||
// ===========================================================================
|
||||
// to fix serialization of regexes for logging purposes
|
||||
// RegExp.prototype.toJSON = RegExp.prototype.toString;
|
||||
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
|
||||
// ===========================================================================
|
||||
function errJSON(e) {
|
||||
return { "type": "exception", "message": e.message, "stack": e.stack };
|
||||
}
|
||||
exports.errJSON = errJSON;
|
||||
// ===========================================================================
|
||||
var Logger = /** @class */ (function () {
|
||||
function Logger() {
|
||||
this.logStream = null;
|
||||
this.debugLogging = false;
|
||||
this.logErrorsToRedis = false;
|
||||
this.logLevels = [];
|
||||
this.contexts = [];
|
||||
this.crawlState = null;
|
||||
}
|
||||
Logger.prototype.setExternalLogStream = function (logFH) {
|
||||
this.logStream = logFH;
|
||||
};
|
||||
Logger.prototype.setDebugLogging = function (debugLog) {
|
||||
this.debugLogging = debugLog;
|
||||
};
|
||||
Logger.prototype.setLogErrorsToRedis = function (logErrorsToRedis) {
|
||||
this.logErrorsToRedis = logErrorsToRedis;
|
||||
};
|
||||
Logger.prototype.setLogLevel = function (logLevels) {
|
||||
this.logLevels = logLevels;
|
||||
};
|
||||
Logger.prototype.setContext = function (contexts) {
|
||||
this.contexts = contexts;
|
||||
};
|
||||
Logger.prototype.setCrawlState = function (crawlState) {
|
||||
this.crawlState = crawlState;
|
||||
};
|
||||
Logger.prototype.logAsJSON = function (message, data, context, logLevel) {
|
||||
if (logLevel === void 0) { logLevel = "info"; }
|
||||
if (data instanceof Error) {
|
||||
data = errJSON(data);
|
||||
}
|
||||
else if (typeof data !== "object") {
|
||||
data = { "message": data.toString() };
|
||||
}
|
||||
if (this.logLevels.length) {
|
||||
if (this.logLevels.indexOf(logLevel) < 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (this.contexts.length) {
|
||||
if (this.contexts.indexOf(context) < 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
var dataToLog = {
|
||||
"timestamp": new Date().toISOString(),
|
||||
"logLevel": logLevel,
|
||||
"context": context,
|
||||
"message": message,
|
||||
"details": data ? data : {}
|
||||
};
|
||||
var string = JSON.stringify(dataToLog);
|
||||
console.log(string);
|
||||
if (this.logStream) {
|
||||
this.logStream.write(string + "\n");
|
||||
}
|
||||
var toLogToRedis = ["error", "fatal"];
|
||||
if (this.logErrorsToRedis && toLogToRedis.includes(logLevel)) {
|
||||
this.crawlState.logError(string);
|
||||
}
|
||||
};
|
||||
Logger.prototype.info = function (message, data, context) {
|
||||
if (data === void 0) { data = {}; }
|
||||
if (context === void 0) { context = "general"; }
|
||||
this.logAsJSON(message, data, context);
|
||||
};
|
||||
Logger.prototype.error = function (message, data, context) {
|
||||
if (data === void 0) { data = {}; }
|
||||
if (context === void 0) { context = "general"; }
|
||||
this.logAsJSON(message, data, context, "error");
|
||||
};
|
||||
Logger.prototype.warn = function (message, data, context) {
|
||||
if (data === void 0) { data = {}; }
|
||||
if (context === void 0) { context = "general"; }
|
||||
this.logAsJSON(message, data, context, "warn");
|
||||
};
|
||||
Logger.prototype.debug = function (message, data, context) {
|
||||
if (data === void 0) { data = {}; }
|
||||
if (context === void 0) { context = "general"; }
|
||||
if (this.debugLogging) {
|
||||
this.logAsJSON(message, data, context, "debug");
|
||||
}
|
||||
};
|
||||
Logger.prototype.fatal = function (message, data, context, exitCode) {
|
||||
if (data === void 0) { data = {}; }
|
||||
if (context === void 0) { context = "general"; }
|
||||
if (exitCode === void 0) { exitCode = 17; }
|
||||
this.logAsJSON("".concat(message, ". Quitting"), data, context, "fatal");
|
||||
function markFailedAndEnd(crawlState) {
|
||||
return __awaiter(this, void 0, void 0, function () {
|
||||
return __generator(this, function (_a) {
|
||||
switch (_a.label) {
|
||||
case 0: return [4 /*yield*/, crawlState.setStatus("failed")];
|
||||
case 1:
|
||||
_a.sent();
|
||||
return [4 /*yield*/, crawlState.setEndTime()];
|
||||
case 2:
|
||||
_a.sent();
|
||||
return [2 /*return*/];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
if (this.crawlState) {
|
||||
markFailedAndEnd(this.crawlState)["finally"](process.exit(exitCode));
|
||||
}
|
||||
else {
|
||||
process.exit(exitCode);
|
||||
}
|
||||
};
|
||||
return Logger;
|
||||
}());
|
||||
exports.logger = new Logger();
|
94
src/redis.js
94
src/redis.js
|
@ -1,94 +0,0 @@
|
|||
"use strict";
|
||||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
||||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
||||
return new (P || (P = Promise))(function (resolve, reject) {
|
||||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
||||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
||||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
||||
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
||||
});
|
||||
};
|
||||
var __generator = (this && this.__generator) || function (thisArg, body) {
|
||||
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
|
||||
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
|
||||
function verb(n) { return function (v) { return step([n, v]); }; }
|
||||
function step(op) {
|
||||
if (f) throw new TypeError("Generator is already executing.");
|
||||
while (_) try {
|
||||
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
|
||||
if (y = 0, t) op = [op[0] & 2, t.value];
|
||||
switch (op[0]) {
|
||||
case 0: case 1: t = op; break;
|
||||
case 4: _.label++; return { value: op[1], done: false };
|
||||
case 5: _.label++; y = op[1]; op = [0]; continue;
|
||||
case 7: op = _.ops.pop(); _.trys.pop(); continue;
|
||||
default:
|
||||
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
|
||||
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
|
||||
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
|
||||
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
|
||||
if (t[2]) _.ops.pop();
|
||||
_.trys.pop(); continue;
|
||||
}
|
||||
op = body.call(thisArg, _);
|
||||
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
|
||||
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
|
||||
}
|
||||
};
|
||||
var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
|
||||
if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
|
||||
if (ar || !(i in from)) {
|
||||
if (!ar) ar = Array.prototype.slice.call(from, 0, i);
|
||||
ar[i] = from[i];
|
||||
}
|
||||
}
|
||||
return to.concat(ar || Array.prototype.slice.call(from));
|
||||
};
|
||||
exports.__esModule = true;
|
||||
exports.setExitOnRedisError = exports.initRedis = void 0;
|
||||
var ioredis_1 = require("ioredis");
|
||||
var logger_js_1 = require("./logger.js");
|
||||
var error = console.error;
|
||||
var lastLogTime = 0;
|
||||
var exitOnError = false;
|
||||
// log only once every 10 seconds
|
||||
var REDIS_ERROR_LOG_INTERVAL_SECS = 10000;
|
||||
console.error = function () {
|
||||
var args = [];
|
||||
for (var _i = 0; _i < arguments.length; _i++) {
|
||||
args[_i] = arguments[_i];
|
||||
}
|
||||
if (typeof args[0] === "string" &&
|
||||
args[0].indexOf("[ioredis] Unhandled error event") === 0) {
|
||||
var now = Date.now();
|
||||
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||
if (lastLogTime && exitOnError) {
|
||||
logger_js_1.logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
|
||||
}
|
||||
logger_js_1.logger.warn("ioredis error", { error: args[0] }, "redis");
|
||||
lastLogTime = now;
|
||||
}
|
||||
return;
|
||||
}
|
||||
error.call.apply(error, __spreadArray([console], args, false));
|
||||
};
|
||||
function initRedis(url) {
|
||||
return __awaiter(this, void 0, void 0, function () {
|
||||
var redis;
|
||||
return __generator(this, function (_a) {
|
||||
switch (_a.label) {
|
||||
case 0:
|
||||
redis = new ioredis_1["default"](url, { lazyConnect: true });
|
||||
return [4 /*yield*/, redis.connect()];
|
||||
case 1:
|
||||
_a.sent();
|
||||
return [2 /*return*/, redis];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
exports.initRedis = initRedis;
|
||||
function setExitOnRedisError() {
|
||||
exitOnError = true;
|
||||
}
|
||||
exports.setExitOnRedisError = setExitOnRedisError;
|
33
src/util/file_reader.ts
Normal file
33
src/util/file_reader.ts
Normal file
|
@ -0,0 +1,33 @@
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
|
||||
const resolvedPath = path.resolve(fileOrDir);
|
||||
|
||||
if (depth >= MAX_DEPTH) {
|
||||
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
|
||||
return [];
|
||||
}
|
||||
|
||||
const stat = fs.statSync(resolvedPath);
|
||||
|
||||
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
|
||||
const contents = fs.readFileSync(resolvedPath);
|
||||
return [`/* src: ${resolvedPath} */\n\n${contents}`];
|
||||
}
|
||||
|
||||
if (stat.isDirectory) {
|
||||
const files = fs.readdirSync(resolvedPath);
|
||||
return files.reduce((acc, next) => {
|
||||
const nextPath = path.join(fileOrDir, next);
|
||||
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
|
||||
}, []);
|
||||
}
|
||||
|
||||
if (depth === 0) {
|
||||
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
||||
return [];
|
||||
}
|
||||
}
|
|
@ -12,6 +12,12 @@ const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta
|
|||
// ===========================================================================
|
||||
class WSTransport
|
||||
{
|
||||
allWS = new Set<WebSocket>();
|
||||
caster?: ScreenCaster;
|
||||
wss: ws.Server;
|
||||
httpServer: any;
|
||||
|
||||
|
||||
constructor(port) {
|
||||
this.allWS = new Set();
|
||||
|
||||
|
@ -87,8 +93,13 @@ class WSTransport
|
|||
// ===========================================================================
|
||||
class RedisPubSubTransport
|
||||
{
|
||||
numConnections: number = 0;
|
||||
castChannel: string;
|
||||
caster?: ScreenCaster;
|
||||
ctrlChannel: string;
|
||||
redis: any;
|
||||
|
||||
constructor(redisUrl, crawlId) {
|
||||
this.numConnections = 0;
|
||||
this.castChannel = `c:${crawlId}:cast`;
|
||||
this.ctrlChannel = `c:${crawlId}:ctrl`;
|
||||
|
||||
|
@ -143,6 +154,14 @@ class RedisPubSubTransport
|
|||
// ===========================================================================
|
||||
class ScreenCaster
|
||||
{
|
||||
transport: WSTransport;
|
||||
caches = new Map();
|
||||
urls = new Map();
|
||||
cdps = new Map();
|
||||
maxWidth = 640;
|
||||
maxHeight = 480;
|
||||
initMsg: {[key: string]: any};
|
||||
|
||||
constructor(transport, numWorkers) {
|
||||
this.transport = transport;
|
||||
this.transport.caster = this;
|
102
src/util/screenshots.ts
Normal file
102
src/util/screenshots.ts
Normal file
|
@ -0,0 +1,102 @@
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
import * as warcio from "warcio";
|
||||
import sharp from "sharp";
|
||||
|
||||
import { logger, errJSON } from "./logger.js";
|
||||
|
||||
// ============================================================================
|
||||
|
||||
export const screenshotTypes = {
|
||||
"view": {
|
||||
type: "png",
|
||||
omitBackground: true,
|
||||
fullPage: false
|
||||
},
|
||||
"thumbnail": {
|
||||
type: "jpeg",
|
||||
omitBackground: true,
|
||||
fullPage: false
|
||||
},
|
||||
"fullPage": {
|
||||
type: "png",
|
||||
omitBackground: true,
|
||||
fullPage: true
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
export class Screenshots {
|
||||
browser: any;
|
||||
page: any;
|
||||
url: string;
|
||||
directory: string;
|
||||
warcName: string;
|
||||
date: Date;
|
||||
|
||||
constructor({browser, page, url, date, directory}) {
|
||||
this.browser = browser;
|
||||
this.page = page;
|
||||
this.url = url;
|
||||
this.directory = directory;
|
||||
this.warcName = path.join(this.directory, "screenshots.warc.gz");
|
||||
this.date = date ? date : new Date();
|
||||
}
|
||||
|
||||
async take(screenshotType="view") {
|
||||
try {
|
||||
if (screenshotType !== "fullPage") {
|
||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
||||
}
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
|
||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
}
|
||||
}
|
||||
|
||||
async takeFullPage() {
|
||||
await this.take("fullPage");
|
||||
}
|
||||
|
||||
async takeThumbnail() {
|
||||
const screenshotType = "thumbnail";
|
||||
try {
|
||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
const thumbnailBuffer = await sharp(screenshotBuffer)
|
||||
// 16:9 thumbnail
|
||||
.resize(640, 360)
|
||||
.toBuffer();
|
||||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
|
||||
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
}
|
||||
}
|
||||
|
||||
async writeBufferToWARC(screenshotBuffer, screenshotType, imageType) {
|
||||
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, imageType);
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||
}
|
||||
|
||||
async wrap(buffer, screenshotType="screenshot", imageType="png") {
|
||||
const warcVersion = "WARC/1.1";
|
||||
const warcRecordType = "resource";
|
||||
const warcHeaders = {"Content-Type": `image/${imageType}`};
|
||||
async function* content() {
|
||||
yield buffer;
|
||||
}
|
||||
let screenshotUrl = `urn:${screenshotType}:` + this.url;
|
||||
return warcio.WARCRecord.create({
|
||||
url: screenshotUrl,
|
||||
date: this.date.toISOString(),
|
||||
type: warcRecordType,
|
||||
warcVersion,
|
||||
warcHeaders}, content());
|
||||
}
|
||||
}
|
212
src/util/seeds.ts
Normal file
212
src/util/seeds.ts
Normal file
|
@ -0,0 +1,212 @@
|
|||
import { logger } from "./logger.js";
|
||||
import { MAX_DEPTH } from "./constants.js";
|
||||
|
||||
type ScopeType =
|
||||
| "prefix"
|
||||
| "host"
|
||||
| "domain"
|
||||
| "page"
|
||||
| "page-spa"
|
||||
| "any"
|
||||
| "custom";
|
||||
|
||||
export class ScopedSeed
|
||||
{
|
||||
url: string;
|
||||
scopeType: ScopeType;
|
||||
include: RegExp[];
|
||||
exclude?: RegExp[] = [];
|
||||
allowHash = false;
|
||||
depth = -1;
|
||||
sitemap?: string;
|
||||
extraHops = 0;
|
||||
|
||||
maxExtraHops = 0;
|
||||
maxDepth = 0;
|
||||
|
||||
|
||||
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
|
||||
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: boolean, extraHops?: number}) {
|
||||
const parsedUrl = this.parseUrl(url);
|
||||
if (!parsedUrl) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
this.url = parsedUrl.href;
|
||||
this.include = this.parseRx(include);
|
||||
this.exclude = this.parseRx(exclude);
|
||||
this.scopeType = scopeType;
|
||||
|
||||
if (!this.scopeType) {
|
||||
this.scopeType = this.include.length ? "custom" : "prefix";
|
||||
}
|
||||
|
||||
if (this.scopeType !== "custom") {
|
||||
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
|
||||
this.include = [...includeNew, ...this.include];
|
||||
allowHash = allowHashNew;
|
||||
}
|
||||
|
||||
// for page scope, the depth is set to extraHops, as no other
|
||||
// crawling is done
|
||||
if (this.scopeType === "page") {
|
||||
depth = extraHops;
|
||||
}
|
||||
|
||||
this.sitemap = this.resolveSiteMap(sitemap);
|
||||
this.allowHash = allowHash;
|
||||
this.maxExtraHops = extraHops;
|
||||
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
|
||||
}
|
||||
|
||||
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
|
||||
parseRx(value) {
|
||||
if (value === null || value === undefined || value === "") {
|
||||
return [];
|
||||
} else if (!(value instanceof Array)) {
|
||||
return [new RegExp(value)];
|
||||
} else {
|
||||
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
|
||||
}
|
||||
}
|
||||
|
||||
parseUrl(url, logDetails = {}) {
|
||||
let parsedUrl = null;
|
||||
try {
|
||||
parsedUrl = new URL(url.trim());
|
||||
} catch (e) {
|
||||
logger.warn("Invalid Page - not a valid URL", {url, ...logDetails});
|
||||
return null;
|
||||
}
|
||||
|
||||
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
||||
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
|
||||
parsedUrl = null;
|
||||
}
|
||||
|
||||
return parsedUrl;
|
||||
}
|
||||
|
||||
resolveSiteMap(sitemap) {
|
||||
if (sitemap === true) {
|
||||
const url = new URL(this.url);
|
||||
url.pathname = "/sitemap.xml";
|
||||
return url.href;
|
||||
}
|
||||
|
||||
return sitemap;
|
||||
}
|
||||
|
||||
scopeFromType(scopeType, parsedUrl) : [RegExp[], boolean] {
|
||||
let include : RegExp[];
|
||||
let allowHash = false;
|
||||
|
||||
switch (scopeType) {
|
||||
case "page":
|
||||
include = [];
|
||||
break;
|
||||
|
||||
case "page-spa":
|
||||
// allow scheme-agnostic URLS as likely redirects
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
|
||||
allowHash = true;
|
||||
break;
|
||||
|
||||
case "prefix":
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
|
||||
break;
|
||||
|
||||
case "host":
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
|
||||
break;
|
||||
|
||||
case "domain":
|
||||
if (parsedUrl.hostname.startsWith("www.")) {
|
||||
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
||||
}
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
|
||||
break;
|
||||
|
||||
case "any":
|
||||
include = [/.*/];
|
||||
break;
|
||||
|
||||
default:
|
||||
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
||||
}
|
||||
|
||||
return [include, allowHash];
|
||||
}
|
||||
|
||||
isAtMaxDepth(depth) {
|
||||
return depth >= this.maxDepth;
|
||||
}
|
||||
|
||||
isIncluded(url, depth, extraHops = 0, logDetails = {}) {
|
||||
if (depth > this.maxDepth) {
|
||||
return false;
|
||||
}
|
||||
|
||||
url = this.parseUrl(url, logDetails);
|
||||
if (!url) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!this.allowHash) {
|
||||
// remove hashtag
|
||||
url.hash = "";
|
||||
}
|
||||
|
||||
url = url.href;
|
||||
|
||||
if (url === this.url) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// skip already crawled
|
||||
// if (this.seenList.has(url)) {
|
||||
// return false;
|
||||
//}
|
||||
let inScope = false;
|
||||
|
||||
// check scopes
|
||||
for (const s of this.include) {
|
||||
if (s.test(url)) {
|
||||
inScope = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let isOOS = false;
|
||||
|
||||
if (!inScope) {
|
||||
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
|
||||
isOOS = true;
|
||||
} else {
|
||||
//console.log(`Not in scope ${url} ${this.include}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// check exclusions
|
||||
for (const e of this.exclude) {
|
||||
if (e.test(url)) {
|
||||
//console.log(`Skipping ${url} excluded by ${e}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return {url, isOOS};
|
||||
}
|
||||
}
|
||||
|
||||
export function rxEscape(string) {
|
||||
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
|
||||
}
|
||||
|
||||
export function urlRxEscape(url, parsedUrl) {
|
||||
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
37
src/util/timing.ts
Normal file
37
src/util/timing.ts
Normal file
|
@ -0,0 +1,37 @@
|
|||
import { logger } from "./logger.js";
|
||||
|
||||
export function sleep(seconds) {
|
||||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
||||
}
|
||||
|
||||
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
|
||||
// return Promise return value or log error if timeout is reached first
|
||||
const timeout = seconds * 1000;
|
||||
|
||||
const rejectPromiseOnTimeout = (timeout) => {
|
||||
return new Promise((resolve, reject) => {
|
||||
setTimeout(() => (reject("timeout reached")), timeout);
|
||||
});
|
||||
};
|
||||
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
|
||||
.catch((err) => {
|
||||
if (err == "timeout reached") {
|
||||
const logFunc = isWarn ? logger.warn : logger.error;
|
||||
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
|
||||
} else {
|
||||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||
throw err;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export function secondsElapsed(startTime, nowDate = null) {
|
||||
nowDate = nowDate || new Date();
|
||||
|
||||
return (nowDate.getTime() - startTime) / 1000;
|
||||
}
|
||||
|
||||
export function timestampNow() {
|
||||
return new Date().toISOString().replace(/[^\d]/g, "");
|
||||
}
|
12
yarn.lock
12
yarn.lock
|
@ -934,6 +934,13 @@
|
|||
resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26"
|
||||
integrity sha512-8/bnjSZD86ZfpBsDlCIkNXIvm+h6wi9g7IqL+kmFkQ+Wvu3JrasgLElfiPgoo8V8vVfnEi0QVS12gbl94h9YsQ==
|
||||
|
||||
"@types/node@^20.8.7":
|
||||
version "20.8.7"
|
||||
resolved "https://registry.yarnpkg.com/@types/node/-/node-20.8.7.tgz#ad23827850843de973096edfc5abc9e922492a25"
|
||||
integrity sha512-21TKHHh3eUHIi2MloeptJWALuCu5H7HQTdTrWIFReA8ad+aggoX+lRes3ex7/FtpC+sVUpFMQ+QTfYr74mruiQ==
|
||||
dependencies:
|
||||
undici-types "~5.25.1"
|
||||
|
||||
"@types/prettier@^2.1.5":
|
||||
version "2.7.1"
|
||||
resolved "https://registry.yarnpkg.com/@types/prettier/-/prettier-2.7.1.tgz#dfd20e2dc35f027cdd6c1908e80a5ddc7499670e"
|
||||
|
@ -4714,6 +4721,11 @@ unbzip2-stream@1.4.3:
|
|||
buffer "^5.2.1"
|
||||
through "^2.3.8"
|
||||
|
||||
undici-types@~5.25.1:
|
||||
version "5.25.3"
|
||||
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3"
|
||||
integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA==
|
||||
|
||||
unique-string@^3.0.0:
|
||||
version "3.0.0"
|
||||
resolved "https://registry.yarnpkg.com/unique-string/-/unique-string-3.0.0.tgz#84a1c377aff5fd7a8bc6b55d8244b2bd90d75b9a"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue