move to src/util

This commit is contained in:
Ilya Kreymer 2023-10-21 21:39:38 -07:00
parent 5e5b4de79b
commit e5fa61d4cf
14 changed files with 417 additions and 258 deletions

View file

@ -30,6 +30,7 @@
"yargs": "^17.7.2"
},
"devDependencies": {
"@types/node": "^20.8.7",
"eslint": "^8.37.0",
"eslint-plugin-react": "^7.22.0",
"jest": "^29.2.1",

View file

@ -1,163 +0,0 @@
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
exports.__esModule = true;
exports.logger = exports.errJSON = void 0;
// ===========================================================================
// to fix serialization of regexes for logging purposes
// RegExp.prototype.toJSON = RegExp.prototype.toString;
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
// ===========================================================================
function errJSON(e) {
return { "type": "exception", "message": e.message, "stack": e.stack };
}
exports.errJSON = errJSON;
// ===========================================================================
var Logger = /** @class */ (function () {
function Logger() {
this.logStream = null;
this.debugLogging = false;
this.logErrorsToRedis = false;
this.logLevels = [];
this.contexts = [];
this.crawlState = null;
}
Logger.prototype.setExternalLogStream = function (logFH) {
this.logStream = logFH;
};
Logger.prototype.setDebugLogging = function (debugLog) {
this.debugLogging = debugLog;
};
Logger.prototype.setLogErrorsToRedis = function (logErrorsToRedis) {
this.logErrorsToRedis = logErrorsToRedis;
};
Logger.prototype.setLogLevel = function (logLevels) {
this.logLevels = logLevels;
};
Logger.prototype.setContext = function (contexts) {
this.contexts = contexts;
};
Logger.prototype.setCrawlState = function (crawlState) {
this.crawlState = crawlState;
};
Logger.prototype.logAsJSON = function (message, data, context, logLevel) {
if (logLevel === void 0) { logLevel = "info"; }
if (data instanceof Error) {
data = errJSON(data);
}
else if (typeof data !== "object") {
data = { "message": data.toString() };
}
if (this.logLevels.length) {
if (this.logLevels.indexOf(logLevel) < 0) {
return;
}
}
if (this.contexts.length) {
if (this.contexts.indexOf(context) < 0) {
return;
}
}
var dataToLog = {
"timestamp": new Date().toISOString(),
"logLevel": logLevel,
"context": context,
"message": message,
"details": data ? data : {}
};
var string = JSON.stringify(dataToLog);
console.log(string);
if (this.logStream) {
this.logStream.write(string + "\n");
}
var toLogToRedis = ["error", "fatal"];
if (this.logErrorsToRedis && toLogToRedis.includes(logLevel)) {
this.crawlState.logError(string);
}
};
Logger.prototype.info = function (message, data, context) {
if (data === void 0) { data = {}; }
if (context === void 0) { context = "general"; }
this.logAsJSON(message, data, context);
};
Logger.prototype.error = function (message, data, context) {
if (data === void 0) { data = {}; }
if (context === void 0) { context = "general"; }
this.logAsJSON(message, data, context, "error");
};
Logger.prototype.warn = function (message, data, context) {
if (data === void 0) { data = {}; }
if (context === void 0) { context = "general"; }
this.logAsJSON(message, data, context, "warn");
};
Logger.prototype.debug = function (message, data, context) {
if (data === void 0) { data = {}; }
if (context === void 0) { context = "general"; }
if (this.debugLogging) {
this.logAsJSON(message, data, context, "debug");
}
};
Logger.prototype.fatal = function (message, data, context, exitCode) {
if (data === void 0) { data = {}; }
if (context === void 0) { context = "general"; }
if (exitCode === void 0) { exitCode = 17; }
this.logAsJSON("".concat(message, ". Quitting"), data, context, "fatal");
function markFailedAndEnd(crawlState) {
return __awaiter(this, void 0, void 0, function () {
return __generator(this, function (_a) {
switch (_a.label) {
case 0: return [4 /*yield*/, crawlState.setStatus("failed")];
case 1:
_a.sent();
return [4 /*yield*/, crawlState.setEndTime()];
case 2:
_a.sent();
return [2 /*return*/];
}
});
});
}
if (this.crawlState) {
markFailedAndEnd(this.crawlState)["finally"](process.exit(exitCode));
}
else {
process.exit(exitCode);
}
};
return Logger;
}());
exports.logger = new Logger();

View file

@ -1,94 +0,0 @@
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
if (ar || !(i in from)) {
if (!ar) ar = Array.prototype.slice.call(from, 0, i);
ar[i] = from[i];
}
}
return to.concat(ar || Array.prototype.slice.call(from));
};
exports.__esModule = true;
exports.setExitOnRedisError = exports.initRedis = void 0;
var ioredis_1 = require("ioredis");
var logger_js_1 = require("./logger.js");
var error = console.error;
var lastLogTime = 0;
var exitOnError = false;
// log only once every 10 seconds
var REDIS_ERROR_LOG_INTERVAL_SECS = 10000;
console.error = function () {
var args = [];
for (var _i = 0; _i < arguments.length; _i++) {
args[_i] = arguments[_i];
}
if (typeof args[0] === "string" &&
args[0].indexOf("[ioredis] Unhandled error event") === 0) {
var now = Date.now();
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
if (lastLogTime && exitOnError) {
logger_js_1.logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
}
logger_js_1.logger.warn("ioredis error", { error: args[0] }, "redis");
lastLogTime = now;
}
return;
}
error.call.apply(error, __spreadArray([console], args, false));
};
function initRedis(url) {
return __awaiter(this, void 0, void 0, function () {
var redis;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
redis = new ioredis_1["default"](url, { lazyConnect: true });
return [4 /*yield*/, redis.connect()];
case 1:
_a.sent();
return [2 /*return*/, redis];
}
});
});
}
exports.initRedis = initRedis;
function setExitOnRedisError() {
exitOnError = true;
}
exports.setExitOnRedisError = setExitOnRedisError;

33
src/util/file_reader.ts Normal file
View file

@ -0,0 +1,33 @@
import fs from "fs";
import path from "path";
const MAX_DEPTH = 2;
export function collectAllFileSources(fileOrDir, ext = null, depth = 0) {
const resolvedPath = path.resolve(fileOrDir);
if (depth >= MAX_DEPTH) {
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
return [];
}
const stat = fs.statSync(resolvedPath);
if (stat.isFile && (ext === null || path.extname(resolvedPath) === ext)) {
const contents = fs.readFileSync(resolvedPath);
return [`/* src: ${resolvedPath} */\n\n${contents}`];
}
if (stat.isDirectory) {
const files = fs.readdirSync(resolvedPath);
return files.reduce((acc, next) => {
const nextPath = path.join(fileOrDir, next);
return [...acc, ...collectAllFileSources(nextPath, ext, depth + 1)];
}, []);
}
if (depth === 0) {
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
return [];
}
}

View file

@ -12,6 +12,12 @@ const indexHTML = fs.readFileSync(new URL("../html/screencast.html", import.meta
// ===========================================================================
class WSTransport
{
allWS = new Set<WebSocket>();
caster?: ScreenCaster;
wss: ws.Server;
httpServer: any;
constructor(port) {
this.allWS = new Set();
@ -87,8 +93,13 @@ class WSTransport
// ===========================================================================
class RedisPubSubTransport
{
numConnections: number = 0;
castChannel: string;
caster?: ScreenCaster;
ctrlChannel: string;
redis: any;
constructor(redisUrl, crawlId) {
this.numConnections = 0;
this.castChannel = `c:${crawlId}:cast`;
this.ctrlChannel = `c:${crawlId}:ctrl`;
@ -143,6 +154,14 @@ class RedisPubSubTransport
// ===========================================================================
class ScreenCaster
{
transport: WSTransport;
caches = new Map();
urls = new Map();
cdps = new Map();
maxWidth = 640;
maxHeight = 480;
initMsg: {[key: string]: any};
constructor(transport, numWorkers) {
this.transport = transport;
this.transport.caster = this;

102
src/util/screenshots.ts Normal file
View file

@ -0,0 +1,102 @@
import fs from "fs";
import path from "path";
import * as warcio from "warcio";
import sharp from "sharp";
import { logger, errJSON } from "./logger.js";
// ============================================================================
export const screenshotTypes = {
"view": {
type: "png",
omitBackground: true,
fullPage: false
},
"thumbnail": {
type: "jpeg",
omitBackground: true,
fullPage: false
},
"fullPage": {
type: "png",
omitBackground: true,
fullPage: true
}
};
export class Screenshots {
browser: any;
page: any;
url: string;
directory: string;
warcName: string;
date: Date;
constructor({browser, page, url, date, directory}) {
this.browser = browser;
this.page = page;
this.url = url;
this.directory = directory;
this.warcName = path.join(this.directory, "screenshots.warc.gz");
this.date = date ? date : new Date();
}
async take(screenshotType="view") {
try {
if (screenshotType !== "fullPage") {
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
}
const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options);
await this.writeBufferToWARC(screenshotBuffer, screenshotType, options.type);
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}
async takeFullPage() {
await this.take("fullPage");
}
async takeThumbnail() {
const screenshotType = "thumbnail";
try {
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options);
const thumbnailBuffer = await sharp(screenshotBuffer)
// 16:9 thumbnail
.resize(640, 360)
.toBuffer();
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, options.type);
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
} catch (e) {
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
}
}
async writeBufferToWARC(screenshotBuffer, screenshotType, imageType) {
const warcRecord = await this.wrap(screenshotBuffer, screenshotType, imageType);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
fs.appendFileSync(this.warcName, warcRecordBuffer);
}
async wrap(buffer, screenshotType="screenshot", imageType="png") {
const warcVersion = "WARC/1.1";
const warcRecordType = "resource";
const warcHeaders = {"Content-Type": `image/${imageType}`};
async function* content() {
yield buffer;
}
let screenshotUrl = `urn:${screenshotType}:` + this.url;
return warcio.WARCRecord.create({
url: screenshotUrl,
date: this.date.toISOString(),
type: warcRecordType,
warcVersion,
warcHeaders}, content());
}
}

212
src/util/seeds.ts Normal file
View file

@ -0,0 +1,212 @@
import { logger } from "./logger.js";
import { MAX_DEPTH } from "./constants.js";
type ScopeType =
| "prefix"
| "host"
| "domain"
| "page"
| "page-spa"
| "any"
| "custom";
export class ScopedSeed
{
url: string;
scopeType: ScopeType;
include: RegExp[];
exclude?: RegExp[] = [];
allowHash = false;
depth = -1;
sitemap?: string;
extraHops = 0;
maxExtraHops = 0;
maxDepth = 0;
constructor({url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: boolean, extraHops?: number}) {
const parsedUrl = this.parseUrl(url);
if (!parsedUrl) {
throw new Error("Invalid URL");
}
this.url = parsedUrl.href;
this.include = this.parseRx(include);
this.exclude = this.parseRx(exclude);
this.scopeType = scopeType;
if (!this.scopeType) {
this.scopeType = this.include.length ? "custom" : "prefix";
}
if (this.scopeType !== "custom") {
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
this.include = [...includeNew, ...this.include];
allowHash = allowHashNew;
}
// for page scope, the depth is set to extraHops, as no other
// crawling is done
if (this.scopeType === "page") {
depth = extraHops;
}
this.sitemap = this.resolveSiteMap(sitemap);
this.allowHash = allowHash;
this.maxExtraHops = extraHops;
this.maxDepth = depth < 0 ? MAX_DEPTH : depth;
}
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
parseRx(value) {
if (value === null || value === undefined || value === "") {
return [];
} else if (!(value instanceof Array)) {
return [new RegExp(value)];
} else {
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
}
}
parseUrl(url, logDetails = {}) {
let parsedUrl = null;
try {
parsedUrl = new URL(url.trim());
} catch (e) {
logger.warn("Invalid Page - not a valid URL", {url, ...logDetails});
return null;
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
parsedUrl = null;
}
return parsedUrl;
}
resolveSiteMap(sitemap) {
if (sitemap === true) {
const url = new URL(this.url);
url.pathname = "/sitemap.xml";
return url.href;
}
return sitemap;
}
scopeFromType(scopeType, parsedUrl) : [RegExp[], boolean] {
let include : RegExp[];
let allowHash = false;
switch (scopeType) {
case "page":
include = [];
break;
case "page-spa":
// allow scheme-agnostic URLS as likely redirects
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
allowHash = true;
break;
case "prefix":
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
break;
case "host":
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
break;
case "domain":
if (parsedUrl.hostname.startsWith("www.")) {
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
}
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
break;
case "any":
include = [/.*/];
break;
default:
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
}
return [include, allowHash];
}
isAtMaxDepth(depth) {
return depth >= this.maxDepth;
}
isIncluded(url, depth, extraHops = 0, logDetails = {}) {
if (depth > this.maxDepth) {
return false;
}
url = this.parseUrl(url, logDetails);
if (!url) {
return false;
}
if (!this.allowHash) {
// remove hashtag
url.hash = "";
}
url = url.href;
if (url === this.url) {
return true;
}
// skip already crawled
// if (this.seenList.has(url)) {
// return false;
//}
let inScope = false;
// check scopes
for (const s of this.include) {
if (s.test(url)) {
inScope = true;
break;
}
}
let isOOS = false;
if (!inScope) {
if (this.maxExtraHops && extraHops <= this.maxExtraHops) {
isOOS = true;
} else {
//console.log(`Not in scope ${url} ${this.include}`);
return false;
}
}
// check exclusions
for (const e of this.exclude) {
if (e.test(url)) {
//console.log(`Skipping ${url} excluded by ${e}`);
return false;
}
}
return {url, isOOS};
}
}
export function rxEscape(string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}
export function urlRxEscape(url, parsedUrl) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
}

37
src/util/timing.ts Normal file
View file

@ -0,0 +1,37 @@
import { logger } from "./logger.js";
export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
// return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000;
const rejectPromiseOnTimeout = (timeout) => {
return new Promise((resolve, reject) => {
setTimeout(() => (reject("timeout reached")), timeout);
});
};
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
.catch((err) => {
if (err == "timeout reached") {
const logFunc = isWarn ? logger.warn : logger.error;
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
} else {
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err;
}
});
}
export function secondsElapsed(startTime, nowDate = null) {
nowDate = nowDate || new Date();
return (nowDate.getTime() - startTime) / 1000;
}
export function timestampNow() {
return new Date().toISOString().replace(/[^\d]/g, "");
}

View file

@ -934,6 +934,13 @@
resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26"
integrity sha512-8/bnjSZD86ZfpBsDlCIkNXIvm+h6wi9g7IqL+kmFkQ+Wvu3JrasgLElfiPgoo8V8vVfnEi0QVS12gbl94h9YsQ==
"@types/node@^20.8.7":
version "20.8.7"
resolved "https://registry.yarnpkg.com/@types/node/-/node-20.8.7.tgz#ad23827850843de973096edfc5abc9e922492a25"
integrity sha512-21TKHHh3eUHIi2MloeptJWALuCu5H7HQTdTrWIFReA8ad+aggoX+lRes3ex7/FtpC+sVUpFMQ+QTfYr74mruiQ==
dependencies:
undici-types "~5.25.1"
"@types/prettier@^2.1.5":
version "2.7.1"
resolved "https://registry.yarnpkg.com/@types/prettier/-/prettier-2.7.1.tgz#dfd20e2dc35f027cdd6c1908e80a5ddc7499670e"
@ -4714,6 +4721,11 @@ unbzip2-stream@1.4.3:
buffer "^5.2.1"
through "^2.3.8"
undici-types@~5.25.1:
version "5.25.3"
resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.25.3.tgz#e044115914c85f0bcbb229f346ab739f064998c3"
integrity sha512-Ga1jfYwRn7+cP9v8auvEXN1rX3sWqlayd4HP7OKk4mZWylEmu3KzXDUGrQUN6Ol7qo1gPvB2e5gX6udnyEPgdA==
unique-string@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/unique-string/-/unique-string-3.0.0.tgz#84a1c377aff5fd7a8bc6b55d8244b2bd90d75b9a"