browsertrix-crawler/util/timing.js
Ilya Kreymer 877d9f5b44
Use new browser-based archiving mechanism instead of pywb proxy (#424)
Major refactoring of Browsertrix Crawler to native capture network traffic to WARC files
via the Chrome Debug Protocol (CDP). Allows for more flexibility and accuracy when dealing
with HTTP/2.x sites and avoids a MITM proxy. Addresses #343 

Changes include:
- Recorder class for capture CDP network traffic for each page.
- Handling requests from service workers via matching active frames, skipping unrelated requests outside the page (from background pages, etc..)
- WARC writing support via TS-based warcio.js library.
- Generates single WARC file per worker (still need to add size rollover).
- Request interception via Fetch.requestPaused
- Rule-based rewriting response support (via wabac.js), using Fetch.getResponseBody() / Fetch.fulfillRequest()
- Streaming responses via three methods: inline response fetch via Fetch.takeResponseBodyAsStream, 
async loading via browser network stack with Network.loadNetworkResource() and node-based async fetch
via fetch()
- Direct async fetch() capture of non-HTML URLs
- Awaiting for all requests to finish before moving on to next page, upto page timeout.
- Experimental: generate CDXJ on-the-fly as WARC is being written (not yet in use).
- removed pywb, using cdxj-indexer for --generateCDX option.
2023-11-07 21:38:50 -08:00

37 lines
1.2 KiB
JavaScript

import { logger } from "./logger.js";
export function sleep(seconds) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
}
export function timedRun(promise, seconds, message="Promise timed out", logDetails={}, context="general", isWarn=false) {
// return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000;
const rejectPromiseOnTimeout = (timeout) => {
return new Promise((resolve, reject) => {
setTimeout(() => (reject("timeout reached")), timeout);
});
};
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
.catch((err) => {
if (err == "timeout reached") {
const logFunc = isWarn ? logger.warn : logger.error;
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
} else {
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err;
}
});
}
export function secondsElapsed(startTime, nowDate = null) {
nowDate = nowDate || new Date();
return (nowDate.getTime() - startTime) / 1000;
}
export function timestampNow() {
return new Date().toISOString().replace(/[^\d]/g, "");
}