various edge-case loading optimizations: (#709)

- rework 'should stream' logic:
* ensure 206 responses (or any response) greater than 25M are streamed
* response between 5M and 25M are read into memory if text/css/js as they may be rewritten
* responses <5M are read into memory
* responses with unknown size are streamed if a 2xx, otherwise read into memory, assuming error code responses may lack status codes but otherwise are small
- likely fix for issues in #706
- if too many range requests for same URL are being made, try
skipping/failing right away to reduce load
- assume main browser context is used not just for service workers,
always enable
- check false positive 'net-aborted' error that may actually be ok for
media, as well as documents
- improve logging
- interrupt any pending requests (that may be loading via browser
context) after page timeout, log dropped requests
---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-10-31 14:06:17 -07:00 committed by GitHub
parent 5c00bca2b4
commit e5bab8e7c8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 149 additions and 80 deletions

View file

@ -30,7 +30,7 @@
"p-queue": "^7.3.4", "p-queue": "^7.3.4",
"pixelmatch": "^5.3.0", "pixelmatch": "^5.3.0",
"pngjs": "^7.0.0", "pngjs": "^7.0.0",
"puppeteer-core": "^23.5.1", "puppeteer-core": "^23.6.0",
"sax": "^1.3.0", "sax": "^1.3.0",
"sharp": "^0.32.6", "sharp": "^0.32.6",
"tsc": "^2.0.4", "tsc": "^2.0.4",

View file

@ -175,6 +175,7 @@ export class Crawler {
finalExit = false; finalExit = false;
uploadAndDeleteLocal = false; uploadAndDeleteLocal = false;
done = false; done = false;
postCrawling = false;
textInPages = false; textInPages = false;
@ -1536,12 +1537,13 @@ self.__bx_behaviors.selectMainBehavior();
} }
async postCrawl() { async postCrawl() {
this.postCrawling = true;
logger.info("Crawling done");
if (this.params.combineWARC && !this.params.dryRun) { if (this.params.combineWARC && !this.params.dryRun) {
await this.combineWARC(); await this.combineWARC();
} }
logger.info("Crawling done");
if ( if (
(this.params.generateCDX || this.params.generateWACZ) && (this.params.generateCDX || this.params.generateWACZ) &&
!this.params.dryRun !this.params.dryRun

View file

@ -6,7 +6,7 @@ import { Readable } from "node:stream";
import os from "os"; import os from "os";
import path from "path"; import path from "path";
import { LogContext, logger } from "./logger.js"; import { formatErr, LogContext, logger } from "./logger.js";
import { initStorage } from "./storage.js"; import { initStorage } from "./storage.js";
import { DISPLAY, type ServiceWorkerOpt } from "./constants.js"; import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
@ -126,7 +126,7 @@ export class Browser {
? undefined ? undefined
: (target) => this.targetFilter(target), : (target) => this.targetFilter(target),
}; };
await this._init(launchOpts, ondisconnect, recording); await this._init(launchOpts, ondisconnect);
} }
targetFilter(target: Target) { targetFilter(target: Target) {
@ -392,7 +392,6 @@ export class Browser {
launchOpts: PuppeteerLaunchOptions, launchOpts: PuppeteerLaunchOptions,
// eslint-disable-next-line @typescript-eslint/ban-types // eslint-disable-next-line @typescript-eslint/ban-types
ondisconnect: Function | null = null, ondisconnect: Function | null = null,
recording: boolean,
) { ) {
this.browser = await puppeteer.launch(launchOpts); this.browser = await puppeteer.launch(launchOpts);
@ -400,9 +399,7 @@ export class Browser {
this.firstCDP = await target.createCDPSession(); this.firstCDP = await target.createCDPSession();
if (recording) { await this.browserContextFetch();
await this.serviceWorkerFetch();
}
if (ondisconnect) { if (ondisconnect) {
this.browser.on("disconnected", (err) => ondisconnect(err)); this.browser.on("disconnected", (err) => ondisconnect(err));
@ -479,35 +476,24 @@ export class Browser {
return { page, cdp }; return { page, cdp };
} }
async serviceWorkerFetch() { async browserContextFetch() {
if (!this.firstCDP) { if (!this.firstCDP) {
return; return;
} }
this.firstCDP.on("Fetch.requestPaused", async (params) => { this.firstCDP.on("Fetch.requestPaused", async (params) => {
const { frameId, requestId, networkId, request } = params; const { frameId, requestId, request } = params;
const { url } = request;
if (!this.firstCDP) { if (!this.firstCDP) {
throw new Error("CDP missing"); throw new Error("CDP missing");
} }
if (networkId) {
try {
await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) {
logger.warn(
"continueResponse failed",
{ url: request.url },
"recorder",
);
}
return;
}
let foundRecorder = null; let foundRecorder = null;
for (const recorder of this.recorders) { for (const recorder of this.recorders) {
if (recorder.swUrls.has(request.url)) { if (recorder.swUrls.has(url)) {
recorder.swFrameIds.add(frameId); recorder.swFrameIds.add(frameId);
} }
@ -520,16 +506,16 @@ export class Browser {
if (!foundRecorder) { if (!foundRecorder) {
logger.warn( logger.warn(
"Skipping URL from unknown frame", "Skipping URL from unknown frame",
{ url: request.url, frameId }, { url, frameId },
"recorder", "recorder",
); );
try { try {
await this.firstCDP.send("Fetch.continueResponse", { requestId }); await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) { } catch (e) {
logger.warn( logger.debug(
"continueResponse failed", "continueResponse failed",
{ url: request.url }, { url, ...formatErr(e), from: "serviceWorker" },
"recorder", "recorder",
); );
} }

View file

@ -122,6 +122,7 @@ export class Recorder {
pendingRequests!: Map<string, RequestResponseInfo>; pendingRequests!: Map<string, RequestResponseInfo>;
skipIds!: Set<string>; skipIds!: Set<string>;
pageInfo!: PageInfoRecord; pageInfo!: PageInfoRecord;
skipRangeUrls!: Map<string, number>;
swTargetId?: string | null; swTargetId?: string | null;
swFrameIds = new Set<string>(); swFrameIds = new Set<string>();
@ -130,7 +131,8 @@ export class Recorder {
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
logDetails: Record<string, any> = {}; logDetails: Record<string, any> = {};
skipping = false;
pageFinished = false;
gzip = true; gzip = true;
@ -169,6 +171,7 @@ export class Recorder {
frameIdToExecId: Map<string, number>; frameIdToExecId: Map<string, number>;
}) { }) {
this.frameIdToExecId = frameIdToExecId; this.frameIdToExecId = frameIdToExecId;
this.pageFinished = false;
// Fetch // Fetch
cdp.on("Fetch.requestPaused", (params) => { cdp.on("Fetch.requestPaused", (params) => {
@ -407,6 +410,8 @@ export class Recorder {
logNetwork("Network.loadingFailed", { logNetwork("Network.loadingFailed", {
requestId, requestId,
url, url,
errorText,
type,
...this.logDetails, ...this.logDetails,
}); });
@ -426,15 +431,14 @@ export class Recorder {
case "net::ERR_ABORTED": case "net::ERR_ABORTED":
// check if this is a false positive -- a valid download that's already been fetched // check if this is a false positive -- a valid download that's already been fetched
// the abort is just for page, but download will succeed // the abort is just for page, but download will succeed
if (type === "Document" && reqresp.isValidBinary()) { if (
(type === "Document" || type === "Media") &&
reqresp.isValidBinary()
) {
this.removeReqResp(requestId); this.removeReqResp(requestId);
return this.serializeToWARC(reqresp); return this.serializeToWARC(reqresp);
} else if ( } else if (url && reqresp.requestHeaders && type === "Media") {
url && this.removeReqResp(requestId);
reqresp.requestHeaders &&
reqresp.requestHeaders["x-browsertrix-fetch"]
) {
delete reqresp.requestHeaders["x-browsertrix-fetch"];
logger.warn( logger.warn(
"Attempt direct fetch of failed request", "Attempt direct fetch of failed request",
{ url, ...this.logDetails }, { url, ...this.logDetails },
@ -453,7 +457,7 @@ export class Recorder {
default: default:
logger.warn( logger.warn(
"Request failed", "Request failed",
{ url, errorText, ...this.logDetails }, { url, errorText, type, status: reqresp.status, ...this.logDetails },
"recorder", "recorder",
); );
} }
@ -495,7 +499,7 @@ export class Recorder {
async handleRequestPaused( async handleRequestPaused(
params: Protocol.Fetch.RequestPausedEvent, params: Protocol.Fetch.RequestPausedEvent,
cdp: CDPSession, cdp: CDPSession,
isSWorker = false, isBrowserContext = false,
) { ) {
const { const {
requestId, requestId,
@ -520,10 +524,13 @@ export class Recorder {
if ( if (
responseStatusCode && responseStatusCode &&
!responseErrorReason && !responseErrorReason &&
!this.shouldSkip(headers, url, method, resourceType) && !this.shouldSkip(headers, url, method, resourceType)
!(isSWorker && networkId)
) { ) {
continued = await this.handleFetchResponse(params, cdp, isSWorker); continued = await this.handleFetchResponse(
params,
cdp,
isBrowserContext,
);
} }
} catch (e) { } catch (e) {
logger.error( logger.error(
@ -549,7 +556,7 @@ export class Recorder {
async handleFetchResponse( async handleFetchResponse(
params: Protocol.Fetch.RequestPausedEvent, params: Protocol.Fetch.RequestPausedEvent,
cdp: CDPSession, cdp: CDPSession,
isSWorker: boolean, isBrowserContext: boolean,
) { ) {
const { request } = params; const { request } = params;
const { url } = request; const { url } = request;
@ -610,21 +617,44 @@ export class Recorder {
return false; return false;
} else { } else {
logger.debug( // logger.debug(
"Skip 206 Response", // "Skip 206 Response",
{ range, contentLen, url, ...this.logDetails }, // { range, contentLen, url, ...this.logDetails },
"recorder", // "recorder",
); // );
this.removeReqResp(networkId); this.removeReqResp(networkId);
const count = this.skipRangeUrls.get(url) || 0;
if (count > 2) {
// just fail additional range requests to save bandwidth, as these are not being recorded
await cdp.send("Fetch.failRequest", {
requestId,
errorReason: "BlockedByResponse",
});
return true;
}
this.skipRangeUrls.set(url, count + 1);
return false; return false;
} }
} }
const reqresp = this.pendingReqResp(networkId); const reqresp = this.pendingReqResp(networkId);
if (!reqresp) { if (!reqresp) {
return false; return false;
} }
// indicate that this is intercepted in the page context
if (!isBrowserContext) {
reqresp.inPageContext = true;
}
// Already being handled by a different handler
if (reqresp.fetchContinued) {
return false;
}
reqresp.fetchContinued = true;
if ( if (
url === this.pageUrl && url === this.pageUrl &&
(!this.pageInfo.ts || (!this.pageInfo.ts ||
@ -643,12 +673,6 @@ export class Recorder {
if (this.noResponseForStatus(responseStatusCode)) { if (this.noResponseForStatus(responseStatusCode)) {
reqresp.payload = new Uint8Array(); reqresp.payload = new Uint8Array();
if (isSWorker) {
this.removeReqResp(networkId);
await this.serializeToWARC(reqresp);
}
return false; return false;
} }
@ -656,13 +680,13 @@ export class Recorder {
let streamingConsume = false; let streamingConsume = false;
// if contentLength is large or unknown, do streaming, unless its an essential resource
// in which case, need to do a full fetch either way
// don't count non-200 responses which may not have content-length
if ( if (
(contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) && this.shouldStream(
responseStatusCode === 200 && contentLen,
!this.isEssentialResource(reqresp.resourceType, mimeType) responseStatusCode || 0,
reqresp.resourceType || "",
mimeType,
)
) { ) {
const opts: ResponseStreamAsyncFetchOptions = { const opts: ResponseStreamAsyncFetchOptions = {
reqresp, reqresp,
@ -724,9 +748,9 @@ export class Recorder {
const rewritten = await this.rewriteResponse(reqresp, mimeType); const rewritten = await this.rewriteResponse(reqresp, mimeType);
// if in service worker, serialize here // if in browser context, and not also intercepted in page context
// as won't be getting a loadingFinished message // serialize here, as won't be getting a loadingFinished message for it
if (isSWorker && reqresp.payload) { if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
this.removeReqResp(networkId); this.removeReqResp(networkId);
await this.serializeToWARC(reqresp); await this.serializeToWARC(reqresp);
} }
@ -794,7 +818,8 @@ export class Recorder {
} }
this.pendingRequests = new Map(); this.pendingRequests = new Map();
this.skipIds = new Set(); this.skipIds = new Set();
this.skipping = false; this.skipRangeUrls = new Map<string, number>();
this.pageFinished = false;
this.pageInfo = { this.pageInfo = {
pageid, pageid,
urls: {}, urls: {},
@ -861,8 +886,14 @@ export class Recorder {
let numPending = this.pendingRequests.size; let numPending = this.pendingRequests.size;
while (numPending && !this.crawler.interrupted) { let pending = [];
const pending = []; while (
numPending &&
!this.pageFinished &&
!this.crawler.interrupted &&
!this.crawler.postCrawling
) {
pending = [];
for (const [requestId, reqresp] of this.pendingRequests.entries()) { for (const [requestId, reqresp] of this.pendingRequests.entries()) {
const url = reqresp.url || ""; const url = reqresp.url || "";
const entry: { const entry: {
@ -892,11 +923,24 @@ export class Recorder {
await sleep(5.0); await sleep(5.0);
numPending = this.pendingRequests.size; numPending = this.pendingRequests.size;
} }
if (this.pendingRequests.size) {
logger.warn(
"Dropping timed out requests",
{ numPending, pending, ...this.logDetails },
"recorder",
);
for (const requestId of this.pendingRequests.keys()) {
this.removeReqResp(requestId);
}
}
} }
async onClosePage() { async onClosePage() {
// Any page-specific handling before page is closed. // Any page-specific handling before page is closed.
this.frameIdToExecId = null; this.frameIdToExecId = null;
this.pageFinished = true;
} }
async onDone(timeout: number) { async onDone(timeout: number) {
@ -1019,7 +1063,7 @@ export class Recorder {
} }
} }
isEssentialResource(resourceType: string | undefined, contentType: string) { isEssentialResource(resourceType: string, contentType: string) {
if (resourceType === "script" || resourceType === "stylesheet") { if (resourceType === "script" || resourceType === "stylesheet") {
return true; return true;
} }
@ -1031,6 +1075,41 @@ export class Recorder {
return false; return false;
} }
shouldStream(
contentLength: number,
responseStatusCode: number,
resourceType: string,
mimeType: string,
) {
// if contentLength is too large even for rewriting, always stream, will not do rewriting
// even if text
if (contentLength > MAX_TEXT_REWRITE_SIZE) {
return true;
}
// if contentLength larger but is essential resource, do stream
// otherwise full fetch for rewriting
if (
contentLength > MAX_BROWSER_DEFAULT_FETCH_SIZE &&
!this.isEssentialResource(resourceType, mimeType)
) {
return true;
}
// if contentLength is unknown, also stream if its an essential resource and not 3xx / 4xx / 5xx
// status code, as these codes may have no content-length, and are likely small
if (
contentLength < 0 &&
!this.isEssentialResource(resourceType, mimeType) &&
responseStatusCode >= 200 &&
responseStatusCode < 300
) {
return true;
}
return false;
}
protected getMimeType( protected getMimeType(
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[], headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
) { ) {
@ -1089,10 +1168,6 @@ export class Recorder {
logNetwork("Skipping ignored id", { requestId }); logNetwork("Skipping ignored id", { requestId });
return null; return null;
} }
if (this.skipping) {
//logger.debug("Skipping request, page already finished", this.logDetails, "recorder");
return null;
}
const reqresp = new RequestResponseInfo(requestId); const reqresp = new RequestResponseInfo(requestId);
this.pendingRequests.set(requestId, reqresp); this.pendingRequests.set(requestId, reqresp);
return reqresp; return reqresp;
@ -1395,7 +1470,7 @@ class AsyncFetcher {
reqresp.payload = Buffer.concat(buffers, currSize); reqresp.payload = Buffer.concat(buffers, currSize);
externalBuffer.buffers = [reqresp.payload]; externalBuffer.buffers = [reqresp.payload];
} else if (fh) { } else if (fh) {
logger.warn( logger.debug(
"Large payload written to WARC, but not returned to browser (would require rereading into memory)", "Large payload written to WARC, but not returned to browser (would require rereading into memory)",
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize }, { url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
"recorder", "recorder",

View file

@ -49,6 +49,12 @@ export class RequestResponseInfo {
payload?: Uint8Array; payload?: Uint8Array;
isRemoveRange = false; isRemoveRange = false;
// fetchContinued - avoid duplicate fetch response handling
fetchContinued = false;
// is handled in page context
inPageContext = false;
// misc // misc
fromServiceWorker = false; fromServiceWorker = false;
fromCache = false; fromCache = false;

View file

@ -2075,10 +2075,10 @@ detect-newline@^3.0.0:
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651" resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA== integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
devtools-protocol@0.0.1342118: devtools-protocol@0.0.1354347:
version "0.0.1342118" version "0.0.1354347"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1342118.tgz#ea136fc1701572c0830233dcb414dc857e582e0a" resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1354347.tgz#5cb509610b8f61fc69a31e5c810d5bed002d85ea"
integrity sha512-75fMas7PkYNDTmDyb6PRJCH7ILmHLp+BhrZGeMsa4bCh40DTxgCz2NRy5UDzII4C5KuD0oBMZ9vXKhEl6UD/3w== integrity sha512-BlmkSqV0V84E2WnEnoPnwyix57rQxAM5SKJjf4TbYOCGLAWtz8CDH8RIaGOjPgPCXo2Mce3kxSY497OySidY3Q==
diff-sequences@^29.6.3: diff-sequences@^29.6.3:
version "29.6.3" version "29.6.3"
@ -4375,15 +4375,15 @@ punycode@^2.1.0:
resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec" resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A== integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==
puppeteer-core@^23.5.1: puppeteer-core@^23.6.0:
version "23.5.1" version "23.6.0"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.5.1.tgz#fac4268820c35d3172e783a1f1a39773b2c0f7c6" resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.6.0.tgz#a3e1e09c05f47fb8ca2bc9d4ca200d18e3704303"
integrity sha512-We6xKCSZaZ23+GAYckeNfeDeJIVuhxOBsh/gZkbULu/XLFJ3umSiiQ8Ey927h3g/XrCCr8CnSZ5fvP5v2vB5Yw== integrity sha512-se1bhgUpR9C529SgHGr/eyT92mYyQPAhA2S9pGtGrVG2xob9qE6Pbp7TlqiSPlnnY1lINqhn6/67EwkdzOmKqQ==
dependencies: dependencies:
"@puppeteer/browsers" "2.4.0" "@puppeteer/browsers" "2.4.0"
chromium-bidi "0.8.0" chromium-bidi "0.8.0"
debug "^4.3.7" debug "^4.3.7"
devtools-protocol "0.0.1342118" devtools-protocol "0.0.1354347"
typed-query-selector "^2.12.0" typed-query-selector "^2.12.0"
ws "^8.18.0" ws "^8.18.0"