mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
various edge-case loading optimizations: (#709)
- rework 'should stream' logic: * ensure 206 responses (or any response) greater than 25M are streamed * response between 5M and 25M are read into memory if text/css/js as they may be rewritten * responses <5M are read into memory * responses with unknown size are streamed if a 2xx, otherwise read into memory, assuming error code responses may lack status codes but otherwise are small - likely fix for issues in #706 - if too many range requests for same URL are being made, try skipping/failing right away to reduce load - assume main browser context is used not just for service workers, always enable - check false positive 'net-aborted' error that may actually be ok for media, as well as documents - improve logging - interrupt any pending requests (that may be loading via browser context) after page timeout, log dropped requests --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
5c00bca2b4
commit
e5bab8e7c8
6 changed files with 149 additions and 80 deletions
|
@ -30,7 +30,7 @@
|
|||
"p-queue": "^7.3.4",
|
||||
"pixelmatch": "^5.3.0",
|
||||
"pngjs": "^7.0.0",
|
||||
"puppeteer-core": "^23.5.1",
|
||||
"puppeteer-core": "^23.6.0",
|
||||
"sax": "^1.3.0",
|
||||
"sharp": "^0.32.6",
|
||||
"tsc": "^2.0.4",
|
||||
|
|
|
@ -175,6 +175,7 @@ export class Crawler {
|
|||
finalExit = false;
|
||||
uploadAndDeleteLocal = false;
|
||||
done = false;
|
||||
postCrawling = false;
|
||||
|
||||
textInPages = false;
|
||||
|
||||
|
@ -1536,12 +1537,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
|
||||
async postCrawl() {
|
||||
this.postCrawling = true;
|
||||
logger.info("Crawling done");
|
||||
|
||||
if (this.params.combineWARC && !this.params.dryRun) {
|
||||
await this.combineWARC();
|
||||
}
|
||||
|
||||
logger.info("Crawling done");
|
||||
|
||||
if (
|
||||
(this.params.generateCDX || this.params.generateWACZ) &&
|
||||
!this.params.dryRun
|
||||
|
|
|
@ -6,7 +6,7 @@ import { Readable } from "node:stream";
|
|||
import os from "os";
|
||||
import path from "path";
|
||||
|
||||
import { LogContext, logger } from "./logger.js";
|
||||
import { formatErr, LogContext, logger } from "./logger.js";
|
||||
import { initStorage } from "./storage.js";
|
||||
|
||||
import { DISPLAY, type ServiceWorkerOpt } from "./constants.js";
|
||||
|
@ -126,7 +126,7 @@ export class Browser {
|
|||
? undefined
|
||||
: (target) => this.targetFilter(target),
|
||||
};
|
||||
await this._init(launchOpts, ondisconnect, recording);
|
||||
await this._init(launchOpts, ondisconnect);
|
||||
}
|
||||
|
||||
targetFilter(target: Target) {
|
||||
|
@ -392,7 +392,6 @@ export class Browser {
|
|||
launchOpts: PuppeteerLaunchOptions,
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
ondisconnect: Function | null = null,
|
||||
recording: boolean,
|
||||
) {
|
||||
this.browser = await puppeteer.launch(launchOpts);
|
||||
|
||||
|
@ -400,9 +399,7 @@ export class Browser {
|
|||
|
||||
this.firstCDP = await target.createCDPSession();
|
||||
|
||||
if (recording) {
|
||||
await this.serviceWorkerFetch();
|
||||
}
|
||||
await this.browserContextFetch();
|
||||
|
||||
if (ondisconnect) {
|
||||
this.browser.on("disconnected", (err) => ondisconnect(err));
|
||||
|
@ -479,35 +476,24 @@ export class Browser {
|
|||
return { page, cdp };
|
||||
}
|
||||
|
||||
async serviceWorkerFetch() {
|
||||
async browserContextFetch() {
|
||||
if (!this.firstCDP) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.firstCDP.on("Fetch.requestPaused", async (params) => {
|
||||
const { frameId, requestId, networkId, request } = params;
|
||||
const { frameId, requestId, request } = params;
|
||||
|
||||
const { url } = request;
|
||||
|
||||
if (!this.firstCDP) {
|
||||
throw new Error("CDP missing");
|
||||
}
|
||||
|
||||
if (networkId) {
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.warn(
|
||||
"continueResponse failed",
|
||||
{ url: request.url },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
let foundRecorder = null;
|
||||
|
||||
for (const recorder of this.recorders) {
|
||||
if (recorder.swUrls.has(request.url)) {
|
||||
if (recorder.swUrls.has(url)) {
|
||||
recorder.swFrameIds.add(frameId);
|
||||
}
|
||||
|
||||
|
@ -520,16 +506,16 @@ export class Browser {
|
|||
if (!foundRecorder) {
|
||||
logger.warn(
|
||||
"Skipping URL from unknown frame",
|
||||
{ url: request.url, frameId },
|
||||
{ url, frameId },
|
||||
"recorder",
|
||||
);
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.warn(
|
||||
logger.debug(
|
||||
"continueResponse failed",
|
||||
{ url: request.url },
|
||||
{ url, ...formatErr(e), from: "serviceWorker" },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
|
|
@ -122,6 +122,7 @@ export class Recorder {
|
|||
pendingRequests!: Map<string, RequestResponseInfo>;
|
||||
skipIds!: Set<string>;
|
||||
pageInfo!: PageInfoRecord;
|
||||
skipRangeUrls!: Map<string, number>;
|
||||
|
||||
swTargetId?: string | null;
|
||||
swFrameIds = new Set<string>();
|
||||
|
@ -130,7 +131,8 @@ export class Recorder {
|
|||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logDetails: Record<string, any> = {};
|
||||
skipping = false;
|
||||
|
||||
pageFinished = false;
|
||||
|
||||
gzip = true;
|
||||
|
||||
|
@ -169,6 +171,7 @@ export class Recorder {
|
|||
frameIdToExecId: Map<string, number>;
|
||||
}) {
|
||||
this.frameIdToExecId = frameIdToExecId;
|
||||
this.pageFinished = false;
|
||||
|
||||
// Fetch
|
||||
cdp.on("Fetch.requestPaused", (params) => {
|
||||
|
@ -407,6 +410,8 @@ export class Recorder {
|
|||
logNetwork("Network.loadingFailed", {
|
||||
requestId,
|
||||
url,
|
||||
errorText,
|
||||
type,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
|
@ -426,15 +431,14 @@ export class Recorder {
|
|||
case "net::ERR_ABORTED":
|
||||
// check if this is a false positive -- a valid download that's already been fetched
|
||||
// the abort is just for page, but download will succeed
|
||||
if (type === "Document" && reqresp.isValidBinary()) {
|
||||
if (
|
||||
(type === "Document" || type === "Media") &&
|
||||
reqresp.isValidBinary()
|
||||
) {
|
||||
this.removeReqResp(requestId);
|
||||
return this.serializeToWARC(reqresp);
|
||||
} else if (
|
||||
url &&
|
||||
reqresp.requestHeaders &&
|
||||
reqresp.requestHeaders["x-browsertrix-fetch"]
|
||||
) {
|
||||
delete reqresp.requestHeaders["x-browsertrix-fetch"];
|
||||
} else if (url && reqresp.requestHeaders && type === "Media") {
|
||||
this.removeReqResp(requestId);
|
||||
logger.warn(
|
||||
"Attempt direct fetch of failed request",
|
||||
{ url, ...this.logDetails },
|
||||
|
@ -453,7 +457,7 @@ export class Recorder {
|
|||
default:
|
||||
logger.warn(
|
||||
"Request failed",
|
||||
{ url, errorText, ...this.logDetails },
|
||||
{ url, errorText, type, status: reqresp.status, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
@ -495,7 +499,7 @@ export class Recorder {
|
|||
async handleRequestPaused(
|
||||
params: Protocol.Fetch.RequestPausedEvent,
|
||||
cdp: CDPSession,
|
||||
isSWorker = false,
|
||||
isBrowserContext = false,
|
||||
) {
|
||||
const {
|
||||
requestId,
|
||||
|
@ -520,10 +524,13 @@ export class Recorder {
|
|||
if (
|
||||
responseStatusCode &&
|
||||
!responseErrorReason &&
|
||||
!this.shouldSkip(headers, url, method, resourceType) &&
|
||||
!(isSWorker && networkId)
|
||||
!this.shouldSkip(headers, url, method, resourceType)
|
||||
) {
|
||||
continued = await this.handleFetchResponse(params, cdp, isSWorker);
|
||||
continued = await this.handleFetchResponse(
|
||||
params,
|
||||
cdp,
|
||||
isBrowserContext,
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
|
@ -549,7 +556,7 @@ export class Recorder {
|
|||
async handleFetchResponse(
|
||||
params: Protocol.Fetch.RequestPausedEvent,
|
||||
cdp: CDPSession,
|
||||
isSWorker: boolean,
|
||||
isBrowserContext: boolean,
|
||||
) {
|
||||
const { request } = params;
|
||||
const { url } = request;
|
||||
|
@ -610,21 +617,44 @@ export class Recorder {
|
|||
|
||||
return false;
|
||||
} else {
|
||||
logger.debug(
|
||||
"Skip 206 Response",
|
||||
{ range, contentLen, url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
// logger.debug(
|
||||
// "Skip 206 Response",
|
||||
// { range, contentLen, url, ...this.logDetails },
|
||||
// "recorder",
|
||||
// );
|
||||
this.removeReqResp(networkId);
|
||||
const count = this.skipRangeUrls.get(url) || 0;
|
||||
if (count > 2) {
|
||||
// just fail additional range requests to save bandwidth, as these are not being recorded
|
||||
await cdp.send("Fetch.failRequest", {
|
||||
requestId,
|
||||
errorReason: "BlockedByResponse",
|
||||
});
|
||||
return true;
|
||||
}
|
||||
this.skipRangeUrls.set(url, count + 1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const reqresp = this.pendingReqResp(networkId);
|
||||
|
||||
if (!reqresp) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// indicate that this is intercepted in the page context
|
||||
if (!isBrowserContext) {
|
||||
reqresp.inPageContext = true;
|
||||
}
|
||||
|
||||
// Already being handled by a different handler
|
||||
if (reqresp.fetchContinued) {
|
||||
return false;
|
||||
}
|
||||
|
||||
reqresp.fetchContinued = true;
|
||||
|
||||
if (
|
||||
url === this.pageUrl &&
|
||||
(!this.pageInfo.ts ||
|
||||
|
@ -643,12 +673,6 @@ export class Recorder {
|
|||
|
||||
if (this.noResponseForStatus(responseStatusCode)) {
|
||||
reqresp.payload = new Uint8Array();
|
||||
|
||||
if (isSWorker) {
|
||||
this.removeReqResp(networkId);
|
||||
await this.serializeToWARC(reqresp);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -656,13 +680,13 @@ export class Recorder {
|
|||
|
||||
let streamingConsume = false;
|
||||
|
||||
// if contentLength is large or unknown, do streaming, unless its an essential resource
|
||||
// in which case, need to do a full fetch either way
|
||||
// don't count non-200 responses which may not have content-length
|
||||
if (
|
||||
(contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) &&
|
||||
responseStatusCode === 200 &&
|
||||
!this.isEssentialResource(reqresp.resourceType, mimeType)
|
||||
this.shouldStream(
|
||||
contentLen,
|
||||
responseStatusCode || 0,
|
||||
reqresp.resourceType || "",
|
||||
mimeType,
|
||||
)
|
||||
) {
|
||||
const opts: ResponseStreamAsyncFetchOptions = {
|
||||
reqresp,
|
||||
|
@ -724,9 +748,9 @@ export class Recorder {
|
|||
|
||||
const rewritten = await this.rewriteResponse(reqresp, mimeType);
|
||||
|
||||
// if in service worker, serialize here
|
||||
// as won't be getting a loadingFinished message
|
||||
if (isSWorker && reqresp.payload) {
|
||||
// if in browser context, and not also intercepted in page context
|
||||
// serialize here, as won't be getting a loadingFinished message for it
|
||||
if (isBrowserContext && !reqresp.inPageContext && reqresp.payload) {
|
||||
this.removeReqResp(networkId);
|
||||
await this.serializeToWARC(reqresp);
|
||||
}
|
||||
|
@ -794,7 +818,8 @@ export class Recorder {
|
|||
}
|
||||
this.pendingRequests = new Map();
|
||||
this.skipIds = new Set();
|
||||
this.skipping = false;
|
||||
this.skipRangeUrls = new Map<string, number>();
|
||||
this.pageFinished = false;
|
||||
this.pageInfo = {
|
||||
pageid,
|
||||
urls: {},
|
||||
|
@ -861,8 +886,14 @@ export class Recorder {
|
|||
|
||||
let numPending = this.pendingRequests.size;
|
||||
|
||||
while (numPending && !this.crawler.interrupted) {
|
||||
const pending = [];
|
||||
let pending = [];
|
||||
while (
|
||||
numPending &&
|
||||
!this.pageFinished &&
|
||||
!this.crawler.interrupted &&
|
||||
!this.crawler.postCrawling
|
||||
) {
|
||||
pending = [];
|
||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||
const url = reqresp.url || "";
|
||||
const entry: {
|
||||
|
@ -892,11 +923,24 @@ export class Recorder {
|
|||
await sleep(5.0);
|
||||
numPending = this.pendingRequests.size;
|
||||
}
|
||||
|
||||
if (this.pendingRequests.size) {
|
||||
logger.warn(
|
||||
"Dropping timed out requests",
|
||||
{ numPending, pending, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
for (const requestId of this.pendingRequests.keys()) {
|
||||
this.removeReqResp(requestId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async onClosePage() {
|
||||
// Any page-specific handling before page is closed.
|
||||
this.frameIdToExecId = null;
|
||||
|
||||
this.pageFinished = true;
|
||||
}
|
||||
|
||||
async onDone(timeout: number) {
|
||||
|
@ -1019,7 +1063,7 @@ export class Recorder {
|
|||
}
|
||||
}
|
||||
|
||||
isEssentialResource(resourceType: string | undefined, contentType: string) {
|
||||
isEssentialResource(resourceType: string, contentType: string) {
|
||||
if (resourceType === "script" || resourceType === "stylesheet") {
|
||||
return true;
|
||||
}
|
||||
|
@ -1031,6 +1075,41 @@ export class Recorder {
|
|||
return false;
|
||||
}
|
||||
|
||||
shouldStream(
|
||||
contentLength: number,
|
||||
responseStatusCode: number,
|
||||
resourceType: string,
|
||||
mimeType: string,
|
||||
) {
|
||||
// if contentLength is too large even for rewriting, always stream, will not do rewriting
|
||||
// even if text
|
||||
if (contentLength > MAX_TEXT_REWRITE_SIZE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// if contentLength larger but is essential resource, do stream
|
||||
// otherwise full fetch for rewriting
|
||||
if (
|
||||
contentLength > MAX_BROWSER_DEFAULT_FETCH_SIZE &&
|
||||
!this.isEssentialResource(resourceType, mimeType)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// if contentLength is unknown, also stream if its an essential resource and not 3xx / 4xx / 5xx
|
||||
// status code, as these codes may have no content-length, and are likely small
|
||||
if (
|
||||
contentLength < 0 &&
|
||||
!this.isEssentialResource(resourceType, mimeType) &&
|
||||
responseStatusCode >= 200 &&
|
||||
responseStatusCode < 300
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
protected getMimeType(
|
||||
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
||||
) {
|
||||
|
@ -1089,10 +1168,6 @@ export class Recorder {
|
|||
logNetwork("Skipping ignored id", { requestId });
|
||||
return null;
|
||||
}
|
||||
if (this.skipping) {
|
||||
//logger.debug("Skipping request, page already finished", this.logDetails, "recorder");
|
||||
return null;
|
||||
}
|
||||
const reqresp = new RequestResponseInfo(requestId);
|
||||
this.pendingRequests.set(requestId, reqresp);
|
||||
return reqresp;
|
||||
|
@ -1395,7 +1470,7 @@ class AsyncFetcher {
|
|||
reqresp.payload = Buffer.concat(buffers, currSize);
|
||||
externalBuffer.buffers = [reqresp.payload];
|
||||
} else if (fh) {
|
||||
logger.warn(
|
||||
logger.debug(
|
||||
"Large payload written to WARC, but not returned to browser (would require rereading into memory)",
|
||||
{ url, actualSize: reqresp.readSize, maxSize: this.maxFetchSize },
|
||||
"recorder",
|
||||
|
|
|
@ -49,6 +49,12 @@ export class RequestResponseInfo {
|
|||
payload?: Uint8Array;
|
||||
isRemoveRange = false;
|
||||
|
||||
// fetchContinued - avoid duplicate fetch response handling
|
||||
fetchContinued = false;
|
||||
|
||||
// is handled in page context
|
||||
inPageContext = false;
|
||||
|
||||
// misc
|
||||
fromServiceWorker = false;
|
||||
fromCache = false;
|
||||
|
|
18
yarn.lock
18
yarn.lock
|
@ -2075,10 +2075,10 @@ detect-newline@^3.0.0:
|
|||
resolved "https://registry.yarnpkg.com/detect-newline/-/detect-newline-3.1.0.tgz#576f5dfc63ae1a192ff192d8ad3af6308991b651"
|
||||
integrity sha512-TLz+x/vEXm/Y7P7wn1EJFNLxYpUD4TgMosxY6fAVJUnJMbupHBOncxyWUG9OpTaH9EBD7uFI5LfEgmMOc54DsA==
|
||||
|
||||
devtools-protocol@0.0.1342118:
|
||||
version "0.0.1342118"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1342118.tgz#ea136fc1701572c0830233dcb414dc857e582e0a"
|
||||
integrity sha512-75fMas7PkYNDTmDyb6PRJCH7ILmHLp+BhrZGeMsa4bCh40DTxgCz2NRy5UDzII4C5KuD0oBMZ9vXKhEl6UD/3w==
|
||||
devtools-protocol@0.0.1354347:
|
||||
version "0.0.1354347"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1354347.tgz#5cb509610b8f61fc69a31e5c810d5bed002d85ea"
|
||||
integrity sha512-BlmkSqV0V84E2WnEnoPnwyix57rQxAM5SKJjf4TbYOCGLAWtz8CDH8RIaGOjPgPCXo2Mce3kxSY497OySidY3Q==
|
||||
|
||||
diff-sequences@^29.6.3:
|
||||
version "29.6.3"
|
||||
|
@ -4375,15 +4375,15 @@ punycode@^2.1.0:
|
|||
resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
|
||||
integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==
|
||||
|
||||
puppeteer-core@^23.5.1:
|
||||
version "23.5.1"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.5.1.tgz#fac4268820c35d3172e783a1f1a39773b2c0f7c6"
|
||||
integrity sha512-We6xKCSZaZ23+GAYckeNfeDeJIVuhxOBsh/gZkbULu/XLFJ3umSiiQ8Ey927h3g/XrCCr8CnSZ5fvP5v2vB5Yw==
|
||||
puppeteer-core@^23.6.0:
|
||||
version "23.6.0"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.6.0.tgz#a3e1e09c05f47fb8ca2bc9d4ca200d18e3704303"
|
||||
integrity sha512-se1bhgUpR9C529SgHGr/eyT92mYyQPAhA2S9pGtGrVG2xob9qE6Pbp7TlqiSPlnnY1lINqhn6/67EwkdzOmKqQ==
|
||||
dependencies:
|
||||
"@puppeteer/browsers" "2.4.0"
|
||||
chromium-bidi "0.8.0"
|
||||
debug "^4.3.7"
|
||||
devtools-protocol "0.0.1342118"
|
||||
devtools-protocol "0.0.1354347"
|
||||
typed-query-selector "^2.12.0"
|
||||
ws "^8.18.0"
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue