Fix 206 response + general video handling (#646)

Refactors handling of 206 responses:
- If a 206 response is encountered, and its actually the full range,
convert to 200 and rewrite range and content-range headers to x-range
and x-orig-range. This is to support rewriting of 206 responses for DASH
manifests
- If a partial 206 response starting with `0-`, do a full async fetch
separately.
- If a partial 206 response not starting with 0-, just ignore (very
likely a duplicate picked up when handling the 0- response)
- Don't stream content-types that can be rewritten, since streaming
prevents rewriting. Fixes rewriting on DASH/HLS manifests which have no
content-length and don't get properly rewritten.
- Overall, adds missing rewriting of DASH/HLS manifests that have no
content-length and are served as 206.
- Update to latest wabac.js which fixes rewriting of DASH manifest to
avoid duplicate '<?xml' prefix, webrecorder/wabac.js#192
- Fixes #645
This commit is contained in:
Ilya Kreymer 2024-07-17 13:24:25 -07:00 committed by GitHub
parent 01666b4474
commit 88a2fbd0a0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 128 additions and 59 deletions

View file

@ -42,7 +42,7 @@ ADD config/ /app/
ADD html/ /app/html/
ARG RWP_VERSION=2.1.1
ARG RWP_VERSION=2.1.2
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.2.4",
"version": "1.2.5",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.19.1",
"@webrecorder/wabac": "^2.19.4",
"browsertrix-behaviors": "^0.6.2",
"crc": "^4.3.2",
"fetch-socks": "^1.3.0",

View file

@ -20,7 +20,7 @@ export function formatErr(e: unknown): Record<string, any> {
} else if (typeof e === "object") {
return e || {};
} else {
return { message: (e as object).toString() };
return { message: (e as object) + "" };
}
}

View file

@ -45,6 +45,17 @@ const WRITE_DUPE_KEY = "s:writedupe";
const MIME_EVENT_STREAM = "text/event-stream";
const RW_MIME_TYPES = [
"application/x-mpegURL",
"application/vnd.apple.mpegurl",
"application/dash+xml",
"text/html",
"application/json",
"text/javascript",
"application/javascript",
"application/x-javascript",
];
const encoder = new TextEncoder();
// =================================================================
@ -76,7 +87,6 @@ export type PageInfoRecord = {
// =================================================================
export type AsyncFetchOptions = {
tempdir: string;
reqresp: RequestResponseInfo;
expectedSize?: number;
// eslint-disable-next-line no-use-before-define
@ -135,8 +145,6 @@ export class Recorder {
logDetails: Record<string, any> = {};
skipping = false;
allowFull206 = false;
tempdir: string;
gzip = true;
@ -439,7 +447,6 @@ export class Recorder {
"recorder",
);
const fetcher = new AsyncFetcher({
tempdir: this.tempdir,
reqresp,
recorder: this,
networkId: requestId,
@ -572,15 +579,40 @@ export class Recorder {
if (responseStatusCode === 206) {
const range = this._getContentRange(responseHeaders);
if (
this.allowFull206 &&
range === `bytes 0-${contentLen - 1}/${contentLen}`
) {
if (range === `bytes 0-${contentLen - 1}/${contentLen}`) {
logger.debug(
"Keep 206 Response, Full Range",
{ range, contentLen, url, networkId, ...this.logDetails },
"recorder",
);
} else if (range?.startsWith("bytes 0-")) {
logger.debug(
"Re-request 206 Response without range",
{ range, contentLen, url, ...this.logDetails },
"recorder",
);
this.removeReqResp(networkId);
const reqresp = new RequestResponseInfo("0");
reqresp.fillRequest(params.request, params.resourceType);
if (reqresp.requestHeaders) {
delete reqresp.requestHeaders["range"];
delete reqresp.requestHeaders["Range"];
}
reqresp.frameId = params.frameId;
this.addAsyncFetch(
{
reqresp,
expectedSize: parseInt(range.split("/")[1]),
recorder: this,
networkId: "0",
cdp,
},
contentLen,
);
return false;
} else {
logger.debug(
"Skip 206 Response",
@ -624,16 +656,17 @@ export class Recorder {
return false;
}
const mimeType = this.getMimeType(responseHeaders) || "";
let streamingConsume = false;
// if contentLength is large or unknown, do streaming, unless its an essential resource
// in which case, need to do a full fetch either way
if (
(contentLen < 0 || contentLen > MAX_BROWSER_DEFAULT_FETCH_SIZE) &&
!this.isEssentialResource(reqresp.resourceType)
!this.isEssentialResource(reqresp.resourceType, mimeType)
) {
const opts: ResponseStreamAsyncFetchOptions = {
tempdir: this.tempdir,
reqresp,
expectedSize: contentLen,
recorder: this,
@ -659,14 +692,7 @@ export class Recorder {
// if not consumed via takeStream, attempt async loading
if (!streamingConsume) {
let fetcher: AsyncFetcher;
if (reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
fetcher = new AsyncFetcher(opts);
} else {
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
}
this.fetcherQ.add(() => fetcher.load());
this.addAsyncFetch(opts, contentLen);
return false;
}
} else {
@ -698,7 +724,7 @@ export class Recorder {
}
}
const rewritten = await this.rewriteResponse(reqresp, responseHeaders);
const rewritten = await this.rewriteResponse(reqresp, mimeType);
// if in service worker, serialize here
// as won't be getting a loadingFinished message
@ -746,6 +772,17 @@ export class Recorder {
return true;
}
addAsyncFetch(opts: NetworkLoadAsyncFetchOptions, contentLen: number) {
let fetcher: AsyncFetcher;
if (opts.reqresp.method !== "GET" || contentLen > MAX_NETWORK_LOAD_SIZE) {
fetcher = new AsyncFetcher(opts);
} else {
fetcher = new NetworkLoadStreamAsyncFetcher(opts);
}
this.fetcherQ.add(() => fetcher.load());
}
startPage({ pageid, url }: { pageid: string; url: string }) {
this.pageid = pageid;
this.pageUrl = url;
@ -927,10 +964,7 @@ export class Recorder {
return false;
}
async rewriteResponse(
reqresp: RequestResponseInfo,
responseHeaders?: Protocol.Fetch.HeaderEntry[],
) {
async rewriteResponse(reqresp: RequestResponseInfo, contentType: string) {
const { url, extraOpts, payload } = reqresp;
// don't rewrite if payload is missing or too big
@ -941,8 +975,6 @@ export class Recorder {
let newString = null;
let string = null;
const contentType = this._getContentType(responseHeaders);
switch (contentType) {
case "application/x-mpegURL":
case "application/vnd.apple.mpegurl":
@ -983,17 +1015,26 @@ export class Recorder {
"recorder",
);
reqresp.payload = encoder.encode(newString);
reqresp.isRemoveRange = true;
return true;
} else {
return false;
}
}
isEssentialResource(resourceType: string | undefined) {
return ["document", "stylesheet", "script"].includes(resourceType || "");
isEssentialResource(resourceType: string | undefined, contentType: string) {
if (["document", "stylesheet", "script"].includes(resourceType || "")) {
return true;
}
if (RW_MIME_TYPES.includes(contentType)) {
return true;
}
return false;
}
_getContentType(
protected getMimeType(
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
) {
if (!headers) {
@ -1008,7 +1049,7 @@ export class Recorder {
return null;
}
_getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
protected _getContentLen(headers?: Protocol.Fetch.HeaderEntry[]) {
if (!headers) {
return -1;
}
@ -1120,7 +1161,7 @@ export class Recorder {
!isRedirectStatus(status) &&
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, url, status))
) {
logNetwork("Skipping dupe", { url });
logNetwork("Skipping dupe", { url, status, ...this.logDetails });
return;
}
@ -1173,7 +1214,6 @@ export class Recorder {
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
// should not get here, as dupe pages tracked via seen list
const fetcher = new AsyncFetcher({
tempdir: this.tempdir,
reqresp,
recorder: this,
networkId: "0",
@ -1231,7 +1271,6 @@ class AsyncFetcher {
manualRedirect = false;
constructor({
tempdir,
reqresp,
expectedSize = -1,
recorder,
@ -1251,7 +1290,7 @@ class AsyncFetcher {
this.recorder = recorder;
this.tempdir = tempdir;
this.tempdir = recorder.tempdir;
this.filename = path.join(
this.tempdir,
`${timestampNow()}-${uuidv4()}.data`,
@ -1604,7 +1643,7 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
return;
}
reqresp.status = httpStatusCode || 0;
reqresp.setStatus(httpStatusCode || 200);
reqresp.responseHeaders = headers || {};
return this.takeStreamIter(cdp, stream);
@ -1618,6 +1657,10 @@ function createResponse(
pageid: string,
contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>,
) {
if (reqresp.isRemoveRange && reqresp.status === 206) {
reqresp.setStatus(200);
}
const url = reqresp.url;
const warcVersion = "WARC/1.1";
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;

View file

@ -7,6 +7,8 @@ import { HTML_TYPES } from "./constants.js";
import { Response } from "undici";
const CONTENT_LENGTH = "content-length";
const CONTENT_RANGE = "content-range";
const RANGE = "range";
const CONTENT_TYPE = "content-type";
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
@ -46,6 +48,7 @@ export class RequestResponseInfo {
responseHeadersText?: string;
payload?: Uint8Array;
isRemoveRange = false;
// misc
fromServiceWorker = false;
@ -76,11 +79,17 @@ export class RequestResponseInfo {
this.requestId = requestId;
}
setStatus(status: number, statusText?: string) {
this.status = status;
this.statusText = statusText || getStatusText(this.status);
}
fillFetchRequestPaused(params: Protocol.Fetch.RequestPausedEvent) {
this.fillRequest(params.request, params.resourceType);
this.status = params.responseStatusCode || 0;
this.statusText = params.responseStatusText || getStatusText(this.status);
if (params.responseStatusCode) {
this.setStatus(params.responseStatusCode, params.responseStatusText);
}
this.responseHeadersList = params.responseHeaders;
@ -116,8 +125,7 @@ export class RequestResponseInfo {
this.url = response.url.split("#")[0];
this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status);
this.setStatus(response.status, response.statusText);
this.protocol = response.protocol;
@ -182,8 +190,7 @@ export class RequestResponseInfo {
fillFetchResponse(response: Response) {
this.responseHeaders = Object.fromEntries(response.headers);
this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status);
this.setStatus(response.status, response.statusText);
}
fillRequestExtraInfo(
@ -240,7 +247,11 @@ export class RequestResponseInfo {
headersDict[headerName] = "" + actualContentLength;
continue;
}
if (EXCLUDE_HEADERS.includes(headerName)) {
if (
EXCLUDE_HEADERS.includes(headerName) ||
(this.isRemoveRange &&
(headerName === CONTENT_RANGE || headerName === RANGE))
) {
headerName = "x-orig-" + headerName;
}
headersDict[headerName] = this._encodeHeaderValue(header.value);
@ -263,7 +274,11 @@ export class RequestResponseInfo {
}
const value = this._encodeHeaderValue(headersDict[key]);
if (EXCLUDE_HEADERS.includes(keyLower)) {
if (
EXCLUDE_HEADERS.includes(keyLower) ||
(this.isRemoveRange &&
(keyLower === CONTENT_RANGE || keyLower === RANGE))
) {
headersDict["x-orig-" + key] = value;
delete headersDict[key];
} else {
@ -316,11 +331,11 @@ export class RequestResponseInfo {
}
shouldSkipSave() {
// skip cached, OPTIONS/HEAD responses, and 304 or 206 responses
// skip cached, OPTIONS/HEAD responses, and 304 responses
if (
this.fromCache ||
(this.method && ["OPTIONS", "HEAD"].includes(this.method)) ||
[206, 304].includes(this.status)
this.status == 304
) {
return true;
}
@ -330,6 +345,17 @@ export class RequestResponseInfo {
return true;
}
if (this.status === 206) {
const headers = new Headers(this.getResponseHeadersDict());
const contentLength: number = parseInt(
headers.get(CONTENT_LENGTH) || "0",
);
const contentRange = headers.get(CONTENT_RANGE);
if (contentRange !== `bytes 0-${contentLength - 1}/${contentLength}`) {
return false;
}
}
return false;
}

View file

@ -1300,21 +1300,21 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.19.1":
version "2.19.1"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.1.tgz#ce0d609f9e90c708af99945e1fa338be0ba2b5f9"
integrity sha512-m8Fi70OkhzkicbcbN5TrrBpj5D/EZKzVp5905kGPoC2F2zLqxUDMzx1FOHt2sTO/1b9NMvBmw9Pk1JQyYEm6rA==
"@webrecorder/wabac@^2.19.4":
version "2.19.4"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe"
integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2"
"@webrecorder/wombat" "^3.7.8"
"@webrecorder/wombat" "^3.7.11"
acorn "^8.10.0"
auto-js-ipfs "^2.1.1"
base64-js "^1.5.1"
brotli "^1.3.3"
buffer "^6.0.3"
fast-xml-parser "^4.2.5"
fast-xml-parser "^4.4.0"
hash-wasm "^4.9.0"
http-link-header "^1.1.3"
http-status-codes "^2.1.4"
@ -1329,10 +1329,10 @@
stream-browserify "^3.0.0"
warcio "^2.2.1"
"@webrecorder/wombat@^3.7.8":
version "3.7.8"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.8.tgz#a414278b6fbd99bc02a97e384f0373307e60d9fa"
integrity sha512-BmEHrvGLHPQtECmCK9Oz7G3p2StsyaFOlNmAMDSNK/GjqPH+UWZOqDryBkWryTh+pFZXKblqyotLtvR4YxVyeQ==
"@webrecorder/wombat@^3.7.11":
version "3.7.11"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96"
integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA==
dependencies:
warcio "^2.2.0"
@ -2431,7 +2431,7 @@ fast-xml-parser@^4.2.2:
dependencies:
strnum "^1.0.5"
fast-xml-parser@^4.2.5:
fast-xml-parser@^4.4.0:
version "4.4.0"
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501"
integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg==