1.2.8 updates: (#668)

- rewriting: update wabac.js, use getCustomRewriter(), don't truncate
POST request bodies for URLs that use a custom rewriter
- browser: disable --enable-automation, setting webdriver = true, so no
need for override
- deps: update puppeteer-core, necessary changes for latest puppeteer
This commit is contained in:
Ilya Kreymer 2024-08-13 23:38:55 -07:00 committed by GitHub
parent bb34c5ef47
commit 8d7fb1e084
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 53 additions and 50 deletions

View file

@ -42,7 +42,7 @@ ADD config/ /app/
ADD html/ /app/html/
ARG RWP_VERSION=2.1.3
ARG RWP_VERSION=2.1.4
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.2.7",
"version": "1.2.8",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
@ -18,8 +18,8 @@
"dependencies": {
"@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.19.4",
"browsertrix-behaviors": "^0.6.3",
"@webrecorder/wabac": "^2.19.7",
"browsertrix-behaviors": "^0.6.4",
"fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",
@ -30,7 +30,7 @@
"p-queue": "^7.3.4",
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^22.14.0",
"puppeteer-core": "^23.0.2",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",

View file

@ -113,7 +113,7 @@ export class Browser {
headless,
executablePath: this.getBrowserExe(),
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
ignoreHTTPSErrors: true,
acceptInsecureCerts: true,
handleSIGHUP: signals,
handleSIGINT: signals,
handleSIGTERM: signals,
@ -140,11 +140,6 @@ export class Browser {
}
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
await this.addInitScript(
page,
'Object.defineProperty(navigator, "webdriver", {value: false});',
);
switch (this.swOpt) {
case "disabled":
logger.debug("Service Workers: always disabled", {}, "browser");

View file

@ -14,11 +14,8 @@ import {
import { fetch, Response } from "undici";
import {
baseRules as baseDSRules,
htmlRules as htmlDSRules,
// @ts-expect-error TODO fill in why error is expected
} from "@webrecorder/wabac/src/rewrite/index.js";
// @ts-expect-error TODO fill in why error is expected
import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite/index.js";
import {
rewriteDASH,
rewriteHLS,
@ -1003,10 +1000,9 @@ export class Recorder {
case "text/javascript":
case "application/javascript":
case "application/x-javascript": {
const rules = contentType === "text/html" ? htmlDSRules : baseDSRules;
const rw = rules.getRewriter(url);
const rw = getCustomRewriter(url, isHTMLMime(contentType));
if (rw !== rules.defaultRewriter) {
if (rw) {
string = payload.toString();
newString = rw.rewrite(string, { live: true, save: extraOpts });
}

View file

@ -1,5 +1,7 @@
// @ts-expect-error TODO fill in why error is expected
import { getStatusText } from "@webrecorder/wabac/src/utils.js";
// @ts-expect-error TODO fill in why error is expected
import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite/index.js";
import { Protocol } from "puppeteer-core";
import { postToGetUrl } from "warcio";
@ -372,8 +374,11 @@ export class RequestResponseInfo {
};
if (postToGetUrl(convData)) {
//this.requestBody = convData.requestBody;
// truncate to avoid extra long URLs
// if not custom rewrite, truncate to avoid extra long URLs
if (getCustomRewriter(this.url, isHTMLMime(this.getMimeType() || ""))) {
return convData.url;
}
try {
const url = new URL(convData.url);
for (const [key, value] of url.searchParams.entries()) {

View file

@ -70,7 +70,7 @@ export class Screenshots {
});
}
const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options);
const screenshotBuffer = Buffer.from(await this.page.screenshot(options));
if (state && screenshotType === "view") {
state.screenshotView = screenshotBuffer;
}

View file

@ -1300,21 +1300,21 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.19.4":
version "2.19.4"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe"
integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww==
"@webrecorder/wabac@^2.19.7":
version "2.19.7"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.7.tgz#3afe48f79752bcd189cffd5d5e6a8dbe4f394053"
integrity sha512-X9UFxWCww1KWDnAaEjg7vpg6SznBov5a88FPxbOvo5yCT/UkJcQHaa0qo1L52l46sIAUnSbsYz1ur9yMd6ygVA==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2"
"@webrecorder/wombat" "^3.7.11"
"@webrecorder/wombat" "^3.7.14"
acorn "^8.10.0"
auto-js-ipfs "^2.1.1"
base64-js "^1.5.1"
brotli "^1.3.3"
buffer "^6.0.3"
fast-xml-parser "^4.4.0"
fast-xml-parser "^4.4.1"
hash-wasm "^4.9.0"
http-link-header "^1.1.3"
http-status-codes "^2.1.4"
@ -1329,10 +1329,10 @@
stream-browserify "^3.0.0"
warcio "^2.2.1"
"@webrecorder/wombat@^3.7.11":
version "3.7.11"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96"
integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA==
"@webrecorder/wombat@^3.7.14":
version "3.7.14"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.14.tgz#3779e4cadb256755bbbfd2960805965ec4daacd8"
integrity sha512-sDNH+c8WstQrK91y8kIPJh1XAC2WXLU5rC8wztANzK1mVzA7v6XB5gk3Yp7OIAn4bn1XuGRVjubhKhmxVVZ9kg==
dependencies:
warcio "^2.2.0"
@ -1677,10 +1677,10 @@ browserslist@^4.22.2:
node-releases "^2.0.14"
update-browserslist-db "^1.0.13"
browsertrix-behaviors@^0.6.3:
version "0.6.3"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.3.tgz#cdd6457bcc718cc30257fd754a2c12191a6431a2"
integrity sha512-fr9w8ANqmxDid4Ile+dYjwcU5nD4+ZhTBVID2zBYWNoSoFkrEILUtpSAbBmLtr5Ujulxjn71uUQwMOfAFAUqzw==
browsertrix-behaviors@^0.6.4:
version "0.6.4"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.4.tgz#33fe9a433108f2faac3a03af91aff940433e5b87"
integrity sha512-xaiO/VqqeSd5FnAkIKQINxC/q3Med33Lqw3LGxD4NBtkcMSh1Anz/+830QHVlQbp08nIPUXYV96hDrx1Uv0PmQ==
dependencies:
query-selector-shadow-dom "^1.0.1"
@ -1801,10 +1801,10 @@ chownr@^1.1.1:
resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b"
integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
chromium-bidi@0.6.2:
version "0.6.2"
resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-0.6.2.tgz#91f9daa20984833b52221084480fbe0465b29c67"
integrity sha512-4WVBa6ijmUTVr9cZD4eicQD8Mdy/HCX3bzEIYYpmk0glqYLoWH+LqQEvV9RpDRzoQSbY1KJHloYXbDMXMbDPhg==
chromium-bidi@0.6.4:
version "0.6.4"
resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-0.6.4.tgz#627d76bae2819d59b61a413babe9664e0a16b71d"
integrity sha512-8zoq6ogmhQQkAKZVKO2ObFTl4uOkqoX1PlKQX3hZQ5E9cbUotcAb7h4pTNVAGGv8Z36PF3CtdOriEp/Rz82JqQ==
dependencies:
mitt "3.0.1"
urlpattern-polyfill "10.0.0"
@ -1973,6 +1973,13 @@ debug@^4.3.5:
dependencies:
ms "2.1.2"
debug@^4.3.6:
version "4.3.6"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.6.tgz#2ab2c38fbaffebf8aa95fdfe6d88438c7a13c52b"
integrity sha512-O/09Bd4Z1fBrU4VzkhFqVgpPzaGbw6Sm9FEkBT1A/YBXQFGuuSxa1dN2nxgxS34JmKXqYx8CZAwEVoJFImUXIg==
dependencies:
ms "2.1.2"
decode-uri-component@^0.2.2:
version "0.2.2"
resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.2.tgz#e69dbe25d37941171dd540e024c444cd5188e1e9"
@ -2428,10 +2435,10 @@ fast-xml-parser@^4.2.2:
dependencies:
strnum "^1.0.5"
fast-xml-parser@^4.4.0:
version "4.4.0"
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501"
integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg==
fast-xml-parser@^4.4.1:
version "4.4.1"
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.1.tgz#86dbf3f18edf8739326447bcaac31b4ae7f6514f"
integrity sha512-xkjOecfnKGkSsOwtZ5Pz7Us/T6mrbPQrq0nh+aCO5V9nk5NLWmasAHumTKjiPJPWANe+kAZ84Jc8ooJkzZ88Sw==
dependencies:
strnum "^1.0.5"
@ -4345,14 +4352,14 @@ punycode@^2.1.0:
resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==
puppeteer-core@^22.14.0:
version "22.14.0"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-22.14.0.tgz#5bb466adba725c966b0a86f0337a476d4c68ebec"
integrity sha512-rl4tOY5LcA3e374GAlsGGHc05HL3eGNf5rZ+uxkl6id9zVZKcwcp1Z+Nd6byb6WPiPeecT/dwz8f/iUm+AZQSw==
puppeteer-core@^23.0.2:
version "23.0.2"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.0.2.tgz#343c8d003e609620febfe35f76847a0014cdc97c"
integrity sha512-MvOHn+g1TYkAR2oVd/bf/YWXKqFTJmkhyyurYgxkrjh8rBOL1ZH5VyOsLJi0bLO7/yoipAmk1gFZEx9HUJnaoA==
dependencies:
"@puppeteer/browsers" "2.3.0"
chromium-bidi "0.6.2"
debug "^4.3.5"
chromium-bidi "0.6.4"
debug "^4.3.6"
devtools-protocol "0.0.1312386"
ws "^8.18.0"