mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Logging and browser improvements: (#158)
* logging: add 'jserrors' option to --logging to print JS errors * browser config: use flags from playwright * browser: use socat to allow connecting via devtools via crawling on port 9222
This commit is contained in:
parent
6cc38bf511
commit
e22d95e2f0
4 changed files with 54 additions and 7 deletions
14
crawler.js
14
crawler.js
|
@ -104,6 +104,8 @@ class Crawler {
|
|||
this.sizeExceeded = false;
|
||||
this.finalExit = false;
|
||||
this.behaviorLastLine = null;
|
||||
|
||||
this.logConsole = false;
|
||||
}
|
||||
|
||||
statusLog(...args) {
|
||||
|
@ -228,6 +230,8 @@ class Crawler {
|
|||
redisStdio = "ignore";
|
||||
}
|
||||
|
||||
this.logConsole = this.params.logging.includes("jserrors");
|
||||
|
||||
this.browserExe = getBrowserExe();
|
||||
|
||||
this.configureUA();
|
||||
|
@ -257,6 +261,8 @@ class Crawler {
|
|||
}
|
||||
});
|
||||
|
||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||
|
||||
if (!this.params.headless && !process.env.NO_XVFB) {
|
||||
child_process.spawn("Xvfb", [
|
||||
process.env.DISPLAY,
|
||||
|
@ -708,6 +714,14 @@ class Crawler {
|
|||
// more serious page error, mark page session as invalid
|
||||
page.on("error", () => this.markPageFailed(page));
|
||||
|
||||
if (this.logConsole) {
|
||||
page.on("console", (msg) => {
|
||||
if (msg.type() === "error") {
|
||||
console.log(msg.text(), msg.location());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";
|
||||
|
||||
try {
|
||||
|
|
|
@ -119,7 +119,6 @@ async function main() {
|
|||
}
|
||||
|
||||
const browserArgs = chromeArgs(useProxy, null, [
|
||||
"--remote-debugging-port=9221",
|
||||
`--window-size=${params.windowSize}`,
|
||||
]);
|
||||
|
||||
|
|
|
@ -158,7 +158,7 @@ class ArgParser {
|
|||
},
|
||||
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug",
|
||||
describe: "Logging options for crawler, can include: stats, pywb, behaviors, behaviors-debug, jserrors",
|
||||
type: "string",
|
||||
default: "stats",
|
||||
},
|
||||
|
|
|
@ -84,19 +84,53 @@ function getDefaultUA() {
|
|||
module.exports.getDefaultUA = getDefaultUA;
|
||||
|
||||
|
||||
// from https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/chromium/chromium.ts#L327
|
||||
const DEFAULT_PLAYWRIGHT_FLAGS = [
|
||||
"--disable-field-trial-config", // https://source.chromium.org/chromium/chromium/src/+/main:testing/variations/README.md
|
||||
"--disable-background-networking",
|
||||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-back-forward-cache", // Avoids surprises like main request not being intercepted during page.goBack().
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-extensions",
|
||||
// AvoidUnnecessaryBeforeUnloadCheckSync - https://github.com/microsoft/playwright/issues/14047
|
||||
// Translate - https://github.com/microsoft/playwright/issues/16126
|
||||
"--disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate",
|
||||
"--allow-pre-commit-input",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--no-first-run",
|
||||
"--enable-automation",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
// See https://chromium-review.googlesource.com/c/chromium/src/+/2436773
|
||||
"--no-service-autorun",
|
||||
"--export-tagged-pdf"
|
||||
];
|
||||
|
||||
|
||||
module.exports.chromeArgs = (proxy, userAgent=null, extraArgs=[]) => {
|
||||
// Chrome Flags, including proxy server
|
||||
const args = [
|
||||
...DEFAULT_PLAYWRIGHT_FLAGS,
|
||||
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
||||
"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
//"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
||||
"--no-sandbox",
|
||||
"--disable-background-media-suspend",
|
||||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--remote-debugging-port=9221",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,AcceptCHFrame,AutoExpandDetailsElement",
|
||||
"--disable-site-isolation-trials",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
`--user-agent=${userAgent || getDefaultUA()}`,
|
||||
...extraArgs,
|
||||
];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue