2022-10-24 15:30:10 +02:00
|
|
|
import child_process from "child_process";
|
|
|
|
import fs from "fs";
|
|
|
|
import path from "path";
|
|
|
|
import os from "os";
|
|
|
|
import request from "request";
|
|
|
|
import { initStorage } from "./storage.js";
|
2022-12-15 12:38:41 -05:00
|
|
|
import { Logger } from "./logger.js";
|
|
|
|
|
|
|
|
const logger = new Logger();
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
|
|
|
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export async function loadProfile(profileFilename) {
|
2022-05-05 14:27:17 -05:00
|
|
|
const targetFilename = "/tmp/profile.tar.gz";
|
|
|
|
|
2022-03-14 14:44:24 -07:00
|
|
|
if (profileFilename &&
|
|
|
|
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
|
|
|
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
|
2022-03-14 14:44:24 -07:00
|
|
|
|
|
|
|
const p = new Promise((resolve, reject) => {
|
|
|
|
request.get(profileFilename).
|
|
|
|
on("error", (err) => reject(err)).
|
|
|
|
pipe(fs.createWriteStream(targetFilename)).
|
|
|
|
on("finish", () => resolve());
|
|
|
|
});
|
|
|
|
|
|
|
|
await p;
|
|
|
|
|
2022-05-05 14:27:17 -05:00
|
|
|
profileFilename = targetFilename;
|
|
|
|
} else if (profileFilename && profileFilename.startsWith("@")) {
|
|
|
|
const storage = initStorage("");
|
|
|
|
|
|
|
|
if (!storage) {
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
2022-05-05 14:27:17 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
await storage.downloadFile(profileFilename.slice(1), targetFilename);
|
|
|
|
|
2022-03-14 14:44:24 -07:00
|
|
|
profileFilename = targetFilename;
|
|
|
|
}
|
|
|
|
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
if (profileFilename) {
|
2022-03-14 14:44:24 -07:00
|
|
|
try {
|
|
|
|
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
|
|
|
|
} catch (e) {
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
2022-03-14 14:44:24 -07:00
|
|
|
}
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return profileDir;
|
2022-10-24 15:30:10 +02:00
|
|
|
}
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export function saveProfile(profileFilename) {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
2022-10-24 15:30:10 +02:00
|
|
|
}
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export function getBrowserExe() {
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
|
|
|
for (const file of files) {
|
|
|
|
if (file && fs.existsSync(file)) {
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export function getDefaultUA() {
|
2022-03-18 10:32:59 -07:00
|
|
|
let version = process.env.BROWSER_VERSION;
|
|
|
|
|
|
|
|
try {
|
|
|
|
version = child_process.execFileSync(getBrowserExe(), ["--version"], {encoding: "utf8"});
|
|
|
|
version = version.match(/[\d.]+/)[0];
|
|
|
|
} catch(e) {
|
2022-12-15 12:38:41 -05:00
|
|
|
logger.error("Error getting default UserAgent", e);
|
2022-03-18 10:32:59 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-08-21 00:30:25 -07:00
|
|
|
// from https://github.com/microsoft/playwright/blob/main/packages/playwright-core/src/server/chromium/chromium.ts#L327
|
|
|
|
const DEFAULT_PLAYWRIGHT_FLAGS = [
|
|
|
|
"--disable-field-trial-config", // https://source.chromium.org/chromium/chromium/src/+/main:testing/variations/README.md
|
|
|
|
"--disable-background-networking",
|
|
|
|
"--enable-features=NetworkService,NetworkServiceInProcess",
|
|
|
|
"--disable-background-timer-throttling",
|
|
|
|
"--disable-backgrounding-occluded-windows",
|
|
|
|
"--disable-back-forward-cache", // Avoids surprises like main request not being intercepted during page.goBack().
|
|
|
|
"--disable-breakpad",
|
|
|
|
"--disable-client-side-phishing-detection",
|
|
|
|
"--disable-component-extensions-with-background-pages",
|
|
|
|
"--disable-default-apps",
|
|
|
|
"--disable-dev-shm-usage",
|
|
|
|
"--disable-extensions",
|
|
|
|
// AvoidUnnecessaryBeforeUnloadCheckSync - https://github.com/microsoft/playwright/issues/14047
|
|
|
|
// Translate - https://github.com/microsoft/playwright/issues/16126
|
|
|
|
"--disable-features=ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate",
|
|
|
|
"--allow-pre-commit-input",
|
|
|
|
"--disable-hang-monitor",
|
|
|
|
"--disable-ipc-flooding-protection",
|
|
|
|
"--disable-popup-blocking",
|
|
|
|
"--disable-prompt-on-repost",
|
|
|
|
"--disable-renderer-backgrounding",
|
|
|
|
"--disable-sync",
|
|
|
|
"--force-color-profile=srgb",
|
|
|
|
"--metrics-recording-only",
|
|
|
|
"--no-first-run",
|
2023-01-09 23:56:53 -08:00
|
|
|
"--no-startup-window",
|
2022-08-21 00:30:25 -07:00
|
|
|
"--password-store=basic",
|
|
|
|
"--use-mock-keychain",
|
|
|
|
// See https://chromium-review.googlesource.com/c/chromium/src/+/2436773
|
|
|
|
"--no-service-autorun",
|
|
|
|
"--export-tagged-pdf"
|
|
|
|
];
|
|
|
|
|
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
|
2022-03-18 10:32:59 -07:00
|
|
|
// Chrome Flags, including proxy server
|
|
|
|
const args = [
|
2022-08-21 00:30:25 -07:00
|
|
|
...DEFAULT_PLAYWRIGHT_FLAGS,
|
2022-03-18 10:32:59 -07:00
|
|
|
...(process.env.CHROME_FLAGS ?? "").split(" ").filter(Boolean),
|
2022-08-21 00:30:25 -07:00
|
|
|
//"--no-xshm", // needed for Chrome >80 (check if puppeteer adds automatically)
|
2022-03-18 10:32:59 -07:00
|
|
|
"--no-sandbox",
|
|
|
|
"--disable-background-media-suspend",
|
2022-08-21 00:30:25 -07:00
|
|
|
"--remote-debugging-port=9221",
|
2022-03-18 10:32:59 -07:00
|
|
|
"--autoplay-policy=no-user-gesture-required",
|
2022-06-30 19:24:26 -07:00
|
|
|
"--disable-site-isolation-trials",
|
2022-03-18 10:32:59 -07:00
|
|
|
`--user-agent=${userAgent || getDefaultUA()}`,
|
|
|
|
...extraArgs,
|
|
|
|
];
|
|
|
|
|
|
|
|
if (proxy) {
|
|
|
|
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
|
|
|
}
|
|
|
|
|
|
|
|
return args;
|
2022-10-24 15:30:10 +02:00
|
|
|
}
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
|
|
|
|
2023-01-23 16:47:33 -08:00
|
|
|
export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
|
2022-02-20 22:22:19 -08:00
|
|
|
const context = await frame.executionContext();
|
2023-01-23 16:47:33 -08:00
|
|
|
const url = frame.url();
|
|
|
|
|
|
|
|
logger.info(`Running ${name}...`, {url});
|
2022-02-20 22:22:19 -08:00
|
|
|
|
|
|
|
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
|
|
|
|
const contextId = context._contextId;
|
|
|
|
const expression = funcString + "\n//# sourceURL=__puppeteer_evaluation_script__";
|
|
|
|
|
2023-01-23 16:47:33 -08:00
|
|
|
const { exceptionDetails, result } = await context._client
|
2022-02-20 22:22:19 -08:00
|
|
|
.send("Runtime.evaluate", {
|
|
|
|
expression,
|
|
|
|
contextId,
|
|
|
|
returnByValue: true,
|
|
|
|
awaitPromise: true,
|
|
|
|
userGesture: true,
|
|
|
|
includeCommandLineAPI: true,
|
|
|
|
});
|
|
|
|
|
|
|
|
if (exceptionDetails) {
|
2023-01-23 16:47:33 -08:00
|
|
|
const details = exceptionDetails.stackTrace || {};
|
|
|
|
details.url = url;
|
2023-01-23 10:43:12 -08:00
|
|
|
logger.error(
|
2023-01-23 16:47:33 -08:00
|
|
|
`Run ${name} failed: ${exceptionDetails.text}`, details
|
2022-02-20 22:22:19 -08:00
|
|
|
);
|
2023-01-23 16:47:33 -08:00
|
|
|
} else {
|
|
|
|
logger.info(`Run ${name} finished`, {url});
|
2022-02-20 22:22:19 -08:00
|
|
|
}
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2023-01-23 16:47:33 -08:00
|
|
|
return result.value;
|
2022-10-24 15:30:10 +02:00
|
|
|
}
|
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set
* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages
* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)
* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile
* screencasting: convert newContext to window instead of page by default, instead of just warning about it
* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2
* seeds: add trim() to seed URLs
* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically
* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles
* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25
* update CHANGES and README with new features
* bump version to 0.4.1
2021-07-22 14:24:51 -07:00
|
|
|
|
2022-03-18 10:32:59 -07:00
|
|
|
|
2022-10-24 15:30:10 +02:00
|
|
|
export async function sleep(time) {
|
2022-03-18 10:32:59 -07:00
|
|
|
return new Promise(resolve => setTimeout(resolve, time));
|
2022-10-24 15:30:10 +02:00
|
|
|
}
|
2022-03-18 10:32:59 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|