mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
browser: just pass profileUrl and track if custom profile is used
browser: don't disable service workers always (accidentally added as part of playwright migration) only disable if using profile, same as 0.8.x behavior potential fix for #288 bump version to 0.9.1
This commit is contained in:
parent
ebdf0ac8f8
commit
6531d52e2c
4 changed files with 29 additions and 15 deletions
|
@ -347,7 +347,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
async setupPage({page, cdp, workerid}) {
|
async setupPage({page, cdp, workerid}) {
|
||||||
await page.addInitScript("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
await this.browser.setupPage({page, cdp});
|
||||||
|
|
||||||
if (this.params.logging.includes("jserrors")) {
|
if (this.params.logging.includes("jserrors")) {
|
||||||
page.on("console", (msg) => {
|
page.on("console", (msg) => {
|
||||||
|
@ -644,8 +644,6 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
this.profileDir = await this.browser.loadProfile(this.params.profile);
|
|
||||||
|
|
||||||
if (this.params.healthCheckPort) {
|
if (this.params.healthCheckPort) {
|
||||||
this.healthChecker = new HealthChecker(this.params.healthCheckPort, this.params.workers);
|
this.healthChecker = new HealthChecker(this.params.healthCheckPort, this.params.workers);
|
||||||
}
|
}
|
||||||
|
@ -723,7 +721,7 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.browser.launch({
|
await this.browser.launch({
|
||||||
dataDir: this.profileDir,
|
profileUrl: this.params.profile,
|
||||||
headless: this.params.headless,
|
headless: this.params.headless,
|
||||||
emulateDevice: this.emulateDevice,
|
emulateDevice: this.emulateDevice,
|
||||||
chromeOptions: {
|
chromeOptions: {
|
||||||
|
|
|
@ -199,7 +199,8 @@ async function main() {
|
||||||
await cdp.send("Network.setCacheDisabled", {cacheDisabled: true});
|
await cdp.send("Network.setCacheDisabled", {cacheDisabled: true});
|
||||||
|
|
||||||
if (!params.automated) {
|
if (!params.automated) {
|
||||||
await page.addInitScript("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
await browser.setupPage({page, cdp});
|
||||||
|
|
||||||
// for testing, inject browsertrix-behaviors
|
// for testing, inject browsertrix-behaviors
|
||||||
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
|
await page.addInitScript(behaviors + ";\nself.__bx_behaviors.init();");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "browsertrix-crawler",
|
"name": "browsertrix-crawler",
|
||||||
"version": "0.9.0",
|
"version": "0.9.1",
|
||||||
"main": "browsertrix-crawler",
|
"main": "browsertrix-crawler",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||||
|
|
|
@ -11,8 +11,6 @@ import { initStorage } from "./storage.js";
|
||||||
|
|
||||||
import { chromium } from "playwright-core";
|
import { chromium } from "playwright-core";
|
||||||
|
|
||||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
|
||||||
|
|
||||||
|
|
||||||
// ==================================================================
|
// ==================================================================
|
||||||
export class Browser
|
export class Browser
|
||||||
|
@ -22,16 +20,22 @@ export class Browser
|
||||||
|
|
||||||
this.firstPage = null;
|
this.firstPage = null;
|
||||||
this.firstCDP = null;
|
this.firstCDP = null;
|
||||||
|
|
||||||
|
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||||
|
this.customProfile = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
async launch({dataDir, chromeOptions, signals = false, headless = false, emulateDevice = {viewport: null}} = {}) {
|
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {viewport: null}} = {}) {
|
||||||
if (this.context) {
|
if (this.context) {
|
||||||
logger.warn("Context already inited", {}, "context");
|
logger.warn("Context already inited", {}, "context");
|
||||||
return this.context;
|
return this.context;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (profileUrl) {
|
||||||
|
this.customProfile = await this.loadProfile(profileUrl);
|
||||||
|
}
|
||||||
|
|
||||||
const args = this.chromeArgs(chromeOptions);
|
const args = this.chromeArgs(chromeOptions);
|
||||||
const userDataDir = dataDir || profileDir;
|
|
||||||
|
|
||||||
const launchOpts = {
|
const launchOpts = {
|
||||||
...emulateDevice,
|
...emulateDevice,
|
||||||
|
@ -43,10 +47,10 @@ export class Browser
|
||||||
handleSIGHUP: signals,
|
handleSIGHUP: signals,
|
||||||
handleSIGINT: signals,
|
handleSIGINT: signals,
|
||||||
handleSIGTERM: signals,
|
handleSIGTERM: signals,
|
||||||
serviceWorkers: dataDir ? "block" : "allow",
|
serviceWorkers: "allow"
|
||||||
};
|
};
|
||||||
|
|
||||||
this.context = await chromium.launchPersistentContext(userDataDir, launchOpts);
|
this.context = await chromium.launchPersistentContext(this.profileDir, launchOpts);
|
||||||
|
|
||||||
if (this.context.pages()) {
|
if (this.context.pages()) {
|
||||||
this.firstPage = this.context.pages()[0];
|
this.firstPage = this.context.pages()[0];
|
||||||
|
@ -65,6 +69,16 @@ export class Browser
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async setupPage({page, cdp}) {
|
||||||
|
await page.addInitScript("Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||||
|
|
||||||
|
if (this.customProfile) {
|
||||||
|
logger.info("Disabling Service Workers for profile", {}, "browser");
|
||||||
|
|
||||||
|
await cdp.send("Network.setBypassServiceWorker", {bypass: true});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async getFirstPageWithCDP() {
|
async getFirstPageWithCDP() {
|
||||||
return {page: this.firstPage, cdp: this.firstCDP};
|
return {page: this.firstPage, cdp: this.firstCDP};
|
||||||
}
|
}
|
||||||
|
@ -126,17 +140,18 @@ export class Browser
|
||||||
|
|
||||||
if (profileFilename) {
|
if (profileFilename) {
|
||||||
try {
|
try {
|
||||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
|
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir});
|
||||||
|
return true;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return profileDir;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
saveProfile(profileFilename) {
|
saveProfile(profileFilename) {
|
||||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
||||||
}
|
}
|
||||||
|
|
||||||
chromeArgs({proxy=true, userAgent=null, extraArgs=[]} = {}) {
|
chromeArgs({proxy=true, userAgent=null, extraArgs=[]} = {}) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue