0.4.1 Release! (#70)

* optimization: don't intercept requests if no blockRules set

* page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages

* add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds)

* refactor profile loadProfile/saveProfile to util/browser.js
- support augmenting existing profile when creating a new profile

* screencasting: convert newContext to window instead of page by default, instead of just warning about it

* shared multiplatform image support:
- determine browser exe from list of options, getBrowserExe() returns current exe
- supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64
- update to multiplatform oldwebtoday/chrome:91 as browser image
- enable multiplatform build with latest build-push-action@v2

* seeds: add trim() to seed URLs

* logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically

* profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles

* extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25

* update CHANGES and README with new features

* bump version to 0.4.1
This commit is contained in:
Ilya Kreymer 2021-07-22 14:24:51 -07:00 committed by GitHub
parent 6a65ea7a58
commit f4c6b6a99f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 268 additions and 104 deletions

View file

@ -2,17 +2,53 @@ name: Publish Docker image
on:
release:
types: [published]
jobs:
push_to_registries:
name: Push Docker image to Dockerhub
name: Build x86 and ARM Images and push to Dockerhub
runs-on: ubuntu-latest
steps:
- name: Check out the repo
-
name: Check out the repo
uses: actions/checkout@v2
- name: Push to Docker Hub
uses: docker/build-push-action@v1
-
name: Prepare
id: prep
run: |
DOCKER_IMAGE=webrecorder/browsertrix-crawler
VERSION=edge
if [[ $GITHUB_REF == refs/tags/* ]]; then
VERSION=${GITHUB_REF#refs/tags/}
elif [[ $GITHUB_REF == refs/heads/* ]]; then
VERSION=$(echo ${GITHUB_REF#refs/heads/} | sed -r 's#/+#-#g')
elif [[ $GITHUB_REF == refs/pull/* ]]; then
VERSION=pr-${{ github.event.number }}
fi
TAGS="${DOCKER_IMAGE}:${VERSION}"
echo ::set-output name=tags::${TAGS}
-
name: Set up QEMU
uses: docker/setup-qemu-action@v1
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
repository: webrecorder/browsertrix-crawler
tag_with_ref: true
-
name: Build and push
id: docker_build
uses: docker/build-push-action@v2
with:
context: .
push: true
tags: ${{ steps.prep.outputs.tags }}
platforms: "linux/amd64,linux/arm64"
-
name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}

View file

@ -1,5 +1,14 @@
## CHANGES
v0.4.1
- BlockRules Optimizations: don't intercept requests if no blockRules
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
- Behavior Timeouts: Add --behaviorTimeout to specify custom timeout for behaviors, in seconds (defaulting to 90 seconds)
- Load Wait Default: Switch to 'load,networkidle2' to speed-up waiting for initial load
- Multi-platform build: Support building for amd64 and Arm using oldwebtoday/chrome:91 images (check for google-chrome and chromium-browser automatically)
- CI: Build a multi-platform (amd64 and arm64) image on each release
v0.4.0
- YAML based config, specifyable via --config property or via stdin (with '--config stdin')
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level

View file

@ -1,10 +1,10 @@
ARG BROWSER_VERSION=90
ARG BROWSER_VERSION=91
ARG BROWSER_IMAGE_BASE=oldwebtoday/chrome
ARG BROWSER_BIN=google-chrome
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} as chrome
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} AS browser
FROM ubuntu:bionic
@ -36,8 +36,8 @@ ENV PROXY_HOST=localhost \
BROWSER_VERSION=${BROWSER_VERSION} \
BROWSER_BIN=${BROWSER_BIN}
COPY --from=chrome /tmp/*.deb /deb/
COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
COPY --from=browser /tmp/*.deb /deb/
COPY --from=browser /app/libpepflashplayer.so /app/libpepflashplayer.so
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
rm -rf /var/lib/opts/lists/*

View file

@ -63,7 +63,7 @@ Browsertrix Crawler includes a number of additional command-line options, explai
--waitUntil Puppeteer page.goto() condition to
wait for before continuing, can be
multiple separate by ','
[default: "load,networkidle0"]
[default: "load,networkidle2"]
--depth The depth of the crawl for all seeds
[number] [default: -1]
--limit Limit crawl to this number of pages
@ -138,6 +138,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
--behaviors Which background behaviors to enable
on each page
[string] [default: "autoplay,autofetch,siteSpecific"]
--behaviorTimeout If >0, timeout (in seconds) for
in-page behavior will run on each
page. If 0, a behavior can run until
finish. [number] [default: 90]
--profile Path to tar.gz file which will be
extracted and used as the browser
profile [string]
@ -152,10 +156,14 @@ Browsertrix Crawler includes a number of additional command-line options, explai
</details>
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
### Waiting for Page Load
One of the key nuances of browser-based crawling is determining when a page is finished loading. This can be configured with the `--waitUntil` flag.
The default is `load,networkidle2`, which waits until page load and <=2 requests remain, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). The `--waitUntil networkidle0` may make sense for sites, where absolutely all requests must be waited until before proceeding.
See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) for more info on the options that can be used with this flag from the Puppeteer docs.
The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example),
while `--waitUntil networkidle0` may make sense for dynamic sites.
### YAML Crawl Config
@ -288,6 +296,8 @@ and auto-fetch content that is not loaded by default, and also run custom behavi
Behaviors to run can be specified via a comma-separated list passed to the `--behaviors` option. By default, the auto-scroll behavior is not enabled by default, as it may slow down crawling. To enable this behaviors, you can add
`--behaviors autoscroll` or to enable all behaviors, add `--behaviors autoscroll,autoplay,autofetch,siteSpecific`.
The site-specific behavior (or autoscroll) will start running after the page is finished its initial load (as defined by the `--waitUntil` settings). The behavior will then run until finished or until the behavior timeout is exceeded. This timeout can be set (in seconds) via the `--behaviorTimeout` flag (90 seconds by default). Setting the timeout to 0 will allow the behavior to run until it is finished.
See [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) for more info on all of the currently available behaviors.
@ -336,6 +346,12 @@ The script will then prompt you for login credentials, attempt to login and crea
- To specify headless mode, add the `--headless` flag. Note that for crawls run with `--headless` flag, it is recommended to also create the profile with `--headless` to ensure the profile is compatible.
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional profile functionality, such as support for custom profile creation scripts, may be added in the future.
### Interactive Profile Creation
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
@ -351,14 +367,19 @@ Browsertrix Crawler will then create a profile as before using the current state
For example, to start in interactive profile creation mode, run:
```
docker run -p 9222:9222 -p 9223:9223 -v $PWD/crawls/profiles:/output/ -it webrecorder/browsertrix-crawler:0.4.0-beta.3 create-login-profile --interactive --url "https://example.com/"
docker run -p 9222:9222 -p 9223:9223 -v $PWD/profiles:/output/ -it webrecorder/browsertrix-crawler:[VERSION] create-login-profile --interactive --url "https://example.com/"
```
Then, open a browser pointing to `http://localhost:9223/` and use the embedded browser to log in to any sites or configure any settings as needed.
Click 'Create Profile at the top when done. The profile will then be created in `./crawls/profiles/profile.tar.gz` containing the settings of this browsing session.
It is also possible to extend an existing profiles by also passing in an existing profile via the `--profile` flag. In this way, it is possible to build new profiles by extending previous browsing sessions as needed.
### Using Browser Profile
```
docker run -p 9222:9222 -p 9223:9223 -v $PWD/profiles:/profiles --filename /profiles/newProfile.tar.gz -it webrecorder/browsertrix-crawler:[VERSION] create-login-profile --interactive --url "https://example.com/ --profile /profiles/oldProfile.tar.gz"
```
### Using Browser Profile with a Crawl
To use a previously created profile with a crawl, use the `--profile` flag or `profile` option. The `--profile` flag can then be used to specify any Chrome profile stored as a tarball. Using profiles created with same or older version of Browsertrix Crawler is recommended to ensure compatibility. This option allows running a crawl with the browser already pre-configured, logged in to certain sites, language settings configured, etc...
@ -369,9 +390,6 @@ After running the above command, you can now run a crawl with the profile, as fo
docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --profile /crawls/profiles/profile.tar.gz --url https://twitter.com/--generateWACZ --collection test-with-profile
```
The current profile creation script is still experimental and the script attempts to detect the usename and password fields on a site as generically as possible, but may not work for all sites. Additional profile functionality, such as support for custom profile creation scripts, may be added in the future.
## Architecture
The Docker container provided here packages up several components used in Browsertrix.
@ -386,20 +404,6 @@ The crawl produces a single pywb collection, at `/crawls/collections/<collection
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
### Building with Custom Browser Image / Building on Apple M1
Browsertrix Crawler can be built on the new ARM M1 chip (for development). However, since there is no Linux build of Chrome for ARM, Chromium can be used instead. Currently, Webrecorder provides the `oldwebtoday/chromium:91-arm` for running Browsertrix Crawler on ARM-based systems.
For example, to build with this Chromium image on an Apple M1 machine, run:
```
docker-compose build --build-arg BROWSER_IMAGE_BASE=oldwebtoday/chromium --build-arg "BROWSER_VERSION=91-arm" --build-arg BROWSER_BIN=chromium-browser
```
You should then be able to run Browsertrix Crawler natively on M1.
The build arguments specify the base image, version and browser binary. This approach can also be used to install a different browser in general from any Debian-based Docker image.
### Usage with Docker Compose
@ -427,6 +431,29 @@ In this example, the crawl data is written to `./crawls/collections/wr-net` by d
While the crawl is running, the status of the crawl (provide by puppeteer-cluster monitoring) prints the progress to the Docker log.
### Multi-Platform Build / Support for Apple M1
Browsertrix Crawler uses a browser image which supports amd64 and arm64 (currently `oldwebtoday/chrome:91`).
This means Browsertrix Crawler can be built natively on Apple M1 systems using the default settings. Simply running `docker-compose build` on an Apple M1 should build a native version that should work for development.
On M1 system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build.
### Custom Browser Image
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images from `oldwebtoday/chrome` and `oldwebtoday/chromium` are supported.
For example, Webrecorder provides the `oldwebtoday/chromium:91-arm` for running Browsertrix Crawler on ARM-based systems.
To build with this specific Chromium image on an Apple M1 machine, run:
```
docker-compose build --build-arg BROWSER_IMAGE_BASE=oldwebtoday/chromium --build-arg "BROWSER_VERSION=91-arm" --build-arg BROWSER_BIN=chromium-browser
```
The build arguments specify the base image, version and browser binary. This approach can also be used to install a different browser in general from any Debian-based Docker image. Additional browser images may be added in the future.
### Viewing crawled data with pywb
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:

View file

@ -2,7 +2,6 @@ const child_process = require("child_process");
const path = require("path");
const fs = require("fs");
const fsp = require("fs/promises");
const os = require("os");
// to ignore HTTPS error for HEAD check
const HTTPS_AGENT = require("https").Agent({
@ -27,7 +26,9 @@ const TextExtract = require("./util/textextract");
const { ScreenCaster } = require("./util/screencaster");
const { parseArgs } = require("./util/argParser");
const { BROWSER_BIN, BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
const { getBrowserExe, loadProfile } = require("./util/browser");
const { BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
const { BlockRules } = require("./util/blockrules");
@ -50,13 +51,20 @@ class Crawler {
this.limitHit = false;
this.userAgent = "";
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
this.params = parseArgs(this.profileDir);
this.params = parseArgs();
this.debugLogging = this.params.logging.includes("debug");
this.profileDir = loadProfile(this.params.profile);
if (this.params.profile) {
this.statusLog("With Browser Profile: " + this.params.profile);
}
this.emulateDevice = this.params.emulateDevice;
console.log("Seeds", this.params.scopedSeeds);
this.debugLog("Seeds", this.params.scopedSeeds);
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
this.capturePrefix = this.captureBasePrerix + "/id_/";
@ -75,10 +83,19 @@ class Crawler {
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
this.blockRules = null;
}
statusLog(...args) {
console.log(...args);
}
debugLog(...args) {
if (this.debugLogging) {
console.log(...args);
}
}
configureUA() {
// override userAgent
if (this.params.userAgent) {
@ -91,6 +108,8 @@ class Crawler {
return;
}
this.browserExe = getBrowserExe();
// if device set, it overrides the default Chrome UA
if (this.emulateDevice) {
this.userAgent = this.emulateDevice.userAgent;
@ -98,9 +117,9 @@ class Crawler {
let version = process.env.BROWSER_VERSION;
try {
version = child_process.execFileSync(BROWSER_BIN, ["--product-version"], {encoding: "utf8"}).trim();
version = child_process.execFileSync(this.browserExe, ["--product-version"], {encoding: "utf8"}).trim();
} catch(e) {
console.log(e);
console.error(e);
}
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
@ -178,7 +197,7 @@ class Crawler {
// Puppeter Options
return {
headless: this.params.headless,
executablePath: BROWSER_BIN,
executablePath: this.browserExe,
ignoreHTTPSErrors: true,
args: this.chromeArgs,
userDataDir: this.profileDir,
@ -223,7 +242,7 @@ class Crawler {
await page.emulate(this.emulateDevice);
}
if (this.profileDir) {
if (this.params.profile) {
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
}
@ -290,7 +309,7 @@ class Crawler {
try {
this.driver = require(this.params.driver);
} catch(e) {
console.log(e);
console.warn(e);
return;
}
@ -309,12 +328,13 @@ class Crawler {
await this.initPages();
if (this.params.blockRules) {
if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
}
if (this.params.screencastPort) {
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
this.debugLog(`Screencast Server started on: ${this.params.screencastPort}`);
}
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
@ -344,13 +364,13 @@ class Crawler {
}
if (this.params.generateCDX) {
console.log("Generate CDX");
this.statusLog("Generating CDX");
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
}
if (this.params.generateWACZ) {
console.log("Generating WACZ");
this.statusLog("Generating WACZ");
const archiveDir = path.join(this.collDir, "archive");
@ -364,8 +384,8 @@ class Crawler {
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
// Run the wacz create command
child_process.spawnSync("wacz" , argument_list);
console.log(`WACZ successfully generated and saved to: ${waczPath}`);
child_process.spawnSync("wacz" , argument_list, {stdio: "inherit"});
this.debugLog(`WACZ successfully generated and saved to: ${waczPath}`);
}
}
@ -400,7 +420,7 @@ class Crawler {
try {
await page.goto(url, this.gotoOpts);
} catch (e) {
console.log(`Load timeout for ${url}`, e);
console.warn(`Load timeout for ${url}`, e);
}
if (selector) {
@ -408,7 +428,7 @@ class Crawler {
}
}
async extractLinks(page, seedId, depth, selector = "a[href]") {
async extractLinks(page, seedId, depth, selector = "a[href]", prop = "href", isAttribute = false) {
const results = [];
const seed = this.params.scopedSeeds[seedId];
@ -418,11 +438,18 @@ class Crawler {
return;
}
const loadProp = (selector, prop) => {
return [...document.querySelectorAll(selector)].map(elem => elem[prop]);
};
const loadAttr = (selector, attr) => {
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(attr));
};
const loadFunc = isAttribute ? loadAttr : loadProp;
try {
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate((selector) => {
/* eslint-disable-next-line no-undef */
return [...document.querySelectorAll(selector)].map(elem => elem.href);
}, selector)));
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, prop)));
if (linkResults) {
for (const linkResult of linkResults) {
@ -452,7 +479,7 @@ class Crawler {
}
}
} catch (e) {
console.log("Queuing Error: ", e);
console.error("Queuing Error: ", e);
}
}
@ -486,19 +513,18 @@ class Crawler {
if (createNew) {
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
if (this.params.text) {
console.log("creating pages with full text");
header["hasText"] = true;
}
else{
console.log("creating pages without full text");
this.statusLog("Text Extraction: Enabled");
} else {
header["hasText"] = false;
this.statusLog("Text Extraction: Disabled");
}
const header_formatted = JSON.stringify(header).concat("\n");
await this.pagesFH.writeFile(header_formatted);
}
} catch(err) {
console.log("pages/pages.jsonl creation failed", err);
console.error("pages/pages.jsonl creation failed", err);
}
}
@ -531,7 +557,7 @@ class Crawler {
agent: this.resolveAgent
});
if (resp.status >= 400) {
console.log(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
this.debugLog(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
return true;
}
@ -564,7 +590,7 @@ class Crawler {
}
async awaitPendingClear() {
console.log("Waiting to ensure pending data is written to WARC...");
this.statusLog("Waiting to ensure pending data is written to WARCs...");
const redis = new Redis("redis://localhost/0");
@ -574,7 +600,7 @@ class Crawler {
break;
}
console.log(`Still waiting for ${res} pending requests to finish...`);
this.debugLog(`Still waiting for ${res} pending requests to finish...`);
await this.sleep(1000);
}
@ -595,17 +621,17 @@ class Crawler {
const { sites } = await sitemapper.fetch();
this.queueUrls(seedId, sites, 0);
} catch(e) {
console.log(e);
console.warn(e);
}
}
async combineWARC() {
console.log("Combining the WARCs");
this.statusLog("Generating Combined WARCs");
// Get the list of created Warcs
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
console.log(`Combining ${warcLists.length} WARCs...`);
this.debugLog(`Combining ${warcLists.length} WARCs...`);
const fileSizeObjects = []; // Used to sort the created warc by fileSize
@ -674,7 +700,7 @@ class Crawler {
fh.write(warcBuffer);
}
console.log(`Appending WARC ${fileSizeObjects[j].fileName}`);
this.debugLog(`Appending WARC ${fileSizeObjects[j].fileName}`);
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
@ -691,7 +717,7 @@ class Crawler {
await fh.end();
}
console.log(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
this.debugLog(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
}
}

View file

@ -6,12 +6,11 @@ const child_process = require("child_process");
const puppeteer = require("puppeteer-core");
const yargs = require("yargs");
const { BROWSER_BIN } = require("./util/constants");
const { getBrowserExe, loadProfile, saveProfile } = require("./util/browser");
const fs = require("fs");
const path = require("path");
const http = require("http");
const url = require("url");
const profileHTML = fs.readFileSync(path.join(__dirname, "screencast", "createProfile.html"), {encoding: "utf8"});
function cliOpts() {
@ -49,6 +48,17 @@ function cliOpts() {
describe: "Start in interactive mode!",
type: "boolean",
default: false,
},
"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
},
"windowSize": {
type: "string",
describe: "Browser window dimensions, specified as: width,height",
default: "1600,900"
}
};
}
@ -77,10 +87,11 @@ async function main() {
}
//await new Promise(resolve => setTimeout(resolve, 2000));
const profileDir = loadProfile(params.profile);
const args = {
headless: !!params.headless,
executablePath: BROWSER_BIN,
executablePath: getBrowserExe(),
ignoreHTTPSErrors: true,
args: [
"--no-xshm",
@ -88,9 +99,11 @@ async function main() {
"--disable-background-media-suspend",
"--autoplay-policy=no-user-gesture-required",
"--disable-features=IsolateOrigins,site-per-process",
"--user-data-dir=/tmp/profile",
"--remote-debugging-port=9221",
]
`--window-size=${params.windowSize}`
],
userDataDir: profileDir,
defaultViewport: null,
};
if (!params.user && !params.interactive) {
@ -163,7 +176,8 @@ async function createProfile(params, browser, page) {
const profileFilename = params.filename || "/output/profile.tar.gz";
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: "/tmp/profile"});
saveProfile(profileFilename);
console.log("done");
}
@ -199,16 +213,17 @@ function promptInput(msg, hidden = false) {
async function handleInteractive(params, browser, page) {
const target = page.target();
const targetUrl = `http://localhost:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}`;
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}&panel=resources`;
console.log("Creating Profile Interactively...");
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
const httpServer = http.createServer(async (req, res) => {
const pathname = url.parse(req.url).pathname;
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
const pathname = parsedUrl.pathname;
if (pathname === "/") {
res.writeHead(200, {"Content-Type": "text/html"});
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl));
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replace("$HOST", parsedUrl.hostname)));
} else if (pathname === "/createProfile" && req.method === "POST") {
@ -234,7 +249,7 @@ async function handleInteractive(params, browser, page) {
const port = 9223;
httpServer.listen(port);
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with the browser, click 'Create Profile' when done.`);
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
}
main();

View file

@ -2,7 +2,7 @@ version: '3.5'
services:
crawler:
image: webrecorder/browsertrix-crawler:0.4.0
image: webrecorder/browsertrix-crawler:0.4.1
build:
context: ./

View file

@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.4.0",
"version": "0.4.1",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",

View file

@ -7,10 +7,16 @@ html, body, iframe {
height: 100%;
margin: 0;
padding: 0;
border: 0;
overflow: hidden;
font-family: sans-serif;
}
body {
display: flex;
flex-direction: column;
}
iframe#main {
height: calc(100% - 36px);
height: calc(100% - 36px);
}
div#info {
margin: 8px;
@ -18,11 +24,15 @@ div#info {
form {
display: inline;
}
button {
font-weight: bold;
font-size: 15px;
}
</style>
</head>
<body>
<div id="info">
<b>Create Profile Interactively</b> -- Load any pages that you want to be part of the profile. When Done, Click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
Log in to any site(s) that you want to be part of the crawl profile using the embedded browser below. When done, click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
</div>
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
</body>

View file

@ -12,7 +12,7 @@ function getSeeds(config) {
return orig(name, ...args);
};
return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
return parseArgs(["node", "crawler", "--config", "configtest"]).scopedSeeds;
}
test("default scope", async () => {

View file

@ -1,6 +1,5 @@
const path = require("path");
const fs = require("fs");
const child_process = require("child_process");
const yaml = require("js-yaml");
const puppeteer = require("puppeteer-core");
@ -16,10 +15,6 @@ const { ScopedSeed } = require("./seeds");
// ============================================================================
class ArgParser {
constructor(profileDir) {
this.profileDir = profileDir;
}
get cliOpts() {
return {
"seeds": {
@ -50,7 +45,7 @@ class ArgParser {
"waitUntil": {
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
default: "load,networkidle0",
default: "load,networkidle2",
},
"depth": {
@ -195,6 +190,12 @@ class ArgParser {
type: "string",
},
"behaviorTimeout": {
describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
default: 90,
type: "number",
},
"profile": {
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
type: "string",
@ -261,6 +262,9 @@ class ArgParser {
argv.behaviors = argv.behaviors.split(",");
}
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
if (argv.behaviorTimeout) {
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
}
if (argv.logging.includes("behaviors")) {
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
} else if (argv.logging.includes("behaviors-debug")) {
@ -277,7 +281,8 @@ class ArgParser {
case "page":
argv.newContext = Cluster.CONCURRENCY_PAGE;
if (argv.screencastPort && argv.workers > 1) {
console.warn("Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window");
console.log("Note: to support screencasting with >1 workers, newContext set to 'window' instead of 'page'");
argv.newContext = NewWindowPage;
}
break;
@ -348,15 +353,10 @@ class ArgParser {
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
}
if (argv.profile) {
child_process.execSync("tar xvfz " + argv.profile, {cwd: this.profileDir});
}
return true;
}
}
module.exports.parseArgs = function(profileDir, argv) {
return new ArgParser(profileDir).parseArgs(argv);
module.exports.parseArgs = function(argv) {
return new ArgParser().parseArgs(argv);
};

View file

@ -56,9 +56,19 @@ class BlockRules
}
async initPage(page) {
if (!this.rules.length) {
return;
}
await page.setRequestInterception(true);
page.on("request", (request) => this.handleRequest(request));
page.on("request", async (request) => {
try {
await this.handleRequest(request);
} catch (e) {
console.warn(e);
}
});
}
async handleRequest(request) {

33
util/browser.js Normal file
View file

@ -0,0 +1,33 @@
const child_process = require("child_process");
const fs = require("fs");
const path = require("path");
const os = require("os");
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
module.exports.loadProfile = function(profileFilename) {
if (profileFilename) {
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
}
return profileDir;
};
module.exports.saveProfile = function(profileFilename) {
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
};
module.exports.getBrowserExe = function() {
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
for (const file of files) {
if (file && fs.existsSync(file)) {
return file;
}
}
return null;
};

View file

@ -2,5 +2,4 @@
module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";
module.exports.BROWSER_BIN = process.env.BROWSER_BIN || "google-chrome";

View file

@ -47,7 +47,6 @@ class ScreenCaster
});
this.httpServer.listen(port);
console.log(`Screencast Server started on: ${port}`);
}
initWebSocket(ws) {
@ -65,7 +64,7 @@ class ScreenCaster
}
ws.on("close", () => {
console.log("Screencast WebSocket Disconnected");
//console.log("Screencast WebSocket Disconnected");
this.allWS.delete(ws);
if (this.allWS.size === 0) {
@ -100,7 +99,7 @@ class ScreenCaster
try {
await cdp.send("Page.screencastFrameAck", {sessionId});
} catch(e) {
console.log("Ack Failed, probably window/tab already closed", e);
//console.log("Ack Failed, probably window/tab already closed", e);
}
});

View file

@ -33,7 +33,7 @@ class ScopedSeed
parseUrl(url) {
let parsedUrl = null;
try {
parsedUrl = new URL(url);
parsedUrl = new URL(url.trim());
} catch (e) {
throw new Error(`Invalid Seed "${url}" - not a valid URL`);
}