mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
0.4.1 Release! (#70)
* optimization: don't intercept requests if no blockRules set * page load: set waitUntil to use networkidle2 instead of networkidle0 as reasonable default for most pages * add --behaviorTimeout to set max running time for behaviors (defaults to 90 seconds) * refactor profile loadProfile/saveProfile to util/browser.js - support augmenting existing profile when creating a new profile * screencasting: convert newContext to window instead of page by default, instead of just warning about it * shared multiplatform image support: - determine browser exe from list of options, getBrowserExe() returns current exe - supports running with 'google-chrome' under amd64, and 'chromium-browser' under arm64 - update to multiplatform oldwebtoday/chrome:91 as browser image - enable multiplatform build with latest build-push-action@v2 * seeds: add trim() to seed URLs * logging: reduce initial debug logging, enable only if '--logging debug' is set. log if profile, text-extraction enabled, and post-processing stages automatically * profile creation: add --windowSize flag, set default to 1600x900, default to loading Application tab, tweak UI styles * extractLinks: support passing in custom property to get link, and also loading as an attribute via getAttribute. Fixes #25 * update CHANGES and README with new features * bump version to 0.4.1
This commit is contained in:
parent
6a65ea7a58
commit
f4c6b6a99f
16 changed files with 268 additions and 104 deletions
48
.github/workflows/release.yaml
vendored
48
.github/workflows/release.yaml
vendored
|
@ -2,17 +2,53 @@ name: Publish Docker image
|
|||
on:
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
push_to_registries:
|
||||
name: Push Docker image to Dockerhub
|
||||
name: Build x86 and ARM Images and push to Dockerhub
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
-
|
||||
name: Check out the repo
|
||||
uses: actions/checkout@v2
|
||||
- name: Push to Docker Hub
|
||||
uses: docker/build-push-action@v1
|
||||
|
||||
-
|
||||
name: Prepare
|
||||
id: prep
|
||||
run: |
|
||||
DOCKER_IMAGE=webrecorder/browsertrix-crawler
|
||||
VERSION=edge
|
||||
if [[ $GITHUB_REF == refs/tags/* ]]; then
|
||||
VERSION=${GITHUB_REF#refs/tags/}
|
||||
elif [[ $GITHUB_REF == refs/heads/* ]]; then
|
||||
VERSION=$(echo ${GITHUB_REF#refs/heads/} | sed -r 's#/+#-#g')
|
||||
elif [[ $GITHUB_REF == refs/pull/* ]]; then
|
||||
VERSION=pr-${{ github.event.number }}
|
||||
fi
|
||||
TAGS="${DOCKER_IMAGE}:${VERSION}"
|
||||
echo ::set-output name=tags::${TAGS}
|
||||
-
|
||||
name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
repository: webrecorder/browsertrix-crawler
|
||||
tag_with_ref: true
|
||||
-
|
||||
name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.prep.outputs.tags }}
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
-
|
||||
name: Image digest
|
||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||
|
||||
|
|
|
@ -1,5 +1,14 @@
|
|||
## CHANGES
|
||||
|
||||
v0.4.1
|
||||
- BlockRules Optimizations: don't intercept requests if no blockRules
|
||||
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
|
||||
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
|
||||
- Behavior Timeouts: Add --behaviorTimeout to specify custom timeout for behaviors, in seconds (defaulting to 90 seconds)
|
||||
- Load Wait Default: Switch to 'load,networkidle2' to speed-up waiting for initial load
|
||||
- Multi-platform build: Support building for amd64 and Arm using oldwebtoday/chrome:91 images (check for google-chrome and chromium-browser automatically)
|
||||
- CI: Build a multi-platform (amd64 and arm64) image on each release
|
||||
|
||||
v0.4.0
|
||||
- YAML based config, specifyable via --config property or via stdin (with '--config stdin')
|
||||
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
ARG BROWSER_VERSION=90
|
||||
ARG BROWSER_VERSION=91
|
||||
|
||||
ARG BROWSER_IMAGE_BASE=oldwebtoday/chrome
|
||||
|
||||
ARG BROWSER_BIN=google-chrome
|
||||
|
||||
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} as chrome
|
||||
FROM ${BROWSER_IMAGE_BASE}:${BROWSER_VERSION} AS browser
|
||||
|
||||
FROM ubuntu:bionic
|
||||
|
||||
|
@ -36,8 +36,8 @@ ENV PROXY_HOST=localhost \
|
|||
BROWSER_VERSION=${BROWSER_VERSION} \
|
||||
BROWSER_BIN=${BROWSER_BIN}
|
||||
|
||||
COPY --from=chrome /tmp/*.deb /deb/
|
||||
COPY --from=chrome /app/libpepflashplayer.so /app/libpepflashplayer.so
|
||||
COPY --from=browser /tmp/*.deb /deb/
|
||||
COPY --from=browser /app/libpepflashplayer.so /app/libpepflashplayer.so
|
||||
RUN dpkg -i /deb/*.deb; apt-get update; apt-get install -fqqy && \
|
||||
rm -rf /var/lib/opts/lists/*
|
||||
|
||||
|
|
73
README.md
73
README.md
|
@ -63,7 +63,7 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--waitUntil Puppeteer page.goto() condition to
|
||||
wait for before continuing, can be
|
||||
multiple separate by ','
|
||||
[default: "load,networkidle0"]
|
||||
[default: "load,networkidle2"]
|
||||
--depth The depth of the crawl for all seeds
|
||||
[number] [default: -1]
|
||||
--limit Limit crawl to this number of pages
|
||||
|
@ -138,6 +138,10 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
--behaviors Which background behaviors to enable
|
||||
on each page
|
||||
[string] [default: "autoplay,autofetch,siteSpecific"]
|
||||
--behaviorTimeout If >0, timeout (in seconds) for
|
||||
in-page behavior will run on each
|
||||
page. If 0, a behavior can run until
|
||||
finish. [number] [default: 90]
|
||||
--profile Path to tar.gz file which will be
|
||||
extracted and used as the browser
|
||||
profile [string]
|
||||
|
@ -152,10 +156,14 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
</details>
|
||||
|
||||
|
||||
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
|
||||
### Waiting for Page Load
|
||||
|
||||
One of the key nuances of browser-based crawling is determining when a page is finished loading. This can be configured with the `--waitUntil` flag.
|
||||
|
||||
The default is `load,networkidle2`, which waits until page load and <=2 requests remain, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example). The `--waitUntil networkidle0` may make sense for sites, where absolutely all requests must be waited until before proceeding.
|
||||
|
||||
See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options) for more info on the options that can be used with this flag from the Puppeteer docs.
|
||||
|
||||
The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example),
|
||||
while `--waitUntil networkidle0` may make sense for dynamic sites.
|
||||
|
||||
### YAML Crawl Config
|
||||
|
||||
|
@ -288,6 +296,8 @@ and auto-fetch content that is not loaded by default, and also run custom behavi
|
|||
Behaviors to run can be specified via a comma-separated list passed to the `--behaviors` option. By default, the auto-scroll behavior is not enabled by default, as it may slow down crawling. To enable this behaviors, you can add
|
||||
`--behaviors autoscroll` or to enable all behaviors, add `--behaviors autoscroll,autoplay,autofetch,siteSpecific`.
|
||||
|
||||
The site-specific behavior (or autoscroll) will start running after the page is finished its initial load (as defined by the `--waitUntil` settings). The behavior will then run until finished or until the behavior timeout is exceeded. This timeout can be set (in seconds) via the `--behaviorTimeout` flag (90 seconds by default). Setting the timeout to 0 will allow the behavior to run until it is finished.
|
||||
|
||||
See [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) for more info on all of the currently available behaviors.
|
||||
|
||||
|
||||
|
@ -336,6 +346,12 @@ The script will then prompt you for login credentials, attempt to login and crea
|
|||
|
||||
- To specify headless mode, add the `--headless` flag. Note that for crawls run with `--headless` flag, it is recommended to also create the profile with `--headless` to ensure the profile is compatible.
|
||||
|
||||
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
|
||||
|
||||
|
||||
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional profile functionality, such as support for custom profile creation scripts, may be added in the future.
|
||||
|
||||
|
||||
### Interactive Profile Creation
|
||||
|
||||
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
|
||||
|
@ -351,14 +367,19 @@ Browsertrix Crawler will then create a profile as before using the current state
|
|||
For example, to start in interactive profile creation mode, run:
|
||||
|
||||
```
|
||||
docker run -p 9222:9222 -p 9223:9223 -v $PWD/crawls/profiles:/output/ -it webrecorder/browsertrix-crawler:0.4.0-beta.3 create-login-profile --interactive --url "https://example.com/"
|
||||
docker run -p 9222:9222 -p 9223:9223 -v $PWD/profiles:/output/ -it webrecorder/browsertrix-crawler:[VERSION] create-login-profile --interactive --url "https://example.com/"
|
||||
```
|
||||
|
||||
Then, open a browser pointing to `http://localhost:9223/` and use the embedded browser to log in to any sites or configure any settings as needed.
|
||||
Click 'Create Profile at the top when done. The profile will then be created in `./crawls/profiles/profile.tar.gz` containing the settings of this browsing session.
|
||||
|
||||
It is also possible to extend an existing profiles by also passing in an existing profile via the `--profile` flag. In this way, it is possible to build new profiles by extending previous browsing sessions as needed.
|
||||
|
||||
### Using Browser Profile
|
||||
```
|
||||
docker run -p 9222:9222 -p 9223:9223 -v $PWD/profiles:/profiles --filename /profiles/newProfile.tar.gz -it webrecorder/browsertrix-crawler:[VERSION] create-login-profile --interactive --url "https://example.com/ --profile /profiles/oldProfile.tar.gz"
|
||||
```
|
||||
|
||||
### Using Browser Profile with a Crawl
|
||||
|
||||
To use a previously created profile with a crawl, use the `--profile` flag or `profile` option. The `--profile` flag can then be used to specify any Chrome profile stored as a tarball. Using profiles created with same or older version of Browsertrix Crawler is recommended to ensure compatibility. This option allows running a crawl with the browser already pre-configured, logged in to certain sites, language settings configured, etc...
|
||||
|
||||
|
@ -369,9 +390,6 @@ After running the above command, you can now run a crawl with the profile, as fo
|
|||
docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --profile /crawls/profiles/profile.tar.gz --url https://twitter.com/--generateWACZ --collection test-with-profile
|
||||
```
|
||||
|
||||
The current profile creation script is still experimental and the script attempts to detect the usename and password fields on a site as generically as possible, but may not work for all sites. Additional profile functionality, such as support for custom profile creation scripts, may be added in the future.
|
||||
|
||||
|
||||
## Architecture
|
||||
|
||||
The Docker container provided here packages up several components used in Browsertrix.
|
||||
|
@ -386,20 +404,6 @@ The crawl produces a single pywb collection, at `/crawls/collections/<collection
|
|||
|
||||
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
|
||||
|
||||
### Building with Custom Browser Image / Building on Apple M1
|
||||
|
||||
Browsertrix Crawler can be built on the new ARM M1 chip (for development). However, since there is no Linux build of Chrome for ARM, Chromium can be used instead. Currently, Webrecorder provides the `oldwebtoday/chromium:91-arm` for running Browsertrix Crawler on ARM-based systems.
|
||||
|
||||
For example, to build with this Chromium image on an Apple M1 machine, run:
|
||||
|
||||
```
|
||||
docker-compose build --build-arg BROWSER_IMAGE_BASE=oldwebtoday/chromium --build-arg "BROWSER_VERSION=91-arm" --build-arg BROWSER_BIN=chromium-browser
|
||||
```
|
||||
|
||||
You should then be able to run Browsertrix Crawler natively on M1.
|
||||
|
||||
The build arguments specify the base image, version and browser binary. This approach can also be used to install a different browser in general from any Debian-based Docker image.
|
||||
|
||||
|
||||
### Usage with Docker Compose
|
||||
|
||||
|
@ -427,6 +431,29 @@ In this example, the crawl data is written to `./crawls/collections/wr-net` by d
|
|||
While the crawl is running, the status of the crawl (provide by puppeteer-cluster monitoring) prints the progress to the Docker log.
|
||||
|
||||
|
||||
### Multi-Platform Build / Support for Apple M1
|
||||
|
||||
Browsertrix Crawler uses a browser image which supports amd64 and arm64 (currently `oldwebtoday/chrome:91`).
|
||||
|
||||
This means Browsertrix Crawler can be built natively on Apple M1 systems using the default settings. Simply running `docker-compose build` on an Apple M1 should build a native version that should work for development.
|
||||
|
||||
On M1 system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build.
|
||||
|
||||
|
||||
### Custom Browser Image
|
||||
|
||||
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images from `oldwebtoday/chrome` and `oldwebtoday/chromium` are supported.
|
||||
|
||||
For example, Webrecorder provides the `oldwebtoday/chromium:91-arm` for running Browsertrix Crawler on ARM-based systems.
|
||||
|
||||
To build with this specific Chromium image on an Apple M1 machine, run:
|
||||
|
||||
```
|
||||
docker-compose build --build-arg BROWSER_IMAGE_BASE=oldwebtoday/chromium --build-arg "BROWSER_VERSION=91-arm" --build-arg BROWSER_BIN=chromium-browser
|
||||
```
|
||||
|
||||
The build arguments specify the base image, version and browser binary. This approach can also be used to install a different browser in general from any Debian-based Docker image. Additional browser images may be added in the future.
|
||||
|
||||
### Viewing crawled data with pywb
|
||||
|
||||
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:
|
||||
|
|
98
crawler.js
98
crawler.js
|
@ -2,7 +2,6 @@ const child_process = require("child_process");
|
|||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
const fsp = require("fs/promises");
|
||||
const os = require("os");
|
||||
|
||||
// to ignore HTTPS error for HEAD check
|
||||
const HTTPS_AGENT = require("https").Agent({
|
||||
|
@ -27,7 +26,9 @@ const TextExtract = require("./util/textextract");
|
|||
const { ScreenCaster } = require("./util/screencaster");
|
||||
const { parseArgs } = require("./util/argParser");
|
||||
|
||||
const { BROWSER_BIN, BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
|
||||
const { getBrowserExe, loadProfile } = require("./util/browser");
|
||||
|
||||
const { BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
|
||||
|
||||
const { BlockRules } = require("./util/blockrules");
|
||||
|
||||
|
@ -50,13 +51,20 @@ class Crawler {
|
|||
this.limitHit = false;
|
||||
|
||||
this.userAgent = "";
|
||||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
|
||||
this.params = parseArgs(this.profileDir);
|
||||
this.params = parseArgs();
|
||||
|
||||
this.debugLogging = this.params.logging.includes("debug");
|
||||
|
||||
this.profileDir = loadProfile(this.params.profile);
|
||||
|
||||
if (this.params.profile) {
|
||||
this.statusLog("With Browser Profile: " + this.params.profile);
|
||||
}
|
||||
|
||||
this.emulateDevice = this.params.emulateDevice;
|
||||
|
||||
console.log("Seeds", this.params.scopedSeeds);
|
||||
this.debugLog("Seeds", this.params.scopedSeeds);
|
||||
|
||||
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
||||
this.capturePrefix = this.captureBasePrerix + "/id_/";
|
||||
|
@ -75,10 +83,19 @@ class Crawler {
|
|||
// pages file
|
||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
|
||||
this.blockRules = null;
|
||||
}
|
||||
|
||||
statusLog(...args) {
|
||||
console.log(...args);
|
||||
}
|
||||
|
||||
debugLog(...args) {
|
||||
if (this.debugLogging) {
|
||||
console.log(...args);
|
||||
}
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
// override userAgent
|
||||
if (this.params.userAgent) {
|
||||
|
@ -91,6 +108,8 @@ class Crawler {
|
|||
return;
|
||||
}
|
||||
|
||||
this.browserExe = getBrowserExe();
|
||||
|
||||
// if device set, it overrides the default Chrome UA
|
||||
if (this.emulateDevice) {
|
||||
this.userAgent = this.emulateDevice.userAgent;
|
||||
|
@ -98,9 +117,9 @@ class Crawler {
|
|||
let version = process.env.BROWSER_VERSION;
|
||||
|
||||
try {
|
||||
version = child_process.execFileSync(BROWSER_BIN, ["--product-version"], {encoding: "utf8"}).trim();
|
||||
version = child_process.execFileSync(this.browserExe, ["--product-version"], {encoding: "utf8"}).trim();
|
||||
} catch(e) {
|
||||
console.log(e);
|
||||
console.error(e);
|
||||
}
|
||||
|
||||
this.userAgent = `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${version} Safari/537.36`;
|
||||
|
@ -178,7 +197,7 @@ class Crawler {
|
|||
// Puppeter Options
|
||||
return {
|
||||
headless: this.params.headless,
|
||||
executablePath: BROWSER_BIN,
|
||||
executablePath: this.browserExe,
|
||||
ignoreHTTPSErrors: true,
|
||||
args: this.chromeArgs,
|
||||
userDataDir: this.profileDir,
|
||||
|
@ -223,7 +242,7 @@ class Crawler {
|
|||
await page.emulate(this.emulateDevice);
|
||||
}
|
||||
|
||||
if (this.profileDir) {
|
||||
if (this.params.profile) {
|
||||
await page._client.send("Network.setBypassServiceWorker", {bypass: true});
|
||||
}
|
||||
|
||||
|
@ -290,7 +309,7 @@ class Crawler {
|
|||
try {
|
||||
this.driver = require(this.params.driver);
|
||||
} catch(e) {
|
||||
console.log(e);
|
||||
console.warn(e);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -309,12 +328,13 @@ class Crawler {
|
|||
|
||||
await this.initPages();
|
||||
|
||||
if (this.params.blockRules) {
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
|
||||
}
|
||||
|
||||
if (this.params.screencastPort) {
|
||||
this.screencaster = new ScreenCaster(this.cluster, this.params.screencastPort);
|
||||
this.debugLog(`Screencast Server started on: ${this.params.screencastPort}`);
|
||||
}
|
||||
|
||||
for (let i = 0; i < this.params.scopedSeeds.length; i++) {
|
||||
|
@ -344,13 +364,13 @@ class Crawler {
|
|||
}
|
||||
|
||||
if (this.params.generateCDX) {
|
||||
console.log("Generate CDX");
|
||||
this.statusLog("Generating CDX");
|
||||
|
||||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
||||
}
|
||||
|
||||
if (this.params.generateWACZ) {
|
||||
console.log("Generating WACZ");
|
||||
this.statusLog("Generating WACZ");
|
||||
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
|
||||
|
@ -364,8 +384,8 @@ class Crawler {
|
|||
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
||||
|
||||
// Run the wacz create command
|
||||
child_process.spawnSync("wacz" , argument_list);
|
||||
console.log(`WACZ successfully generated and saved to: ${waczPath}`);
|
||||
child_process.spawnSync("wacz" , argument_list, {stdio: "inherit"});
|
||||
this.debugLog(`WACZ successfully generated and saved to: ${waczPath}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -400,7 +420,7 @@ class Crawler {
|
|||
try {
|
||||
await page.goto(url, this.gotoOpts);
|
||||
} catch (e) {
|
||||
console.log(`Load timeout for ${url}`, e);
|
||||
console.warn(`Load timeout for ${url}`, e);
|
||||
}
|
||||
|
||||
if (selector) {
|
||||
|
@ -408,7 +428,7 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async extractLinks(page, seedId, depth, selector = "a[href]") {
|
||||
async extractLinks(page, seedId, depth, selector = "a[href]", prop = "href", isAttribute = false) {
|
||||
const results = [];
|
||||
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
@ -418,11 +438,18 @@ class Crawler {
|
|||
return;
|
||||
}
|
||||
|
||||
const loadProp = (selector, prop) => {
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem[prop]);
|
||||
};
|
||||
|
||||
const loadAttr = (selector, attr) => {
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(attr));
|
||||
};
|
||||
|
||||
const loadFunc = isAttribute ? loadAttr : loadProp;
|
||||
|
||||
try {
|
||||
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate((selector) => {
|
||||
/* eslint-disable-next-line no-undef */
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem.href);
|
||||
}, selector)));
|
||||
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, prop)));
|
||||
|
||||
if (linkResults) {
|
||||
for (const linkResult of linkResults) {
|
||||
|
@ -452,7 +479,7 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log("Queuing Error: ", e);
|
||||
console.error("Queuing Error: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -486,19 +513,18 @@ class Crawler {
|
|||
if (createNew) {
|
||||
const header = {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"};
|
||||
if (this.params.text) {
|
||||
console.log("creating pages with full text");
|
||||
header["hasText"] = true;
|
||||
}
|
||||
else{
|
||||
console.log("creating pages without full text");
|
||||
this.statusLog("Text Extraction: Enabled");
|
||||
} else {
|
||||
header["hasText"] = false;
|
||||
this.statusLog("Text Extraction: Disabled");
|
||||
}
|
||||
const header_formatted = JSON.stringify(header).concat("\n");
|
||||
await this.pagesFH.writeFile(header_formatted);
|
||||
}
|
||||
|
||||
} catch(err) {
|
||||
console.log("pages/pages.jsonl creation failed", err);
|
||||
console.error("pages/pages.jsonl creation failed", err);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -531,7 +557,7 @@ class Crawler {
|
|||
agent: this.resolveAgent
|
||||
});
|
||||
if (resp.status >= 400) {
|
||||
console.log(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
||||
this.debugLog(`Skipping HEAD check ${url}, invalid status ${resp.status}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -564,7 +590,7 @@ class Crawler {
|
|||
}
|
||||
|
||||
async awaitPendingClear() {
|
||||
console.log("Waiting to ensure pending data is written to WARC...");
|
||||
this.statusLog("Waiting to ensure pending data is written to WARCs...");
|
||||
|
||||
const redis = new Redis("redis://localhost/0");
|
||||
|
||||
|
@ -574,7 +600,7 @@ class Crawler {
|
|||
break;
|
||||
}
|
||||
|
||||
console.log(`Still waiting for ${res} pending requests to finish...`);
|
||||
this.debugLog(`Still waiting for ${res} pending requests to finish...`);
|
||||
|
||||
await this.sleep(1000);
|
||||
}
|
||||
|
@ -595,17 +621,17 @@ class Crawler {
|
|||
const { sites } = await sitemapper.fetch();
|
||||
this.queueUrls(seedId, sites, 0);
|
||||
} catch(e) {
|
||||
console.log(e);
|
||||
console.warn(e);
|
||||
}
|
||||
}
|
||||
|
||||
async combineWARC() {
|
||||
console.log("Combining the WARCs");
|
||||
this.statusLog("Generating Combined WARCs");
|
||||
|
||||
// Get the list of created Warcs
|
||||
const warcLists = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
|
||||
console.log(`Combining ${warcLists.length} WARCs...`);
|
||||
this.debugLog(`Combining ${warcLists.length} WARCs...`);
|
||||
|
||||
const fileSizeObjects = []; // Used to sort the created warc by fileSize
|
||||
|
||||
|
@ -674,7 +700,7 @@ class Crawler {
|
|||
fh.write(warcBuffer);
|
||||
}
|
||||
|
||||
console.log(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
||||
this.debugLog(`Appending WARC ${fileSizeObjects[j].fileName}`);
|
||||
|
||||
const reader = fs.createReadStream(fileSizeObjects[j].fileName);
|
||||
|
||||
|
@ -691,7 +717,7 @@ class Crawler {
|
|||
await fh.end();
|
||||
}
|
||||
|
||||
console.log(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
||||
this.debugLog(`Combined WARCs saved as: ${generatedCombinedWarcs}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,12 +6,11 @@ const child_process = require("child_process");
|
|||
const puppeteer = require("puppeteer-core");
|
||||
const yargs = require("yargs");
|
||||
|
||||
const { BROWSER_BIN } = require("./util/constants");
|
||||
const { getBrowserExe, loadProfile, saveProfile } = require("./util/browser");
|
||||
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const http = require("http");
|
||||
const url = require("url");
|
||||
const profileHTML = fs.readFileSync(path.join(__dirname, "screencast", "createProfile.html"), {encoding: "utf8"});
|
||||
|
||||
function cliOpts() {
|
||||
|
@ -49,6 +48,17 @@ function cliOpts() {
|
|||
describe: "Start in interactive mode!",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"windowSize": {
|
||||
type: "string",
|
||||
describe: "Browser window dimensions, specified as: width,height",
|
||||
default: "1600,900"
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -77,10 +87,11 @@ async function main() {
|
|||
}
|
||||
|
||||
//await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
const profileDir = loadProfile(params.profile);
|
||||
|
||||
const args = {
|
||||
headless: !!params.headless,
|
||||
executablePath: BROWSER_BIN,
|
||||
executablePath: getBrowserExe(),
|
||||
ignoreHTTPSErrors: true,
|
||||
args: [
|
||||
"--no-xshm",
|
||||
|
@ -88,9 +99,11 @@ async function main() {
|
|||
"--disable-background-media-suspend",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-features=IsolateOrigins,site-per-process",
|
||||
"--user-data-dir=/tmp/profile",
|
||||
"--remote-debugging-port=9221",
|
||||
]
|
||||
`--window-size=${params.windowSize}`
|
||||
],
|
||||
userDataDir: profileDir,
|
||||
defaultViewport: null,
|
||||
};
|
||||
|
||||
if (!params.user && !params.interactive) {
|
||||
|
@ -163,7 +176,8 @@ async function createProfile(params, browser, page) {
|
|||
|
||||
const profileFilename = params.filename || "/output/profile.tar.gz";
|
||||
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: "/tmp/profile"});
|
||||
saveProfile(profileFilename);
|
||||
|
||||
console.log("done");
|
||||
}
|
||||
|
||||
|
@ -199,16 +213,17 @@ function promptInput(msg, hidden = false) {
|
|||
|
||||
async function handleInteractive(params, browser, page) {
|
||||
const target = page.target();
|
||||
const targetUrl = `http://localhost:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}`;
|
||||
const targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=localhost:9222/devtools/page/${target._targetId}&panel=resources`;
|
||||
|
||||
console.log("Creating Profile Interactively...");
|
||||
child_process.spawn("socat", ["tcp-listen:9222,fork", "tcp:localhost:9221"]);
|
||||
|
||||
const httpServer = http.createServer(async (req, res) => {
|
||||
const pathname = url.parse(req.url).pathname;
|
||||
const parsedUrl = new URL(req.url, `http://${req.headers.host}`);
|
||||
const pathname = parsedUrl.pathname;
|
||||
if (pathname === "/") {
|
||||
res.writeHead(200, {"Content-Type": "text/html"});
|
||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl));
|
||||
res.end(profileHTML.replace("$DEVTOOLS_SRC", targetUrl.replace("$HOST", parsedUrl.hostname)));
|
||||
|
||||
} else if (pathname === "/createProfile" && req.method === "POST") {
|
||||
|
||||
|
@ -234,7 +249,7 @@ async function handleInteractive(params, browser, page) {
|
|||
|
||||
const port = 9223;
|
||||
httpServer.listen(port);
|
||||
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with the browser, click 'Create Profile' when done.`);
|
||||
console.log(`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`);
|
||||
}
|
||||
|
||||
main();
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '3.5'
|
|||
|
||||
services:
|
||||
crawler:
|
||||
image: webrecorder/browsertrix-crawler:0.4.0
|
||||
image: webrecorder/browsertrix-crawler:0.4.1
|
||||
build:
|
||||
context: ./
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.4.0",
|
||||
"version": "0.4.1",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
|
|
|
@ -7,10 +7,16 @@ html, body, iframe {
|
|||
height: 100%;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
border: 0;
|
||||
overflow: hidden;
|
||||
font-family: sans-serif;
|
||||
}
|
||||
body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
iframe#main {
|
||||
height: calc(100% - 36px);
|
||||
height: calc(100% - 36px);
|
||||
}
|
||||
div#info {
|
||||
margin: 8px;
|
||||
|
@ -18,11 +24,15 @@ div#info {
|
|||
form {
|
||||
display: inline;
|
||||
}
|
||||
button {
|
||||
font-weight: bold;
|
||||
font-size: 15px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="info">
|
||||
<b>Create Profile Interactively</b> -- Load any pages that you want to be part of the profile. When Done, Click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
|
||||
Log in to any site(s) that you want to be part of the crawl profile using the embedded browser below. When done, click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
|
||||
</div>
|
||||
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
|
||||
</body>
|
||||
|
|
|
@ -12,7 +12,7 @@ function getSeeds(config) {
|
|||
return orig(name, ...args);
|
||||
};
|
||||
|
||||
return parseArgs(null, ["node", "crawler", "--config", "configtest"]).scopedSeeds;
|
||||
return parseArgs(["node", "crawler", "--config", "configtest"]).scopedSeeds;
|
||||
}
|
||||
|
||||
test("default scope", async () => {
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
const child_process = require("child_process");
|
||||
|
||||
const yaml = require("js-yaml");
|
||||
const puppeteer = require("puppeteer-core");
|
||||
|
@ -16,10 +15,6 @@ const { ScopedSeed } = require("./seeds");
|
|||
|
||||
// ============================================================================
|
||||
class ArgParser {
|
||||
constructor(profileDir) {
|
||||
this.profileDir = profileDir;
|
||||
}
|
||||
|
||||
get cliOpts() {
|
||||
return {
|
||||
"seeds": {
|
||||
|
@ -50,7 +45,7 @@ class ArgParser {
|
|||
|
||||
"waitUntil": {
|
||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separate by ','",
|
||||
default: "load,networkidle0",
|
||||
default: "load,networkidle2",
|
||||
},
|
||||
|
||||
"depth": {
|
||||
|
@ -195,6 +190,12 @@ class ArgParser {
|
|||
type: "string",
|
||||
},
|
||||
|
||||
"behaviorTimeout": {
|
||||
describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
||||
default: 90,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
|
@ -261,6 +262,9 @@ class ArgParser {
|
|||
argv.behaviors = argv.behaviors.split(",");
|
||||
}
|
||||
argv.behaviors.forEach((x) => behaviorOpts[x] = true);
|
||||
if (argv.behaviorTimeout) {
|
||||
behaviorOpts.timeout = argv.behaviorTimeout *= 1000;
|
||||
}
|
||||
if (argv.logging.includes("behaviors")) {
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
} else if (argv.logging.includes("behaviors-debug")) {
|
||||
|
@ -277,7 +281,8 @@ class ArgParser {
|
|||
case "page":
|
||||
argv.newContext = Cluster.CONCURRENCY_PAGE;
|
||||
if (argv.screencastPort && argv.workers > 1) {
|
||||
console.warn("Note: Screencast with >1 workers and default page context may only show one page at a time. To fix, add '--newContext window' to open each page in a new window");
|
||||
console.log("Note: to support screencasting with >1 workers, newContext set to 'window' instead of 'page'");
|
||||
argv.newContext = NewWindowPage;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -348,15 +353,10 @@ class ArgParser {
|
|||
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
||||
}
|
||||
|
||||
if (argv.profile) {
|
||||
child_process.execSync("tar xvfz " + argv.profile, {cwd: this.profileDir});
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
module.exports.parseArgs = function(profileDir, argv) {
|
||||
return new ArgParser(profileDir).parseArgs(argv);
|
||||
module.exports.parseArgs = function(argv) {
|
||||
return new ArgParser().parseArgs(argv);
|
||||
};
|
||||
|
|
|
@ -56,9 +56,19 @@ class BlockRules
|
|||
}
|
||||
|
||||
async initPage(page) {
|
||||
if (!this.rules.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
await page.setRequestInterception(true);
|
||||
|
||||
page.on("request", (request) => this.handleRequest(request));
|
||||
page.on("request", async (request) => {
|
||||
try {
|
||||
await this.handleRequest(request);
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async handleRequest(request) {
|
||||
|
|
33
util/browser.js
Normal file
33
util/browser.js
Normal file
|
@ -0,0 +1,33 @@
|
|||
const child_process = require("child_process");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const os = require("os");
|
||||
|
||||
const profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
|
||||
module.exports.loadProfile = function(profileFilename) {
|
||||
if (profileFilename) {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: profileDir});
|
||||
}
|
||||
|
||||
return profileDir;
|
||||
};
|
||||
|
||||
module.exports.saveProfile = function(profileFilename) {
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: profileDir});
|
||||
};
|
||||
|
||||
module.exports.getBrowserExe = function() {
|
||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||
for (const file of files) {
|
||||
if (file && fs.existsSync(file)) {
|
||||
return file;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
|
@ -2,5 +2,4 @@
|
|||
module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||
module.exports.BROWSER_BIN = process.env.BROWSER_BIN || "google-chrome";
|
||||
|
||||
|
|
|
@ -47,7 +47,6 @@ class ScreenCaster
|
|||
});
|
||||
|
||||
this.httpServer.listen(port);
|
||||
console.log(`Screencast Server started on: ${port}`);
|
||||
}
|
||||
|
||||
initWebSocket(ws) {
|
||||
|
@ -65,7 +64,7 @@ class ScreenCaster
|
|||
}
|
||||
|
||||
ws.on("close", () => {
|
||||
console.log("Screencast WebSocket Disconnected");
|
||||
//console.log("Screencast WebSocket Disconnected");
|
||||
this.allWS.delete(ws);
|
||||
|
||||
if (this.allWS.size === 0) {
|
||||
|
@ -100,7 +99,7 @@ class ScreenCaster
|
|||
try {
|
||||
await cdp.send("Page.screencastFrameAck", {sessionId});
|
||||
} catch(e) {
|
||||
console.log("Ack Failed, probably window/tab already closed", e);
|
||||
//console.log("Ack Failed, probably window/tab already closed", e);
|
||||
}
|
||||
});
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class ScopedSeed
|
|||
parseUrl(url) {
|
||||
let parsedUrl = null;
|
||||
try {
|
||||
parsedUrl = new URL(url);
|
||||
parsedUrl = new URL(url.trim());
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid Seed "${url}" - not a valid URL`);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue