mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Add Prettier to the repo, and format all the files! (#428)
This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
This commit is contained in:
parent
af1e0860e4
commit
2a49406df7
70 changed files with 3192 additions and 2026 deletions
|
@ -5,7 +5,11 @@ module.exports = {
|
||||||
node: true,
|
node: true,
|
||||||
jest: true,
|
jest: true,
|
||||||
},
|
},
|
||||||
extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"],
|
extends: [
|
||||||
|
"eslint:recommended",
|
||||||
|
"plugin:@typescript-eslint/recommended",
|
||||||
|
"prettier",
|
||||||
|
],
|
||||||
parser: "@typescript-eslint/parser",
|
parser: "@typescript-eslint/parser",
|
||||||
plugins: ["@typescript-eslint"],
|
plugins: ["@typescript-eslint"],
|
||||||
parserOptions: {
|
parserOptions: {
|
||||||
|
@ -13,10 +17,6 @@ module.exports = {
|
||||||
sourceType: "module",
|
sourceType: "module",
|
||||||
},
|
},
|
||||||
rules: {
|
rules: {
|
||||||
indent: ["error", 2],
|
|
||||||
"linebreak-style": ["error", "unix"],
|
|
||||||
quotes: ["error", "double"],
|
|
||||||
semi: ["error", "always"],
|
|
||||||
"no-constant-condition": ["error", { checkLoops: false }],
|
"no-constant-condition": ["error", { checkLoops: false }],
|
||||||
"no-use-before-define": [
|
"no-use-before-define": [
|
||||||
"error",
|
"error",
|
||||||
|
|
51
.github/workflows/ci.yaml
vendored
51
.github/workflows/ci.yaml
vendored
|
@ -6,7 +6,6 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint:
|
lint:
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
|
@ -14,18 +13,17 @@ jobs:
|
||||||
node-version: [18.x]
|
node-version: [18.x]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Use Node.js ${{ matrix.node-version }}
|
- name: Use Node.js ${{ matrix.node-version }}
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v3
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
run: yarn install
|
run: yarn install
|
||||||
- name: run linter
|
- name: run linter
|
||||||
run: yarn lint
|
run: yarn lint && yarn format
|
||||||
|
|
||||||
build:
|
build:
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
|
@ -33,21 +31,16 @@ jobs:
|
||||||
node-version: [18.x]
|
node-version: [18.x]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Use Node.js ${{ matrix.node-version }}
|
- name: Use Node.js ${{ matrix.node-version }}
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v3
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
- name: install requirements
|
- name: install requirements
|
||||||
run: yarn install
|
run: yarn install
|
||||||
- name: build js
|
- name: build js
|
||||||
run: yarn run tsc
|
run: yarn run tsc
|
||||||
- name: build docker
|
- name: build docker
|
||||||
run: docker-compose build
|
run: docker-compose build
|
||||||
- name: run jest
|
- name: run jest
|
||||||
run: sudo yarn test
|
run: sudo yarn test
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
22
.github/workflows/release.yaml
vendored
22
.github/workflows/release.yaml
vendored
|
@ -8,12 +8,10 @@ jobs:
|
||||||
name: Build x86 and ARM Images and push to Dockerhub
|
name: Build x86 and ARM Images and push to Dockerhub
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
steps:
|
steps:
|
||||||
-
|
- name: Check out the repo
|
||||||
name: Check out the repo
|
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
-
|
- name: Docker image metadata
|
||||||
name: Docker image metadata
|
|
||||||
id: meta
|
id: meta
|
||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
|
@ -21,23 +19,19 @@ jobs:
|
||||||
tags: |
|
tags: |
|
||||||
type=semver,pattern={{version}}
|
type=semver,pattern={{version}}
|
||||||
|
|
||||||
-
|
- name: Set up QEMU
|
||||||
name: Set up QEMU
|
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v3
|
||||||
with:
|
with:
|
||||||
platforms: arm64
|
platforms: arm64
|
||||||
|
|
||||||
-
|
- name: Set up Docker Buildx
|
||||||
name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v1
|
uses: docker/setup-buildx-action@v1
|
||||||
-
|
- name: Login to DockerHub
|
||||||
name: Login to DockerHub
|
|
||||||
uses: docker/login-action@v3
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
-
|
- name: Build and push
|
||||||
name: Build and push
|
|
||||||
id: docker_build
|
id: docker_build
|
||||||
uses: docker/build-push-action@v3
|
uses: docker/build-push-action@v3
|
||||||
with:
|
with:
|
||||||
|
@ -45,7 +39,5 @@ jobs:
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
platforms: "linux/amd64,linux/arm64"
|
platforms: "linux/amd64,linux/arm64"
|
||||||
-
|
- name: Image digest
|
||||||
name: Image digest
|
|
||||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||||
|
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -6,3 +6,4 @@ node_modules/
|
||||||
crawls/
|
crawls/
|
||||||
test-crawls/
|
test-crawls/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
dist
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#!/usr/bin/env sh
|
#!/usr/bin/env sh
|
||||||
. "$(dirname -- "$0")/_/husky.sh"
|
. "$(dirname -- "$0")/_/husky.sh"
|
||||||
|
|
||||||
yarn lint
|
yarn lint:fix
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
repos:
|
repos:
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: husky-run-pre-commit
|
- id: husky-run-pre-commit
|
||||||
name: husky
|
name: husky
|
||||||
language: system
|
language: system
|
||||||
entry: .husky/pre-commit
|
entry: .husky/pre-commit
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
|
1
.prettierignore
Normal file
1
.prettierignore
Normal file
|
@ -0,0 +1 @@
|
||||||
|
dist
|
18
CHANGES.md
18
CHANGES.md
|
@ -1,11 +1,13 @@
|
||||||
## CHANGES
|
## CHANGES
|
||||||
|
|
||||||
v0.8.1
|
v0.8.1
|
||||||
|
|
||||||
- Logging and Behavior Tweaks by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/229
|
- Logging and Behavior Tweaks by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/229
|
||||||
- Fix typos by @stavares843 in https://github.com/webrecorder/browsertrix-crawler/pull/232
|
- Fix typos by @stavares843 in https://github.com/webrecorder/browsertrix-crawler/pull/232
|
||||||
- Add crawl log to WACZ by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/231
|
- Add crawl log to WACZ by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/231
|
||||||
|
|
||||||
v0.8.0
|
v0.8.0
|
||||||
|
|
||||||
- Switch to Chrome/Chromium 109
|
- Switch to Chrome/Chromium 109
|
||||||
- Convert to ESM module
|
- Convert to ESM module
|
||||||
- Add ad blocking via request interception (#173)
|
- Add ad blocking via request interception (#173)
|
||||||
|
@ -25,11 +27,13 @@ v0.8.0
|
||||||
- update behaviors to 0.4.1, rename 'Behavior line' -> 'Behavior log' by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/223
|
- update behaviors to 0.4.1, rename 'Behavior line' -> 'Behavior log' by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/223
|
||||||
|
|
||||||
v0.7.1
|
v0.7.1
|
||||||
|
|
||||||
- Fix for warcio.js by @ikreymer in #178
|
- Fix for warcio.js by @ikreymer in #178
|
||||||
- Guard against pre-existing user/group by @edsu in #176
|
- Guard against pre-existing user/group by @edsu in #176
|
||||||
- Fix incorrect combineWARCs property in README.md by @Georift in #180
|
- Fix incorrect combineWARCs property in README.md by @Georift in #180
|
||||||
|
|
||||||
v0.7.0
|
v0.7.0
|
||||||
|
|
||||||
- Update to Chrome/Chromium 101 - (0.7.0 Beta 0) by @ikreymer in #144
|
- Update to Chrome/Chromium 101 - (0.7.0 Beta 0) by @ikreymer in #144
|
||||||
- Add --netIdleWait, bump dependencies (0.7.0-beta.2) by @ikreymer in #145
|
- Add --netIdleWait, bump dependencies (0.7.0-beta.2) by @ikreymer in #145
|
||||||
- Update README.md by @atomotic in #147
|
- Update README.md by @atomotic in #147
|
||||||
|
@ -41,7 +45,6 @@ v0.7.0
|
||||||
- Interrupt Handling Fixes by @ikreymer in #167
|
- Interrupt Handling Fixes by @ikreymer in #167
|
||||||
- Run in Docker as User by @edsu in #171
|
- Run in Docker as User by @edsu in #171
|
||||||
|
|
||||||
|
|
||||||
v0.6.0
|
v0.6.0
|
||||||
|
|
||||||
- Add a --waitOnDone option, which has browsertrix crawler wait when finished (for use with Browsertrix Cloud)
|
- Add a --waitOnDone option, which has browsertrix crawler wait when finished (for use with Browsertrix Cloud)
|
||||||
|
@ -56,8 +59,8 @@ v0.6.0
|
||||||
- Fixes to interrupting a single instance in a shared state crawl
|
- Fixes to interrupting a single instance in a shared state crawl
|
||||||
- force all cookies, including session cookies, to fixed duration in days, configurable via --cookieDays
|
- force all cookies, including session cookies, to fixed duration in days, configurable via --cookieDays
|
||||||
|
|
||||||
|
|
||||||
v0.5.0
|
v0.5.0
|
||||||
|
|
||||||
- Scope: support for `scopeType: domain` to include all subdomains and ignoring 'www.' if specified in the seed.
|
- Scope: support for `scopeType: domain` to include all subdomains and ignoring 'www.' if specified in the seed.
|
||||||
- Profiles: support loading remote profile from URL as well as local file
|
- Profiles: support loading remote profile from URL as well as local file
|
||||||
- Non-HTML Pages: Load non-200 responses in browser, even if non-html, fix waiting issues with non-HTML pages (eg. PDFs)
|
- Non-HTML Pages: Load non-200 responses in browser, even if non-html, fix waiting issues with non-HTML pages (eg. PDFs)
|
||||||
|
@ -75,8 +78,8 @@ v0.5.0
|
||||||
- Signing: Support for optional signing of WACZ
|
- Signing: Support for optional signing of WACZ
|
||||||
- Dependencies: update to latest pywb, wacz and browsertrix-behaviors packages
|
- Dependencies: update to latest pywb, wacz and browsertrix-behaviors packages
|
||||||
|
|
||||||
|
|
||||||
v0.4.4
|
v0.4.4
|
||||||
|
|
||||||
- Page Block Rules Fix: 'request already handled' errors by avoiding adding duplicate handlers to same page.
|
- Page Block Rules Fix: 'request already handled' errors by avoiding adding duplicate handlers to same page.
|
||||||
- Page Block Rules Fix: await all continue/abort() calls and catch errors.
|
- Page Block Rules Fix: await all continue/abort() calls and catch errors.
|
||||||
- Page Block Rules: Don't apply to top-level page, print warning and recommend scope rules instead.
|
- Page Block Rules: Don't apply to top-level page, print warning and recommend scope rules instead.
|
||||||
|
@ -86,11 +89,13 @@ v0.4.4
|
||||||
- README: Update old type -> scopeType, list new scope types.
|
- README: Update old type -> scopeType, list new scope types.
|
||||||
|
|
||||||
v0.4.3
|
v0.4.3
|
||||||
|
|
||||||
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
|
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
|
||||||
- BlockRules Fixes: Always allow pywb proxy scripts.
|
- BlockRules Fixes: Always allow pywb proxy scripts.
|
||||||
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
|
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
|
||||||
|
|
||||||
v0.4.2
|
v0.4.2
|
||||||
|
|
||||||
- Compose/docs: Build latest image by default, update README to refer to latest image
|
- Compose/docs: Build latest image by default, update README to refer to latest image
|
||||||
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
||||||
- Tests: Update all tests to use `test-crawls` directory
|
- Tests: Update all tests to use `test-crawls` directory
|
||||||
|
@ -98,6 +103,7 @@ v0.4.2
|
||||||
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
|
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
|
||||||
|
|
||||||
v0.4.1
|
v0.4.1
|
||||||
|
|
||||||
- BlockRules Optimizations: don't intercept requests if no blockRules
|
- BlockRules Optimizations: don't intercept requests if no blockRules
|
||||||
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
|
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
|
||||||
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
|
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
|
||||||
|
@ -107,6 +113,7 @@ v0.4.1
|
||||||
- CI: Build a multi-platform (amd64 and arm64) image on each release
|
- CI: Build a multi-platform (amd64 and arm64) image on each release
|
||||||
|
|
||||||
v0.4.0
|
v0.4.0
|
||||||
|
|
||||||
- YAML based config, specifyable via --config property or via stdin (with '--config stdin')
|
- YAML based config, specifyable via --config property or via stdin (with '--config stdin')
|
||||||
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level
|
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level
|
||||||
- Per-Seed scoping, including different scope types, or depth and include/exclude rules configurable per seed in 'seeds' list via YAML config
|
- Per-Seed scoping, including different scope types, or depth and include/exclude rules configurable per seed in 'seeds' list via YAML config
|
||||||
|
@ -120,16 +127,17 @@ v0.4.0
|
||||||
- Update to latest pywb (2.5.0b4), browsertrix-behaviors (0.2.3), py-wacz (0.3.1)
|
- Update to latest pywb (2.5.0b4), browsertrix-behaviors (0.2.3), py-wacz (0.3.1)
|
||||||
|
|
||||||
v0.3.2
|
v0.3.2
|
||||||
|
|
||||||
- Added a `--urlFile` option: Allows users to specify a .txt file list of exact URLs to crawl (one URL per line).
|
- Added a `--urlFile` option: Allows users to specify a .txt file list of exact URLs to crawl (one URL per line).
|
||||||
|
|
||||||
|
|
||||||
v0.3.1
|
v0.3.1
|
||||||
|
|
||||||
- Improved shutdown wait: Instead of waiting for 5 secs, wait until all pending requests are written to WARCs
|
- Improved shutdown wait: Instead of waiting for 5 secs, wait until all pending requests are written to WARCs
|
||||||
- Bug fix: Use async APIs for combine WARC to avoid spurious issues with multiple crawls
|
- Bug fix: Use async APIs for combine WARC to avoid spurious issues with multiple crawls
|
||||||
- Behaviors Update to Behaviors to 0.2.1, with support for facebook pages
|
- Behaviors Update to Behaviors to 0.2.1, with support for facebook pages
|
||||||
|
|
||||||
|
|
||||||
v0.3.0
|
v0.3.0
|
||||||
|
|
||||||
- WARC Combining: `--combineWARC` and `--rolloverSize` flags for generating combined WARC at end of crawl, each WARC upto specified rolloverSize
|
- WARC Combining: `--combineWARC` and `--rolloverSize` flags for generating combined WARC at end of crawl, each WARC upto specified rolloverSize
|
||||||
- Profiles: Support for creating reusable browser profiles, stored as tarballs, and running crawl with a login profile (see README for more info)
|
- Profiles: Support for creating reusable browser profiles, stored as tarballs, and running crawl with a login profile (see README for more info)
|
||||||
- Behaviors: Switch to Browsertrix Behaviors v0.1.1 for in-page behaviors
|
- Behaviors: Switch to Browsertrix Behaviors v0.1.1 for in-page behaviors
|
||||||
|
|
34
README.md
34
README.md
|
@ -51,7 +51,6 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
||||||
|
|
||||||
## Crawling Configuration Options
|
## Crawling Configuration Options
|
||||||
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
|
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
|
||||||
|
|
||||||
|
@ -269,8 +268,8 @@ Options:
|
||||||
ess (for debugging) [boolean]
|
ess (for debugging) [boolean]
|
||||||
--config Path to YAML config file
|
--config Path to YAML config file
|
||||||
```
|
```
|
||||||
</details>
|
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
### Waiting for Page Load
|
### Waiting for Page Load
|
||||||
|
|
||||||
|
@ -282,14 +281,12 @@ See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remar
|
||||||
|
|
||||||
The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
|
The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
|
||||||
|
|
||||||
|
|
||||||
### YAML Crawl Config
|
### YAML Crawl Config
|
||||||
|
|
||||||
Browsertix Crawler supports the use of a yaml file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option.
|
Browsertix Crawler supports the use of a yaml file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option.
|
||||||
|
|
||||||
The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
|
The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml
|
docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml
|
||||||
```
|
```
|
||||||
|
@ -300,7 +297,6 @@ The config can also be passed via stdin, which can simplify the command. Note th
|
||||||
cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin
|
cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
An example config file (eg. crawl-config.yaml) might contain:
|
An example config file (eg. crawl-config.yaml) might contain:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -361,7 +357,6 @@ To make this configuration as simple as possible, there are several predefined s
|
||||||
The scope settings for multi-page crawls (page-spa, prefix, host, domain) also include http/https versions, eg. given a prefix of `http://example.com/path/`,
|
The scope settings for multi-page crawls (page-spa, prefix, host, domain) also include http/https versions, eg. given a prefix of `http://example.com/path/`,
|
||||||
`https://example.com/path/` is also included.
|
`https://example.com/path/` is also included.
|
||||||
|
|
||||||
|
|
||||||
#### Custom Scope Inclusion Rules
|
#### Custom Scope Inclusion Rules
|
||||||
|
|
||||||
Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions.
|
Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions.
|
||||||
|
@ -375,7 +370,6 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
|
||||||
|
|
||||||
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
|
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
|
||||||
|
|
||||||
|
|
||||||
#### Extra 'Hops' Beyond Current Scope
|
#### Extra 'Hops' Beyond Current Scope
|
||||||
|
|
||||||
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
|
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
|
||||||
|
@ -385,7 +379,6 @@ For example, this is most useful when crawling with a `host` or `prefix` scope,
|
||||||
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
|
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
|
||||||
that will take precedence over the `--extraHops`.
|
that will take precedence over the `--extraHops`.
|
||||||
|
|
||||||
|
|
||||||
#### Scope Rule Examples
|
#### Scope Rule Examples
|
||||||
|
|
||||||
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
|
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
|
||||||
|
@ -456,27 +449,24 @@ If the `--blockMessage` is also specified, a blocked URL is replaced with the sp
|
||||||
|
|
||||||
If it seems confusing which rules should be used, here is a quick way to determine:
|
If it seems confusing which rules should be used, here is a quick way to determine:
|
||||||
|
|
||||||
- If you'd like to restrict *the pages that are being crawled*, use the crawl scope rules (defined above).
|
- If you'd like to restrict _the pages that are being crawled_, use the crawl scope rules (defined above).
|
||||||
|
|
||||||
- If you'd like to restrict *parts of a page* that are being loaded, use the page resource block rules described in this section.
|
- If you'd like to restrict _parts of a page_ that are being loaded, use the page resource block rules described in this section.
|
||||||
|
|
||||||
The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked.
|
The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked.
|
||||||
|
|
||||||
These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page).
|
These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page).
|
||||||
|
|
||||||
|
|
||||||
### Ad blocking
|
### Ad blocking
|
||||||
|
|
||||||
With version 0.8.0, Browsertrix Crawler supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.
|
With version 0.8.0, Browsertrix Crawler supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.
|
||||||
|
|
||||||
|
|
||||||
### Custom Warcinfo Fields
|
### Custom Warcinfo Fields
|
||||||
|
|
||||||
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
|
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
|
||||||
|
|
||||||
For example, the following are equivalent ways to add additional warcinfo fields:
|
For example, the following are equivalent ways to add additional warcinfo fields:
|
||||||
|
|
||||||
|
|
||||||
via yaml config:
|
via yaml config:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
@ -622,7 +612,6 @@ docker run -e CHROME_FLAGS="--disable-extensions-except=/ext/ublock --load-exten
|
||||||
|
|
||||||
You can also directly use extensions from an existing chrome-profile by using e.g. `~/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.41.8_0/` as the path.
|
You can also directly use extensions from an existing chrome-profile by using e.g. `~/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.41.8_0/` as the path.
|
||||||
|
|
||||||
|
|
||||||
## Saving Crawl State: Interrupting and Restarting the Crawl
|
## Saving Crawl State: Interrupting and Restarting the Crawl
|
||||||
|
|
||||||
With version 0.5.0, a crawl can be gracefully interrupted with Ctrl-C (SIGINT) or a SIGTERM.
|
With version 0.5.0, a crawl can be gracefully interrupted with Ctrl-C (SIGINT) or a SIGTERM.
|
||||||
|
@ -642,13 +631,11 @@ or `never` respectively, to control when the crawl state file should be written.
|
||||||
When the `--saveState` is set to always, Browsertrix Crawler will also save the state automatically during the crawl, as set by the `--saveStateInterval` setting.
|
When the `--saveState` is set to always, Browsertrix Crawler will also save the state automatically during the crawl, as set by the `--saveStateInterval` setting.
|
||||||
When The crawler will keep the last `--saveStateHistory` save states and delete older ones. This provides extra backup, in case the crawl fails unexpectedly, or is not terminated via Ctrl-C, several previous crawl states are still available.
|
When The crawler will keep the last `--saveStateHistory` save states and delete older ones. This provides extra backup, in case the crawl fails unexpectedly, or is not terminated via Ctrl-C, several previous crawl states are still available.
|
||||||
|
|
||||||
|
|
||||||
## Creating and Using Browser Profiles
|
## Creating and Using Browser Profiles
|
||||||
|
|
||||||
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
|
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
|
||||||
to certain sites or setting other settings, and running a crawl exactly with those settings. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies.
|
to certain sites or setting other settings, and running a crawl exactly with those settings. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies.
|
||||||
|
|
||||||
|
|
||||||
### Interactive Profile Creation
|
### Interactive Profile Creation
|
||||||
|
|
||||||
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
|
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
|
||||||
|
@ -719,7 +706,6 @@ The script will then prompt you for login credentials, attempt to login and crea
|
||||||
|
|
||||||
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
|
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
|
||||||
|
|
||||||
|
|
||||||
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional automated profile creation functionality, such as support for custom profile creation scripts, may be added in the future.
|
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional automated profile creation functionality, such as support for custom profile creation scripts, may be added in the future.
|
||||||
|
|
||||||
### Using Browser Profile with a Crawl
|
### Using Browser Profile with a Crawl
|
||||||
|
@ -743,7 +729,6 @@ All released Docker Images are available from Docker Hub, listed by release tag
|
||||||
|
|
||||||
Details for each corresponding release tag are also available on GitHub at: https://github.com/webrecorder/browsertrix-crawler/releases
|
Details for each corresponding release tag are also available on GitHub at: https://github.com/webrecorder/browsertrix-crawler/releases
|
||||||
|
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
The Docker container provided here packages up several components used in Browsertrix.
|
The Docker container provided here packages up several components used in Browsertrix.
|
||||||
|
@ -752,7 +737,6 @@ The system uses `pywb` in recording mode for capturing the content. The crawl pr
|
||||||
|
|
||||||
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
|
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
|
||||||
|
|
||||||
|
|
||||||
### Usage with Docker Compose
|
### Usage with Docker Compose
|
||||||
|
|
||||||
Many examples in this README demonstrate running Browsertrix Crawler with `docker run`.
|
Many examples in this README demonstrate running Browsertrix Crawler with `docker run`.
|
||||||
|
@ -775,10 +759,8 @@ docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --
|
||||||
|
|
||||||
In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
|
In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
|
||||||
|
|
||||||
|
|
||||||
While the crawl is running, the status of the crawl prints the progress to the JSON log output. This can be disabled by using the `--logging` option and not including `stats`.
|
While the crawl is running, the status of the crawl prints the progress to the JSON log output. This can be disabled by using the `--logging` option and not including `stats`.
|
||||||
|
|
||||||
|
|
||||||
### Multi-Platform Build / Support for Apple Silicon (M1/M2)
|
### Multi-Platform Build / Support for Apple Silicon (M1/M2)
|
||||||
|
|
||||||
Browsertrix Crawler uses a browser image which supports amd64 and arm64.
|
Browsertrix Crawler uses a browser image which supports amd64 and arm64.
|
||||||
|
@ -787,7 +769,6 @@ This means Browsertrix Crawler can be built natively on Apple Silicon systems us
|
||||||
|
|
||||||
On an Apple Silicon system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build. Note that Chromium is different than Chrome, and for example, some video codecs may not be supported in the ARM / Chromium-based version that would be in the amd64 / Chrome version. For production crawling, it is recommended to run on an amd64 Linux environment.
|
On an Apple Silicon system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build. Note that Chromium is different than Chrome, and for example, some video codecs may not be supported in the ARM / Chromium-based version that would be in the amd64 / Chrome version. For production crawling, it is recommended to run on an amd64 Linux environment.
|
||||||
|
|
||||||
|
|
||||||
### Modifying Browser Image
|
### Modifying Browser Image
|
||||||
|
|
||||||
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Chrome/Chromium (depending on host system chip architecture) and Brave Browser are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base).
|
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Chrome/Chromium (depending on host system chip architecture) and Brave Browser are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base).
|
||||||
|
@ -796,7 +777,6 @@ The browser base image used is specified and can be changed at the top of the Do
|
||||||
|
|
||||||
Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image.
|
Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image.
|
||||||
|
|
||||||
|
|
||||||
### Viewing crawled data with pywb
|
### Viewing crawled data with pywb
|
||||||
|
|
||||||
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:
|
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:
|
||||||
|
@ -809,17 +789,13 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
|
||||||
|
|
||||||
(Previewing crawl results while a crawl its still running should also be possible soon!)
|
(Previewing crawl results while a crawl its still running should also be possible soon!)
|
||||||
|
|
||||||
|
## Support
|
||||||
Support
|
|
||||||
-------
|
|
||||||
|
|
||||||
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||||
|
|
||||||
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
|
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
|
||||||
|
|
||||||
|
## License
|
||||||
License
|
|
||||||
-------
|
|
||||||
|
|
||||||
[AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see
|
[AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see
|
||||||
[LICENSE](LICENSE) for more details.
|
[LICENSE](LICENSE) for more details.
|
||||||
|
|
|
@ -1,17 +1,16 @@
|
||||||
version: '3.5'
|
version: "3.5"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
crawler:
|
crawler:
|
||||||
image: ${REGISTRY}webrecorder/browsertrix-crawler:latest
|
image: ${REGISTRY}webrecorder/browsertrix-crawler:latest
|
||||||
build:
|
build:
|
||||||
context: ./
|
context: ./
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./crawls:/crawls
|
- ./crawls:/crawls
|
||||||
|
|
||||||
cap_add:
|
cap_add:
|
||||||
- NET_ADMIN
|
- NET_ADMIN
|
||||||
- SYS_ADMIN
|
- SYS_ADMIN
|
||||||
|
|
||||||
shm_size: 1gb
|
|
||||||
|
|
||||||
|
shm_size: 1gb
|
||||||
|
|
|
@ -1,39 +1,45 @@
|
||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<style>
|
<style>
|
||||||
html, body, iframe {
|
html,
|
||||||
width: 100%;
|
body,
|
||||||
height: 100%;
|
iframe {
|
||||||
margin: 0;
|
width: 100%;
|
||||||
padding: 0;
|
height: 100%;
|
||||||
border: 0;
|
margin: 0;
|
||||||
overflow: hidden;
|
padding: 0;
|
||||||
font-family: sans-serif;
|
border: 0;
|
||||||
}
|
overflow: hidden;
|
||||||
body {
|
font-family: sans-serif;
|
||||||
display: flex;
|
}
|
||||||
flex-direction: column;
|
body {
|
||||||
}
|
display: flex;
|
||||||
iframe#main {
|
flex-direction: column;
|
||||||
height: calc(100% - 36px);
|
}
|
||||||
}
|
iframe#main {
|
||||||
div#info {
|
height: calc(100% - 36px);
|
||||||
margin: 8px;
|
}
|
||||||
}
|
div#info {
|
||||||
form {
|
margin: 8px;
|
||||||
display: inline;
|
}
|
||||||
}
|
form {
|
||||||
button {
|
display: inline;
|
||||||
font-weight: bold;
|
}
|
||||||
font-size: 15px;
|
button {
|
||||||
}
|
font-weight: bold;
|
||||||
</style>
|
font-size: 15px;
|
||||||
</head>
|
}
|
||||||
<body>
|
</style>
|
||||||
<div id="info">
|
</head>
|
||||||
Log in to any site(s) that you want to be part of the crawl profile using the embedded browser below. When done, click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
|
<body>
|
||||||
</div>
|
<div id="info">
|
||||||
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
|
Log in to any site(s) that you want to be part of the crawl profile using
|
||||||
</body>
|
the embedded browser below. When done, click
|
||||||
|
<form action="/createProfile" method="post">
|
||||||
|
<button type="submit">Create Profile</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
|
||||||
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -1,75 +1,79 @@
|
||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<style>
|
<style>
|
||||||
#content {
|
#content {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: row;
|
flex-direction: row;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
}
|
}
|
||||||
#content img {
|
#content img {
|
||||||
width: 640px;
|
width: 640px;
|
||||||
height: 480px;
|
height: 480px;
|
||||||
margin: 2rem;
|
margin: 2rem;
|
||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
<script>
|
<script>
|
||||||
const ws = new WebSocket(window.location.href.replace("http", "ws") + "ws");
|
const ws = new WebSocket(
|
||||||
ws.addEventListener("message", (event) => handleMessage(event.data));
|
window.location.href.replace("http", "ws") + "ws",
|
||||||
|
);
|
||||||
|
ws.addEventListener("message", (event) => handleMessage(event.data));
|
||||||
|
|
||||||
const unusedElems = [];
|
const unusedElems = [];
|
||||||
|
|
||||||
function handleMessage(resp) {
|
function handleMessage(resp) {
|
||||||
resp = JSON.parse(resp);
|
resp = JSON.parse(resp);
|
||||||
|
|
||||||
switch (resp.msg) {
|
switch (resp.msg) {
|
||||||
case "screencast":
|
case "screencast":
|
||||||
img = createImage(resp.id);
|
img = createImage(resp.id);
|
||||||
if (resp.data) {
|
if (resp.data) {
|
||||||
setImageData(img, resp.data);
|
setImageData(img, resp.data);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "close":
|
||||||
|
img = unuseImage(resp.id);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
|
|
||||||
case "close":
|
function setImageData(img, data) {
|
||||||
img = unuseImage(resp.id);
|
//img.style.display = "";
|
||||||
break;
|
img.src = "data:image/png;base64," + data;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
function setImageData(img, data) {
|
function createImage(id) {
|
||||||
//img.style.display = "";
|
let elem = document.getElementById(id);
|
||||||
img.src = "data:image/png;base64," + data;
|
if (elem) {
|
||||||
}
|
return elem;
|
||||||
|
}
|
||||||
|
|
||||||
function createImage(id) {
|
if (unusedElems.length) {
|
||||||
let elem = document.getElementById(id);
|
elem = unusedElems.shift();
|
||||||
if (elem) {
|
elem.setAttribute("id", id);
|
||||||
return elem;
|
return elem;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (unusedElems.length) {
|
elem = document.createElement("img");
|
||||||
elem = unusedElems.shift();
|
elem.setAttribute("id", id);
|
||||||
elem.setAttribute("id", id);
|
document.getElementById("content").appendChild(elem);
|
||||||
return elem;
|
return elem;
|
||||||
}
|
}
|
||||||
|
|
||||||
elem = document.createElement("img");
|
function unuseImage(id) {
|
||||||
elem.setAttribute("id", id);
|
const elem = document.getElementById(id);
|
||||||
document.getElementById("content").appendChild(elem);
|
if (!elem) {
|
||||||
return elem;
|
return;
|
||||||
}
|
}
|
||||||
|
//elem.style.display = "none";
|
||||||
function unuseImage(id) {
|
unusedElems.push(elem);
|
||||||
const elem = document.getElementById(id);
|
}
|
||||||
if (!elem) {
|
</script>
|
||||||
return;
|
<head>
|
||||||
}
|
<body>
|
||||||
//elem.style.display = "none";
|
<div id="content"></div>
|
||||||
unusedElems.push(elem);
|
</body>
|
||||||
}
|
</head>
|
||||||
</script>
|
</head>
|
||||||
<head>
|
</html>
|
||||||
<body>
|
|
||||||
<div id="content">
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
<!DOCTYPE html>
|
<!doctype html>
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
|
|
||||||
<!--
|
<!--
|
||||||
noVNC example: lightweight example using minimal UI and features
|
noVNC example: lightweight example using minimal UI and features
|
||||||
|
|
||||||
|
@ -16,180 +15,180 @@
|
||||||
-->
|
-->
|
||||||
<title>noVNC</title>
|
<title>noVNC</title>
|
||||||
|
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8" />
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
background-color: dimgrey;
|
||||||
|
height: 100%;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
}
|
||||||
|
html {
|
||||||
|
height: 100%;
|
||||||
|
}
|
||||||
|
|
||||||
body {
|
#top_bar {
|
||||||
margin: 0;
|
display: none;
|
||||||
background-color: dimgrey;
|
background-color: #6e84a3;
|
||||||
height: 100%;
|
color: white;
|
||||||
display: flex;
|
font: bold 12px Helvetica;
|
||||||
flex-direction: column;
|
padding: 6px 5px 4px 5px;
|
||||||
}
|
border-bottom: 1px outset;
|
||||||
html {
|
}
|
||||||
height: 100%;
|
#status {
|
||||||
}
|
text-align: center;
|
||||||
|
}
|
||||||
#top_bar {
|
#sendCtrlAltDelButton {
|
||||||
display: none;
|
display: none;
|
||||||
background-color: #6e84a3;
|
position: fixed;
|
||||||
color: white;
|
top: 0px;
|
||||||
font: bold 12px Helvetica;
|
right: 0px;
|
||||||
padding: 6px 5px 4px 5px;
|
border: 1px outset;
|
||||||
border-bottom: 1px outset;
|
padding: 5px 5px 4px 5px;
|
||||||
}
|
cursor: pointer;
|
||||||
#status {
|
}
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
#sendCtrlAltDelButton {
|
|
||||||
display: none;
|
|
||||||
position: fixed;
|
|
||||||
top: 0px;
|
|
||||||
right: 0px;
|
|
||||||
border: 1px outset;
|
|
||||||
padding: 5px 5px 4px 5px;
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
#screen {
|
|
||||||
flex: 1; /* fill remaining space */
|
|
||||||
overflow: hidden;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
#screen {
|
||||||
|
flex: 1; /* fill remaining space */
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
<script type="module" crossorigin="anonymous">
|
<script type="module" crossorigin="anonymous">
|
||||||
// RFB holds the API to connect and communicate with a VNC server
|
// RFB holds the API to connect and communicate with a VNC server
|
||||||
import RFB from './core/rfb.js';
|
import RFB from "./core/rfb.js";
|
||||||
|
|
||||||
let rfb;
|
let rfb;
|
||||||
let desktopName;
|
let desktopName;
|
||||||
|
|
||||||
// When this function is called we have
|
// When this function is called we have
|
||||||
// successfully connected to a server
|
// successfully connected to a server
|
||||||
function connectedToServer(e) {
|
function connectedToServer(e) {
|
||||||
status("Connected to " + desktopName);
|
status("Connected to " + desktopName);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function is called when we are disconnected
|
||||||
|
function disconnectedFromServer(e) {
|
||||||
|
if (e.detail.clean) {
|
||||||
|
status("Disconnected, retrying...");
|
||||||
|
setTimeout(connect, 2000);
|
||||||
|
} else {
|
||||||
|
status("Something went wrong, connection is closed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When this function is called, the server requires
|
||||||
|
// credentials to authenticate
|
||||||
|
function credentialsAreRequired(e) {
|
||||||
|
const password = prompt("Password Required:");
|
||||||
|
rfb.sendCredentials({ password: password });
|
||||||
|
}
|
||||||
|
|
||||||
|
// When this function is called we have received
|
||||||
|
// a desktop name from the server
|
||||||
|
function updateDesktopName(e) {
|
||||||
|
desktopName = e.detail.name;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Since most operating systems will catch Ctrl+Alt+Del
|
||||||
|
// before they get a chance to be intercepted by the browser,
|
||||||
|
// we provide a way to emulate this key sequence.
|
||||||
|
function sendCtrlAltDel() {
|
||||||
|
rfb.sendCtrlAltDel();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show a status text in the top bar
|
||||||
|
function status(text) {
|
||||||
|
document.getElementById("status").textContent = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function extracts the value of one variable from the
|
||||||
|
// query string. If the variable isn't defined in the URL
|
||||||
|
// it returns the default value instead.
|
||||||
|
function readQueryVariable(name, defaultValue) {
|
||||||
|
// A URL with a query parameter can look like this (But will most probably get logged on the http server):
|
||||||
|
// https://www.example.com?myqueryparam=myvalue
|
||||||
|
//
|
||||||
|
// For privacy (Using a hastag #, the parameters will not be sent to the server)
|
||||||
|
// the url can be requested in the following way:
|
||||||
|
// https://www.example.com#myqueryparam=myvalue&password=secreatvalue
|
||||||
|
//
|
||||||
|
// Even Mixing public and non public parameters will work:
|
||||||
|
// https://www.example.com?nonsecretparam=example.com#password=secreatvalue
|
||||||
|
//
|
||||||
|
// Note that we use location.href instead of location.search
|
||||||
|
// because Firefox < 53 has a bug w.r.t location.search
|
||||||
|
const re = new RegExp(".*[?&]" + name + "=([^&#]*)"),
|
||||||
|
match = ""
|
||||||
|
.concat(document.location.href, window.location.hash)
|
||||||
|
.match(re);
|
||||||
|
|
||||||
|
if (match) {
|
||||||
|
// We have to decode the URL since want the cleartext value
|
||||||
|
return decodeURIComponent(match[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function is called when we are disconnected
|
return defaultValue;
|
||||||
function disconnectedFromServer(e) {
|
}
|
||||||
if (e.detail.clean) {
|
|
||||||
status("Disconnected, retrying...");
|
document.getElementById("sendCtrlAltDelButton").onclick = sendCtrlAltDel;
|
||||||
setTimeout(connect, 2000);
|
|
||||||
} else {
|
// Read parameters specified in the URL query string
|
||||||
status("Something went wrong, connection is closed");
|
// By default, use the host and port of server that served this file
|
||||||
}
|
const host = readQueryVariable("host", window.location.hostname);
|
||||||
|
let port = readQueryVariable("port", window.location.port);
|
||||||
|
const password = readQueryVariable("password");
|
||||||
|
const path = readQueryVariable("path", "websockify");
|
||||||
|
|
||||||
|
// | | | | | |
|
||||||
|
// | | | Connect | | |
|
||||||
|
// v v v v v v
|
||||||
|
function connect() {
|
||||||
|
status("Connecting");
|
||||||
|
|
||||||
|
// Build the websocket URL used to connect
|
||||||
|
let url;
|
||||||
|
if (window.location.protocol === "https:") {
|
||||||
|
url = "wss";
|
||||||
|
} else {
|
||||||
|
url = "ws";
|
||||||
}
|
}
|
||||||
|
url += "://" + host;
|
||||||
// When this function is called, the server requires
|
if (port) {
|
||||||
// credentials to authenticate
|
url += ":" + port;
|
||||||
function credentialsAreRequired(e) {
|
|
||||||
const password = prompt("Password Required:");
|
|
||||||
rfb.sendCredentials({ password: password });
|
|
||||||
}
|
}
|
||||||
|
url += "/" + path;
|
||||||
|
|
||||||
// When this function is called we have received
|
// Creating a new RFB object will start a new connection
|
||||||
// a desktop name from the server
|
rfb = new RFB(document.getElementById("screen"), url, {
|
||||||
function updateDesktopName(e) {
|
credentials: { password: password },
|
||||||
desktopName = e.detail.name;
|
});
|
||||||
}
|
|
||||||
|
|
||||||
// Since most operating systems will catch Ctrl+Alt+Del
|
// Add listeners to important events from the RFB module
|
||||||
// before they get a chance to be intercepted by the browser,
|
rfb.addEventListener("connect", connectedToServer);
|
||||||
// we provide a way to emulate this key sequence.
|
rfb.addEventListener("disconnect", disconnectedFromServer);
|
||||||
function sendCtrlAltDel() {
|
rfb.addEventListener("credentialsrequired", credentialsAreRequired);
|
||||||
rfb.sendCtrlAltDel();
|
rfb.addEventListener("desktopname", updateDesktopName);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Show a status text in the top bar
|
// Set parameters that can be changed on an active connection
|
||||||
function status(text) {
|
rfb.viewOnly = readQueryVariable("view_only", false);
|
||||||
document.getElementById('status').textContent = text;
|
rfb.scaleViewport = readQueryVariable("scale", false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function extracts the value of one variable from the
|
connect();
|
||||||
// query string. If the variable isn't defined in the URL
|
|
||||||
// it returns the default value instead.
|
|
||||||
function readQueryVariable(name, defaultValue) {
|
|
||||||
// A URL with a query parameter can look like this (But will most probably get logged on the http server):
|
|
||||||
// https://www.example.com?myqueryparam=myvalue
|
|
||||||
//
|
|
||||||
// For privacy (Using a hastag #, the parameters will not be sent to the server)
|
|
||||||
// the url can be requested in the following way:
|
|
||||||
// https://www.example.com#myqueryparam=myvalue&password=secreatvalue
|
|
||||||
//
|
|
||||||
// Even Mixing public and non public parameters will work:
|
|
||||||
// https://www.example.com?nonsecretparam=example.com#password=secreatvalue
|
|
||||||
//
|
|
||||||
// Note that we use location.href instead of location.search
|
|
||||||
// because Firefox < 53 has a bug w.r.t location.search
|
|
||||||
const re = new RegExp('.*[?&]' + name + '=([^&#]*)'),
|
|
||||||
match = ''.concat(document.location.href, window.location.hash).match(re);
|
|
||||||
|
|
||||||
if (match) {
|
|
||||||
// We have to decode the URL since want the cleartext value
|
|
||||||
return decodeURIComponent(match[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
document.getElementById('sendCtrlAltDelButton')
|
|
||||||
.onclick = sendCtrlAltDel;
|
|
||||||
|
|
||||||
// Read parameters specified in the URL query string
|
|
||||||
// By default, use the host and port of server that served this file
|
|
||||||
const host = readQueryVariable('host', window.location.hostname);
|
|
||||||
let port = readQueryVariable('port', window.location.port);
|
|
||||||
const password = readQueryVariable('password');
|
|
||||||
const path = readQueryVariable('path', 'websockify');
|
|
||||||
|
|
||||||
// | | | | | |
|
|
||||||
// | | | Connect | | |
|
|
||||||
// v v v v v v
|
|
||||||
function connect() {
|
|
||||||
status("Connecting");
|
|
||||||
|
|
||||||
// Build the websocket URL used to connect
|
|
||||||
let url;
|
|
||||||
if (window.location.protocol === "https:") {
|
|
||||||
url = 'wss';
|
|
||||||
} else {
|
|
||||||
url = 'ws';
|
|
||||||
}
|
|
||||||
url += '://' + host;
|
|
||||||
if(port) {
|
|
||||||
url += ':' + port;
|
|
||||||
}
|
|
||||||
url += '/' + path;
|
|
||||||
|
|
||||||
// Creating a new RFB object will start a new connection
|
|
||||||
rfb = new RFB(document.getElementById('screen'), url,
|
|
||||||
{ credentials: { password: password } });
|
|
||||||
|
|
||||||
// Add listeners to important events from the RFB module
|
|
||||||
rfb.addEventListener("connect", connectedToServer);
|
|
||||||
rfb.addEventListener("disconnect", disconnectedFromServer);
|
|
||||||
rfb.addEventListener("credentialsrequired", credentialsAreRequired);
|
|
||||||
rfb.addEventListener("desktopname", updateDesktopName);
|
|
||||||
|
|
||||||
// Set parameters that can be changed on an active connection
|
|
||||||
rfb.viewOnly = readQueryVariable('view_only', false);
|
|
||||||
rfb.scaleViewport = readQueryVariable('scale', false);
|
|
||||||
}
|
|
||||||
|
|
||||||
connect();
|
|
||||||
</script>
|
</script>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
<div id="top_bar">
|
<div id="top_bar">
|
||||||
<div id="status">Loading</div>
|
<div id="status">Loading</div>
|
||||||
<div id="sendCtrlAltDelButton">Send CtrlAltDel</div>
|
<div id="sendCtrlAltDelButton">Send CtrlAltDel</div>
|
||||||
</div>
|
</div>
|
||||||
<div id="screen">
|
<div id="screen">
|
||||||
<!-- This is where the remote screen will appear -->
|
<!-- This is where the remote screen will appear -->
|
||||||
</div>
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -8,7 +8,10 @@
|
||||||
"license": "AGPL-3.0-or-later",
|
"license": "AGPL-3.0-or-later",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"tsc": "tsc",
|
"tsc": "tsc",
|
||||||
"lint": "eslint *.js tests/*.test.js",
|
"format": "prettier . --check",
|
||||||
|
"format:fix": "prettier . --write",
|
||||||
|
"lint": "eslint .",
|
||||||
|
"lint:fix": "yarn format:fix && eslint . --fix",
|
||||||
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
||||||
"prepare": "husky install"
|
"prepare": "husky install"
|
||||||
},
|
},
|
||||||
|
@ -40,9 +43,11 @@
|
||||||
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
||||||
"@typescript-eslint/parser": "^6.10.0",
|
"@typescript-eslint/parser": "^6.10.0",
|
||||||
"eslint": "^8.53.0",
|
"eslint": "^8.53.0",
|
||||||
|
"eslint-config-prettier": "^9.0.0",
|
||||||
"eslint-plugin-react": "^7.22.0",
|
"eslint-plugin-react": "^7.22.0",
|
||||||
"jest": "^29.2.1",
|
"jest": "^29.2.1",
|
||||||
"md5": "^2.3.0",
|
"md5": "^2.3.0",
|
||||||
|
"prettier": "3.0.3",
|
||||||
"typescript": "^5.2.2"
|
"typescript": "^5.2.2"
|
||||||
},
|
},
|
||||||
"jest": {
|
"jest": {
|
||||||
|
|
324
src/crawler.ts
324
src/crawler.ts
|
@ -4,7 +4,13 @@ import fs, { WriteStream } from "fs";
|
||||||
import os from "os";
|
import os from "os";
|
||||||
import fsp, { FileHandle } from "fs/promises";
|
import fsp, { FileHandle } from "fs/promises";
|
||||||
|
|
||||||
import { RedisCrawlState, LoadState, QueueState, PageState, WorkerId } from "./util/state.js";
|
import {
|
||||||
|
RedisCrawlState,
|
||||||
|
LoadState,
|
||||||
|
QueueState,
|
||||||
|
PageState,
|
||||||
|
WorkerId,
|
||||||
|
} from "./util/state.js";
|
||||||
|
|
||||||
import Sitemapper from "sitemapper";
|
import Sitemapper from "sitemapper";
|
||||||
import yaml from "js-yaml";
|
import yaml from "js-yaml";
|
||||||
|
@ -13,7 +19,14 @@ import * as warcio from "warcio";
|
||||||
|
|
||||||
import { HealthChecker } from "./util/healthcheck.js";
|
import { HealthChecker } from "./util/healthcheck.js";
|
||||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||||
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization, S3StorageSync } from "./util/storage.js";
|
import {
|
||||||
|
initStorage,
|
||||||
|
getFileSize,
|
||||||
|
getDirSize,
|
||||||
|
interpolateFilename,
|
||||||
|
checkDiskUtilization,
|
||||||
|
S3StorageSync,
|
||||||
|
} from "./util/storage.js";
|
||||||
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
||||||
import { Screenshots } from "./util/screenshots.js";
|
import { Screenshots } from "./util/screenshots.js";
|
||||||
import { parseArgs } from "./util/argParser.js";
|
import { parseArgs } from "./util/argParser.js";
|
||||||
|
@ -25,7 +38,12 @@ import { collectAllFileSources } from "./util/file_reader.js";
|
||||||
|
|
||||||
import { Browser } from "./util/browser.js";
|
import { Browser } from "./util/browser.js";
|
||||||
|
|
||||||
import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
|
import {
|
||||||
|
ADD_LINK_FUNC,
|
||||||
|
BEHAVIOR_LOG_FUNC,
|
||||||
|
HTML_TYPES,
|
||||||
|
DEFAULT_SELECTORS,
|
||||||
|
} from "./util/constants.js";
|
||||||
|
|
||||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||||
import { OriginOverride } from "./util/originoverride.js";
|
import { OriginOverride } from "./util/originoverride.js";
|
||||||
|
@ -41,12 +59,23 @@ const HTTPS_AGENT = new HTTPSAgent({
|
||||||
|
|
||||||
const HTTP_AGENT = new HTTPAgent();
|
const HTTP_AGENT = new HTTPAgent();
|
||||||
|
|
||||||
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
const behaviors = fs.readFileSync(
|
||||||
|
new URL(
|
||||||
|
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||||
|
import.meta.url,
|
||||||
|
),
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
const FETCH_TIMEOUT_SECS = 30;
|
const FETCH_TIMEOUT_SECS = 30;
|
||||||
const PAGE_OP_TIMEOUT_SECS = 5;
|
const PAGE_OP_TIMEOUT_SECS = 5;
|
||||||
|
|
||||||
const POST_CRAWL_STATES = ["generate-wacz", "uploading-wacz", "generate-cdx", "generate-warc"];
|
const POST_CRAWL_STATES = [
|
||||||
|
"generate-wacz",
|
||||||
|
"uploading-wacz",
|
||||||
|
"generate-cdx",
|
||||||
|
"generate-warc",
|
||||||
|
];
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
type LogDetails = Record<string, any>;
|
type LogDetails = Record<string, any>;
|
||||||
|
@ -62,7 +91,6 @@ type PageEntry = {
|
||||||
favIconUrl?: string;
|
favIconUrl?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class Crawler {
|
export class Crawler {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -128,8 +156,12 @@ export class Crawler {
|
||||||
maxHeapUsed = 0;
|
maxHeapUsed = 0;
|
||||||
maxHeapTotal = 0;
|
maxHeapTotal = 0;
|
||||||
|
|
||||||
// eslint-disable-next-line no-use-before-define
|
driver!: (opts: {
|
||||||
driver!: (opts: { page: Page; data: PageState; crawler: Crawler }) => NonNullable<unknown>;
|
page: Page;
|
||||||
|
data: PageState;
|
||||||
|
// eslint-disable-next-line no-use-before-define
|
||||||
|
crawler: Crawler;
|
||||||
|
}) => NonNullable<unknown>;
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
const res = parseArgs();
|
const res = parseArgs();
|
||||||
|
@ -140,12 +172,12 @@ export class Crawler {
|
||||||
this.collDir = path.join(
|
this.collDir = path.join(
|
||||||
this.params.cwd,
|
this.params.cwd,
|
||||||
"collections",
|
"collections",
|
||||||
this.params.collection
|
this.params.collection,
|
||||||
);
|
);
|
||||||
this.logDir = path.join(this.collDir, "logs");
|
this.logDir = path.join(this.collDir, "logs");
|
||||||
this.logFilename = path.join(
|
this.logFilename = path.join(
|
||||||
this.logDir,
|
this.logDir,
|
||||||
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`
|
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
|
||||||
);
|
);
|
||||||
|
|
||||||
const debugLogging = this.params.logging.includes("debug");
|
const debugLogging = this.params.logging.includes("debug");
|
||||||
|
@ -252,7 +284,7 @@ export class Crawler {
|
||||||
|
|
||||||
if (!redisUrl.startsWith("redis://")) {
|
if (!redisUrl.startsWith("redis://")) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported"
|
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -272,7 +304,7 @@ export class Crawler {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
|
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
|
||||||
{},
|
{},
|
||||||
"state"
|
"state",
|
||||||
);
|
);
|
||||||
|
|
||||||
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
||||||
|
@ -281,7 +313,7 @@ export class Crawler {
|
||||||
redis,
|
redis,
|
||||||
this.params.crawlId,
|
this.params.crawlId,
|
||||||
this.maxPageTime,
|
this.maxPageTime,
|
||||||
os.hostname()
|
os.hostname(),
|
||||||
);
|
);
|
||||||
|
|
||||||
// clear any pending URLs from this instance
|
// clear any pending URLs from this instance
|
||||||
|
@ -291,7 +323,7 @@ export class Crawler {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
|
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
|
||||||
{},
|
{},
|
||||||
"state"
|
"state",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -311,7 +343,7 @@ export class Crawler {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Screencast server started on: ${this.params.screencastPort}`,
|
`Screencast server started on: ${this.params.screencastPort}`,
|
||||||
{},
|
{},
|
||||||
"screencast"
|
"screencast",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
||||||
|
@ -375,7 +407,7 @@ export class Crawler {
|
||||||
logger.debug(`Clearing ${this.collDir} before starting`);
|
logger.debug(`Clearing ${this.collDir} before starting`);
|
||||||
try {
|
try {
|
||||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.error(`Unable to clear ${this.collDir}`, e);
|
logger.error(`Unable to clear ${this.collDir}`, e);
|
||||||
}
|
}
|
||||||
|
@ -383,7 +415,7 @@ export class Crawler {
|
||||||
|
|
||||||
if (this.params.customBehaviors) {
|
if (this.params.customBehaviors) {
|
||||||
this.customBehaviors = this.loadCustomBehaviors(
|
this.customBehaviors = this.loadCustomBehaviors(
|
||||||
this.params.customBehaviors
|
this.params.customBehaviors,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -445,7 +477,7 @@ export class Crawler {
|
||||||
exitCode = 11;
|
exitCode = 11;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.error("Crawl failed", e);
|
logger.error("Crawl failed", e);
|
||||||
exitCode = 9;
|
exitCode = 9;
|
||||||
|
@ -461,7 +493,7 @@ export class Crawler {
|
||||||
_behaviorLog(
|
_behaviorLog(
|
||||||
{ data, type }: { data: string; type: string },
|
{ data, type }: { data: string; type: string },
|
||||||
pageUrl: string,
|
pageUrl: string,
|
||||||
workerid: WorkerId
|
workerid: WorkerId,
|
||||||
) {
|
) {
|
||||||
let behaviorLine;
|
let behaviorLine;
|
||||||
let message;
|
let message;
|
||||||
|
@ -481,21 +513,21 @@ export class Crawler {
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case "info":
|
case "info":
|
||||||
behaviorLine = JSON.stringify(data);
|
behaviorLine = JSON.stringify(data);
|
||||||
if (behaviorLine !== this.behaviorLastLine) {
|
if (behaviorLine !== this.behaviorLastLine) {
|
||||||
logger.info(message, details, "behaviorScript");
|
logger.info(message, details, "behaviorScript");
|
||||||
this.behaviorLastLine = behaviorLine;
|
this.behaviorLastLine = behaviorLine;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "error":
|
case "error":
|
||||||
logger.error(message, details, "behaviorScript");
|
logger.error(message, details, "behaviorScript");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "debug":
|
case "debug":
|
||||||
default:
|
default:
|
||||||
logger.debug(message, details, "behaviorScript");
|
logger.debug(message, details, "behaviorScript");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -506,7 +538,7 @@ export class Crawler {
|
||||||
depth,
|
depth,
|
||||||
extraHops,
|
extraHops,
|
||||||
}: { seedId: number; url: string; depth: number; extraHops: number },
|
}: { seedId: number; url: string; depth: number; extraHops: number },
|
||||||
logDetails = {}
|
logDetails = {},
|
||||||
) {
|
) {
|
||||||
const seed = this.params.scopedSeeds[seedId];
|
const seed = this.params.scopedSeeds[seedId];
|
||||||
|
|
||||||
|
@ -553,7 +585,7 @@ export class Crawler {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
msg.text(),
|
msg.text(),
|
||||||
{ location: msg.location(), page: page.url(), workerid },
|
{ location: msg.location(), page: page.url(), workerid },
|
||||||
"jsError"
|
"jsError",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -562,7 +594,7 @@ export class Crawler {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Page Error",
|
"Page Error",
|
||||||
{ ...errJSON(e), page: page.url(), workerid },
|
{ ...errJSON(e), page: page.url(), workerid },
|
||||||
"jsError"
|
"jsError",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -574,14 +606,14 @@ export class Crawler {
|
||||||
|
|
||||||
await page.exposeFunction(
|
await page.exposeFunction(
|
||||||
ADD_LINK_FUNC,
|
ADD_LINK_FUNC,
|
||||||
(url: string) => callbacks.addLink && callbacks.addLink(url)
|
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
||||||
);
|
);
|
||||||
|
|
||||||
if (this.params.behaviorOpts) {
|
if (this.params.behaviorOpts) {
|
||||||
await page.exposeFunction(
|
await page.exposeFunction(
|
||||||
BEHAVIOR_LOG_FUNC,
|
BEHAVIOR_LOG_FUNC,
|
||||||
(logdata: { data: string; type: string }) =>
|
(logdata: { data: string; type: string }) =>
|
||||||
this._behaviorLog(logdata, page.url(), workerid)
|
this._behaviorLog(logdata, page.url(), workerid),
|
||||||
);
|
);
|
||||||
await this.browser.addInitScript(page, behaviors);
|
await this.browser.addInitScript(page, behaviors);
|
||||||
|
|
||||||
|
@ -622,7 +654,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Failed to fetch favicon from browser /json endpoint",
|
"Failed to fetch favicon from browser /json endpoint",
|
||||||
logDetails
|
logDetails,
|
||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
@ -645,7 +677,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
"HEAD request to determine if URL is HTML page timed out",
|
"HEAD request to determine if URL is HTML page timed out",
|
||||||
logDetails,
|
logDetails,
|
||||||
"fetch",
|
"fetch",
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!data.isHTMLPage && directFetchCapture) {
|
if (!data.isHTMLPage && directFetchCapture) {
|
||||||
|
@ -656,7 +688,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
"Direct fetch capture attempt timed out",
|
"Direct fetch capture attempt timed out",
|
||||||
logDetails,
|
logDetails,
|
||||||
"fetch",
|
"fetch",
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
if (fetched) {
|
if (fetched) {
|
||||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||||
|
@ -666,7 +698,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info(
|
logger.info(
|
||||||
"Direct fetch successful",
|
"Direct fetch successful",
|
||||||
{ url, ...logDetails },
|
{ url, ...logDetails },
|
||||||
"fetch"
|
"fetch",
|
||||||
);
|
);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -714,7 +746,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const { changed, text } = await textextract.extractAndStoreText(
|
const { changed, text } = await textextract.extractAndStoreText(
|
||||||
"text",
|
"text",
|
||||||
false,
|
false,
|
||||||
this.params.text.includes("to-warc")
|
this.params.text.includes("to-warc"),
|
||||||
);
|
);
|
||||||
|
|
||||||
if (changed && text && this.params.text.includes("to-pages")) {
|
if (changed && text && this.params.text.includes("to-pages")) {
|
||||||
|
@ -729,7 +761,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Skipping behaviors for non-HTML page",
|
"Skipping behaviors for non-HTML page",
|
||||||
logDetails,
|
logDetails,
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
} else if (data.skipBehaviors) {
|
} else if (data.skipBehaviors) {
|
||||||
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
||||||
|
@ -739,7 +771,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
this.params.behaviorTimeout,
|
this.params.behaviorTimeout,
|
||||||
"Behaviors timed out",
|
"Behaviors timed out",
|
||||||
logDetails,
|
logDetails,
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
|
|
||||||
await this.netIdle(page, logDetails);
|
await this.netIdle(page, logDetails);
|
||||||
|
@ -757,7 +789,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (this.params.pageExtraDelay) {
|
if (this.params.pageExtraDelay) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
|
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
|
||||||
logDetails
|
logDetails,
|
||||||
);
|
);
|
||||||
await sleep(this.params.pageExtraDelay);
|
await sleep(this.params.pageExtraDelay);
|
||||||
}
|
}
|
||||||
|
@ -784,7 +816,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Page Load Failed",
|
"Page Load Failed",
|
||||||
{ loadState, ...logDetails },
|
{ loadState, ...logDetails },
|
||||||
"pageStatus"
|
"pageStatus",
|
||||||
);
|
);
|
||||||
|
|
||||||
await this.crawlState.markFailed(data.url);
|
await this.crawlState.markFailed(data.url);
|
||||||
|
@ -816,7 +848,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
page: Page,
|
page: Page,
|
||||||
cdp: CDPSession,
|
cdp: CDPSession,
|
||||||
frames: Frame[],
|
frames: Frame[],
|
||||||
logDetails: LogDetails
|
logDetails: LogDetails,
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
frames = frames || page.frames();
|
frames = frames || page.frames();
|
||||||
|
@ -828,7 +860,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
frameUrls: frames.map((frame) => frame.url()),
|
frameUrls: frames.map((frame) => frame.url()),
|
||||||
...logDetails,
|
...logDetails,
|
||||||
},
|
},
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
|
|
||||||
const results = await Promise.allSettled(
|
const results = await Promise.allSettled(
|
||||||
|
@ -844,9 +876,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
self.__bx_behaviors.run();
|
self.__bx_behaviors.run();
|
||||||
}`,
|
}`,
|
||||||
logDetails,
|
logDetails,
|
||||||
"behavior"
|
"behavior",
|
||||||
)
|
),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
for (const res of results) {
|
for (const res of results) {
|
||||||
|
@ -855,7 +887,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Behavior run partially failed",
|
"Behavior run partially failed",
|
||||||
{ reason, ...logDetails },
|
{ reason, ...logDetails },
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -863,14 +895,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info(
|
logger.info(
|
||||||
"Behaviors finished",
|
"Behaviors finished",
|
||||||
{ finished: results.length, ...logDetails },
|
{ finished: results.length, ...logDetails },
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
return true;
|
return true;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Behavior run failed",
|
"Behavior run failed",
|
||||||
{ ...errJSON(e), ...logDetails },
|
{ ...errJSON(e), ...logDetails },
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -886,14 +918,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
||||||
// if there's no tag or an iframe tag, then assume its a regular frame
|
// if there's no tag or an iframe tag, then assume its a regular frame
|
||||||
const tagName = await frame.evaluate(
|
const tagName = await frame.evaluate(
|
||||||
"self && self.frameElement && self.frameElement.tagName"
|
"self && self.frameElement && self.frameElement.tagName",
|
||||||
);
|
);
|
||||||
|
|
||||||
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Skipping processing non-frame object",
|
"Skipping processing non-frame object",
|
||||||
{ tagName, frameUrl, ...logDetails },
|
{ tagName, frameUrl, ...logDetails },
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -910,7 +942,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Skipping processing frame",
|
"Skipping processing frame",
|
||||||
{ frameUrl, ...logDetails },
|
{ frameUrl, ...logDetails },
|
||||||
"behavior"
|
"behavior",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -921,13 +953,13 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const packageFileJSON = JSON.parse(
|
const packageFileJSON = JSON.parse(
|
||||||
await fsp.readFile(new URL("../package.json", import.meta.url), {
|
await fsp.readFile(new URL("../package.json", import.meta.url), {
|
||||||
encoding: "utf-8",
|
encoding: "utf-8",
|
||||||
})
|
}),
|
||||||
);
|
);
|
||||||
const warcioPackageJSON = JSON.parse(
|
const warcioPackageJSON = JSON.parse(
|
||||||
await fsp.readFile(
|
await fsp.readFile(
|
||||||
new URL("../node_modules/warcio/package.json", import.meta.url),
|
new URL("../node_modules/warcio/package.json", import.meta.url),
|
||||||
{ encoding: "utf-8" }
|
{ encoding: "utf-8" },
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
||||||
|
@ -945,7 +977,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const warcInfo = { ...info, ...this.params.warcInfo };
|
const warcInfo = { ...info, ...this.params.warcInfo };
|
||||||
const record = await warcio.WARCRecord.createWARCInfo(
|
const record = await warcio.WARCRecord.createWARCInfo(
|
||||||
{ filename, type, warcVersion },
|
{ filename, type, warcVersion },
|
||||||
warcInfo
|
warcInfo,
|
||||||
);
|
);
|
||||||
const buffer = await warcio.WARCSerializer.serialize(record, {
|
const buffer = await warcio.WARCSerializer.serialize(record, {
|
||||||
gzip: true,
|
gzip: true,
|
||||||
|
@ -964,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (this.params.sizeLimit) {
|
if (this.params.sizeLimit) {
|
||||||
if (size >= this.params.sizeLimit) {
|
if (size >= this.params.sizeLimit) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`
|
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
|
||||||
);
|
);
|
||||||
interrupt = true;
|
interrupt = true;
|
||||||
}
|
}
|
||||||
|
@ -974,7 +1006,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const elapsed = secondsElapsed(this.startTime);
|
const elapsed = secondsElapsed(this.startTime);
|
||||||
if (elapsed >= this.params.timeLimit) {
|
if (elapsed >= this.params.timeLimit) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`
|
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
|
||||||
);
|
);
|
||||||
interrupt = true;
|
interrupt = true;
|
||||||
}
|
}
|
||||||
|
@ -992,7 +1024,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const numFailed = this.crawlState.numFailed();
|
const numFailed = this.crawlState.numFailed();
|
||||||
if (numFailed >= this.params.failOnFailedLimit) {
|
if (numFailed >= this.params.failOnFailedLimit) {
|
||||||
logger.fatal(
|
logger.fatal(
|
||||||
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`
|
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1060,14 +1092,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
if (this.params.healthCheckPort) {
|
if (this.params.healthCheckPort) {
|
||||||
this.healthChecker = new HealthChecker(
|
this.healthChecker = new HealthChecker(
|
||||||
this.params.healthCheckPort,
|
this.params.healthCheckPort,
|
||||||
this.params.workers
|
this.params.workers,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const driverUrl = new URL(this.params.driver, import.meta.url);
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
||||||
this.driver = (await import(driverUrl.href)).default;
|
this.driver = (await import(driverUrl.href)).default;
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
||||||
return;
|
return;
|
||||||
|
@ -1125,7 +1157,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.crawlState.load(
|
await this.crawlState.load(
|
||||||
this.params.state,
|
this.params.state,
|
||||||
this.params.scopedSeeds,
|
this.params.scopedSeeds,
|
||||||
true
|
true,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1133,14 +1165,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
this.adBlockRules = new AdBlockRules(
|
this.adBlockRules = new AdBlockRules(
|
||||||
this.captureBasePrefix,
|
this.captureBasePrefix,
|
||||||
this.params.adBlockMessage
|
this.params.adBlockMessage,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (this.params.blockRules && this.params.blockRules.length) {
|
if (this.params.blockRules && this.params.blockRules.length) {
|
||||||
this.blockRules = new BlockRules(
|
this.blockRules = new BlockRules(
|
||||||
this.params.blockRules,
|
this.params.blockRules,
|
||||||
this.captureBasePrefix,
|
this.captureBasePrefix,
|
||||||
this.params.blockMessage
|
this.params.blockMessage,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1178,10 +1210,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.error(
|
logger.error(
|
||||||
"Browser disconnected (crashed?), interrupting crawl",
|
"Browser disconnected (crashed?), interrupting crawl",
|
||||||
err,
|
err,
|
||||||
"browser"
|
"browser",
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} as any);
|
} as any);
|
||||||
|
|
||||||
// --------------
|
// --------------
|
||||||
|
@ -1220,7 +1252,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||||
const warcListFull = warcList.map((filename) =>
|
const warcListFull = warcList.map((filename) =>
|
||||||
path.join(this.collDir, "archive", filename)
|
path.join(this.collDir, "archive", filename),
|
||||||
);
|
);
|
||||||
|
|
||||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||||
|
@ -1230,7 +1262,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
...warcListFull,
|
...warcListFull,
|
||||||
];
|
];
|
||||||
const indexResult = await this.awaitProcess(
|
const indexResult = await this.awaitProcess(
|
||||||
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd })
|
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
|
||||||
);
|
);
|
||||||
if (indexResult === 0) {
|
if (indexResult === 0) {
|
||||||
logger.debug("Indexing complete, CDX successfully created");
|
logger.debug("Indexing complete, CDX successfully created");
|
||||||
|
@ -1251,11 +1283,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
if (uploaded && this.uploadAndDeleteLocal) {
|
if (uploaded && this.uploadAndDeleteLocal) {
|
||||||
logger.info(
|
logger.info(
|
||||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`
|
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
||||||
);
|
);
|
||||||
try {
|
try {
|
||||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
||||||
}
|
}
|
||||||
|
@ -1352,13 +1384,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
createArgs.push("-f");
|
createArgs.push("-f");
|
||||||
|
|
||||||
warcFileList.forEach((val) =>
|
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
|
||||||
createArgs.push(path.join(archiveDir, val))
|
|
||||||
);
|
|
||||||
|
|
||||||
// create WACZ
|
// create WACZ
|
||||||
const waczResult = await this.awaitProcess(
|
const waczResult = await this.awaitProcess(
|
||||||
child_process.spawn("wacz", createArgs)
|
child_process.spawn("wacz", createArgs),
|
||||||
);
|
);
|
||||||
|
|
||||||
if (waczResult !== 0) {
|
if (waczResult !== 0) {
|
||||||
|
@ -1430,7 +1460,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
maxHeapTotal: this.maxHeapTotal,
|
maxHeapTotal: this.maxHeapTotal,
|
||||||
...memUsage,
|
...memUsage,
|
||||||
},
|
},
|
||||||
"memory"
|
"memory",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1461,9 +1491,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
try {
|
try {
|
||||||
await fsp.writeFile(
|
await fsp.writeFile(
|
||||||
this.params.statsFilename,
|
this.params.statsFilename,
|
||||||
JSON.stringify(stats, null, 2)
|
JSON.stringify(stats, null, 2),
|
||||||
);
|
);
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
logger.warn("Stats output failed", err);
|
logger.warn("Stats output failed", err);
|
||||||
}
|
}
|
||||||
|
@ -1473,7 +1503,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
async loadPage(
|
async loadPage(
|
||||||
page: Page,
|
page: Page,
|
||||||
data: PageState,
|
data: PageState,
|
||||||
selectorOptsList = DEFAULT_SELECTORS
|
selectorOptsList = DEFAULT_SELECTORS,
|
||||||
) {
|
) {
|
||||||
const { url, seedId, depth } = data;
|
const { url, seedId, depth } = data;
|
||||||
|
|
||||||
|
@ -1533,7 +1563,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const contentType = resp.headers()["content-type"];
|
const contentType = resp.headers()["content-type"];
|
||||||
|
|
||||||
isHTMLPage = this.isHTMLContentType(contentType);
|
isHTMLPage = this.isHTMLContentType(contentType);
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
const msg = e.message || "";
|
const msg = e.message || "";
|
||||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||||
|
@ -1575,7 +1605,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const frames = await page.frames();
|
const frames = await page.frames();
|
||||||
|
|
||||||
const filteredFrames = await Promise.allSettled(
|
const filteredFrames = await Promise.allSettled(
|
||||||
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails))
|
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
|
||||||
);
|
);
|
||||||
|
|
||||||
data.filteredFrames = filteredFrames
|
data.filteredFrames = filteredFrames
|
||||||
|
@ -1640,7 +1670,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
page: Page,
|
page: Page,
|
||||||
data: PageState,
|
data: PageState,
|
||||||
selectors = DEFAULT_SELECTORS,
|
selectors = DEFAULT_SELECTORS,
|
||||||
logDetails: LogDetails
|
logDetails: LogDetails,
|
||||||
) {
|
) {
|
||||||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||||
|
|
||||||
|
@ -1651,7 +1681,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
links.push(url);
|
links.push(url);
|
||||||
if (links.length == 500) {
|
if (links.length == 500) {
|
||||||
promiseList.push(
|
promiseList.push(
|
||||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails)
|
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||||
);
|
);
|
||||||
links = [];
|
links = [];
|
||||||
}
|
}
|
||||||
|
@ -1676,7 +1706,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
document.querySelectorAll(selector).forEach(getter);
|
document.querySelectorAll(selector).forEach(getter);
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const func = (window as any)[addLinkFunc] as (url: string) => NonNullable<unknown>;
|
const func = (window as any)[addLinkFunc] as (
|
||||||
|
url: string,
|
||||||
|
) => NonNullable<unknown>;
|
||||||
urls.forEach((url) => func.call(this, url));
|
urls.forEach((url) => func.call(this, url));
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -1701,9 +1733,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}),
|
}),
|
||||||
PAGE_OP_TIMEOUT_SECS,
|
PAGE_OP_TIMEOUT_SECS,
|
||||||
"Link extraction timed out",
|
"Link extraction timed out",
|
||||||
logDetails
|
logDetails,
|
||||||
)
|
),
|
||||||
)
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
for (let i = 0; i < promiseResults.length; i++) {
|
for (let i = 0; i < promiseResults.length; i++) {
|
||||||
|
@ -1718,14 +1750,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.warn("Link Extraction failed", e);
|
logger.warn("Link Extraction failed", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (links.length) {
|
if (links.length) {
|
||||||
promiseList.push(
|
promiseList.push(
|
||||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails)
|
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1737,7 +1769,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
urls: string[],
|
urls: string[],
|
||||||
depth: number,
|
depth: number,
|
||||||
extraHops = 0,
|
extraHops = 0,
|
||||||
logDetails: LogDetails = {}
|
logDetails: LogDetails = {},
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
depth += 1;
|
depth += 1;
|
||||||
|
@ -1748,7 +1780,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
for (const possibleUrl of urls) {
|
for (const possibleUrl of urls) {
|
||||||
const res = this.isInScope(
|
const res = this.isInScope(
|
||||||
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
|
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
|
||||||
logDetails
|
logDetails,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
|
@ -1763,11 +1795,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
url,
|
url,
|
||||||
depth,
|
depth,
|
||||||
isOOS ? newExtraHops : extraHops,
|
isOOS ? newExtraHops : extraHops,
|
||||||
logDetails
|
logDetails,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.error("Queuing Error", e);
|
logger.error("Queuing Error", e);
|
||||||
}
|
}
|
||||||
|
@ -1784,12 +1816,12 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
"Cloudflare check timed out",
|
"Cloudflare check timed out",
|
||||||
logDetails,
|
logDetails,
|
||||||
"general",
|
"general",
|
||||||
true
|
true,
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Cloudflare Check Detected, waiting for reload...",
|
"Cloudflare Check Detected, waiting for reload...",
|
||||||
logDetails
|
logDetails,
|
||||||
);
|
);
|
||||||
await sleep(5.5);
|
await sleep(5.5);
|
||||||
}
|
}
|
||||||
|
@ -1803,7 +1835,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
url: string,
|
url: string,
|
||||||
depth: number,
|
depth: number,
|
||||||
extraHops: number,
|
extraHops: number,
|
||||||
logDetails: LogDetails = {}
|
logDetails: LogDetails = {},
|
||||||
) {
|
) {
|
||||||
if (this.limitHit) {
|
if (this.limitHit) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1811,30 +1843,30 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
const result = await this.crawlState.addToQueue(
|
const result = await this.crawlState.addToQueue(
|
||||||
{ url, seedId, depth, extraHops },
|
{ url, seedId, depth, extraHops },
|
||||||
this.pageLimit
|
this.pageLimit,
|
||||||
);
|
);
|
||||||
|
|
||||||
switch (result) {
|
switch (result) {
|
||||||
case QueueState.ADDED:
|
case QueueState.ADDED:
|
||||||
logger.debug("Queued new page url", { url, ...logDetails }, "links");
|
logger.debug("Queued new page url", { url, ...logDetails }, "links");
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
case QueueState.LIMIT_HIT:
|
case QueueState.LIMIT_HIT:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Not queued page url, at page limit",
|
"Not queued page url, at page limit",
|
||||||
{ url, ...logDetails },
|
{ url, ...logDetails },
|
||||||
"links"
|
"links",
|
||||||
);
|
);
|
||||||
this.limitHit = true;
|
this.limitHit = true;
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
case QueueState.DUPE_URL:
|
case QueueState.DUPE_URL:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Not queued page url, already seen",
|
"Not queued page url, already seen",
|
||||||
{ url, ...logDetails },
|
{ url, ...logDetails },
|
||||||
"links"
|
"links",
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -1867,7 +1899,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const header_formatted = JSON.stringify(header).concat("\n");
|
const header_formatted = JSON.stringify(header).concat("\n");
|
||||||
await this.pagesFH.writeFile(header_formatted);
|
await this.pagesFH.writeFile(header_formatted);
|
||||||
}
|
}
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
logger.error("pages/pages.jsonl creation failed", err);
|
logger.error("pages/pages.jsonl creation failed", err);
|
||||||
}
|
}
|
||||||
|
@ -1904,7 +1936,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const processedRow = JSON.stringify(row) + "\n";
|
const processedRow = JSON.stringify(row) + "\n";
|
||||||
try {
|
try {
|
||||||
await this.pagesFH!.writeFile(processedRow);
|
await this.pagesFH!.writeFile(processedRow);
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
logger.warn("pages/pages.jsonl append failed", err);
|
logger.warn("pages/pages.jsonl append failed", err);
|
||||||
}
|
}
|
||||||
|
@ -1920,7 +1952,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
method: "HEAD",
|
method: "HEAD",
|
||||||
headers: this.headers,
|
headers: this.headers,
|
||||||
agent: this.resolveAgent,
|
agent: this.resolveAgent,
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} as any);
|
} as any);
|
||||||
if (resp.status !== 200) {
|
if (resp.status !== 200) {
|
||||||
logger.debug("HEAD response code != 200, loading in browser", {
|
logger.debug("HEAD response code != 200, loading in browser", {
|
||||||
|
@ -1961,14 +1993,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
logger.info(
|
logger.info(
|
||||||
"Fetching full sitemap (fromDate not specified/valid)",
|
"Fetching full sitemap (fromDate not specified/valid)",
|
||||||
{ url, sitemapFromDate },
|
{ url, sitemapFromDate },
|
||||||
"sitemap"
|
"sitemap",
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
lastmodFromTimestamp = dateObj.getTime();
|
lastmodFromTimestamp = dateObj.getTime();
|
||||||
logger.info(
|
logger.info(
|
||||||
"Fetching and filtering sitemap by date",
|
"Fetching and filtering sitemap by date",
|
||||||
{ url, sitemapFromDate },
|
{ url, sitemapFromDate },
|
||||||
"sitemap"
|
"sitemap",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1984,7 +2016,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const { sites } = await sitemapper.fetch();
|
const { sites } = await sitemapper.fetch();
|
||||||
logger.info("Sitemap Urls Found", { urls: sites.length }, "sitemap");
|
logger.info("Sitemap Urls Found", { urls: sites.length }, "sitemap");
|
||||||
await this.queueInScopeUrls(seedId, sites, 0);
|
await this.queueInScopeUrls(seedId, sites, 0);
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.warn("Error fetching sites from sitemap", e, "sitemap");
|
logger.warn("Error fetching sites from sitemap", e, "sitemap");
|
||||||
}
|
}
|
||||||
|
@ -2088,21 +2120,21 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
|
|
||||||
async serializeConfig(done = false) {
|
async serializeConfig(done = false) {
|
||||||
switch (this.params.saveState) {
|
switch (this.params.saveState) {
|
||||||
case "never":
|
case "never":
|
||||||
return;
|
|
||||||
|
|
||||||
case "partial":
|
|
||||||
if (!done) {
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
if (await this.crawlState.isFinished()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "always":
|
case "partial":
|
||||||
default:
|
if (!done) {
|
||||||
break;
|
return;
|
||||||
|
}
|
||||||
|
if (await this.crawlState.isFinished()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "always":
|
||||||
|
default:
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
|
@ -2137,7 +2169,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
try {
|
try {
|
||||||
logger.info(`Saving crawl state to: ${filename}`);
|
logger.info(`Saving crawl state to: ${filename}`);
|
||||||
await fsp.writeFile(filename, res);
|
await fsp.writeFile(filename, res);
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.error(`Failed to write save state file: ${filename}`, e);
|
logger.error(`Failed to write save state file: ${filename}`, e);
|
||||||
return;
|
return;
|
||||||
|
@ -2166,8 +2198,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
function shouldIgnoreAbort(req: HTTPRequest) {
|
function shouldIgnoreAbort(req: HTTPRequest) {
|
||||||
try {
|
try {
|
||||||
const failure = req.failure();
|
const failure = req.failure();
|
||||||
const failureText = failure && failure.errorText || "";
|
const failureText = (failure && failure.errorText) || "";
|
||||||
if (failureText !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
if (
|
||||||
|
failureText !== "net::ERR_ABORTED" ||
|
||||||
|
req.resourceType() !== "document"
|
||||||
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2178,8 +2213,10 @@ function shouldIgnoreAbort(req: HTTPRequest) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (headers["content-disposition"] ||
|
if (
|
||||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
headers["content-disposition"] ||
|
||||||
|
(headers["content-type"] && !headers["content-type"].startsWith("text/"))
|
||||||
|
) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
@ -2188,4 +2225,3 @@ function shouldIgnoreAbort(req: HTTPRequest) {
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,81 +15,99 @@ import { Browser } from "./util/browser.js";
|
||||||
import { initStorage } from "./util/storage.js";
|
import { initStorage } from "./util/storage.js";
|
||||||
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||||
|
|
||||||
const profileHTML = fs.readFileSync(new URL("../html/createProfile.html", import.meta.url), {encoding: "utf8"});
|
const profileHTML = fs.readFileSync(
|
||||||
const vncHTML = fs.readFileSync(new URL("../html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
|
new URL("../html/createProfile.html", import.meta.url),
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
const vncHTML = fs.readFileSync(
|
||||||
|
new URL("../html/vnc_lite.html", import.meta.url),
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
const behaviors = fs.readFileSync(
|
||||||
|
new URL(
|
||||||
|
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||||
|
import.meta.url,
|
||||||
|
),
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
function cliOpts(): { [key: string]: Options } {
|
function cliOpts(): { [key: string]: Options } {
|
||||||
return {
|
return {
|
||||||
"url": {
|
url: {
|
||||||
describe: "The URL of the login page",
|
describe: "The URL of the login page",
|
||||||
type: "string",
|
type: "string",
|
||||||
demandOption: true,
|
demandOption: true,
|
||||||
},
|
},
|
||||||
|
|
||||||
"user": {
|
user: {
|
||||||
describe: "The username for the login. If not specified, will be prompted",
|
describe:
|
||||||
|
"The username for the login. If not specified, will be prompted",
|
||||||
},
|
},
|
||||||
|
|
||||||
"password": {
|
password: {
|
||||||
describe: "The password for the login. If not specified, will be prompted (recommended)",
|
describe:
|
||||||
|
"The password for the login. If not specified, will be prompted (recommended)",
|
||||||
},
|
},
|
||||||
|
|
||||||
"filename": {
|
filename: {
|
||||||
describe: "The filename for the profile tarball",
|
describe: "The filename for the profile tarball",
|
||||||
default: "/crawls/profiles/profile.tar.gz",
|
default: "/crawls/profiles/profile.tar.gz",
|
||||||
},
|
},
|
||||||
|
|
||||||
"debugScreenshot": {
|
debugScreenshot: {
|
||||||
describe: "If specified, take a screenshot after login and save as this filename"
|
describe:
|
||||||
|
"If specified, take a screenshot after login and save as this filename",
|
||||||
},
|
},
|
||||||
|
|
||||||
"headless": {
|
headless: {
|
||||||
describe: "Run in headless mode, otherwise start xvfb",
|
describe: "Run in headless mode, otherwise start xvfb",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"automated": {
|
automated: {
|
||||||
describe: "Start in automated mode, no interactive browser",
|
describe: "Start in automated mode, no interactive browser",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"interactive": {
|
interactive: {
|
||||||
describe: "Deprecated. Now the default option!",
|
describe: "Deprecated. Now the default option!",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"shutdownWait": {
|
shutdownWait: {
|
||||||
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
|
describe:
|
||||||
|
"Shutdown browser in interactive after this many seconds, if no pings received",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 0
|
default: 0,
|
||||||
},
|
},
|
||||||
|
|
||||||
"profile": {
|
profile: {
|
||||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
describe:
|
||||||
|
"Path to tar.gz file which will be extracted and used as the browser profile",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"windowSize": {
|
windowSize: {
|
||||||
type: "string",
|
type: "string",
|
||||||
describe: "Browser window dimensions, specified as: width,height",
|
describe: "Browser window dimensions, specified as: width,height",
|
||||||
default: getDefaultWindowSize()
|
default: getDefaultWindowSize(),
|
||||||
},
|
},
|
||||||
|
|
||||||
"proxy": {
|
proxy: {
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"cookieDays": {
|
cookieDays: {
|
||||||
type: "number",
|
type: "number",
|
||||||
describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
describe:
|
||||||
default: 7
|
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||||
}
|
default: 7,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,14 +118,11 @@ function getDefaultWindowSize() {
|
||||||
return `${x},${y}`;
|
return `${x},${y}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const params : any = yargs(process.argv)
|
const params: any = yargs(process.argv)
|
||||||
.usage("browsertrix-crawler profile [options]")
|
.usage("browsertrix-crawler profile [options]")
|
||||||
.option(cliOpts())
|
.option(cliOpts()).argv;
|
||||||
.argv;
|
|
||||||
|
|
||||||
logger.setDebugLogging(true);
|
logger.setDebugLogging(true);
|
||||||
|
|
||||||
|
@ -122,7 +137,7 @@ async function main() {
|
||||||
process.env.GEOMETRY || "",
|
process.env.GEOMETRY || "",
|
||||||
"-ac",
|
"-ac",
|
||||||
"+extension",
|
"+extension",
|
||||||
"RANDR"
|
"RANDR",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
|
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
|
||||||
|
@ -140,7 +155,7 @@ async function main() {
|
||||||
"-passwd",
|
"-passwd",
|
||||||
process.env.VNC_PASS || "",
|
process.env.VNC_PASS || "",
|
||||||
"-display",
|
"-display",
|
||||||
process.env.DISPLAY || ""
|
process.env.DISPLAY || "",
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -156,13 +171,15 @@ async function main() {
|
||||||
"--window-position=0,0",
|
"--window-position=0,0",
|
||||||
`--window-size=${params.windowSize}`,
|
`--window-size=${params.windowSize}`,
|
||||||
// to disable the 'stability will suffer' infobar
|
// to disable the 'stability will suffer' infobar
|
||||||
"--test-type"
|
"--test-type",
|
||||||
]
|
],
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
|
logger.warn(
|
||||||
|
"Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.user || params.password) {
|
if (params.user || params.password) {
|
||||||
|
@ -179,20 +196,23 @@ async function main() {
|
||||||
|
|
||||||
const { page, cdp } = await browser.newWindowPageWithCDP();
|
const { page, cdp } = await browser.newWindowPageWithCDP();
|
||||||
|
|
||||||
const waitUntil : PuppeteerLifeCycleEvent = "load";
|
const waitUntil: PuppeteerLifeCycleEvent = "load";
|
||||||
|
|
||||||
await page.setCacheEnabled(false);
|
await page.setCacheEnabled(false);
|
||||||
|
|
||||||
if (!params.automated) {
|
if (!params.automated) {
|
||||||
await browser.setupPage({page, cdp});
|
await browser.setupPage({ page, cdp });
|
||||||
|
|
||||||
// for testing, inject browsertrix-behaviors
|
// for testing, inject browsertrix-behaviors
|
||||||
await browser.addInitScript(page, behaviors + ";\nself.__bx_behaviors.init();");
|
await browser.addInitScript(
|
||||||
|
page,
|
||||||
|
behaviors + ";\nself.__bx_behaviors.init();",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`Loading page: ${params.url}`);
|
logger.info(`Loading page: ${params.url}`);
|
||||||
|
|
||||||
await page.goto(params.url, {waitUntil});
|
await page.goto(params.url, { waitUntil });
|
||||||
|
|
||||||
if (!params.automated) {
|
if (!params.automated) {
|
||||||
const target = await cdp.send("Target.getTargetInfo");
|
const target = await cdp.send("Target.getTargetInfo");
|
||||||
|
@ -204,20 +224,29 @@ async function main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
async function automatedProfile(
|
||||||
async function automatedProfile(params: any, browser: Browser, page: Page, cdp: CDPSession,
|
// TODO: Fix this the next time the file is edited.
|
||||||
waitUntil: PuppeteerLifeCycleEvent) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
params: any,
|
||||||
|
browser: Browser,
|
||||||
|
page: Page,
|
||||||
|
cdp: CDPSession,
|
||||||
|
waitUntil: PuppeteerLifeCycleEvent,
|
||||||
|
) {
|
||||||
let u, p;
|
let u, p;
|
||||||
|
|
||||||
logger.debug("Looking for username and password entry fields on page...");
|
logger.debug("Looking for username and password entry fields on page...");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
u = await page.waitForSelector(
|
||||||
p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']");
|
"//input[contains(@name, 'user') or contains(@name, 'email')]",
|
||||||
|
);
|
||||||
|
p = await page.waitForSelector(
|
||||||
|
"//input[contains(@name, 'pass') and @type='password']",
|
||||||
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (params.debugScreenshot) {
|
if (params.debugScreenshot) {
|
||||||
await page.screenshot({path: params.debugScreenshot});
|
await page.screenshot({ path: params.debugScreenshot });
|
||||||
}
|
}
|
||||||
logger.debug("Login form could not be found");
|
logger.debug("Login form could not be found");
|
||||||
await page.close();
|
await page.close();
|
||||||
|
@ -231,11 +260,11 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
|
||||||
|
|
||||||
await Promise.allSettled([
|
await Promise.allSettled([
|
||||||
p!.press("Enter"),
|
p!.press("Enter"),
|
||||||
page.waitForNavigation({waitUntil})
|
page.waitForNavigation({ waitUntil }),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (params.debugScreenshot) {
|
if (params.debugScreenshot) {
|
||||||
await page.screenshot({path: params.debugScreenshot});
|
await page.screenshot({ path: params.debugScreenshot });
|
||||||
}
|
}
|
||||||
|
|
||||||
await createProfile(params, browser, page, cdp);
|
await createProfile(params, browser, page, cdp);
|
||||||
|
@ -243,8 +272,15 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
async function createProfile(
|
||||||
async function createProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, targetFilename = "") {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
params: any,
|
||||||
|
browser: Browser,
|
||||||
|
page: Page,
|
||||||
|
cdp: CDPSession,
|
||||||
|
targetFilename = "",
|
||||||
|
) {
|
||||||
await cdp.send("Network.clearBrowserCache");
|
await cdp.send("Network.clearBrowserCache");
|
||||||
|
|
||||||
await browser.close();
|
await browser.close();
|
||||||
|
@ -255,7 +291,7 @@ async function createProfile(params: any, browser: Browser, page: Page, cdp: CDP
|
||||||
|
|
||||||
const outputDir = path.dirname(profileFilename);
|
const outputDir = path.dirname(profileFilename);
|
||||||
if (outputDir && !fs.existsSync(outputDir)) {
|
if (outputDir && !fs.existsSync(outputDir)) {
|
||||||
fs.mkdirSync(outputDir, {recursive: true});
|
fs.mkdirSync(outputDir, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
browser.saveProfile(profileFilename);
|
browser.saveProfile(profileFilename);
|
||||||
|
@ -274,9 +310,9 @@ async function createProfile(params: any, browser: Browser, page: Page, cdp: CDP
|
||||||
|
|
||||||
function promptInput(msg: string, hidden = false) {
|
function promptInput(msg: string, hidden = false) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const rl : any = readline.createInterface({
|
const rl: any = readline.createInterface({
|
||||||
input: process.stdin,
|
input: process.stdin,
|
||||||
output: process.stdout
|
output: process.stdout,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (hidden) {
|
if (hidden) {
|
||||||
|
@ -303,7 +339,6 @@ function promptInput(msg: string, hidden = false) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class InteractiveBrowser {
|
class InteractiveBrowser {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
params: any;
|
params: any;
|
||||||
|
@ -323,7 +358,7 @@ class InteractiveBrowser {
|
||||||
browser: Browser,
|
browser: Browser,
|
||||||
page: Page,
|
page: Page,
|
||||||
cdp: CDPSession,
|
cdp: CDPSession,
|
||||||
targetId: string
|
targetId: string,
|
||||||
) {
|
) {
|
||||||
logger.info("Creating Profile Interactively...");
|
logger.info("Creating Profile Interactively...");
|
||||||
child_process.spawn("socat", [
|
child_process.spawn("socat", [
|
||||||
|
@ -359,19 +394,19 @@ class InteractiveBrowser {
|
||||||
if (this.shutdownWait) {
|
if (this.shutdownWait) {
|
||||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||||
logger.debug(
|
logger.debug(
|
||||||
`Shutting down in ${this.shutdownWait}ms if no ping received`
|
`Shutting down in ${this.shutdownWait}ms if no ping received`,
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
this.shutdownTimer = null;
|
this.shutdownTimer = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const httpServer = http.createServer((req, res) =>
|
const httpServer = http.createServer((req, res) =>
|
||||||
this.handleRequest(req, res)
|
this.handleRequest(req, res),
|
||||||
);
|
);
|
||||||
const port = 9223;
|
const port = 9223;
|
||||||
httpServer.listen(port);
|
httpServer.listen(port);
|
||||||
logger.info(
|
logger.info(
|
||||||
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`
|
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!params.headless) {
|
if (!params.headless) {
|
||||||
|
@ -442,141 +477,141 @@ class InteractiveBrowser {
|
||||||
let origins;
|
let origins;
|
||||||
|
|
||||||
switch (pathname) {
|
switch (pathname) {
|
||||||
case "/":
|
case "/":
|
||||||
res.writeHead(200, { "Content-Type": "text/html" });
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
if (this.params.headless) {
|
if (this.params.headless) {
|
||||||
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
||||||
} else {
|
} else {
|
||||||
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
||||||
}
|
}
|
||||||
res.end(
|
res.end(
|
||||||
profileHTML.replace(
|
profileHTML.replace(
|
||||||
"$DEVTOOLS_SRC",
|
"$DEVTOOLS_SRC",
|
||||||
targetUrl.replaceAll("$HOST", parsedUrl.hostname)
|
targetUrl.replaceAll("$HOST", parsedUrl.hostname),
|
||||||
)
|
),
|
||||||
);
|
|
||||||
return;
|
|
||||||
|
|
||||||
case "/vnc/":
|
|
||||||
case "/vnc/index.html":
|
|
||||||
res.writeHead(200, { "Content-Type": "text/html" });
|
|
||||||
res.end(vncHTML);
|
|
||||||
return;
|
|
||||||
|
|
||||||
case "/ping":
|
|
||||||
if (this.shutdownWait) {
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
clearTimeout(this.shutdownTimer as any);
|
|
||||||
this.shutdownTimer = setTimeout(
|
|
||||||
() => process.exit(0),
|
|
||||||
this.shutdownWait
|
|
||||||
);
|
);
|
||||||
logger.debug(
|
return;
|
||||||
`Ping received, delaying shutdown for ${this.shutdownWait}ms`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
origins = Array.from(this.originSet.values());
|
case "/vnc/":
|
||||||
|
case "/vnc/index.html":
|
||||||
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
|
res.end(vncHTML);
|
||||||
|
return;
|
||||||
|
|
||||||
res.writeHead(200, { "Content-Type": "application/json" });
|
case "/ping":
|
||||||
|
if (this.shutdownWait) {
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
clearTimeout(this.shutdownTimer as any);
|
||||||
|
this.shutdownTimer = setTimeout(
|
||||||
|
() => process.exit(0),
|
||||||
|
this.shutdownWait,
|
||||||
|
);
|
||||||
|
logger.debug(
|
||||||
|
`Ping received, delaying shutdown for ${this.shutdownWait}ms`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
res.end(JSON.stringify({ pong: true, origins }));
|
|
||||||
return;
|
|
||||||
|
|
||||||
case "/target":
|
|
||||||
res.writeHead(200, { "Content-Type": "application/json" });
|
|
||||||
res.end(JSON.stringify({ targetId: this.targetId }));
|
|
||||||
return;
|
|
||||||
|
|
||||||
case "/vncpass":
|
|
||||||
res.writeHead(200, { "Content-Type": "application/json" });
|
|
||||||
res.end(JSON.stringify({ password: process.env.VNC_PASS }));
|
|
||||||
return;
|
|
||||||
|
|
||||||
case "/navigate":
|
|
||||||
if (req.method !== "POST") {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const postData = await this.readBodyJson(req);
|
|
||||||
const url = new URL(postData.url).href;
|
|
||||||
|
|
||||||
res.writeHead(200, { "Content-Type": "application/json" });
|
|
||||||
res.end(JSON.stringify({ success: true }));
|
|
||||||
|
|
||||||
this.page.goto(url);
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
} catch (e: any) {
|
|
||||||
res.writeHead(400, { "Content-Type": "application/json" });
|
|
||||||
res.end(JSON.stringify({ error: e.toString() }));
|
|
||||||
logger.warn("HTTP Error", e);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
|
|
||||||
case "/createProfileJS":
|
|
||||||
if (req.method !== "POST") {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const postData = await this.readBodyJson(req);
|
|
||||||
const targetFilename = postData.filename || "";
|
|
||||||
|
|
||||||
await this.saveAllCookies();
|
|
||||||
|
|
||||||
const resource = await createProfile(
|
|
||||||
this.params,
|
|
||||||
this.browser,
|
|
||||||
this.page,
|
|
||||||
this.cdp,
|
|
||||||
targetFilename
|
|
||||||
);
|
|
||||||
origins = Array.from(this.originSet.values());
|
origins = Array.from(this.originSet.values());
|
||||||
|
|
||||||
res.writeHead(200, { "Content-Type": "application/json" });
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
res.end(JSON.stringify({ resource, origins }));
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
} catch (e: any) {
|
|
||||||
res.writeHead(500, { "Content-Type": "application/json" });
|
|
||||||
res.end(JSON.stringify({ error: e.toString() }));
|
|
||||||
logger.warn("HTTP Error", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
setTimeout(() => process.exit(0), 200);
|
res.end(JSON.stringify({ pong: true, origins }));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
case "/createProfile":
|
case "/target":
|
||||||
if (req.method !== "POST") {
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
break;
|
res.end(JSON.stringify({ targetId: this.targetId }));
|
||||||
}
|
return;
|
||||||
|
|
||||||
try {
|
case "/vncpass":
|
||||||
await this.saveAllCookies();
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ password: process.env.VNC_PASS }));
|
||||||
|
return;
|
||||||
|
|
||||||
await createProfile(this.params, this.browser, this.page, this.cdp);
|
case "/navigate":
|
||||||
|
if (req.method !== "POST") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
res.writeHead(200, { "Content-Type": "text/html" });
|
try {
|
||||||
res.end(
|
const postData = await this.readBodyJson(req);
|
||||||
"<html><body>Profile Created! You may now close this window.</body></html>"
|
const url = new URL(postData.url).href;
|
||||||
);
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
||||||
} catch (e: any) {
|
|
||||||
res.writeHead(500, { "Content-Type": "text/html" });
|
|
||||||
res.end(
|
|
||||||
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info"
|
|
||||||
);
|
|
||||||
logger.warn("HTTP Error", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
setTimeout(() => process.exit(0), 200);
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
return;
|
res.end(JSON.stringify({ success: true }));
|
||||||
|
|
||||||
|
this.page.goto(url);
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
|
res.writeHead(400, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ error: e.toString() }));
|
||||||
|
logger.warn("HTTP Error", e);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
|
||||||
|
case "/createProfileJS":
|
||||||
|
if (req.method !== "POST") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const postData = await this.readBodyJson(req);
|
||||||
|
const targetFilename = postData.filename || "";
|
||||||
|
|
||||||
|
await this.saveAllCookies();
|
||||||
|
|
||||||
|
const resource = await createProfile(
|
||||||
|
this.params,
|
||||||
|
this.browser,
|
||||||
|
this.page,
|
||||||
|
this.cdp,
|
||||||
|
targetFilename,
|
||||||
|
);
|
||||||
|
origins = Array.from(this.originSet.values());
|
||||||
|
|
||||||
|
res.writeHead(200, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ resource, origins }));
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
|
res.writeHead(500, { "Content-Type": "application/json" });
|
||||||
|
res.end(JSON.stringify({ error: e.toString() }));
|
||||||
|
logger.warn("HTTP Error", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
setTimeout(() => process.exit(0), 200);
|
||||||
|
return;
|
||||||
|
|
||||||
|
case "/createProfile":
|
||||||
|
if (req.method !== "POST") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.saveAllCookies();
|
||||||
|
|
||||||
|
await createProfile(this.params, this.browser, this.page, this.cdp);
|
||||||
|
|
||||||
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
|
res.end(
|
||||||
|
"<html><body>Profile Created! You may now close this window.</body></html>",
|
||||||
|
);
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
} catch (e: any) {
|
||||||
|
res.writeHead(500, { "Content-Type": "text/html" });
|
||||||
|
res.end(
|
||||||
|
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info",
|
||||||
|
);
|
||||||
|
logger.warn("HTTP Error", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
setTimeout(() => process.exit(0), 200);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pathname.startsWith("/vnc/")) {
|
if (pathname.startsWith("/vnc/")) {
|
||||||
const fileUrl = new URL(
|
const fileUrl = new URL(
|
||||||
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
|
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
|
||||||
import.meta.url
|
import.meta.url,
|
||||||
);
|
);
|
||||||
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
|
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
|
||||||
res.writeHead(200, { "Content-Type": "application/javascript" });
|
res.writeHead(200, { "Content-Type": "application/javascript" });
|
||||||
|
@ -607,6 +642,4 @@ class InteractiveBrowser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
main();
|
main();
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,14 @@ import { Page } from "puppeteer-core";
|
||||||
import { PageState } from "./util/state.js";
|
import { PageState } from "./util/state.js";
|
||||||
import { Crawler } from "./crawler.js";
|
import { Crawler } from "./crawler.js";
|
||||||
|
|
||||||
export default async ({data, page, crawler} : {data: PageState, page: Page, crawler: Crawler}) => {
|
export default async ({
|
||||||
|
data,
|
||||||
|
page,
|
||||||
|
crawler,
|
||||||
|
}: {
|
||||||
|
data: PageState;
|
||||||
|
page: Page;
|
||||||
|
crawler: Crawler;
|
||||||
|
}) => {
|
||||||
await crawler.loadPage(page, data);
|
await crawler.loadPage(page, data);
|
||||||
};
|
};
|
||||||
|
|
|
@ -4,13 +4,11 @@ import { logger } from "./util/logger.js";
|
||||||
import { setExitOnRedisError } from "./util/redis.js";
|
import { setExitOnRedisError } from "./util/redis.js";
|
||||||
import { Crawler } from "./crawler.js";
|
import { Crawler } from "./crawler.js";
|
||||||
|
|
||||||
|
let crawler: Crawler | null = null;
|
||||||
let crawler : Crawler | null = null;
|
|
||||||
|
|
||||||
let lastSigInt = 0;
|
let lastSigInt = 0;
|
||||||
let forceTerm = false;
|
let forceTerm = false;
|
||||||
|
|
||||||
|
|
||||||
async function handleTerminate(signame: string) {
|
async function handleTerminate(signame: string) {
|
||||||
logger.info(`${signame} received...`);
|
logger.info(`${signame} received...`);
|
||||||
if (!crawler || !crawler.crawlState) {
|
if (!crawler || !crawler.crawlState) {
|
||||||
|
@ -53,5 +51,3 @@ process.on("SIGABRT", async () => {
|
||||||
|
|
||||||
crawler = new Crawler();
|
crawler = new Crawler();
|
||||||
crawler.run();
|
crawler.run();
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,199 +7,225 @@ import { KnownDevices as devices } from "puppeteer-core";
|
||||||
import yargs, { Options } from "yargs";
|
import yargs, { Options } from "yargs";
|
||||||
import { hideBin } from "yargs/helpers";
|
import { hideBin } from "yargs/helpers";
|
||||||
|
|
||||||
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
|
import {
|
||||||
|
BEHAVIOR_LOG_FUNC,
|
||||||
|
WAIT_UNTIL_OPTS,
|
||||||
|
EXTRACT_TEXT_TYPES,
|
||||||
|
} from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
import { screenshotTypes } from "./screenshots.js";
|
import { screenshotTypes } from "./screenshots.js";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
class ArgParser {
|
class ArgParser {
|
||||||
get cliOpts() : { [key: string]: Options } {
|
get cliOpts(): { [key: string]: Options } {
|
||||||
const coerce = (array : string[]) => {
|
const coerce = (array: string[]) => {
|
||||||
return array.flatMap(v => v.split(",")).filter(x => !!x);
|
return array.flatMap((v) => v.split(",")).filter((x) => !!x);
|
||||||
};
|
};
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"seeds": {
|
seeds: {
|
||||||
alias: "url",
|
alias: "url",
|
||||||
describe: "The URL to start crawling from",
|
describe: "The URL to start crawling from",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: [],
|
default: [],
|
||||||
},
|
},
|
||||||
|
|
||||||
"seedFile": {
|
seedFile: {
|
||||||
alias: ["urlFile"],
|
alias: ["urlFile"],
|
||||||
describe: "If set, read a list of seed urls, one per line, from the specified",
|
describe:
|
||||||
|
"If set, read a list of seed urls, one per line, from the specified",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"workers": {
|
workers: {
|
||||||
alias: "w",
|
alias: "w",
|
||||||
describe: "The number of workers to run in parallel",
|
describe: "The number of workers to run in parallel",
|
||||||
default: 1,
|
default: 1,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"crawlId": {
|
crawlId: {
|
||||||
alias: "id",
|
alias: "id",
|
||||||
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
describe:
|
||||||
|
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"waitUntil": {
|
waitUntil: {
|
||||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
describe:
|
||||||
|
"Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: ["load", "networkidle2"],
|
default: ["load", "networkidle2"],
|
||||||
choices: WAIT_UNTIL_OPTS,
|
choices: WAIT_UNTIL_OPTS,
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
"depth": {
|
depth: {
|
||||||
describe: "The depth of the crawl for all seeds",
|
describe: "The depth of the crawl for all seeds",
|
||||||
default: -1,
|
default: -1,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"extraHops": {
|
extraHops: {
|
||||||
describe: "Number of extra 'hops' to follow, beyond the current scope",
|
describe: "Number of extra 'hops' to follow, beyond the current scope",
|
||||||
default: 0,
|
default: 0,
|
||||||
type: "number"
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"pageLimit": {
|
pageLimit: {
|
||||||
alias: "limit",
|
alias: "limit",
|
||||||
describe: "Limit crawl to this number of pages",
|
describe: "Limit crawl to this number of pages",
|
||||||
default: 0,
|
default: 0,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"maxPageLimit": {
|
maxPageLimit: {
|
||||||
describe: "Maximum pages to crawl, overriding pageLimit if both are set",
|
describe:
|
||||||
|
"Maximum pages to crawl, overriding pageLimit if both are set",
|
||||||
default: 0,
|
default: 0,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"pageLoadTimeout": {
|
pageLoadTimeout: {
|
||||||
alias: "timeout",
|
alias: "timeout",
|
||||||
describe: "Timeout for each page to load (in seconds)",
|
describe: "Timeout for each page to load (in seconds)",
|
||||||
default: 90,
|
default: 90,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"scopeType": {
|
scopeType: {
|
||||||
describe: "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
describe:
|
||||||
|
"A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
||||||
type: "string",
|
type: "string",
|
||||||
choices: ["page", "page-spa", "prefix", "host", "domain", "any", "custom"]
|
choices: [
|
||||||
|
"page",
|
||||||
|
"page-spa",
|
||||||
|
"prefix",
|
||||||
|
"host",
|
||||||
|
"domain",
|
||||||
|
"any",
|
||||||
|
"custom",
|
||||||
|
],
|
||||||
},
|
},
|
||||||
|
|
||||||
"scopeIncludeRx": {
|
scopeIncludeRx: {
|
||||||
alias: "include",
|
alias: "include",
|
||||||
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
describe:
|
||||||
|
"Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||||
},
|
},
|
||||||
|
|
||||||
"scopeExcludeRx": {
|
scopeExcludeRx: {
|
||||||
alias: "exclude",
|
alias: "exclude",
|
||||||
describe: "Regex of page URLs that should be excluded from the crawl."
|
describe: "Regex of page URLs that should be excluded from the crawl.",
|
||||||
},
|
},
|
||||||
|
|
||||||
"allowHashUrls": {
|
allowHashUrls: {
|
||||||
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
describe:
|
||||||
|
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||||
},
|
},
|
||||||
|
|
||||||
"blockRules": {
|
blockRules: {
|
||||||
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
describe:
|
||||||
|
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: [],
|
default: [],
|
||||||
},
|
},
|
||||||
|
|
||||||
"blockMessage": {
|
blockMessage: {
|
||||||
describe: "If specified, when a URL is blocked, a record with this error message is added instead",
|
describe:
|
||||||
|
"If specified, when a URL is blocked, a record with this error message is added instead",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"blockAds": {
|
blockAds: {
|
||||||
alias: "blockads",
|
alias: "blockads",
|
||||||
describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
describe:
|
||||||
|
"If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"adBlockMessage": {
|
adBlockMessage: {
|
||||||
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
|
describe:
|
||||||
|
"If specified, when an ad is blocked, a record with this error message is added instead",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"collection": {
|
collection: {
|
||||||
alias: "c",
|
alias: "c",
|
||||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
describe:
|
||||||
|
"Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: "crawl-@ts"
|
default: "crawl-@ts",
|
||||||
},
|
},
|
||||||
|
|
||||||
"headless": {
|
headless: {
|
||||||
describe: "Run in headless mode, otherwise start xvfb",
|
describe: "Run in headless mode, otherwise start xvfb",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"driver": {
|
driver: {
|
||||||
describe: "JS driver for the crawler",
|
describe: "JS driver for the crawler",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: "./defaultDriver.js",
|
default: "./defaultDriver.js",
|
||||||
},
|
},
|
||||||
|
|
||||||
"generateCDX": {
|
generateCDX: {
|
||||||
alias: ["generatecdx", "generateCdx"],
|
alias: ["generatecdx", "generateCdx"],
|
||||||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
describe:
|
||||||
|
"If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"combineWARC": {
|
combineWARC: {
|
||||||
alias: ["combinewarc", "combineWarc"],
|
alias: ["combinewarc", "combineWarc"],
|
||||||
describe: "If set, combine the warcs",
|
describe: "If set, combine the warcs",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"rolloverSize": {
|
rolloverSize: {
|
||||||
describe: "If set, declare the rollover size",
|
describe: "If set, declare the rollover size",
|
||||||
default: 1000000000,
|
default: 1000000000,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"generateWACZ": {
|
generateWACZ: {
|
||||||
alias: ["generatewacz", "generateWacz"],
|
alias: ["generatewacz", "generateWacz"],
|
||||||
describe: "If set, generate wacz",
|
describe: "If set, generate wacz",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"logging": {
|
logging: {
|
||||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
describe:
|
||||||
|
"Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: ["stats"],
|
default: ["stats"],
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
"logLevel": {
|
logLevel: {
|
||||||
describe: "Comma-separated list of log levels to include in logs",
|
describe: "Comma-separated list of log levels to include in logs",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: [],
|
default: [],
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
"context": {
|
context: {
|
||||||
describe: "Comma-separated list of contexts to include in logs",
|
describe: "Comma-separated list of contexts to include in logs",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: [],
|
default: [],
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
"text": {
|
text: {
|
||||||
describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
describe:
|
||||||
|
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||||
type: "array",
|
type: "array",
|
||||||
choices: EXTRACT_TEXT_TYPES,
|
choices: EXTRACT_TEXT_TYPES,
|
||||||
coerce: (array) => {
|
coerce: (array) => {
|
||||||
|
@ -211,45 +237,51 @@ class ArgParser {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
return coerce(array);
|
return coerce(array);
|
||||||
}
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"cwd": {
|
cwd: {
|
||||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
describe:
|
||||||
|
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: process.cwd(),
|
default: process.cwd(),
|
||||||
},
|
},
|
||||||
|
|
||||||
"mobileDevice": {
|
mobileDevice: {
|
||||||
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
describe:
|
||||||
|
"Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"userAgent": {
|
userAgent: {
|
||||||
describe: "Override user-agent with specified string",
|
describe: "Override user-agent with specified string",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"userAgentSuffix": {
|
userAgentSuffix: {
|
||||||
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
describe:
|
||||||
|
"Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"useSitemap": {
|
useSitemap: {
|
||||||
alias: "sitemap",
|
alias: "sitemap",
|
||||||
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
describe:
|
||||||
|
"If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||||
},
|
},
|
||||||
|
|
||||||
"sitemapFromDate": {
|
sitemapFromDate: {
|
||||||
alias: "sitemapFrom",
|
alias: "sitemapFrom",
|
||||||
describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
describe:
|
||||||
|
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
||||||
},
|
},
|
||||||
|
|
||||||
"statsFilename": {
|
statsFilename: {
|
||||||
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
describe:
|
||||||
|
"If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)",
|
||||||
},
|
},
|
||||||
|
|
||||||
"behaviors": {
|
behaviors: {
|
||||||
describe: "Which background behaviors to enable on each page",
|
describe: "Which background behaviors to enable on each page",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
|
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
|
||||||
|
@ -257,179 +289,204 @@ class ArgParser {
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
"behaviorTimeout": {
|
behaviorTimeout: {
|
||||||
describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
describe:
|
||||||
|
"If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
||||||
default: 90,
|
default: 90,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"pageExtraDelay": {
|
pageExtraDelay: {
|
||||||
alias: "delay",
|
alias: "delay",
|
||||||
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
describe:
|
||||||
|
"If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
||||||
default: 0,
|
default: 0,
|
||||||
type: "number",
|
type: "number",
|
||||||
},
|
},
|
||||||
|
|
||||||
"dedupPolicy": {
|
dedupPolicy: {
|
||||||
describe: "Deduplication policy",
|
describe: "Deduplication policy",
|
||||||
default: "skip",
|
default: "skip",
|
||||||
type: "string",
|
type: "string",
|
||||||
choices: ["skip", "revisit", "keep"],
|
choices: ["skip", "revisit", "keep"],
|
||||||
},
|
},
|
||||||
|
|
||||||
"profile": {
|
profile: {
|
||||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
describe:
|
||||||
|
"Path to tar.gz file which will be extracted and used as the browser profile",
|
||||||
type: "string",
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"screenshot": {
|
screenshot: {
|
||||||
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
describe:
|
||||||
|
"Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: [],
|
default: [],
|
||||||
choices: Array.from(Object.keys(screenshotTypes)),
|
choices: Array.from(Object.keys(screenshotTypes)),
|
||||||
coerce,
|
coerce,
|
||||||
},
|
},
|
||||||
|
|
||||||
"screencastPort": {
|
screencastPort: {
|
||||||
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
describe:
|
||||||
type: "number",
|
"If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
||||||
default: 0
|
|
||||||
},
|
|
||||||
|
|
||||||
"screencastRedis": {
|
|
||||||
describe: "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
|
|
||||||
type: "boolean",
|
|
||||||
default: false
|
|
||||||
},
|
|
||||||
|
|
||||||
"warcInfo": {
|
|
||||||
alias: ["warcinfo"],
|
|
||||||
describe: "Optional fields added to the warcinfo record in combined WARCs",
|
|
||||||
//type: "object"
|
|
||||||
},
|
|
||||||
|
|
||||||
"redisStoreUrl": {
|
|
||||||
describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
|
||||||
type: "string",
|
|
||||||
default: "redis://localhost:6379/0"
|
|
||||||
},
|
|
||||||
|
|
||||||
"saveState": {
|
|
||||||
describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
|
||||||
type: "string",
|
|
||||||
default: "partial",
|
|
||||||
choices: ["never", "partial", "always"]
|
|
||||||
},
|
|
||||||
|
|
||||||
"saveStateInterval": {
|
|
||||||
describe: "If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
|
|
||||||
type: "number",
|
|
||||||
default: 300,
|
|
||||||
},
|
|
||||||
|
|
||||||
"saveStateHistory": {
|
|
||||||
describe: "Number of save states to keep during the duration of a crawl",
|
|
||||||
type: "number",
|
|
||||||
default: 5,
|
|
||||||
},
|
|
||||||
|
|
||||||
"sizeLimit": {
|
|
||||||
describe: "If set, save state and exit if size limit exceeds this value",
|
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 0,
|
default: 0,
|
||||||
},
|
},
|
||||||
|
|
||||||
"diskUtilization": {
|
screencastRedis: {
|
||||||
describe: "If set, save state and exit if disk utilization exceeds this percentage value",
|
describe:
|
||||||
|
"If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
|
||||||
|
type: "boolean",
|
||||||
|
default: false,
|
||||||
|
},
|
||||||
|
|
||||||
|
warcInfo: {
|
||||||
|
alias: ["warcinfo"],
|
||||||
|
describe:
|
||||||
|
"Optional fields added to the warcinfo record in combined WARCs",
|
||||||
|
//type: "object"
|
||||||
|
},
|
||||||
|
|
||||||
|
redisStoreUrl: {
|
||||||
|
describe:
|
||||||
|
"If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
||||||
|
type: "string",
|
||||||
|
default: "redis://localhost:6379/0",
|
||||||
|
},
|
||||||
|
|
||||||
|
saveState: {
|
||||||
|
describe:
|
||||||
|
"If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
||||||
|
type: "string",
|
||||||
|
default: "partial",
|
||||||
|
choices: ["never", "partial", "always"],
|
||||||
|
},
|
||||||
|
|
||||||
|
saveStateInterval: {
|
||||||
|
describe:
|
||||||
|
"If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
|
||||||
|
type: "number",
|
||||||
|
default: 300,
|
||||||
|
},
|
||||||
|
|
||||||
|
saveStateHistory: {
|
||||||
|
describe:
|
||||||
|
"Number of save states to keep during the duration of a crawl",
|
||||||
|
type: "number",
|
||||||
|
default: 5,
|
||||||
|
},
|
||||||
|
|
||||||
|
sizeLimit: {
|
||||||
|
describe:
|
||||||
|
"If set, save state and exit if size limit exceeds this value",
|
||||||
|
type: "number",
|
||||||
|
default: 0,
|
||||||
|
},
|
||||||
|
|
||||||
|
diskUtilization: {
|
||||||
|
describe:
|
||||||
|
"If set, save state and exit if disk utilization exceeds this percentage value",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 90,
|
default: 90,
|
||||||
},
|
},
|
||||||
|
|
||||||
"timeLimit": {
|
timeLimit: {
|
||||||
describe: "If set, save state and exit after time limit, in seconds",
|
describe: "If set, save state and exit after time limit, in seconds",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 0,
|
default: 0,
|
||||||
},
|
},
|
||||||
|
|
||||||
"healthCheckPort": {
|
healthCheckPort: {
|
||||||
describe: "port to run healthcheck on",
|
describe: "port to run healthcheck on",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 0,
|
default: 0,
|
||||||
},
|
},
|
||||||
|
|
||||||
"overwrite": {
|
overwrite: {
|
||||||
describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
describe:
|
||||||
|
"overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"waitOnDone": {
|
waitOnDone: {
|
||||||
describe: "if set, wait for interrupt signal when finished instead of exiting",
|
describe:
|
||||||
|
"if set, wait for interrupt signal when finished instead of exiting",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"restartsOnError": {
|
restartsOnError: {
|
||||||
describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
|
describe:
|
||||||
|
"if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"netIdleWait": {
|
netIdleWait: {
|
||||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
describe:
|
||||||
|
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: -1
|
default: -1,
|
||||||
},
|
},
|
||||||
|
|
||||||
"lang": {
|
lang: {
|
||||||
describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
describe:
|
||||||
type: "string"
|
"if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||||
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"title": {
|
title: {
|
||||||
describe: "If set, write supplied title into WACZ datapackage.json metadata",
|
describe:
|
||||||
type: "string"
|
"If set, write supplied title into WACZ datapackage.json metadata",
|
||||||
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"description": {
|
description: {
|
||||||
alias: ["desc"],
|
alias: ["desc"],
|
||||||
describe: "If set, write supplied description into WACZ datapackage.json metadata",
|
describe:
|
||||||
type: "string"
|
"If set, write supplied description into WACZ datapackage.json metadata",
|
||||||
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"originOverride": {
|
originOverride: {
|
||||||
describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
|
describe:
|
||||||
|
"if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
|
||||||
type: "array",
|
type: "array",
|
||||||
default: [],
|
default: [],
|
||||||
},
|
},
|
||||||
|
|
||||||
"logErrorsToRedis": {
|
logErrorsToRedis: {
|
||||||
describe: "If set, write error messages to redis",
|
describe: "If set, write error messages to redis",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"failOnFailedSeed": {
|
failOnFailedSeed: {
|
||||||
describe: "If set, crawler will fail with exit code 1 if any seed fails",
|
describe:
|
||||||
|
"If set, crawler will fail with exit code 1 if any seed fails",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"failOnFailedLimit": {
|
failOnFailedLimit: {
|
||||||
describe: "If set, save state and exit if number of failed pages exceeds this value",
|
describe:
|
||||||
|
"If set, save state and exit if number of failed pages exceeds this value",
|
||||||
type: "number",
|
type: "number",
|
||||||
default: 0,
|
default: 0,
|
||||||
},
|
},
|
||||||
|
|
||||||
"customBehaviors": {
|
customBehaviors: {
|
||||||
describe: "injects a custom behavior file or set of behavior files in a directory",
|
describe:
|
||||||
type: "string"
|
"injects a custom behavior file or set of behavior files in a directory",
|
||||||
|
type: "string",
|
||||||
},
|
},
|
||||||
|
|
||||||
"debugAccessRedis": {
|
debugAccessRedis: {
|
||||||
describe: "if set, runs internal redis without protected mode to allow external access (for debugging)",
|
describe:
|
||||||
|
"if set, runs internal redis without protected mode to allow external access (for debugging)",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -445,25 +502,28 @@ class ArgParser {
|
||||||
const parsed = yargs(hideBin(argv))
|
const parsed = yargs(hideBin(argv))
|
||||||
.usage("crawler [options]")
|
.usage("crawler [options]")
|
||||||
.option(this.cliOpts)
|
.option(this.cliOpts)
|
||||||
.config("config", "Path to YAML config file", (configPath : string | number) => {
|
.config(
|
||||||
if (configPath === "/crawls/stdin") {
|
"config",
|
||||||
configPath = process.stdin.fd;
|
"Path to YAML config file",
|
||||||
}
|
(configPath: string | number) => {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
if (configPath === "/crawls/stdin") {
|
||||||
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
|
configPath = process.stdin.fd;
|
||||||
return origConfig;
|
}
|
||||||
})
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
.check((argv) => this.validateArgs(argv))
|
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
|
||||||
.argv;
|
return origConfig;
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.check((argv) => this.validateArgs(argv)).argv;
|
||||||
|
|
||||||
return {parsed, origConfig};
|
return { parsed, origConfig };
|
||||||
}
|
}
|
||||||
|
|
||||||
splitCrawlArgsQuoteSafe(crawlArgs: string) : string[] {
|
splitCrawlArgsQuoteSafe(crawlArgs: string): string[] {
|
||||||
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
|
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
|
||||||
const regex = /"[^"]+"|[^\s]+/g;
|
const regex = /"[^"]+"|[^\s]+/g;
|
||||||
const res = crawlArgs.match(regex);
|
const res = crawlArgs.match(regex);
|
||||||
return res ? res.map(e => e.replace(/"(.+)"/, "$1")) : [];
|
return res ? res.map((e) => e.replace(/"(.+)"/, "$1")) : [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -472,13 +532,15 @@ class ArgParser {
|
||||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||||
|
|
||||||
// Check that the collection name is valid.
|
// Check that the collection name is valid.
|
||||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1){
|
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
|
||||||
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
logger.fatal(
|
||||||
|
`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// background behaviors to apply
|
// background behaviors to apply
|
||||||
const behaviorOpts : {[key: string]: string | boolean} = {};
|
const behaviorOpts: { [key: string]: string | boolean } = {};
|
||||||
argv.behaviors.forEach((x: string) => behaviorOpts[x] = true);
|
argv.behaviors.forEach((x: string) => (behaviorOpts[x] = true));
|
||||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||||
|
|
||||||
|
@ -486,19 +548,21 @@ class ArgParser {
|
||||||
|
|
||||||
if (argv.mobileDevice) {
|
if (argv.mobileDevice) {
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
argv.emulateDevice = (devices as Record<string, any>)[argv.mobileDevice.replace("-", " ")];
|
argv.emulateDevice = (devices as Record<string, any>)[
|
||||||
|
argv.mobileDevice.replace("-", " ")
|
||||||
|
];
|
||||||
if (!argv.emulateDevice) {
|
if (!argv.emulateDevice) {
|
||||||
logger.fatal("Unknown device: " + argv.mobileDevice);
|
logger.fatal("Unknown device: " + argv.mobileDevice);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
argv.emulateDevice = {viewport: null};
|
argv.emulateDevice = { viewport: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv.seedFile) {
|
if (argv.seedFile) {
|
||||||
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
|
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
|
||||||
const urlSeedFileList = urlSeedFile.split("\n");
|
const urlSeedFileList = urlSeedFile.split("\n");
|
||||||
|
|
||||||
if (typeof(argv.seeds) === "string") {
|
if (typeof argv.seeds === "string") {
|
||||||
argv.seeds = [argv.seeds];
|
argv.seeds = [argv.seeds];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -530,12 +594,12 @@ class ArgParser {
|
||||||
argv.scopedSeeds = [];
|
argv.scopedSeeds = [];
|
||||||
|
|
||||||
for (let seed of argv.seeds) {
|
for (let seed of argv.seeds) {
|
||||||
if (typeof(seed) === "string") {
|
if (typeof seed === "string") {
|
||||||
seed = {url: seed};
|
seed = { url: seed };
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed}));
|
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (argv.failOnFailedSeed) {
|
if (argv.failOnFailedSeed) {
|
||||||
logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
|
logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
|
||||||
|
@ -552,7 +616,7 @@ class ArgParser {
|
||||||
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) {
|
if (argv.diskUtilization < 0 || argv.diskUtilization > 99) {
|
||||||
argv.diskUtilization = 90;
|
argv.diskUtilization = 90;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ const BlockState = {
|
||||||
BLOCK_PAGE_NAV: "page",
|
BLOCK_PAGE_NAV: "page",
|
||||||
BLOCK_IFRAME_NAV: "iframe",
|
BLOCK_IFRAME_NAV: "iframe",
|
||||||
BLOCK_OTHER: "resource",
|
BLOCK_OTHER: "resource",
|
||||||
BLOCK_AD: "advertisement"
|
BLOCK_AD: "advertisement",
|
||||||
};
|
};
|
||||||
|
|
||||||
type BlockRuleDecl = {
|
type BlockRuleDecl = {
|
||||||
|
@ -21,30 +21,30 @@ type BlockRuleDecl = {
|
||||||
frameTextMatch?: string;
|
frameTextMatch?: string;
|
||||||
inFrameUrl?: string;
|
inFrameUrl?: string;
|
||||||
type?: string;
|
type?: string;
|
||||||
}
|
};
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class BlockRule
|
class BlockRule {
|
||||||
{
|
|
||||||
type: string;
|
type: string;
|
||||||
url: RegExp | null;
|
url: RegExp | null;
|
||||||
frameTextMatch?: RegExp | null;
|
frameTextMatch?: RegExp | null;
|
||||||
inFrameUrl?: RegExp | null;
|
inFrameUrl?: RegExp | null;
|
||||||
|
|
||||||
constructor(data: string | BlockRuleDecl) {
|
constructor(data: string | BlockRuleDecl) {
|
||||||
if (typeof(data) === "string") {
|
if (typeof data === "string") {
|
||||||
this.url = new RegExp(data);
|
this.url = new RegExp(data);
|
||||||
this.type = "block";
|
this.type = "block";
|
||||||
} else {
|
} else {
|
||||||
this.url = data.url ? new RegExp(data.url) : null;
|
this.url = data.url ? new RegExp(data.url) : null;
|
||||||
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
|
this.frameTextMatch = data.frameTextMatch
|
||||||
|
? new RegExp(data.frameTextMatch)
|
||||||
|
: null;
|
||||||
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
|
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
|
||||||
this.type = data.type || "block";
|
this.type = data.type || "block";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!RULE_TYPES.includes(this.type)) {
|
if (!RULE_TYPES.includes(this.type)) {
|
||||||
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
logger.fatal('Rule "type" must be: ' + RULE_TYPES.join(", "));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,16 +59,18 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class BlockRules
|
export class BlockRules {
|
||||||
{
|
|
||||||
rules: BlockRule[];
|
rules: BlockRule[];
|
||||||
blockPutUrl: string;
|
blockPutUrl: string;
|
||||||
blockErrMsg: string;
|
blockErrMsg: string;
|
||||||
blockedUrlSet = new Set();
|
blockedUrlSet = new Set();
|
||||||
|
|
||||||
constructor(blockRules: BlockRuleDecl[], blockPutUrl: string, blockErrMsg: string) {
|
constructor(
|
||||||
|
blockRules: BlockRuleDecl[],
|
||||||
|
blockPutUrl: string,
|
||||||
|
blockErrMsg: string,
|
||||||
|
) {
|
||||||
this.rules = [];
|
this.rules = [];
|
||||||
this.blockPutUrl = blockPutUrl;
|
this.blockPutUrl = blockPutUrl;
|
||||||
this.blockErrMsg = blockErrMsg;
|
this.blockErrMsg = blockErrMsg;
|
||||||
|
@ -89,11 +91,15 @@ export class BlockRules
|
||||||
|
|
||||||
async initPage(browser: Browser, page: Page) {
|
async initPage(browser: Browser, page: Page) {
|
||||||
const onRequest = async (request: HTTPRequest) => {
|
const onRequest = async (request: HTTPRequest) => {
|
||||||
const logDetails = {page: page.url()};
|
const logDetails = { page: page.url() };
|
||||||
try {
|
try {
|
||||||
await this.handleRequest(request, logDetails);
|
await this.handleRequest(request, logDetails);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
|
logger.warn(
|
||||||
|
"Error handling request",
|
||||||
|
{ ...errJSON(e), ...logDetails },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
await browser.interceptRequest(page, onRequest);
|
await browser.interceptRequest(page, onRequest);
|
||||||
|
@ -113,14 +119,22 @@ export class BlockRules
|
||||||
} else {
|
} else {
|
||||||
await request.abort("blockedbyclient", 1);
|
await request.abort("blockedbyclient", 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking");
|
logger.debug(
|
||||||
|
`Block: (${blockState}) Failed On: ${url}`,
|
||||||
|
{ ...errJSON(e), ...logDetails },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
async shouldBlock(
|
||||||
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
request: HTTPRequest,
|
||||||
|
url: string,
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logDetails: Record<string, any>,
|
||||||
|
) {
|
||||||
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
||||||
return BlockState.ALLOW;
|
return BlockState.ALLOW;
|
||||||
}
|
}
|
||||||
|
@ -162,14 +176,29 @@ export class BlockRules
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const rule of this.rules) {
|
for (const rule of this.rules) {
|
||||||
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails);
|
const { done, block } = await this.ruleCheck(
|
||||||
|
rule,
|
||||||
|
request,
|
||||||
|
url,
|
||||||
|
frameUrl,
|
||||||
|
isNavReq,
|
||||||
|
logDetails,
|
||||||
|
);
|
||||||
|
|
||||||
if (block) {
|
if (block) {
|
||||||
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
||||||
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
|
logger.warn(
|
||||||
|
"Block rule match for page request ignored, set --exclude to block full pages",
|
||||||
|
{ url, ...logDetails },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
return BlockState.ALLOW;
|
return BlockState.ALLOW;
|
||||||
}
|
}
|
||||||
logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking");
|
logger.debug(
|
||||||
|
"URL Blocked in iframe",
|
||||||
|
{ url, frameUrl, ...logDetails },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
await this.recordBlockMsg(url);
|
await this.recordBlockMsg(url);
|
||||||
return blockState;
|
return blockState;
|
||||||
}
|
}
|
||||||
|
@ -181,47 +210,75 @@ export class BlockRules
|
||||||
return BlockState.ALLOW;
|
return BlockState.ALLOW;
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
async ruleCheck(
|
||||||
async ruleCheck(rule: BlockRule, request: HTTPRequest, reqUrl: string, frameUrl: string, isNavReq: boolean, logDetails: Record<string, any>) {
|
rule: BlockRule,
|
||||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
request: HTTPRequest,
|
||||||
|
reqUrl: string,
|
||||||
|
frameUrl: string,
|
||||||
|
isNavReq: boolean,
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logDetails: Record<string, any>,
|
||||||
|
) {
|
||||||
|
const { url, inFrameUrl, frameTextMatch } = rule;
|
||||||
|
|
||||||
const type = rule.type || "block";
|
const type = rule.type || "block";
|
||||||
const allowOnly = (type === "allowOnly");
|
const allowOnly = type === "allowOnly";
|
||||||
|
|
||||||
// not a frame match, skip rule
|
// not a frame match, skip rule
|
||||||
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
||||||
return {block: false, done: false};
|
return { block: false, done: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
const urlMatched = (url && reqUrl.match(url));
|
const urlMatched = url && reqUrl.match(url);
|
||||||
|
|
||||||
// if frame text-based rule: if url matched and a frame request
|
// if frame text-based rule: if url matched and a frame request
|
||||||
// frame text-based match: only applies to nav requests, never block otherwise
|
// frame text-based match: only applies to nav requests, never block otherwise
|
||||||
if (frameTextMatch) {
|
if (frameTextMatch) {
|
||||||
if (!urlMatched || !isNavReq) {
|
if (!urlMatched || !isNavReq) {
|
||||||
return {block: false, done: false};
|
return { block: false, done: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
|
const block = (await this.isTextMatch(
|
||||||
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
|
request,
|
||||||
return {block, done: true};
|
reqUrl,
|
||||||
|
frameTextMatch,
|
||||||
|
logDetails,
|
||||||
|
))
|
||||||
|
? !allowOnly
|
||||||
|
: allowOnly;
|
||||||
|
logger.debug(
|
||||||
|
"URL Conditional rule in iframe",
|
||||||
|
{ ...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
|
return { block, done: true };
|
||||||
}
|
}
|
||||||
|
|
||||||
// for non frame text rule, simply match by URL
|
// for non frame text rule, simply match by URL
|
||||||
const block = urlMatched ? !allowOnly : allowOnly;
|
const block = urlMatched ? !allowOnly : allowOnly;
|
||||||
return {block, done: false};
|
return { block, done: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
async isTextMatch(
|
||||||
async isTextMatch(request: HTTPRequest, reqUrl: string, frameTextMatch: RegExp, logDetails: Record<string, any>) {
|
request: HTTPRequest,
|
||||||
|
reqUrl: string,
|
||||||
|
frameTextMatch: RegExp,
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logDetails: Record<string, any>,
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const res = await fetch(reqUrl);
|
const res = await fetch(reqUrl);
|
||||||
const text = await res.text();
|
const text = await res.text();
|
||||||
|
|
||||||
return !!text.match(frameTextMatch);
|
return !!text.match(frameTextMatch);
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking");
|
logger.debug(
|
||||||
|
"Error determining text match",
|
||||||
|
{ ...errJSON(e), ...logDetails },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,19 +296,29 @@ export class BlockRules
|
||||||
const body = this.blockErrMsg;
|
const body = this.blockErrMsg;
|
||||||
const putUrl = new URL(this.blockPutUrl);
|
const putUrl = new URL(this.blockPutUrl);
|
||||||
putUrl.searchParams.set("url", url);
|
putUrl.searchParams.set("url", url);
|
||||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
await fetch(putUrl.href, {
|
||||||
|
method: "PUT",
|
||||||
|
headers: { "Content-Type": "text/html" },
|
||||||
|
body,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class AdBlockRules extends BlockRules
|
export class AdBlockRules extends BlockRules {
|
||||||
{
|
|
||||||
adhosts: string[];
|
adhosts: string[];
|
||||||
|
|
||||||
constructor(blockPutUrl: string, blockErrMsg: string, adhostsFilePath = "../../ad-hosts.json") {
|
constructor(
|
||||||
|
blockPutUrl: string,
|
||||||
|
blockErrMsg: string,
|
||||||
|
adhostsFilePath = "../../ad-hosts.json",
|
||||||
|
) {
|
||||||
super([], blockPutUrl, blockErrMsg);
|
super([], blockPutUrl, blockErrMsg);
|
||||||
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {"encoding": "utf-8"}));
|
this.adhosts = JSON.parse(
|
||||||
|
fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {
|
||||||
|
encoding: "utf-8",
|
||||||
|
}),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
isAdUrl(url: string) {
|
isAdUrl(url: string) {
|
||||||
|
@ -260,10 +327,19 @@ export class AdBlockRules extends BlockRules
|
||||||
return domain && this.adhosts.includes(domain);
|
return domain && this.adhosts.includes(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
async shouldBlock(
|
||||||
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
request: HTTPRequest,
|
||||||
|
url: string,
|
||||||
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
logDetails: Record<string, any>,
|
||||||
|
) {
|
||||||
if (this.isAdUrl(url)) {
|
if (this.isAdUrl(url)) {
|
||||||
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
logger.debug(
|
||||||
|
"URL blocked for being an ad",
|
||||||
|
{ url, ...logDetails },
|
||||||
|
"blocking",
|
||||||
|
);
|
||||||
await this.recordBlockMsg(url);
|
await this.recordBlockMsg(url);
|
||||||
return BlockState.BLOCK_AD;
|
return BlockState.BLOCK_AD;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,28 +9,32 @@ import path from "path";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
import { initStorage } from "./storage.js";
|
import { initStorage } from "./storage.js";
|
||||||
|
|
||||||
import puppeteer, { Frame, HTTPRequest, Page, PuppeteerLaunchOptions, Viewport } from "puppeteer-core";
|
import puppeteer, {
|
||||||
|
Frame,
|
||||||
|
HTTPRequest,
|
||||||
|
Page,
|
||||||
|
PuppeteerLaunchOptions,
|
||||||
|
Viewport,
|
||||||
|
} from "puppeteer-core";
|
||||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||||
|
|
||||||
type LaunchOpts = {
|
type LaunchOpts = {
|
||||||
profileUrl: string;
|
profileUrl: string;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
chromeOptions: Record<string, any>
|
chromeOptions: Record<string, any>;
|
||||||
signals: boolean;
|
signals: boolean;
|
||||||
headless: boolean;
|
headless: boolean;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
emulateDevice?: Record<string, any>
|
emulateDevice?: Record<string, any>;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
ondisconnect?: ((err: any) => NonNullable<unknown>) | null
|
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// ==================================================================
|
// ==================================================================
|
||||||
export class Browser
|
export class Browser {
|
||||||
{
|
|
||||||
profileDir: string;
|
profileDir: string;
|
||||||
customProfile = false;
|
customProfile = false;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
@ -48,47 +52,58 @@ export class Browser
|
||||||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||||
}
|
}
|
||||||
|
|
||||||
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} : LaunchOpts) { if (this.isLaunched()) {
|
async launch({
|
||||||
return;
|
profileUrl,
|
||||||
|
chromeOptions,
|
||||||
|
signals = false,
|
||||||
|
headless = false,
|
||||||
|
emulateDevice = {},
|
||||||
|
ondisconnect = null,
|
||||||
|
}: LaunchOpts) {
|
||||||
|
if (this.isLaunched()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (profileUrl) {
|
||||||
|
this.customProfile = await this.loadProfile(profileUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.emulateDevice = emulateDevice;
|
||||||
|
|
||||||
|
const args = this.chromeArgs(chromeOptions);
|
||||||
|
|
||||||
|
let defaultViewport = null;
|
||||||
|
|
||||||
|
if (process.env.GEOMETRY) {
|
||||||
|
const geom = process.env.GEOMETRY.split("x");
|
||||||
|
|
||||||
|
defaultViewport = { width: Number(geom[0]), height: Number(geom[1]) };
|
||||||
|
}
|
||||||
|
|
||||||
|
const launchOpts: PuppeteerLaunchOptions = {
|
||||||
|
args,
|
||||||
|
headless: headless ? "new" : false,
|
||||||
|
executablePath: this.getBrowserExe(),
|
||||||
|
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
||||||
|
ignoreHTTPSErrors: true,
|
||||||
|
handleSIGHUP: signals,
|
||||||
|
handleSIGINT: signals,
|
||||||
|
handleSIGTERM: signals,
|
||||||
|
protocolTimeout: 0,
|
||||||
|
|
||||||
|
defaultViewport,
|
||||||
|
waitForInitialPage: false,
|
||||||
|
userDataDir: this.profileDir,
|
||||||
|
};
|
||||||
|
|
||||||
|
await this._init(launchOpts, ondisconnect);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (profileUrl) {
|
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
|
||||||
this.customProfile = await this.loadProfile(profileUrl);
|
await this.addInitScript(
|
||||||
}
|
page,
|
||||||
|
'Object.defineProperty(navigator, "webdriver", {value: false});',
|
||||||
this.emulateDevice = emulateDevice;
|
);
|
||||||
|
|
||||||
const args = this.chromeArgs(chromeOptions);
|
|
||||||
|
|
||||||
let defaultViewport = null;
|
|
||||||
|
|
||||||
if (process.env.GEOMETRY) {
|
|
||||||
const geom = process.env.GEOMETRY.split("x");
|
|
||||||
|
|
||||||
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
|
|
||||||
}
|
|
||||||
|
|
||||||
const launchOpts : PuppeteerLaunchOptions = {
|
|
||||||
args,
|
|
||||||
headless: headless ? "new" : false,
|
|
||||||
executablePath: this.getBrowserExe(),
|
|
||||||
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
|
||||||
ignoreHTTPSErrors: true,
|
|
||||||
handleSIGHUP: signals,
|
|
||||||
handleSIGINT: signals,
|
|
||||||
handleSIGTERM: signals,
|
|
||||||
protocolTimeout: 0,
|
|
||||||
|
|
||||||
defaultViewport,
|
|
||||||
waitForInitialPage: false,
|
|
||||||
userDataDir: this.profileDir
|
|
||||||
};
|
|
||||||
|
|
||||||
await this._init(launchOpts, ondisconnect);
|
|
||||||
}
|
|
||||||
|
|
||||||
async setupPage({page} : {page: Page, cdp: CDPSession}) {
|
|
||||||
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
|
||||||
|
|
||||||
if (this.customProfile) {
|
if (this.customProfile) {
|
||||||
logger.info("Disabling Service Workers for profile", {}, "browser");
|
logger.info("Disabling Service Workers for profile", {}, "browser");
|
||||||
|
@ -97,20 +112,26 @@ export class Browser
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async loadProfile(profileFilename: string) : Promise<boolean> {
|
async loadProfile(profileFilename: string): Promise<boolean> {
|
||||||
const targetFilename = "/tmp/profile.tar.gz";
|
const targetFilename = "/tmp/profile.tar.gz";
|
||||||
|
|
||||||
if (profileFilename &&
|
if (
|
||||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
profileFilename &&
|
||||||
|
(profileFilename.startsWith("http:") ||
|
||||||
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
|
profileFilename.startsWith("https:"))
|
||||||
|
) {
|
||||||
|
logger.info(
|
||||||
|
`Downloading ${profileFilename} to ${targetFilename}`,
|
||||||
|
{},
|
||||||
|
"browserProfile",
|
||||||
|
);
|
||||||
|
|
||||||
const resp = await fetch(profileFilename);
|
const resp = await fetch(profileFilename);
|
||||||
await pipeline(
|
await pipeline(
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
Readable.fromWeb(resp.body as any),
|
Readable.fromWeb(resp.body as any),
|
||||||
fs.createWriteStream(targetFilename)
|
fs.createWriteStream(targetFilename),
|
||||||
);
|
);
|
||||||
|
|
||||||
profileFilename = targetFilename;
|
profileFilename = targetFilename;
|
||||||
|
@ -118,7 +139,9 @@ export class Browser
|
||||||
const storage = initStorage();
|
const storage = initStorage();
|
||||||
|
|
||||||
if (!storage) {
|
if (!storage) {
|
||||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
logger.fatal(
|
||||||
|
"Profile specified relative to s3 storage, but no S3 storage defined",
|
||||||
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -129,7 +152,9 @@ export class Browser
|
||||||
|
|
||||||
if (profileFilename) {
|
if (profileFilename) {
|
||||||
try {
|
try {
|
||||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir});
|
child_process.execSync("tar xvfz " + profileFilename, {
|
||||||
|
cwd: this.profileDir,
|
||||||
|
});
|
||||||
return true;
|
return true;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||||
|
@ -140,10 +165,12 @@ export class Browser
|
||||||
}
|
}
|
||||||
|
|
||||||
saveProfile(profileFilename: string) {
|
saveProfile(profileFilename: string) {
|
||||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {
|
||||||
|
cwd: this.profileDir,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
chromeArgs({proxy=true, userAgent=null, extraArgs=[]} = {}) {
|
chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) {
|
||||||
// Chrome Flags, including proxy server
|
// Chrome Flags, including proxy server
|
||||||
const args = [
|
const args = [
|
||||||
// eslint-disable-next-line no-use-before-define
|
// eslint-disable-next-line no-use-before-define
|
||||||
|
@ -162,25 +189,29 @@ export class Browser
|
||||||
|
|
||||||
if (proxy) {
|
if (proxy) {
|
||||||
args.push("--ignore-certificate-errors");
|
args.push("--ignore-certificate-errors");
|
||||||
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
args.push(
|
||||||
|
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return args;
|
return args;
|
||||||
}
|
}
|
||||||
|
|
||||||
getDefaultUA() {
|
getDefaultUA() {
|
||||||
let version : string | undefined = process.env.BROWSER_VERSION;
|
let version: string | undefined = process.env.BROWSER_VERSION;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const browser = this.getBrowserExe();
|
const browser = this.getBrowserExe();
|
||||||
if (browser) {
|
if (browser) {
|
||||||
version = child_process.execFileSync(browser, ["--version"], {encoding: "utf8"});
|
version = child_process.execFileSync(browser, ["--version"], {
|
||||||
|
encoding: "utf8",
|
||||||
|
});
|
||||||
const match = version && version.match(/[\d.]+/);
|
const match = version && version.match(/[\d.]+/);
|
||||||
if (match) {
|
if (match) {
|
||||||
version = match[0];
|
version = match[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch(e) {
|
} catch (e) {
|
||||||
console.error(e);
|
console.error(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -188,7 +219,11 @@ export class Browser
|
||||||
}
|
}
|
||||||
|
|
||||||
getBrowserExe() {
|
getBrowserExe() {
|
||||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
const files = [
|
||||||
|
process.env.BROWSER_BIN,
|
||||||
|
"/usr/bin/google-chrome",
|
||||||
|
"/usr/bin/chromium-browser",
|
||||||
|
];
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
if (file && fs.existsSync(file)) {
|
if (file && fs.existsSync(file)) {
|
||||||
return file;
|
return file;
|
||||||
|
@ -196,14 +231,25 @@ export class Browser
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async evaluateWithCLI_(cdp: CDPSession, frame: Frame, cdpContextId: number, funcString: string, logData: Record<string, string>, contextName: string) {
|
async evaluateWithCLI_(
|
||||||
|
cdp: CDPSession,
|
||||||
|
frame: Frame,
|
||||||
|
cdpContextId: number,
|
||||||
|
funcString: string,
|
||||||
|
logData: Record<string, string>,
|
||||||
|
contextName: string,
|
||||||
|
) {
|
||||||
const frameUrl = frame.url();
|
const frameUrl = frame.url();
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
let details : Record<string, any> = {frameUrl, ...logData};
|
let details: Record<string, any> = { frameUrl, ...logData };
|
||||||
|
|
||||||
if (!frameUrl || frame.isDetached()) {
|
if (!frameUrl || frame.isDetached()) {
|
||||||
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
|
logger.info(
|
||||||
|
"Run Script Skipped, frame no longer attached or has no URL",
|
||||||
|
details,
|
||||||
|
contextName,
|
||||||
|
);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,19 +259,22 @@ export class Browser
|
||||||
//const contextId = context._contextId;
|
//const contextId = context._contextId;
|
||||||
const expression = funcString + "\n//# sourceURL=__evaluation_script__";
|
const expression = funcString + "\n//# sourceURL=__evaluation_script__";
|
||||||
|
|
||||||
const { exceptionDetails, result } = await cdp
|
const { exceptionDetails, result } = await cdp.send("Runtime.evaluate", {
|
||||||
.send("Runtime.evaluate", {
|
expression,
|
||||||
expression,
|
contextId: cdpContextId,
|
||||||
contextId: cdpContextId,
|
returnByValue: true,
|
||||||
returnByValue: true,
|
awaitPromise: true,
|
||||||
awaitPromise: true,
|
userGesture: true,
|
||||||
userGesture: true,
|
includeCommandLineAPI: true,
|
||||||
includeCommandLineAPI: true,
|
});
|
||||||
});
|
|
||||||
|
|
||||||
if (exceptionDetails) {
|
if (exceptionDetails) {
|
||||||
if (exceptionDetails.stackTrace) {
|
if (exceptionDetails.stackTrace) {
|
||||||
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
|
details = {
|
||||||
|
...exceptionDetails.stackTrace,
|
||||||
|
text: exceptionDetails.text,
|
||||||
|
...details,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
logger.error("Run Script Failed", details, contextName);
|
logger.error("Run Script Failed", details, contextName);
|
||||||
} else {
|
} else {
|
||||||
|
@ -256,8 +305,11 @@ export class Browser
|
||||||
return page.evaluateOnNewDocument(script);
|
return page.evaluateOnNewDocument(script);
|
||||||
}
|
}
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
async _init(
|
||||||
async _init(launchOpts: PuppeteerLaunchOptions, ondisconnect : Function | null = null) {
|
launchOpts: PuppeteerLaunchOptions,
|
||||||
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
|
ondisconnect: Function | null = null,
|
||||||
|
) {
|
||||||
this.browser = await puppeteer.launch(launchOpts);
|
this.browser = await puppeteer.launch(launchOpts);
|
||||||
|
|
||||||
const target = this.browser.target();
|
const target = this.browser.target();
|
||||||
|
@ -274,9 +326,10 @@ export class Browser
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async newWindowPageWithCDP() : Promise<{cdp: CDPSession, page: Page}> {
|
async newWindowPageWithCDP(): Promise<{ cdp: CDPSession; page: Page }> {
|
||||||
// unique url to detect new pages
|
// unique url to detect new pages
|
||||||
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
const startPage =
|
||||||
|
"about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||||
|
|
||||||
const p = new Promise<Target>((resolve) => {
|
const p = new Promise<Target>((resolve) => {
|
||||||
const listener = (target: Target) => {
|
const listener = (target: Target) => {
|
||||||
|
@ -298,7 +351,10 @@ export class Browser
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
await this.firstCDP.send("Target.createTarget", {
|
||||||
|
url: startPage,
|
||||||
|
newWindow: true,
|
||||||
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (!this.browser) {
|
if (!this.browser) {
|
||||||
throw e;
|
throw e;
|
||||||
|
@ -307,7 +363,10 @@ export class Browser
|
||||||
|
|
||||||
this.firstCDP = await target.createCDPSession();
|
this.firstCDP = await target.createCDPSession();
|
||||||
|
|
||||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
await this.firstCDP.send("Target.createTarget", {
|
||||||
|
url: startPage,
|
||||||
|
newWindow: true,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const target = await p;
|
const target = await p;
|
||||||
|
@ -331,7 +390,7 @@ export class Browser
|
||||||
|
|
||||||
const cdp = await target.createCDPSession();
|
const cdp = await target.createCDPSession();
|
||||||
|
|
||||||
return {page, cdp};
|
return { page, cdp };
|
||||||
}
|
}
|
||||||
|
|
||||||
async serviceWorkerFetch() {
|
async serviceWorkerFetch() {
|
||||||
|
@ -348,9 +407,13 @@ export class Browser
|
||||||
|
|
||||||
if (networkId) {
|
if (networkId) {
|
||||||
try {
|
try {
|
||||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
logger.warn(
|
||||||
|
"continueResponse failed",
|
||||||
|
{ url: request.url },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -369,12 +432,20 @@ export class Browser
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!foundRecorder) {
|
if (!foundRecorder) {
|
||||||
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
logger.debug(
|
||||||
|
"Skipping URL from unknown frame",
|
||||||
|
{ url: request.url, frameId },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
logger.warn(
|
||||||
|
"continueResponse failed",
|
||||||
|
{ url: request.url },
|
||||||
|
"recorder",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -383,7 +454,9 @@ export class Browser
|
||||||
await foundRecorder.handleRequestPaused(params, this.firstCDP, true);
|
await foundRecorder.handleRequestPaused(params, this.firstCDP, true);
|
||||||
});
|
});
|
||||||
|
|
||||||
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
await this.firstCDP.send("Fetch.enable", {
|
||||||
|
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
@ -395,21 +468,28 @@ export class Browser
|
||||||
funcString: string,
|
funcString: string,
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
logData: Record<string, any>,
|
logData: Record<string, any>,
|
||||||
contextName: string
|
contextName: string,
|
||||||
) {
|
) {
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const context = await (frame as any).executionContext();
|
const context = await (frame as any).executionContext();
|
||||||
cdp = context._client;
|
cdp = context._client;
|
||||||
const cdpContextId = context._contextId;
|
const cdpContextId = context._contextId;
|
||||||
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
|
return await this.evaluateWithCLI_(
|
||||||
|
cdp,
|
||||||
|
frame,
|
||||||
|
cdpContextId,
|
||||||
|
funcString,
|
||||||
|
logData,
|
||||||
|
contextName,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
|
interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
|
||||||
page.on("request", callback);
|
page.on("request", callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
async waitForNetworkIdle(page: Page, params: {timeout?: number}) {
|
async waitForNetworkIdle(page: Page, params: { timeout?: number }) {
|
||||||
return await page.waitForNetworkIdle(params);
|
return await page.waitForNetworkIdle(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -428,7 +508,6 @@ export class Browser
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ==================================================================
|
// ==================================================================
|
||||||
// Default Chromium args from playwright
|
// Default Chromium args from playwright
|
||||||
export const defaultArgs = [
|
export const defaultArgs = [
|
||||||
|
@ -470,5 +549,5 @@ export const defaultArgs = [
|
||||||
"--apps-gallery-url=https://invalid.webstore.example.com/",
|
"--apps-gallery-url=https://invalid.webstore.example.com/",
|
||||||
"--apps-gallery-update-url=https://invalid.webstore.example.com/",
|
"--apps-gallery-update-url=https://invalid.webstore.example.com/",
|
||||||
"--component-updater=url-source=http://invalid.dev/",
|
"--component-updater=url-source=http://invalid.dev/",
|
||||||
"--brave-stats-updater-server=url-source=http://invalid.dev/"
|
"--brave-stats-updater-server=url-source=http://invalid.dev/",
|
||||||
];
|
];
|
||||||
|
|
|
@ -1,15 +1,24 @@
|
||||||
|
export const HTML_TYPES = [
|
||||||
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
"text/html",
|
||||||
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
"application/xhtml",
|
||||||
|
"application/xhtml+xml",
|
||||||
|
];
|
||||||
|
export const WAIT_UNTIL_OPTS = [
|
||||||
|
"load",
|
||||||
|
"domcontentloaded",
|
||||||
|
"networkidle0",
|
||||||
|
"networkidle2",
|
||||||
|
];
|
||||||
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
||||||
|
|
||||||
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||||
export const ADD_LINK_FUNC = "__bx_addLink";
|
export const ADD_LINK_FUNC = "__bx_addLink";
|
||||||
export const MAX_DEPTH = 1000000;
|
export const MAX_DEPTH = 1000000;
|
||||||
|
|
||||||
export const DEFAULT_SELECTORS = [{
|
export const DEFAULT_SELECTORS = [
|
||||||
selector: "a[href]",
|
{
|
||||||
extract: "href",
|
selector: "a[href]",
|
||||||
isAttribute: false
|
extract: "href",
|
||||||
}];
|
isAttribute: false,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
|
@ -3,11 +3,17 @@ import path from "path";
|
||||||
|
|
||||||
const MAX_DEPTH = 2;
|
const MAX_DEPTH = 2;
|
||||||
|
|
||||||
export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0) : string[] {
|
export function collectAllFileSources(
|
||||||
|
fileOrDir: string,
|
||||||
|
ext?: string,
|
||||||
|
depth = 0,
|
||||||
|
): string[] {
|
||||||
const resolvedPath = path.resolve(fileOrDir);
|
const resolvedPath = path.resolve(fileOrDir);
|
||||||
|
|
||||||
if (depth >= MAX_DEPTH) {
|
if (depth >= MAX_DEPTH) {
|
||||||
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
|
console.warn(
|
||||||
|
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
||||||
|
);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +33,9 @@ export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
if (depth === 0) {
|
if (depth === 0) {
|
||||||
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
console.warn(
|
||||||
|
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [];
|
return [];
|
||||||
|
|
|
@ -2,10 +2,8 @@ import http from "http";
|
||||||
import url from "url";
|
import url from "url";
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class HealthChecker
|
export class HealthChecker {
|
||||||
{
|
|
||||||
port: number;
|
port: number;
|
||||||
errorThreshold: number;
|
errorThreshold: number;
|
||||||
healthServer: http.Server;
|
healthServer: http.Server;
|
||||||
|
@ -16,7 +14,9 @@ export class HealthChecker
|
||||||
this.port = port;
|
this.port = port;
|
||||||
this.errorThreshold = errorThreshold;
|
this.errorThreshold = errorThreshold;
|
||||||
|
|
||||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
this.healthServer = http.createServer((...args) =>
|
||||||
|
this.healthCheck(...args),
|
||||||
|
);
|
||||||
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
|
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
|
||||||
this.healthServer.listen(port);
|
this.healthServer.listen(port);
|
||||||
}
|
}
|
||||||
|
@ -24,23 +24,35 @@ export class HealthChecker
|
||||||
async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) {
|
async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) {
|
||||||
const pathname = req.url ? url.parse(req.url).pathname : "";
|
const pathname = req.url ? url.parse(req.url).pathname : "";
|
||||||
switch (pathname) {
|
switch (pathname) {
|
||||||
case "/healthz":
|
case "/healthz":
|
||||||
if (this.errorCount < this.errorThreshold) {
|
if (this.errorCount < this.errorThreshold) {
|
||||||
logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck");
|
logger.debug(
|
||||||
res.writeHead(200);
|
`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
|
||||||
res.end();
|
{},
|
||||||
}
|
"healthcheck",
|
||||||
return;
|
);
|
||||||
|
res.writeHead(200);
|
||||||
|
res.end();
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.error(`health check failed: ${this.errorCount} >= ${this.errorThreshold}`, {}, "healthcheck");
|
logger.error(
|
||||||
|
`health check failed: ${this.errorCount} >= ${this.errorThreshold}`,
|
||||||
|
{},
|
||||||
|
"healthcheck",
|
||||||
|
);
|
||||||
res.writeHead(503);
|
res.writeHead(503);
|
||||||
res.end();
|
res.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
resetErrors() {
|
resetErrors() {
|
||||||
if (this.errorCount > 0) {
|
if (this.errorCount > 0) {
|
||||||
logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck");
|
logger.info(
|
||||||
|
`Page loaded, resetting error count ${this.errorCount} to 0`,
|
||||||
|
{},
|
||||||
|
"healthcheck",
|
||||||
|
);
|
||||||
this.errorCount = 0;
|
this.errorCount = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -49,4 +61,3 @@ export class HealthChecker
|
||||||
this.errorCount++;
|
this.errorCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,30 +5,29 @@ import { Writable } from "node:stream";
|
||||||
import { RedisCrawlState } from "./state.js";
|
import { RedisCrawlState } from "./state.js";
|
||||||
|
|
||||||
// RegExp.prototype.toJSON = RegExp.prototype.toString;
|
// RegExp.prototype.toJSON = RegExp.prototype.toString;
|
||||||
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
|
Object.defineProperty(RegExp.prototype, "toJSON", {
|
||||||
|
value: RegExp.prototype.toString,
|
||||||
|
});
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
export function errJSON(e: any) {
|
export function errJSON(e: any) {
|
||||||
if (e instanceof Error) {
|
if (e instanceof Error) {
|
||||||
return {"type": "exception", "message": e.message, "stack": e.stack};
|
return { type: "exception", message: e.message, stack: e.stack };
|
||||||
} else {
|
} else {
|
||||||
return {"message": e.toString()};
|
return { message: e.toString() };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class Logger
|
class Logger {
|
||||||
{
|
logStream: Writable | null = null;
|
||||||
logStream : Writable | null = null;
|
|
||||||
debugLogging = false;
|
debugLogging = false;
|
||||||
logErrorsToRedis = false;
|
logErrorsToRedis = false;
|
||||||
logLevels : string[] = [];
|
logLevels: string[] = [];
|
||||||
contexts : string[] = [];
|
contexts: string[] = [];
|
||||||
crawlState? : RedisCrawlState | null = null;
|
crawlState?: RedisCrawlState | null = null;
|
||||||
fatalExitCode = 17;
|
fatalExitCode = 17;
|
||||||
|
|
||||||
setDefaultFatalExitCode(exitCode: number) {
|
setDefaultFatalExitCode(exitCode: number) {
|
||||||
|
@ -66,12 +65,12 @@ class Logger
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
data: Record<string, string> | Error | any,
|
data: Record<string, string> | Error | any,
|
||||||
context: string,
|
context: string,
|
||||||
logLevel="info"
|
logLevel = "info",
|
||||||
) {
|
) {
|
||||||
if (data instanceof Error) {
|
if (data instanceof Error) {
|
||||||
data = errJSON(data);
|
data = errJSON(data);
|
||||||
} else if (typeof data !== "object") {
|
} else if (typeof data !== "object") {
|
||||||
data = {"message": data.toString()};
|
data = { message: data.toString() };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.logLevels.length) {
|
if (this.logLevels.length) {
|
||||||
|
@ -87,11 +86,11 @@ class Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
const dataToLog = {
|
const dataToLog = {
|
||||||
"timestamp": new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
"logLevel": logLevel,
|
logLevel: logLevel,
|
||||||
"context": context,
|
context: context,
|
||||||
"message": message,
|
message: message,
|
||||||
"details": data ? data : {}
|
details: data ? data : {},
|
||||||
};
|
};
|
||||||
const string = JSON.stringify(dataToLog);
|
const string = JSON.stringify(dataToLog);
|
||||||
console.log(string);
|
console.log(string);
|
||||||
|
@ -100,30 +99,34 @@ class Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
const toLogToRedis = ["error", "fatal"];
|
const toLogToRedis = ["error", "fatal"];
|
||||||
if (this.logErrorsToRedis && this.crawlState && toLogToRedis.includes(logLevel)) {
|
if (
|
||||||
|
this.logErrorsToRedis &&
|
||||||
|
this.crawlState &&
|
||||||
|
toLogToRedis.includes(logLevel)
|
||||||
|
) {
|
||||||
this.crawlState.logError(string);
|
this.crawlState.logError(string);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info(message: string, data={}, context="general") {
|
info(message: string, data = {}, context = "general") {
|
||||||
this.logAsJSON(message, data, context);
|
this.logAsJSON(message, data, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
error(message: string, data={}, context="general") {
|
error(message: string, data = {}, context = "general") {
|
||||||
this.logAsJSON(message, data, context, "error");
|
this.logAsJSON(message, data, context, "error");
|
||||||
}
|
}
|
||||||
|
|
||||||
warn(message: string, data={}, context="general") {
|
warn(message: string, data = {}, context = "general") {
|
||||||
this.logAsJSON(message, data, context, "warn");
|
this.logAsJSON(message, data, context, "warn");
|
||||||
}
|
}
|
||||||
|
|
||||||
debug(message: string, data={}, context="general") {
|
debug(message: string, data = {}, context = "general") {
|
||||||
if (this.debugLogging) {
|
if (this.debugLogging) {
|
||||||
this.logAsJSON(message, data, context, "debug");
|
this.logAsJSON(message, data, context, "debug");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fatal(message: string, data={}, context="general", exitCode=0) {
|
fatal(message: string, data = {}, context = "general", exitCode = 0) {
|
||||||
exitCode = exitCode || this.fatalExitCode;
|
exitCode = exitCode || this.fatalExitCode;
|
||||||
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
|
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,8 @@ import { HTTPRequest, Page } from "puppeteer-core";
|
||||||
import { errJSON, logger } from "./logger.js";
|
import { errJSON, logger } from "./logger.js";
|
||||||
import { Browser } from "./browser.js";
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
export class OriginOverride
|
export class OriginOverride {
|
||||||
{
|
originOverride: { origUrl: URL; destUrl: URL }[];
|
||||||
originOverride: {origUrl: URL, destUrl: URL}[];
|
|
||||||
|
|
||||||
constructor(originOverride: string[]) {
|
constructor(originOverride: string[]) {
|
||||||
this.originOverride = originOverride.map((override) => {
|
this.originOverride = originOverride.map((override) => {
|
||||||
|
@ -12,7 +11,7 @@ export class OriginOverride
|
||||||
const origUrl = new URL(orig);
|
const origUrl = new URL(orig);
|
||||||
const destUrl = new URL(dest);
|
const destUrl = new URL(dest);
|
||||||
|
|
||||||
return {origUrl, destUrl};
|
return { origUrl, destUrl };
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,7 +23,7 @@ export class OriginOverride
|
||||||
let newUrl = null;
|
let newUrl = null;
|
||||||
let orig = null;
|
let orig = null;
|
||||||
|
|
||||||
for (const {origUrl, destUrl} of this.originOverride) {
|
for (const { origUrl, destUrl } of this.originOverride) {
|
||||||
if (url.startsWith(origUrl.origin)) {
|
if (url.startsWith(origUrl.origin)) {
|
||||||
newUrl = destUrl.origin + url.slice(origUrl.origin.length);
|
newUrl = destUrl.origin + url.slice(origUrl.origin.length);
|
||||||
orig = origUrl;
|
orig = origUrl;
|
||||||
|
@ -44,18 +43,25 @@ export class OriginOverride
|
||||||
headers.set("origin", orig.origin);
|
headers.set("origin", orig.origin);
|
||||||
}
|
}
|
||||||
|
|
||||||
const resp = await fetch(newUrl, {headers});
|
const resp = await fetch(newUrl, { headers });
|
||||||
|
|
||||||
const body = Buffer.from(await resp.arrayBuffer());
|
const body = Buffer.from(await resp.arrayBuffer());
|
||||||
const respHeaders = Object.fromEntries(resp.headers);
|
const respHeaders = Object.fromEntries(resp.headers);
|
||||||
const status = resp.status;
|
const status = resp.status;
|
||||||
|
|
||||||
logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride");
|
logger.debug(
|
||||||
|
"Origin overridden",
|
||||||
request.respond({body, headers: respHeaders, status}, -1);
|
{ orig: url, dest: newUrl, status, body: body.length },
|
||||||
|
"originoverride",
|
||||||
|
);
|
||||||
|
|
||||||
|
request.respond({ body, headers: respHeaders, status }, -1);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("Error overriding origin", {...errJSON(e), url: page.url()}, "originoverride");
|
logger.warn(
|
||||||
|
"Error overriding origin",
|
||||||
|
{ ...errJSON(e), url: page.url() },
|
||||||
|
"originoverride",
|
||||||
|
);
|
||||||
request.continue({}, -1);
|
request.continue({}, -1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -14,14 +14,13 @@ console.error = function (...args) {
|
||||||
typeof args[0] === "string" &&
|
typeof args[0] === "string" &&
|
||||||
args[0].indexOf("[ioredis] Unhandled error event") === 0
|
args[0].indexOf("[ioredis] Unhandled error event") === 0
|
||||||
) {
|
) {
|
||||||
|
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
|
|
||||||
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||||
if (lastLogTime && exitOnError) {
|
if (lastLogTime && exitOnError) {
|
||||||
logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
|
logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
|
||||||
}
|
}
|
||||||
logger.warn("ioredis error", {error: args[0]}, "redis");
|
logger.warn("ioredis error", { error: args[0] }, "redis");
|
||||||
lastLogTime = now;
|
lastLogTime = now;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
@ -30,7 +29,7 @@ console.error = function (...args) {
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function initRedis(url: string) {
|
export async function initRedis(url: string) {
|
||||||
const redis = new Redis(url, {lazyConnect: true});
|
const redis = new Redis(url, { lazyConnect: true });
|
||||||
await redis.connect();
|
await redis.connect();
|
||||||
return redis;
|
return redis;
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,10 +7,8 @@ const CONTENT_LENGTH = "content-length";
|
||||||
const CONTENT_TYPE = "content-type";
|
const CONTENT_TYPE = "content-type";
|
||||||
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class RequestResponseInfo
|
export class RequestResponseInfo {
|
||||||
{
|
|
||||||
_created: Date = new Date();
|
_created: Date = new Date();
|
||||||
|
|
||||||
requestId: string;
|
requestId: string;
|
||||||
|
@ -33,7 +31,7 @@ export class RequestResponseInfo
|
||||||
statusText?: string;
|
statusText?: string;
|
||||||
|
|
||||||
responseHeaders?: Record<string, string>;
|
responseHeaders?: Record<string, string>;
|
||||||
responseHeadersList?: {name: string, value: string}[];
|
responseHeadersList?: { name: string; value: string }[];
|
||||||
responseHeadersText?: string;
|
responseHeadersText?: string;
|
||||||
|
|
||||||
payload?: Uint8Array;
|
payload?: Uint8Array;
|
||||||
|
@ -79,7 +77,6 @@ export class RequestResponseInfo
|
||||||
if (params.type) {
|
if (params.type) {
|
||||||
this.resourceType = params.type;
|
this.resourceType = params.type;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
@ -100,7 +97,12 @@ export class RequestResponseInfo
|
||||||
|
|
||||||
fillResponse(response: Protocol.Network.Response) {
|
fillResponse(response: Protocol.Network.Response) {
|
||||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||||
if (response.status == 304 && this.status && this.status != 304 && this.url) {
|
if (
|
||||||
|
response.status == 304 &&
|
||||||
|
this.status &&
|
||||||
|
this.status != 304 &&
|
||||||
|
this.url
|
||||||
|
) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,9 +129,13 @@ export class RequestResponseInfo
|
||||||
this.fromServiceWorker = !!response.fromServiceWorker;
|
this.fromServiceWorker = !!response.fromServiceWorker;
|
||||||
|
|
||||||
if (response.securityDetails) {
|
if (response.securityDetails) {
|
||||||
const issuer : string = response.securityDetails.issuer || "";
|
const issuer: string = response.securityDetails.issuer || "";
|
||||||
const ctc : string = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
|
const ctc: string =
|
||||||
this.extraOpts.cert = {issuer, ctc};
|
response.securityDetails.certificateTransparencyCompliance ===
|
||||||
|
"compliant"
|
||||||
|
? "1"
|
||||||
|
: "0";
|
||||||
|
this.extraOpts.cert = { issuer, ctc };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,7 +167,6 @@ export class RequestResponseInfo
|
||||||
this.responseHeaders = Object.fromEntries(response.headers);
|
this.responseHeaders = Object.fromEntries(response.headers);
|
||||||
this.status = response.status;
|
this.status = response.status;
|
||||||
this.statusText = response.statusText || getStatusText(this.status);
|
this.statusText = response.statusText || getStatusText(this.status);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
@ -175,7 +180,10 @@ export class RequestResponseInfo
|
||||||
|
|
||||||
if (this.responseHeaders) {
|
if (this.responseHeaders) {
|
||||||
for (const header of Object.keys(this.responseHeaders)) {
|
for (const header of Object.keys(this.responseHeaders)) {
|
||||||
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
|
headers += `${header}: ${this.responseHeaders[header].replace(
|
||||||
|
/\n/g,
|
||||||
|
", ",
|
||||||
|
)}\r\n`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
headers += "\r\n";
|
headers += "\r\n";
|
||||||
|
@ -191,10 +199,18 @@ export class RequestResponseInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
getResponseHeadersDict(length = 0) {
|
getResponseHeadersDict(length = 0) {
|
||||||
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
|
return this._getHeadersDict(
|
||||||
|
this.responseHeaders,
|
||||||
|
this.responseHeadersList,
|
||||||
|
length,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
_getHeadersDict(headersDict?: Record<string, string>, headersList?: {name: string, value: string}[], actualContentLength = 0) {
|
_getHeadersDict(
|
||||||
|
headersDict?: Record<string, string>,
|
||||||
|
headersList?: { name: string; value: string }[],
|
||||||
|
actualContentLength = 0,
|
||||||
|
) {
|
||||||
if (!headersDict && headersList) {
|
if (!headersDict && headersList) {
|
||||||
headersDict = {};
|
headersDict = {};
|
||||||
|
|
||||||
|
|
|
@ -9,12 +9,13 @@ import { Duplex } from "stream";
|
||||||
import { CDPSession, Page } from "puppeteer-core";
|
import { CDPSession, Page } from "puppeteer-core";
|
||||||
import { WorkerId } from "./state.js";
|
import { WorkerId } from "./state.js";
|
||||||
|
|
||||||
const indexHTML = fs.readFileSync(new URL("../../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
const indexHTML = fs.readFileSync(
|
||||||
|
new URL("../../html/screencast.html", import.meta.url),
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class WSTransport
|
class WSTransport {
|
||||||
{
|
|
||||||
allWS = new Set<WebSocket>();
|
allWS = new Set<WebSocket>();
|
||||||
// eslint-disable-next-line no-use-before-define
|
// eslint-disable-next-line no-use-before-define
|
||||||
caster!: ScreenCaster;
|
caster!: ScreenCaster;
|
||||||
|
@ -23,7 +24,6 @@ class WSTransport
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
httpServer: any;
|
httpServer: any;
|
||||||
|
|
||||||
|
|
||||||
constructor(port: number) {
|
constructor(port: number) {
|
||||||
this.allWS = new Set();
|
this.allWS = new Set();
|
||||||
|
|
||||||
|
@ -31,16 +31,21 @@ class WSTransport
|
||||||
|
|
||||||
this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
|
this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
|
||||||
|
|
||||||
this.httpServer = http.createServer((...args) => this.handleRequest(...args));
|
this.httpServer = http.createServer((...args) =>
|
||||||
this.httpServer.on("upgrade", (request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
this.handleRequest(...args),
|
||||||
const pathname = url.parse(request.url || "").pathname;
|
);
|
||||||
|
this.httpServer.on(
|
||||||
|
"upgrade",
|
||||||
|
(request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
||||||
|
const pathname = url.parse(request.url || "").pathname;
|
||||||
|
|
||||||
if (pathname === "/ws") {
|
if (pathname === "/ws") {
|
||||||
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
||||||
this.wss.emit("connection", ws, request);
|
this.wss.emit("connection", ws, request);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
},
|
||||||
|
);
|
||||||
|
|
||||||
this.httpServer.listen(port);
|
this.httpServer.listen(port);
|
||||||
}
|
}
|
||||||
|
@ -48,13 +53,13 @@ class WSTransport
|
||||||
async handleRequest(req: IncomingMessage, res: ServerResponse) {
|
async handleRequest(req: IncomingMessage, res: ServerResponse) {
|
||||||
const pathname = url.parse(req.url || "").pathname;
|
const pathname = url.parse(req.url || "").pathname;
|
||||||
switch (pathname) {
|
switch (pathname) {
|
||||||
case "/":
|
case "/":
|
||||||
res.writeHead(200, {"Content-Type": "text/html"});
|
res.writeHead(200, { "Content-Type": "text/html" });
|
||||||
res.end(indexHTML);
|
res.end(indexHTML);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.writeHead(404, {"Content-Type": "text/html"});
|
res.writeHead(404, { "Content-Type": "text/html" });
|
||||||
res.end("Not Found");
|
res.end("Not Found");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,7 +70,11 @@ class WSTransport
|
||||||
|
|
||||||
this.allWS.add(ws);
|
this.allWS.add(ws);
|
||||||
|
|
||||||
logger.debug("New Screencast Conn", {total: this.allWS.size}, "screencast");
|
logger.debug(
|
||||||
|
"New Screencast Conn",
|
||||||
|
{ total: this.allWS.size },
|
||||||
|
"screencast",
|
||||||
|
);
|
||||||
|
|
||||||
if (this.allWS.size === 1) {
|
if (this.allWS.size === 1) {
|
||||||
this.caster.startCastAll();
|
this.caster.startCastAll();
|
||||||
|
@ -95,10 +104,8 @@ class WSTransport
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class RedisPubSubTransport
|
class RedisPubSubTransport {
|
||||||
{
|
|
||||||
numConnections: number = 0;
|
numConnections: number = 0;
|
||||||
castChannel: string;
|
castChannel: string;
|
||||||
// eslint-disable-next-line no-use-before-define
|
// eslint-disable-next-line no-use-before-define
|
||||||
|
@ -128,23 +135,23 @@ class RedisPubSubTransport
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (message) {
|
switch (message) {
|
||||||
case "connect":
|
case "connect":
|
||||||
this.numConnections++;
|
this.numConnections++;
|
||||||
if (this.numConnections === 1) {
|
if (this.numConnections === 1) {
|
||||||
this.caster.startCastAll();
|
this.caster.startCastAll();
|
||||||
} else {
|
} else {
|
||||||
for (const packet of this.caster.iterCachedData()) {
|
for (const packet of this.caster.iterCachedData()) {
|
||||||
await this.sendAll(packet);
|
await this.sendAll(packet);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
break;
|
||||||
break;
|
|
||||||
|
|
||||||
case "disconnect":
|
case "disconnect":
|
||||||
this.numConnections--;
|
this.numConnections--;
|
||||||
if (this.numConnections === 0) {
|
if (this.numConnections === 0) {
|
||||||
this.caster.stopCastAll();
|
this.caster.stopCastAll();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -157,14 +164,12 @@ class RedisPubSubTransport
|
||||||
|
|
||||||
async isActive() {
|
async isActive() {
|
||||||
const result = await this.redis.pubsub("numsub", this.castChannel);
|
const result = await this.redis.pubsub("numsub", this.castChannel);
|
||||||
return (result.length > 1 ? result[1] > 0: false);
|
return result.length > 1 ? result[1] > 0 : false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
class ScreenCaster
|
class ScreenCaster {
|
||||||
{
|
|
||||||
transport: WSTransport;
|
transport: WSTransport;
|
||||||
caches = new Map<WorkerId, string>();
|
caches = new Map<WorkerId, string>();
|
||||||
urls = new Map<WorkerId, string>();
|
urls = new Map<WorkerId, string>();
|
||||||
|
@ -173,7 +178,7 @@ class ScreenCaster
|
||||||
maxHeight = 480;
|
maxHeight = 480;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
initMsg: {[key: string]: any};
|
initMsg: { [key: string]: any };
|
||||||
|
|
||||||
constructor(transport: WSTransport, numWorkers: number) {
|
constructor(transport: WSTransport, numWorkers: number) {
|
||||||
this.transport = transport;
|
this.transport = transport;
|
||||||
|
@ -183,7 +188,7 @@ class ScreenCaster
|
||||||
msg: "init",
|
msg: "init",
|
||||||
width: this.maxWidth,
|
width: this.maxWidth,
|
||||||
height: this.maxHeight,
|
height: this.maxHeight,
|
||||||
browsers: numWorkers
|
browsers: numWorkers,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,7 +198,7 @@ class ScreenCaster
|
||||||
for (const id of this.caches.keys()) {
|
for (const id of this.caches.keys()) {
|
||||||
const data = this.caches.get(id);
|
const data = this.caches.get(id);
|
||||||
const url = this.urls.get(id);
|
const url = this.urls.get(id);
|
||||||
yield {msg, id, url, data};
|
yield { msg, id, url, data };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -202,7 +207,7 @@ class ScreenCaster
|
||||||
|
|
||||||
// shouldn't happen, getting duplicate cdp
|
// shouldn't happen, getting duplicate cdp
|
||||||
if (this.cdps.get(id) === cdp) {
|
if (this.cdps.get(id) === cdp) {
|
||||||
logger.warn("worker already registered", {workerid: id}, "screencast");
|
logger.warn("worker already registered", { workerid: id }, "screencast");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -215,19 +220,19 @@ class ScreenCaster
|
||||||
const sessionId = resp.sessionId;
|
const sessionId = resp.sessionId;
|
||||||
const url = page.url();
|
const url = page.url();
|
||||||
|
|
||||||
logger.debug("screencastFrame", {workerid: id, url}, "screencast");
|
logger.debug("screencastFrame", { workerid: id, url }, "screencast");
|
||||||
|
|
||||||
// keep previous data cached if just showing about:blank
|
// keep previous data cached if just showing about:blank
|
||||||
if (url && !url.startsWith("about:blank")) {
|
if (url && !url.startsWith("about:blank")) {
|
||||||
this.caches.set(id, data);
|
this.caches.set(id, data);
|
||||||
this.urls.set(id, url);
|
this.urls.set(id, url);
|
||||||
|
|
||||||
await this.transport.sendAll({msg, id, data, url});
|
await this.transport.sendAll({ msg, id, data, url });
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await cdp.send("Page.screencastFrameAck", {sessionId});
|
await cdp.send("Page.screencastFrameAck", { sessionId });
|
||||||
} catch(e) {
|
} catch (e) {
|
||||||
//console.log("Ack Failed, probably window/tab already closed", e);
|
//console.log("Ack Failed, probably window/tab already closed", e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -243,7 +248,7 @@ class ScreenCaster
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async stopById(id: WorkerId, sendClose=false) {
|
async stopById(id: WorkerId, sendClose = false) {
|
||||||
this.caches.delete(id);
|
this.caches.delete(id);
|
||||||
this.urls.delete(id);
|
this.urls.delete(id);
|
||||||
|
|
||||||
|
@ -258,7 +263,7 @@ class ScreenCaster
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sendClose) {
|
if (sendClose) {
|
||||||
await this.transport.sendAll({msg: "close", id});
|
await this.transport.sendAll({ msg: "close", id });
|
||||||
}
|
}
|
||||||
|
|
||||||
this.cdps.delete(id);
|
this.cdps.delete(id);
|
||||||
|
@ -275,9 +280,14 @@ class ScreenCaster
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
(cdp as any)._startedCast = true;
|
(cdp as any)._startedCast = true;
|
||||||
|
|
||||||
logger.info("Started Screencast", {workerid: id}, "screencast");
|
logger.info("Started Screencast", { workerid: id }, "screencast");
|
||||||
|
|
||||||
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
|
await cdp.send("Page.startScreencast", {
|
||||||
|
format: "png",
|
||||||
|
everyNthFrame: 1,
|
||||||
|
maxWidth: this.maxWidth,
|
||||||
|
maxHeight: this.maxHeight,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async stopCast(cdp: CDPSession, id: WorkerId) {
|
async stopCast(cdp: CDPSession, id: WorkerId) {
|
||||||
|
@ -291,7 +301,7 @@ class ScreenCaster
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
(cdp as any)._startedCast = false;
|
(cdp as any)._startedCast = false;
|
||||||
|
|
||||||
logger.info("Stopping Screencast", {workerid: id}, "screencast");
|
logger.info("Stopping Screencast", { workerid: id }, "screencast");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await cdp.send("Page.stopScreencast");
|
await cdp.send("Page.stopScreencast");
|
||||||
|
|
|
@ -4,31 +4,30 @@ import { WARCResourceWriter } from "./warcresourcewriter.js";
|
||||||
import { logger, errJSON } from "./logger.js";
|
import { logger, errJSON } from "./logger.js";
|
||||||
import { Browser } from "./browser.js";
|
import { Browser } from "./browser.js";
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
type ScreenShotType = {
|
type ScreenShotType = {
|
||||||
type: string;
|
type: string;
|
||||||
omitBackground: boolean;
|
omitBackground: boolean;
|
||||||
fullPage: boolean;
|
fullPage: boolean;
|
||||||
}
|
};
|
||||||
|
|
||||||
export const screenshotTypes : Record<string, ScreenShotType> = {
|
export const screenshotTypes: Record<string, ScreenShotType> = {
|
||||||
"view": {
|
view: {
|
||||||
type: "png",
|
type: "png",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
fullPage: false
|
fullPage: false,
|
||||||
},
|
},
|
||||||
"thumbnail": {
|
thumbnail: {
|
||||||
type: "jpeg",
|
type: "jpeg",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
fullPage: false
|
fullPage: false,
|
||||||
},
|
},
|
||||||
"fullPage": {
|
fullPage: {
|
||||||
type: "png",
|
type: "png",
|
||||||
omitBackground: true,
|
omitBackground: true,
|
||||||
fullPage: true
|
fullPage: true,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export class Screenshots extends WARCResourceWriter {
|
export class Screenshots extends WARCResourceWriter {
|
||||||
|
@ -40,22 +39,35 @@ export class Screenshots extends WARCResourceWriter {
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
constructor(opts: any) {
|
constructor(opts: any) {
|
||||||
super({...opts, warcName: "screenshots.warc.gz"});
|
super({ ...opts, warcName: "screenshots.warc.gz" });
|
||||||
this.browser = opts.browser;
|
this.browser = opts.browser;
|
||||||
this.page = opts.page;
|
this.page = opts.page;
|
||||||
}
|
}
|
||||||
|
|
||||||
async take(screenshotType="view") {
|
async take(screenshotType = "view") {
|
||||||
try {
|
try {
|
||||||
if (screenshotType !== "fullPage") {
|
if (screenshotType !== "fullPage") {
|
||||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
await this.browser.setViewport(this.page, {
|
||||||
|
width: 1920,
|
||||||
|
height: 1080,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
const options = screenshotTypes[screenshotType];
|
const options = screenshotTypes[screenshotType];
|
||||||
const screenshotBuffer = await this.page.screenshot(options);
|
const screenshotBuffer = await this.page.screenshot(options);
|
||||||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type);
|
await this.writeBufferToWARC(
|
||||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
screenshotBuffer,
|
||||||
|
screenshotType,
|
||||||
|
"image/" + options.type,
|
||||||
|
);
|
||||||
|
logger.info(
|
||||||
|
`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`,
|
||||||
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
logger.error(
|
||||||
|
"Taking screenshot failed",
|
||||||
|
{ page: this.url, type: screenshotType, ...errJSON(e) },
|
||||||
|
"screenshots",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,17 +78,27 @@ export class Screenshots extends WARCResourceWriter {
|
||||||
async takeThumbnail() {
|
async takeThumbnail() {
|
||||||
const screenshotType = "thumbnail";
|
const screenshotType = "thumbnail";
|
||||||
try {
|
try {
|
||||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
await this.browser.setViewport(this.page, { width: 1920, height: 1080 });
|
||||||
const options = screenshotTypes[screenshotType];
|
const options = screenshotTypes[screenshotType];
|
||||||
const screenshotBuffer = await this.page.screenshot(options);
|
const screenshotBuffer = await this.page.screenshot(options);
|
||||||
const thumbnailBuffer = await sharp(screenshotBuffer)
|
const thumbnailBuffer = await sharp(screenshotBuffer)
|
||||||
// 16:9 thumbnail
|
// 16:9 thumbnail
|
||||||
.resize(640, 360)
|
.resize(640, 360)
|
||||||
.toBuffer();
|
.toBuffer();
|
||||||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type);
|
await this.writeBufferToWARC(
|
||||||
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
|
thumbnailBuffer,
|
||||||
|
screenshotType,
|
||||||
|
"image/" + options.type,
|
||||||
|
);
|
||||||
|
logger.info(
|
||||||
|
`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`,
|
||||||
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
logger.error(
|
||||||
|
"Taking screenshot failed",
|
||||||
|
{ page: this.url, type: screenshotType, ...errJSON(e) },
|
||||||
|
"screenshots",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,8 +10,7 @@ type ScopeType =
|
||||||
| "any"
|
| "any"
|
||||||
| "custom";
|
| "custom";
|
||||||
|
|
||||||
export class ScopedSeed
|
export class ScopedSeed {
|
||||||
{
|
|
||||||
url: string;
|
url: string;
|
||||||
scopeType: ScopeType;
|
scopeType: ScopeType;
|
||||||
include: RegExp[];
|
include: RegExp[];
|
||||||
|
@ -24,11 +23,25 @@ export class ScopedSeed
|
||||||
maxExtraHops = 0;
|
maxExtraHops = 0;
|
||||||
maxDepth = 0;
|
maxDepth = 0;
|
||||||
|
|
||||||
|
constructor({
|
||||||
constructor(
|
url,
|
||||||
{url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
|
scopeType,
|
||||||
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: string | boolean | null, extraHops?: number}
|
include,
|
||||||
) {
|
exclude = [],
|
||||||
|
allowHash = false,
|
||||||
|
depth = -1,
|
||||||
|
sitemap = false,
|
||||||
|
extraHops = 0,
|
||||||
|
}: {
|
||||||
|
url: string;
|
||||||
|
scopeType: ScopeType;
|
||||||
|
include: string[];
|
||||||
|
exclude?: string[];
|
||||||
|
allowHash?: boolean;
|
||||||
|
depth?: number;
|
||||||
|
sitemap?: string | boolean | null;
|
||||||
|
extraHops?: number;
|
||||||
|
}) {
|
||||||
const parsedUrl = this.parseUrl(url);
|
const parsedUrl = this.parseUrl(url);
|
||||||
if (!parsedUrl) {
|
if (!parsedUrl) {
|
||||||
throw new Error("Invalid URL");
|
throw new Error("Invalid URL");
|
||||||
|
@ -43,7 +56,10 @@ export class ScopedSeed
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.scopeType !== "custom") {
|
if (this.scopeType !== "custom") {
|
||||||
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
|
const [includeNew, allowHashNew] = this.scopeFromType(
|
||||||
|
this.scopeType,
|
||||||
|
parsedUrl,
|
||||||
|
);
|
||||||
this.include = [...includeNew, ...this.include];
|
this.include = [...includeNew, ...this.include];
|
||||||
allowHash = allowHashNew;
|
allowHash = allowHashNew;
|
||||||
}
|
}
|
||||||
|
@ -63,13 +79,13 @@ export class ScopedSeed
|
||||||
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
|
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
parseRx(value : any) {
|
parseRx(value: any) {
|
||||||
if (value === null || value === undefined || value === "") {
|
if (value === null || value === undefined || value === "") {
|
||||||
return [];
|
return [];
|
||||||
} else if (!(value instanceof Array)) {
|
} else if (!(value instanceof Array)) {
|
||||||
return [new RegExp(value)];
|
return [new RegExp(value)];
|
||||||
} else {
|
} else {
|
||||||
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
|
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,24 +113,27 @@ export class ScopedSeed
|
||||||
try {
|
try {
|
||||||
parsedUrl = new URL(url.trim());
|
parsedUrl = new URL(url.trim());
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.warn("Invalid Page - not a valid URL", {url, ...logDetails});
|
logger.warn("Invalid Page - not a valid URL", { url, ...logDetails });
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
||||||
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
|
logger.warn("Invalid Page - URL must start with http:// or https://", {
|
||||||
|
url,
|
||||||
|
...logDetails,
|
||||||
|
});
|
||||||
parsedUrl = null;
|
parsedUrl = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return parsedUrl;
|
return parsedUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
resolveSiteMap(sitemap: boolean | string | null) : string | null {
|
resolveSiteMap(sitemap: boolean | string | null): string | null {
|
||||||
if (sitemap === true) {
|
if (sitemap === true) {
|
||||||
const url = new URL(this.url);
|
const url = new URL(this.url);
|
||||||
url.pathname = "/sitemap.xml";
|
url.pathname = "/sitemap.xml";
|
||||||
return url.href;
|
return url.href;
|
||||||
} else if (typeof(sitemap) === "string") {
|
} else if (typeof sitemap === "string") {
|
||||||
const url = new URL(sitemap, this.url);
|
const url = new URL(sitemap, this.url);
|
||||||
return url.href;
|
return url.href;
|
||||||
}
|
}
|
||||||
|
@ -122,42 +141,68 @@ export class ScopedSeed
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
scopeFromType(scopeType: ScopeType, parsedUrl: URL) : [RegExp[], boolean] {
|
scopeFromType(scopeType: ScopeType, parsedUrl: URL): [RegExp[], boolean] {
|
||||||
let include : RegExp[] = [];
|
let include: RegExp[] = [];
|
||||||
let allowHash = false;
|
let allowHash = false;
|
||||||
|
|
||||||
switch (scopeType) {
|
switch (scopeType) {
|
||||||
case "page":
|
case "page":
|
||||||
include = [];
|
include = [];
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "page-spa":
|
case "page-spa":
|
||||||
// allow scheme-agnostic URLS as likely redirects
|
// allow scheme-agnostic URLS as likely redirects
|
||||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
|
include = [
|
||||||
allowHash = true;
|
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
|
||||||
break;
|
];
|
||||||
|
allowHash = true;
|
||||||
|
break;
|
||||||
|
|
||||||
case "prefix":
|
case "prefix":
|
||||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
|
include = [
|
||||||
break;
|
new RegExp(
|
||||||
|
"^" +
|
||||||
|
urlRxEscape(
|
||||||
|
parsedUrl.origin +
|
||||||
|
parsedUrl.pathname.slice(
|
||||||
|
0,
|
||||||
|
parsedUrl.pathname.lastIndexOf("/") + 1,
|
||||||
|
),
|
||||||
|
parsedUrl,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
break;
|
||||||
|
|
||||||
case "host":
|
case "host":
|
||||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
|
include = [
|
||||||
break;
|
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
|
||||||
|
];
|
||||||
|
break;
|
||||||
|
|
||||||
case "domain":
|
case "domain":
|
||||||
if (parsedUrl.hostname.startsWith("www.")) {
|
if (parsedUrl.hostname.startsWith("www.")) {
|
||||||
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
||||||
}
|
}
|
||||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
|
include = [
|
||||||
break;
|
new RegExp(
|
||||||
|
"^" +
|
||||||
|
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
|
||||||
|
"\\/\\/",
|
||||||
|
"\\/\\/([^/]+\\.)*",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
break;
|
||||||
|
|
||||||
case "any":
|
case "any":
|
||||||
include = [/.*/];
|
include = [/.*/];
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
logger.fatal(
|
||||||
|
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return [include, allowHash];
|
return [include, allowHash];
|
||||||
|
@ -221,7 +266,7 @@ export class ScopedSeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {url, isOOS};
|
return { url, isOOS };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -232,7 +277,3 @@ export function rxEscape(string: string) {
|
||||||
export function urlRxEscape(url: string, parsedUrl: URL) {
|
export function urlRxEscape(url: string, parsedUrl: URL) {
|
||||||
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,6 @@ import { MAX_DEPTH } from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { Frame } from "puppeteer-core";
|
import { Frame } from "puppeteer-core";
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export enum LoadState {
|
export enum LoadState {
|
||||||
FAILED = 0,
|
FAILED = 0,
|
||||||
|
@ -16,7 +15,6 @@ export enum LoadState {
|
||||||
BEHAVIORS_DONE = 4,
|
BEHAVIORS_DONE = 4,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export enum QueueState {
|
export enum QueueState {
|
||||||
ADDED = 0,
|
ADDED = 0,
|
||||||
|
@ -24,14 +22,11 @@ export enum QueueState {
|
||||||
DUPE_URL = 2,
|
DUPE_URL = 2,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export type WorkerId = number;
|
export type WorkerId = number;
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class PageState
|
export class PageState {
|
||||||
{
|
|
||||||
url: string;
|
url: string;
|
||||||
seedId: number;
|
seedId: number;
|
||||||
depth: number;
|
depth: number;
|
||||||
|
@ -53,11 +48,16 @@ export class PageState
|
||||||
|
|
||||||
skipBehaviors = false;
|
skipBehaviors = false;
|
||||||
filteredFrames: Frame[] = [];
|
filteredFrames: Frame[] = [];
|
||||||
loadState : LoadState = LoadState.FAILED;
|
loadState: LoadState = LoadState.FAILED;
|
||||||
|
|
||||||
logDetails = {};
|
logDetails = {};
|
||||||
|
|
||||||
constructor(redisData: {url: string, seedId: number, depth: number, extraHops: number}) {
|
constructor(redisData: {
|
||||||
|
url: string;
|
||||||
|
seedId: number;
|
||||||
|
depth: number;
|
||||||
|
extraHops: number;
|
||||||
|
}) {
|
||||||
this.url = redisData.url;
|
this.url = redisData.url;
|
||||||
this.seedId = redisData.seedId;
|
this.seedId = redisData.seedId;
|
||||||
this.depth = redisData.depth;
|
this.depth = redisData.depth;
|
||||||
|
@ -78,10 +78,7 @@ declare module "ioredis" {
|
||||||
limit: number,
|
limit: number,
|
||||||
): Result<number, Context>;
|
): Result<number, Context>;
|
||||||
|
|
||||||
getnext(
|
getnext(qkey: string, pkey: string): Result<string, Context>;
|
||||||
qkey: string,
|
|
||||||
pkey: string,
|
|
||||||
): Result<string, Context>;
|
|
||||||
|
|
||||||
markstarted(
|
markstarted(
|
||||||
pkey: string,
|
pkey: string,
|
||||||
|
@ -103,7 +100,7 @@ declare module "ioredis" {
|
||||||
unlockpending(
|
unlockpending(
|
||||||
pkeyUrl: string,
|
pkeyUrl: string,
|
||||||
uid: string,
|
uid: string,
|
||||||
callback?: Callback<string>
|
callback?: Callback<string>,
|
||||||
): Result<void, Context>;
|
): Result<void, Context>;
|
||||||
|
|
||||||
requeue(
|
requeue(
|
||||||
|
@ -113,13 +110,11 @@ declare module "ioredis" {
|
||||||
url: string,
|
url: string,
|
||||||
maxRetryPending: number,
|
maxRetryPending: number,
|
||||||
): Result<number, Context>;
|
): Result<number, Context>;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class RedisCrawlState
|
export class RedisCrawlState {
|
||||||
{
|
|
||||||
redis: Redis;
|
redis: Redis;
|
||||||
maxRetryPending = 1;
|
maxRetryPending = 1;
|
||||||
_lastSize = 0;
|
_lastSize = 0;
|
||||||
|
@ -138,8 +133,6 @@ export class RedisCrawlState
|
||||||
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
||||||
this.redis = redis;
|
this.redis = redis;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
this.uid = uid;
|
this.uid = uid;
|
||||||
this.key = key;
|
this.key = key;
|
||||||
this.maxPageTime = maxPageTime;
|
this.maxPageTime = maxPageTime;
|
||||||
|
@ -172,7 +165,7 @@ end
|
||||||
redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]);
|
redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]);
|
||||||
redis.call('hdel', KEYS[1], ARGV[1]);
|
redis.call('hdel', KEYS[1], ARGV[1]);
|
||||||
return 0;
|
return 0;
|
||||||
`
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
redis.defineCommand("getnext", {
|
redis.defineCommand("getnext", {
|
||||||
|
@ -187,7 +180,7 @@ if json then
|
||||||
end
|
end
|
||||||
|
|
||||||
return json;
|
return json;
|
||||||
`
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
redis.defineCommand("markstarted", {
|
redis.defineCommand("markstarted", {
|
||||||
|
@ -203,7 +196,7 @@ if json then
|
||||||
redis.call('setex', KEYS[2], ARGV[3], ARGV[4]);
|
redis.call('setex', KEYS[2], ARGV[3], ARGV[4]);
|
||||||
end
|
end
|
||||||
|
|
||||||
`
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
redis.defineCommand("unlockpending", {
|
redis.defineCommand("unlockpending", {
|
||||||
|
@ -215,7 +208,7 @@ if value == ARGV[1] then
|
||||||
redis.call('del', KEYS[1])
|
redis.call('del', KEYS[1])
|
||||||
end
|
end
|
||||||
|
|
||||||
`
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
redis.defineCommand("movefailed", {
|
redis.defineCommand("movefailed", {
|
||||||
|
@ -232,7 +225,7 @@ if json then
|
||||||
redis.call('hdel', KEYS[1], ARGV[1]);
|
redis.call('hdel', KEYS[1], ARGV[1]);
|
||||||
end
|
end
|
||||||
|
|
||||||
`
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
redis.defineCommand("requeue", {
|
redis.defineCommand("requeue", {
|
||||||
|
@ -255,9 +248,8 @@ if not res then
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return 0;
|
return 0;
|
||||||
`
|
`,
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async _getNext() {
|
async _getNext() {
|
||||||
|
@ -271,7 +263,14 @@ return 0;
|
||||||
async markStarted(url: string) {
|
async markStarted(url: string) {
|
||||||
const started = this._timestamp();
|
const started = this._timestamp();
|
||||||
|
|
||||||
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid);
|
return await this.redis.markstarted(
|
||||||
|
this.pkey,
|
||||||
|
this.pkey + ":" + url,
|
||||||
|
url,
|
||||||
|
started,
|
||||||
|
this.maxPageTime,
|
||||||
|
this.uid,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async markFinished(url: string) {
|
async markFinished(url: string) {
|
||||||
|
@ -292,21 +291,24 @@ return 0;
|
||||||
await this.redis.srem(this.skey, url);
|
await this.redis.srem(this.skey, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
recheckScope(data: {url: string, depth: number, extraHops: number, seedId: number}, seeds: ScopedSeed[]) {
|
recheckScope(
|
||||||
|
data: { url: string; depth: number; extraHops: number; seedId: number },
|
||||||
|
seeds: ScopedSeed[],
|
||||||
|
) {
|
||||||
const seed = seeds[data.seedId];
|
const seed = seeds[data.seedId];
|
||||||
|
|
||||||
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
||||||
}
|
}
|
||||||
|
|
||||||
async isFinished() {
|
async isFinished() {
|
||||||
return ((await this.queueSize()) == 0) && ((await this.numDone()) > 0);
|
return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
async setStatus(status_: string) {
|
async setStatus(status_: string) {
|
||||||
await this.redis.hset(`${this.key}:status`, this.uid, status_);
|
await this.redis.hset(`${this.key}:status`, this.uid, status_);
|
||||||
}
|
}
|
||||||
|
|
||||||
async getStatus() : Promise<string> {
|
async getStatus(): Promise<string> {
|
||||||
return (await this.redis.hget(`${this.key}:status`, this.uid)) || "";
|
return (await this.redis.hget(`${this.key}:status`, this.uid)) || "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -343,35 +345,35 @@ return 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const {type, regex} = JSON.parse(result);
|
const { type, regex } = JSON.parse(result);
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case "addExclusion":
|
case "addExclusion":
|
||||||
logger.debug("Add Exclusion", {type, regex}, "exclusion");
|
logger.debug("Add Exclusion", { type, regex }, "exclusion");
|
||||||
if (!regex) {
|
if (!regex) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
for (const seed of seeds) {
|
||||||
|
seed.addExclusion(regex);
|
||||||
|
}
|
||||||
|
// can happen async w/o slowing down crawling
|
||||||
|
// each page is still checked if in scope before crawling, even while
|
||||||
|
// queue is being filtered
|
||||||
|
this.filterQueue(regex);
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
for (const seed of seeds) {
|
|
||||||
seed.addExclusion(regex);
|
|
||||||
}
|
|
||||||
// can happen async w/o slowing down crawling
|
|
||||||
// each page is still checked if in scope before crawling, even while
|
|
||||||
// queue is being filtered
|
|
||||||
this.filterQueue(regex);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "removeExclusion":
|
case "removeExclusion":
|
||||||
logger.debug("Remove Exclusion", {type, regex}, "exclusion");
|
logger.debug("Remove Exclusion", { type, regex }, "exclusion");
|
||||||
if (!regex) {
|
if (!regex) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
for (const seed of seeds) {
|
||||||
|
seed.removeExclusion(regex);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
for (const seed of seeds) {
|
|
||||||
seed.removeExclusion(regex);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
} // TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.warn("Error processing message", e, "redisMessage");
|
logger.warn("Error processing message", e, "redisMessage");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -389,7 +391,7 @@ return 0;
|
||||||
|
|
||||||
// regexStr just a string, optimize by using glob matching
|
// regexStr just a string, optimize by using glob matching
|
||||||
if (this.isStrMatch(regexStr)) {
|
if (this.isStrMatch(regexStr)) {
|
||||||
matcher = {"match": `*${regexStr}*`};
|
matcher = { match: `*${regexStr}*` };
|
||||||
}
|
}
|
||||||
|
|
||||||
const stream = this.redis.zscanStream(this.qkey, matcher);
|
const stream = this.redis.zscanStream(this.qkey, matcher);
|
||||||
|
@ -404,14 +406,18 @@ return 0;
|
||||||
//if (removed) {
|
//if (removed) {
|
||||||
await this.markExcluded(url);
|
await this.markExcluded(url);
|
||||||
//}
|
//}
|
||||||
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
|
logger.debug(
|
||||||
|
"Removing excluded URL",
|
||||||
|
{ url, regex, removed },
|
||||||
|
"exclusion",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
stream.resume();
|
stream.resume();
|
||||||
});
|
});
|
||||||
|
|
||||||
return new Promise<void>(resolve => {
|
return new Promise<void>((resolve) => {
|
||||||
stream.on("end", () => {
|
stream.on("end", () => {
|
||||||
resolve();
|
resolve();
|
||||||
});
|
});
|
||||||
|
@ -424,15 +430,23 @@ return 0;
|
||||||
|
|
||||||
// consider failed if 3 failed retries in 60 secs
|
// consider failed if 3 failed retries in 60 secs
|
||||||
await this.redis.expire(key, 60);
|
await this.redis.expire(key, 60);
|
||||||
return (res >= 3);
|
return res >= 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
||||||
async addToQueue({url, seedId, depth = 0, extraHops = 0} : {url: string, seedId: number, depth?: number, extraHops?: number}, limit = 0) {
|
async addToQueue(
|
||||||
|
{
|
||||||
|
url,
|
||||||
|
seedId,
|
||||||
|
depth = 0,
|
||||||
|
extraHops = 0,
|
||||||
|
}: { url: string; seedId: number; depth?: number; extraHops?: number },
|
||||||
|
limit = 0,
|
||||||
|
) {
|
||||||
const added = this._timestamp();
|
const added = this._timestamp();
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const data : any = {added, url, seedId, depth};
|
const data: any = { added, url, seedId, depth };
|
||||||
if (extraHops) {
|
if (extraHops) {
|
||||||
data.extraHops = extraHops;
|
data.extraHops = extraHops;
|
||||||
}
|
}
|
||||||
|
@ -441,7 +455,15 @@ return 0;
|
||||||
// 0 - url queued successfully
|
// 0 - url queued successfully
|
||||||
// 1 - url queue size limit reached
|
// 1 - url queue size limit reached
|
||||||
// 2 - url is a dupe
|
// 2 - url is a dupe
|
||||||
return await this.redis.addqueue(this.pkey, this.qkey, this.skey, url, this._getScore(data), JSON.stringify(data), limit);
|
return await this.redis.addqueue(
|
||||||
|
this.pkey,
|
||||||
|
this.qkey,
|
||||||
|
this.skey,
|
||||||
|
url,
|
||||||
|
this._getScore(data),
|
||||||
|
JSON.stringify(data),
|
||||||
|
limit,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async nextFromQueue() {
|
async nextFromQueue() {
|
||||||
|
@ -450,7 +472,7 @@ return 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
data = JSON.parse(json);
|
data = JSON.parse(json);
|
||||||
} catch(e) {
|
} catch (e) {
|
||||||
logger.error("Invalid queued json", json);
|
logger.error("Invalid queued json", json);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -476,20 +498,27 @@ return 0;
|
||||||
const failed = await this._iterListKeys(this.fkey);
|
const failed = await this._iterListKeys(this.fkey);
|
||||||
const errors = await this.getErrorList();
|
const errors = await this.getErrorList();
|
||||||
|
|
||||||
return {done, queued, pending, failed, errors};
|
return { done, queued, pending, failed, errors };
|
||||||
}
|
}
|
||||||
|
|
||||||
_getScore(data: {depth: number, extraHops: number}) {
|
_getScore(data: { depth: number; extraHops: number }) {
|
||||||
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
|
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
|
||||||
}
|
}
|
||||||
|
|
||||||
async _iterSortedKey(key: string, inc = 100) {
|
async _iterSortedKey(key: string, inc = 100) {
|
||||||
const results : string[] = [];
|
const results: string[] = [];
|
||||||
|
|
||||||
const len = await this.redis.zcard(key);
|
const len = await this.redis.zcard(key);
|
||||||
|
|
||||||
for (let i = 0; i < len; i += inc) {
|
for (let i = 0; i < len; i += inc) {
|
||||||
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "LIMIT", i, inc);
|
const someResults = await this.redis.zrangebyscore(
|
||||||
|
key,
|
||||||
|
0,
|
||||||
|
"inf",
|
||||||
|
"LIMIT",
|
||||||
|
i,
|
||||||
|
inc,
|
||||||
|
);
|
||||||
results.push(...someResults);
|
results.push(...someResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -497,7 +526,7 @@ return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
async _iterListKeys(key: string, inc = 100) {
|
async _iterListKeys(key: string, inc = 100) {
|
||||||
const results : string[] = [];
|
const results: string[] = [];
|
||||||
|
|
||||||
const len = await this.redis.llen(key);
|
const len = await this.redis.llen(key);
|
||||||
|
|
||||||
|
@ -508,10 +537,14 @@ return 0;
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
async load(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// TODO: Fix this the next time the file is edited.
|
||||||
async load(state: Record<string, any>, seeds: ScopedSeed[], checkScope: boolean) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const seen : string[] = [];
|
state: Record<string, any>,
|
||||||
|
seeds: ScopedSeed[],
|
||||||
|
checkScope: boolean,
|
||||||
|
) {
|
||||||
|
const seen: string[] = [];
|
||||||
|
|
||||||
// need to delete existing keys, if exist to fully reset state
|
// need to delete existing keys, if exist to fully reset state
|
||||||
await this.redis.del(this.qkey);
|
await this.redis.del(this.qkey);
|
||||||
|
@ -545,7 +578,7 @@ return 0;
|
||||||
seen.push(data.url);
|
seen.push(data.url);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (typeof(state.done) === "number") {
|
if (typeof state.done === "number") {
|
||||||
// done key is just an int counter
|
// done key is just an int counter
|
||||||
await this.redis.set(this.dkey, state.done);
|
await this.redis.set(this.dkey, state.done);
|
||||||
} else if (state.done instanceof Array) {
|
} else if (state.done instanceof Array) {
|
||||||
|
@ -601,7 +634,7 @@ return 0;
|
||||||
|
|
||||||
async getPendingList() {
|
async getPendingList() {
|
||||||
const list = await this.redis.hvals(this.pkey);
|
const list = await this.redis.hvals(this.pkey);
|
||||||
return list.map(x => JSON.parse(x));
|
return list.map((x) => JSON.parse(x));
|
||||||
}
|
}
|
||||||
|
|
||||||
async getErrorList() {
|
async getErrorList() {
|
||||||
|
@ -615,9 +648,9 @@ return 0;
|
||||||
for (const url of pendingUrls) {
|
for (const url of pendingUrls) {
|
||||||
await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
|
await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
|
||||||
}
|
}
|
||||||
} // TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.error("Redis Del Pending Failed", e, "state");
|
logger.error("Redis Del Pending Failed", e, "state");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -626,15 +659,21 @@ return 0;
|
||||||
const pendingUrls = await this.redis.hkeys(this.pkey);
|
const pendingUrls = await this.redis.hkeys(this.pkey);
|
||||||
|
|
||||||
for (const url of pendingUrls) {
|
for (const url of pendingUrls) {
|
||||||
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending);
|
const res = await this.redis.requeue(
|
||||||
|
this.pkey,
|
||||||
|
this.qkey,
|
||||||
|
this.pkey + ":" + url,
|
||||||
|
url,
|
||||||
|
this.maxRetryPending,
|
||||||
|
);
|
||||||
switch (res) {
|
switch (res) {
|
||||||
case 1:
|
case 1:
|
||||||
logger.info(`Requeued: ${url}`);
|
logger.info(`Requeued: ${url}`);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 2:
|
case 2:
|
||||||
logger.info(`Not requeuing anymore: ${url}`);
|
logger.info(`Not requeuing anymore: ${url}`);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -656,4 +695,3 @@ return 0;
|
||||||
return await this.redis.lpush(this.ekey, error);
|
return await this.redis.lpush(this.ekey, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,10 +16,8 @@ import { logger } from "./logger.js";
|
||||||
// @ts-expect-error TODO fill in why error is expected
|
// @ts-expect-error TODO fill in why error is expected
|
||||||
import getFolderSize from "get-folder-size";
|
import getFolderSize from "get-folder-size";
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class S3StorageSync
|
export class S3StorageSync {
|
||||||
{
|
|
||||||
fullPrefix: string;
|
fullPrefix: string;
|
||||||
client: Minio.Client;
|
client: Minio.Client;
|
||||||
|
|
||||||
|
@ -36,21 +34,23 @@ export class S3StorageSync
|
||||||
constructor(
|
constructor(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
urlOrData: string | any,
|
urlOrData: string | any,
|
||||||
{webhookUrl, userId, crawlId} :
|
{
|
||||||
{webhookUrl?: string, userId: string, crawlId: string}
|
webhookUrl,
|
||||||
|
userId,
|
||||||
|
crawlId,
|
||||||
|
}: { webhookUrl?: string; userId: string; crawlId: string },
|
||||||
) {
|
) {
|
||||||
let url;
|
let url;
|
||||||
let accessKey;
|
let accessKey;
|
||||||
let secretKey;
|
let secretKey;
|
||||||
|
|
||||||
if (typeof(urlOrData) === "string") {
|
if (typeof urlOrData === "string") {
|
||||||
url = new URL(urlOrData);
|
url = new URL(urlOrData);
|
||||||
accessKey = url.username;
|
accessKey = url.username;
|
||||||
secretKey = url.password;
|
secretKey = url.password;
|
||||||
url.username = "";
|
url.username = "";
|
||||||
url.password = "";
|
url.password = "";
|
||||||
this.fullPrefix = url.href;
|
this.fullPrefix = url.href;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
url = new URL(urlOrData.endpointUrl);
|
url = new URL(urlOrData.endpointUrl);
|
||||||
accessKey = urlOrData.accessKey;
|
accessKey = urlOrData.accessKey;
|
||||||
|
@ -64,7 +64,7 @@ export class S3StorageSync
|
||||||
useSSL: url.protocol === "https:",
|
useSSL: url.protocol === "https:",
|
||||||
accessKey,
|
accessKey,
|
||||||
secretKey,
|
secretKey,
|
||||||
partSize: 100*1024*1024
|
partSize: 100 * 1024 * 1024,
|
||||||
});
|
});
|
||||||
|
|
||||||
this.bucketName = url.pathname.slice(1).split("/")[0];
|
this.bucketName = url.pathname.slice(1).split("/")[0];
|
||||||
|
@ -80,31 +80,47 @@ export class S3StorageSync
|
||||||
|
|
||||||
async uploadFile(srcFilename: string, targetFilename: string) {
|
async uploadFile(srcFilename: string, targetFilename: string) {
|
||||||
const fileUploadInfo = {
|
const fileUploadInfo = {
|
||||||
"bucket": this.bucketName,
|
bucket: this.bucketName,
|
||||||
"crawlId": this.crawlId,
|
crawlId: this.crawlId,
|
||||||
"prefix": this.objectPrefix,
|
prefix: this.objectPrefix,
|
||||||
targetFilename
|
targetFilename,
|
||||||
};
|
};
|
||||||
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
||||||
|
|
||||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
await this.client.fPutObject(
|
||||||
|
this.bucketName,
|
||||||
|
this.objectPrefix + targetFilename,
|
||||||
|
srcFilename,
|
||||||
|
);
|
||||||
|
|
||||||
const {hash, crc32} = await checksumFile("sha256", srcFilename);
|
const { hash, crc32 } = await checksumFile("sha256", srcFilename);
|
||||||
const path = targetFilename;
|
const path = targetFilename;
|
||||||
|
|
||||||
const size = await getFileSize(srcFilename);
|
const size = await getFileSize(srcFilename);
|
||||||
|
|
||||||
// for backwards compatibility, keep 'bytes'
|
// for backwards compatibility, keep 'bytes'
|
||||||
return {path, size, hash, crc32, bytes: size};
|
return { path, size, hash, crc32, bytes: size };
|
||||||
}
|
}
|
||||||
|
|
||||||
async downloadFile(srcFilename: string, destFilename: string) {
|
async downloadFile(srcFilename: string, destFilename: string) {
|
||||||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
await this.client.fGetObject(
|
||||||
|
this.bucketName,
|
||||||
|
this.objectPrefix + srcFilename,
|
||||||
|
destFilename,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async uploadCollWACZ(srcFilename: string, targetFilename: string, completed = true) {
|
async uploadCollWACZ(
|
||||||
|
srcFilename: string,
|
||||||
|
targetFilename: string,
|
||||||
|
completed = true,
|
||||||
|
) {
|
||||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||||
logger.info("WACZ S3 file upload resource", {targetFilename, resource}, "s3Upload");
|
logger.info(
|
||||||
|
"WACZ S3 file upload resource",
|
||||||
|
{ targetFilename, resource },
|
||||||
|
"s3Upload",
|
||||||
|
);
|
||||||
|
|
||||||
if (this.webhookUrl) {
|
if (this.webhookUrl) {
|
||||||
const body = {
|
const body = {
|
||||||
|
@ -115,17 +131,25 @@ export class S3StorageSync
|
||||||
filename: this.fullPrefix + targetFilename,
|
filename: this.fullPrefix + targetFilename,
|
||||||
|
|
||||||
...resource,
|
...resource,
|
||||||
completed
|
completed,
|
||||||
};
|
};
|
||||||
|
|
||||||
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
||||||
|
|
||||||
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
|
if (
|
||||||
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
|
this.webhookUrl.startsWith("http://") ||
|
||||||
|
this.webhookUrl.startsWith("https://")
|
||||||
|
) {
|
||||||
|
await fetch(this.webhookUrl, {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
} else if (this.webhookUrl.startsWith("redis://")) {
|
} else if (this.webhookUrl.startsWith("redis://")) {
|
||||||
const parts = this.webhookUrl.split("/");
|
const parts = this.webhookUrl.split("/");
|
||||||
if (parts.length !== 5) {
|
if (parts.length !== 5) {
|
||||||
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
logger.fatal(
|
||||||
|
"redis webhook url must be in format: redis://<host>:<port>/<db>/<key>",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
||||||
await redis.rpush(parts[4], JSON.stringify(body));
|
await redis.rpush(parts[4], JSON.stringify(body));
|
||||||
|
@ -139,7 +163,8 @@ export function initStorage() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
const endpointUrl =
|
||||||
|
process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||||
const storeInfo = {
|
const storeInfo = {
|
||||||
endpointUrl,
|
endpointUrl,
|
||||||
accessKey: process.env.STORE_ACCESS_KEY,
|
accessKey: process.env.STORE_ACCESS_KEY,
|
||||||
|
@ -156,7 +181,6 @@ export function initStorage() {
|
||||||
return new S3StorageSync(storeInfo, opts);
|
return new S3StorageSync(storeInfo, opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export async function getFileSize(filename: string) {
|
export async function getFileSize(filename: string) {
|
||||||
const stats = await fsp.stat(filename);
|
const stats = await fsp.stat(filename);
|
||||||
return stats.size;
|
return stats.size;
|
||||||
|
@ -165,25 +189,34 @@ export async function getFileSize(filename: string) {
|
||||||
export async function getDirSize(dir: string) {
|
export async function getDirSize(dir: string) {
|
||||||
const { size, errors } = await getFolderSize(dir);
|
const { size, errors } = await getFolderSize(dir);
|
||||||
if (errors && errors.length) {
|
if (errors && errors.length) {
|
||||||
logger.warn("Size check errors", {errors}, "sizecheck");
|
logger.warn("Size check errors", { errors }, "sizecheck");
|
||||||
}
|
}
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
export async function checkDiskUtilization(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// TODO: Fix this the next time the file is edited.
|
||||||
export async function checkDiskUtilization(params: Record<string, any>, archiveDirSize: number, dfOutput=null) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
const diskUsage : Record<string, string> = await getDiskUsage("/crawls", dfOutput);
|
params: Record<string, any>,
|
||||||
|
archiveDirSize: number,
|
||||||
|
dfOutput = null,
|
||||||
|
) {
|
||||||
|
const diskUsage: Record<string, string> = await getDiskUsage(
|
||||||
|
"/crawls",
|
||||||
|
dfOutput,
|
||||||
|
);
|
||||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||||
|
|
||||||
// Check that disk usage isn't already above threshold
|
// Check that disk usage isn't already above threshold
|
||||||
if (usedPercentage >= params.diskUtilization) {
|
if (usedPercentage >= params.diskUtilization) {
|
||||||
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`);
|
logger.info(
|
||||||
|
`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`,
|
||||||
|
);
|
||||||
return {
|
return {
|
||||||
stop: true,
|
stop: true,
|
||||||
used: usedPercentage,
|
used: usedPercentage,
|
||||||
projected: null,
|
projected: null,
|
||||||
threshold: params.diskUtilization
|
threshold: params.diskUtilization,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -191,7 +224,7 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
||||||
const kbUsed = parseInt(diskUsage["Used"]);
|
const kbUsed = parseInt(diskUsage["Used"]);
|
||||||
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
||||||
|
|
||||||
let kbArchiveDirSize = Math.round(archiveDirSize/1024);
|
let kbArchiveDirSize = Math.round(archiveDirSize / 1024);
|
||||||
if (params.combineWARC && params.generateWACZ) {
|
if (params.combineWARC && params.generateWACZ) {
|
||||||
kbArchiveDirSize *= 4;
|
kbArchiveDirSize *= 4;
|
||||||
} else if (params.combineWARC || params.generateWACZ) {
|
} else if (params.combineWARC || params.generateWACZ) {
|
||||||
|
@ -199,15 +232,20 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
||||||
}
|
}
|
||||||
|
|
||||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||||
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal);
|
const projectedUsedPercentage = calculatePercentageUsed(
|
||||||
|
projectedTotal,
|
||||||
|
kbTotal,
|
||||||
|
);
|
||||||
|
|
||||||
if (projectedUsedPercentage >= params.diskUtilization) {
|
if (projectedUsedPercentage >= params.diskUtilization) {
|
||||||
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`);
|
logger.info(
|
||||||
|
`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`,
|
||||||
|
);
|
||||||
return {
|
return {
|
||||||
stop: true,
|
stop: true,
|
||||||
used: usedPercentage,
|
used: usedPercentage,
|
||||||
projected: projectedUsedPercentage,
|
projected: projectedUsedPercentage,
|
||||||
threshold: params.diskUtilization
|
threshold: params.diskUtilization,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -215,7 +253,7 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
||||||
stop: false,
|
stop: false,
|
||||||
used: usedPercentage,
|
used: usedPercentage,
|
||||||
projected: projectedUsedPercentage,
|
projected: projectedUsedPercentage,
|
||||||
threshold: params.diskUtilization
|
threshold: params.diskUtilization,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -225,12 +263,12 @@ export async function getDFOutput(path: string) {
|
||||||
return res.stdout;
|
return res.stdout;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
export async function getDiskUsage(path = "/crawls", dfOutput = null) {
|
||||||
const result = dfOutput || (await getDFOutput(path));
|
const result = dfOutput || (await getDFOutput(path));
|
||||||
const lines = result.split("\n");
|
const lines = result.split("\n");
|
||||||
const keys = lines[0].split(/\s+/ig);
|
const keys = lines[0].split(/\s+/gi);
|
||||||
const rows = lines.slice(1).map(line => {
|
const rows = lines.slice(1).map((line) => {
|
||||||
const values = line.split(/\s+/ig);
|
const values = line.split(/\s+/gi);
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
return keys.reduce((o: Record<string, any>, k, index) => {
|
return keys.reduce((o: Record<string, any>, k, index) => {
|
||||||
|
@ -242,29 +280,34 @@ export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
||||||
}
|
}
|
||||||
|
|
||||||
export function calculatePercentageUsed(used: number, total: number) {
|
export function calculatePercentageUsed(used: number, total: number) {
|
||||||
return Math.round((used/total) * 100);
|
return Math.round((used / total) * 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
function checksumFile(hashName: string, path: string) : Promise<{hash: string, crc32: number}>{
|
function checksumFile(
|
||||||
|
hashName: string,
|
||||||
|
path: string,
|
||||||
|
): Promise<{ hash: string; crc32: number }> {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
const hash = createHash(hashName);
|
const hash = createHash(hashName);
|
||||||
let crc : number = 0;
|
let crc: number = 0;
|
||||||
|
|
||||||
const stream = fs.createReadStream(path);
|
const stream = fs.createReadStream(path);
|
||||||
stream.on("error", err => reject(err));
|
stream.on("error", (err) => reject(err));
|
||||||
stream.on("data", (chunk) => {
|
stream.on("data", (chunk) => {
|
||||||
hash.update(chunk);
|
hash.update(chunk);
|
||||||
crc = crc32(chunk, crc);
|
crc = crc32(chunk, crc);
|
||||||
});
|
});
|
||||||
stream.on("end", () => resolve({hash: hash.digest("hex"), crc32: crc}));
|
stream.on("end", () => resolve({ hash: hash.digest("hex"), crc32: crc }));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export function interpolateFilename(filename: string, crawlId: string) {
|
export function interpolateFilename(filename: string, crawlId: string) {
|
||||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
filename = filename.replace(
|
||||||
|
"@ts",
|
||||||
|
new Date().toISOString().replace(/[:TZz.-]/g, ""),
|
||||||
|
);
|
||||||
filename = filename.replace("@hostname", os.hostname());
|
filename = filename.replace("@hostname", os.hostname());
|
||||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
||||||
filename = filename.replace("@id", crawlId);
|
filename = filename.replace("@id", crawlId);
|
||||||
return filename;
|
return filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,54 +11,79 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
constructor(cdp: CDPSession, opts: any) {
|
constructor(cdp: CDPSession, opts: any) {
|
||||||
super({...opts, warcName: "text.warc.gz"});
|
super({ ...opts, warcName: "text.warc.gz" });
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractAndStoreText(resourceType: string, ignoreIfMatchesLast = false, saveToWarc = false) {
|
async extractAndStoreText(
|
||||||
|
resourceType: string,
|
||||||
|
ignoreIfMatchesLast = false,
|
||||||
|
saveToWarc = false,
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const text = await this.doGetText();
|
const text = await this.doGetText();
|
||||||
|
|
||||||
if (ignoreIfMatchesLast && text === this.lastText) {
|
if (ignoreIfMatchesLast && text === this.lastText) {
|
||||||
this.lastText = this.text;
|
this.lastText = this.text;
|
||||||
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
|
logger.debug(
|
||||||
return {changed: false, text};
|
"Skipping, extracted text unchanged from last extraction",
|
||||||
|
{ url: this.url },
|
||||||
|
"text",
|
||||||
|
);
|
||||||
|
return { changed: false, text };
|
||||||
}
|
}
|
||||||
if (saveToWarc) {
|
if (saveToWarc) {
|
||||||
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
|
await this.writeBufferToWARC(
|
||||||
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
|
new TextEncoder().encode(text),
|
||||||
|
resourceType,
|
||||||
|
"text/plain",
|
||||||
|
);
|
||||||
|
logger.debug(
|
||||||
|
`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.lastText = text;
|
this.lastText = text;
|
||||||
return {changed: true, text};
|
return { changed: true, text };
|
||||||
} // TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
catch (e: any) {
|
} catch (e: any) {
|
||||||
logger.debug("Error extracting text", e, "text");
|
logger.debug("Error extracting text", e, "text");
|
||||||
return {changed: false, text: null};
|
return { changed: false, text: null };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract doGetText() : Promise<string>;
|
abstract doGetText(): Promise<string>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class TextExtractViaSnapshot extends BaseTextExtract {
|
export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||||
async doGetText() : Promise<string> {
|
async doGetText(): Promise<string> {
|
||||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {
|
||||||
|
computedStyles: [],
|
||||||
|
});
|
||||||
return this.parseTextFromDOMSnapshot(result);
|
return this.parseTextFromDOMSnapshot(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
parseTextFromDOMSnapshot(result: Protocol.DOMSnapshot.CaptureSnapshotResponse) : string {
|
parseTextFromDOMSnapshot(
|
||||||
|
result: Protocol.DOMSnapshot.CaptureSnapshotResponse,
|
||||||
|
): string {
|
||||||
const TEXT_NODE = 3;
|
const TEXT_NODE = 3;
|
||||||
const ELEMENT_NODE = 1;
|
const ELEMENT_NODE = 1;
|
||||||
|
|
||||||
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
|
const SKIPPED_NODES = [
|
||||||
|
"SCRIPT",
|
||||||
|
"STYLE",
|
||||||
|
"HEADER",
|
||||||
|
"FOOTER",
|
||||||
|
"BANNER-DIV",
|
||||||
|
"NOSCRIPT",
|
||||||
|
"TITLE",
|
||||||
|
];
|
||||||
|
|
||||||
const {strings, documents} = result;
|
const { strings, documents } = result;
|
||||||
|
|
||||||
const accum : string[] = [];
|
const accum: string[] = [];
|
||||||
|
|
||||||
for (const doc of documents) {
|
for (const doc of documents) {
|
||||||
const nodeValues = doc.nodes.nodeValue || [];
|
const nodeValues = doc.nodes.nodeValue || [];
|
||||||
|
@ -91,16 +116,18 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
export class TextExtractViaDocument extends BaseTextExtract {
|
export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
async doGetText() : Promise<string> {
|
async doGetText(): Promise<string> {
|
||||||
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
const result = await this.cdp.send("DOM.getDocument", {
|
||||||
|
depth: -1,
|
||||||
|
pierce: true,
|
||||||
|
});
|
||||||
return this.parseTextFromDOM(result);
|
return this.parseTextFromDOM(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse) : string {
|
parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse): string {
|
||||||
const accum : string[] = [];
|
const accum: string[] = [];
|
||||||
const metadata = {};
|
const metadata = {};
|
||||||
|
|
||||||
this.parseText(dom.root, metadata, accum);
|
this.parseText(dom.root, metadata, accum);
|
||||||
|
@ -108,9 +135,21 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
return accum.join("\n");
|
return accum.join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
parseText(node: Protocol.DOM.Node, metadata: Record<string, string> | null, accum: string[]) {
|
parseText(
|
||||||
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
node: Protocol.DOM.Node,
|
||||||
const EMPTY_LIST : Protocol.DOM.Node[] = [];
|
metadata: Record<string, string> | null,
|
||||||
|
accum: string[],
|
||||||
|
) {
|
||||||
|
const SKIPPED_NODES = [
|
||||||
|
"head",
|
||||||
|
"script",
|
||||||
|
"style",
|
||||||
|
"header",
|
||||||
|
"footer",
|
||||||
|
"banner-div",
|
||||||
|
"noscript",
|
||||||
|
];
|
||||||
|
const EMPTY_LIST: Protocol.DOM.Node[] = [];
|
||||||
const TEXT = "#text";
|
const TEXT = "#text";
|
||||||
const TITLE = "title";
|
const TITLE = "title";
|
||||||
|
|
||||||
|
@ -128,7 +167,7 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
accum.push(value);
|
accum.push(value);
|
||||||
}
|
}
|
||||||
} else if (name === TITLE) {
|
} else if (name === TITLE) {
|
||||||
const title : string[] = [];
|
const title: string[] = [];
|
||||||
|
|
||||||
for (const child of children) {
|
for (const child of children) {
|
||||||
this.parseText(child, null, title);
|
this.parseText(child, null, title);
|
||||||
|
@ -150,4 +189,3 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import { logger } from "./logger.js";
|
import { logger } from "./logger.js";
|
||||||
|
|
||||||
export function sleep(seconds: number) {
|
export function sleep(seconds: number) {
|
||||||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
return new Promise((resolve) => setTimeout(resolve, seconds * 1000));
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
@ -10,30 +10,36 @@ export function timedRun(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
promise: Promise<any>,
|
promise: Promise<any>,
|
||||||
seconds: number,
|
seconds: number,
|
||||||
message="Promise timed out",
|
message = "Promise timed out",
|
||||||
logDetails={},
|
logDetails = {},
|
||||||
context="general",
|
context = "general",
|
||||||
isWarn=false
|
isWarn = false,
|
||||||
) {
|
) {
|
||||||
// return Promise return value or log error if timeout is reached first
|
// return Promise return value or log error if timeout is reached first
|
||||||
const timeout = seconds * 1000;
|
const timeout = seconds * 1000;
|
||||||
|
|
||||||
const rejectPromiseOnTimeout = (timeout: number) => {
|
const rejectPromiseOnTimeout = (timeout: number) => {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
setTimeout(() => (reject("timeout reached")), timeout);
|
setTimeout(() => reject("timeout reached"), timeout);
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
|
return Promise.race([promise, rejectPromiseOnTimeout(timeout)]).catch(
|
||||||
.catch((err) => {
|
(err) => {
|
||||||
if (err == "timeout reached") {
|
if (err == "timeout reached") {
|
||||||
const logFunc = isWarn ? logger.warn : logger.error;
|
const logFunc = isWarn ? logger.warn : logger.error;
|
||||||
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
|
logFunc.call(
|
||||||
|
logger,
|
||||||
|
message,
|
||||||
|
{ seconds: seconds, ...logDetails },
|
||||||
|
context,
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
});
|
},
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
|
export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
|
||||||
|
|
|
@ -2,8 +2,7 @@ import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import * as warcio from "warcio";
|
import * as warcio from "warcio";
|
||||||
|
|
||||||
export class WARCResourceWriter
|
export class WARCResourceWriter {
|
||||||
{
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
page: any;
|
page: any;
|
||||||
|
@ -12,34 +11,53 @@ export class WARCResourceWriter
|
||||||
warcName: string;
|
warcName: string;
|
||||||
date: Date;
|
date: Date;
|
||||||
|
|
||||||
constructor({url, directory, date, warcName} : {url: string, directory: string, date: Date, warcName: string}) {
|
constructor({
|
||||||
|
url,
|
||||||
|
directory,
|
||||||
|
date,
|
||||||
|
warcName,
|
||||||
|
}: {
|
||||||
|
url: string;
|
||||||
|
directory: string;
|
||||||
|
date: Date;
|
||||||
|
warcName: string;
|
||||||
|
}) {
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.directory = directory;
|
this.directory = directory;
|
||||||
this.warcName = path.join(this.directory, warcName);
|
this.warcName = path.join(this.directory, warcName);
|
||||||
this.date = date ? date : new Date();
|
this.date = date ? date : new Date();
|
||||||
}
|
}
|
||||||
|
|
||||||
async writeBufferToWARC(contents: Uint8Array, resourceType: string, contentType: string) {
|
async writeBufferToWARC(
|
||||||
|
contents: Uint8Array,
|
||||||
|
resourceType: string,
|
||||||
|
contentType: string,
|
||||||
|
) {
|
||||||
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
||||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {
|
||||||
|
gzip: true,
|
||||||
|
});
|
||||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
async wrap(buffer: Uint8Array, resourceType: string, contentType: string) {
|
async wrap(buffer: Uint8Array, resourceType: string, contentType: string) {
|
||||||
const warcVersion = "WARC/1.1";
|
const warcVersion = "WARC/1.1";
|
||||||
const warcRecordType = "resource";
|
const warcRecordType = "resource";
|
||||||
const warcHeaders = {"Content-Type": contentType};
|
const warcHeaders = { "Content-Type": contentType };
|
||||||
async function* content() {
|
async function* content() {
|
||||||
yield buffer;
|
yield buffer;
|
||||||
}
|
}
|
||||||
const resourceUrl = `urn:${resourceType}:${this.url}`;
|
const resourceUrl = `urn:${resourceType}:${this.url}`;
|
||||||
|
|
||||||
return warcio.WARCRecord.create({
|
return warcio.WARCRecord.create(
|
||||||
url: resourceUrl,
|
{
|
||||||
date: this.date.toISOString(),
|
url: resourceUrl,
|
||||||
type: warcRecordType,
|
date: this.date.toISOString(),
|
||||||
warcVersion,
|
type: warcRecordType,
|
||||||
warcHeaders
|
warcVersion,
|
||||||
}, content());
|
warcHeaders,
|
||||||
|
},
|
||||||
|
content(),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,10 +7,8 @@ import { WARCSerializer } from "warcio/node";
|
||||||
import { logger, errJSON } from "./logger.js";
|
import { logger, errJSON } from "./logger.js";
|
||||||
import type { IndexerOffsetLength, WARCRecord } from "warcio";
|
import type { IndexerOffsetLength, WARCRecord } from "warcio";
|
||||||
|
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
export class WARCWriter implements IndexerOffsetLength
|
export class WARCWriter implements IndexerOffsetLength {
|
||||||
{
|
|
||||||
archivesDir: string;
|
archivesDir: string;
|
||||||
tempCdxDir: string;
|
tempCdxDir: string;
|
||||||
filename: string;
|
filename: string;
|
||||||
|
@ -25,8 +23,19 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
fh?: Writable | null;
|
fh?: Writable | null;
|
||||||
cdxFH?: Writable | null;
|
cdxFH?: Writable | null;
|
||||||
|
|
||||||
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails} :
|
constructor({
|
||||||
{archivesDir: string, tempCdxDir: string, filename: string, gzip: boolean, logDetails: Record<string, string>}) {
|
archivesDir,
|
||||||
|
tempCdxDir,
|
||||||
|
filename,
|
||||||
|
gzip,
|
||||||
|
logDetails,
|
||||||
|
}: {
|
||||||
|
archivesDir: string;
|
||||||
|
tempCdxDir: string;
|
||||||
|
filename: string;
|
||||||
|
gzip: boolean;
|
||||||
|
logDetails: Record<string, string>;
|
||||||
|
}) {
|
||||||
this.archivesDir = archivesDir;
|
this.archivesDir = archivesDir;
|
||||||
this.tempCdxDir = tempCdxDir;
|
this.tempCdxDir = tempCdxDir;
|
||||||
this.filename = filename;
|
this.filename = filename;
|
||||||
|
@ -37,21 +46,29 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
this.recordLength = 0;
|
this.recordLength = 0;
|
||||||
|
|
||||||
if (this.tempCdxDir) {
|
if (this.tempCdxDir) {
|
||||||
this.indexer = new CDXIndexer({format: "cdxj"});
|
this.indexer = new CDXIndexer({ format: "cdxj" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async initFH() {
|
async initFH() {
|
||||||
if (!this.fh) {
|
if (!this.fh) {
|
||||||
this.fh = fs.createWriteStream(path.join(this.archivesDir, this.filename));
|
this.fh = fs.createWriteStream(
|
||||||
|
path.join(this.archivesDir, this.filename),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
if (!this.cdxFH && this.tempCdxDir) {
|
if (!this.cdxFH && this.tempCdxDir) {
|
||||||
this.cdxFH = fs.createWriteStream(path.join(this.tempCdxDir, this.filename + ".cdx"));
|
this.cdxFH = fs.createWriteStream(
|
||||||
|
path.join(this.tempCdxDir, this.filename + ".cdx"),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async writeRecordPair(responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined) {
|
async writeRecordPair(
|
||||||
const opts = {gzip: this.gzip};
|
responseRecord: WARCRecord,
|
||||||
|
requestRecord: WARCRecord,
|
||||||
|
responseSerializer: WARCSerializer | undefined = undefined,
|
||||||
|
) {
|
||||||
|
const opts = { gzip: this.gzip };
|
||||||
|
|
||||||
if (!responseSerializer) {
|
if (!responseSerializer) {
|
||||||
responseSerializer = new WARCSerializer(responseRecord, opts);
|
responseSerializer = new WARCSerializer(responseRecord, opts);
|
||||||
|
@ -59,15 +76,20 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
|
|
||||||
await this.initFH();
|
await this.initFH();
|
||||||
|
|
||||||
this.recordLength = await this._writeRecord(responseRecord, responseSerializer);
|
this.recordLength = await this._writeRecord(
|
||||||
|
responseRecord,
|
||||||
|
responseSerializer,
|
||||||
|
);
|
||||||
|
|
||||||
this._writeCDX(responseRecord);
|
this._writeCDX(responseRecord);
|
||||||
|
|
||||||
const requestSerializer = new WARCSerializer(requestRecord, opts);
|
const requestSerializer = new WARCSerializer(requestRecord, opts);
|
||||||
this.recordLength = await this._writeRecord(requestRecord, requestSerializer);
|
this.recordLength = await this._writeRecord(
|
||||||
|
requestRecord,
|
||||||
|
requestSerializer,
|
||||||
|
);
|
||||||
|
|
||||||
this._writeCDX(requestRecord);
|
this._writeCDX(requestRecord);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
||||||
|
@ -83,7 +105,11 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
try {
|
try {
|
||||||
this.fh.write(chunk);
|
this.fh.write(chunk);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
logger.error(
|
||||||
|
"Error writing to WARC, corruption possible",
|
||||||
|
{ ...errJSON(e), url, ...this.logDetails },
|
||||||
|
"writer",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -119,7 +145,7 @@ export class WARCWriter implements IndexerOffsetLength
|
||||||
|
|
||||||
// =================================================================
|
// =================================================================
|
||||||
export function streamFinish(fh: Writable) {
|
export function streamFinish(fh: Writable) {
|
||||||
const p = new Promise<void>(resolve => {
|
const p = new Promise<void>((resolve) => {
|
||||||
fh.once("finish", () => resolve());
|
fh.once("finish", () => resolve());
|
||||||
});
|
});
|
||||||
fh.end();
|
fh.end();
|
||||||
|
|
|
@ -16,9 +16,14 @@ const TEARDOWN_TIMEOUT = 10;
|
||||||
const FINISHED_TIMEOUT = 60;
|
const FINISHED_TIMEOUT = 60;
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
// TODO: Fix this the next time the file is edited.
|
export function runWorkers(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// TODO: Fix this the next time the file is edited.
|
||||||
export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number, collDir: string) {
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
crawler: any,
|
||||||
|
numWorkers: number,
|
||||||
|
maxPageTime: number,
|
||||||
|
collDir: string,
|
||||||
|
) {
|
||||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||||
|
|
||||||
const workers = [];
|
const workers = [];
|
||||||
|
@ -39,13 +44,12 @@ export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < numWorkers; i++) {
|
for (let i = 0; i < numWorkers; i++) {
|
||||||
workers.push(new PageWorker((i + offset), crawler, maxPageTime, collDir));
|
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir));
|
||||||
}
|
}
|
||||||
|
|
||||||
return Promise.allSettled(workers.map((worker) => worker.run()));
|
return Promise.allSettled(workers.map((worker) => worker.run()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -55,17 +59,18 @@ export type WorkerOpts = Record<string, any> & {
|
||||||
workerid: WorkerId;
|
workerid: WorkerId;
|
||||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||||
callbacks: Record<string, Function>;
|
callbacks: Record<string, Function>;
|
||||||
directFetchCapture?: ((url: string) => Promise<{fetched: boolean, mime: string}>) | null;
|
directFetchCapture?:
|
||||||
|
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
|
||||||
|
| null;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export type WorkerState = WorkerOpts & {
|
export type WorkerState = WorkerOpts & {
|
||||||
data: PageState
|
data: PageState;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
export class PageWorker
|
export class PageWorker {
|
||||||
{
|
|
||||||
id: WorkerId;
|
id: WorkerId;
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -91,16 +96,25 @@ export class PageWorker
|
||||||
|
|
||||||
recorder: Recorder;
|
recorder: Recorder;
|
||||||
|
|
||||||
// TODO: Fix this the next time the file is edited.
|
constructor(
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
id: WorkerId,
|
||||||
constructor(id: WorkerId, crawler: any, maxPageTime: number, collDir: string) {
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
crawler: any,
|
||||||
|
maxPageTime: number,
|
||||||
|
collDir: string,
|
||||||
|
) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.crawler = crawler;
|
this.crawler = crawler;
|
||||||
this.maxPageTime = maxPageTime;
|
this.maxPageTime = maxPageTime;
|
||||||
|
|
||||||
this.logDetails = {workerid: this.id};
|
this.logDetails = { workerid: this.id };
|
||||||
|
|
||||||
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler});
|
this.recorder = new Recorder({
|
||||||
|
workerid: id,
|
||||||
|
collDir,
|
||||||
|
crawler: this.crawler,
|
||||||
|
});
|
||||||
|
|
||||||
this.crawler.browser.recorders.push(this.recorder);
|
this.crawler.browser.recorders.push(this.recorder);
|
||||||
}
|
}
|
||||||
|
@ -121,7 +135,7 @@ export class PageWorker
|
||||||
TEARDOWN_TIMEOUT,
|
TEARDOWN_TIMEOUT,
|
||||||
"Page Teardown Timed Out",
|
"Page Teardown Timed Out",
|
||||||
this.logDetails,
|
this.logDetails,
|
||||||
"worker"
|
"worker",
|
||||||
);
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// ignore
|
// ignore
|
||||||
|
@ -129,13 +143,17 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
logger.debug("Closing page", {crashed: this.crashed, workerid: this.id}, "worker");
|
logger.debug(
|
||||||
|
"Closing page",
|
||||||
|
{ crashed: this.crashed, workerid: this.id },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
await timedRun(
|
await timedRun(
|
||||||
this.page.close(),
|
this.page.close(),
|
||||||
TEARDOWN_TIMEOUT,
|
TEARDOWN_TIMEOUT,
|
||||||
"Page Close Timed Out",
|
"Page Close Timed Out",
|
||||||
this.logDetails,
|
this.logDetails,
|
||||||
"worker"
|
"worker",
|
||||||
);
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// ignore
|
// ignore
|
||||||
|
@ -155,9 +173,19 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async initPage(url: string) : Promise<WorkerOpts> {
|
async initPage(url: string): Promise<WorkerOpts> {
|
||||||
if (!this.crashed && this.page && this.opts && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) {
|
if (
|
||||||
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker");
|
!this.crashed &&
|
||||||
|
this.page &&
|
||||||
|
this.opts &&
|
||||||
|
++this.reuseCount <= MAX_REUSE &&
|
||||||
|
this.isSameOrigin(url)
|
||||||
|
) {
|
||||||
|
logger.debug(
|
||||||
|
"Reusing page",
|
||||||
|
{ reuseCount: this.reuseCount, ...this.logDetails },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
return this.opts;
|
return this.opts;
|
||||||
} else if (this.page) {
|
} else if (this.page) {
|
||||||
await this.closePage();
|
await this.closePage();
|
||||||
|
@ -170,13 +198,13 @@ export class PageWorker
|
||||||
|
|
||||||
while (await this.crawler.isCrawlRunning()) {
|
while (await this.crawler.isCrawlRunning()) {
|
||||||
try {
|
try {
|
||||||
logger.debug("Getting page in new window", {workerid}, "worker");
|
logger.debug("Getting page in new window", { workerid }, "worker");
|
||||||
const result = await timedRun(
|
const result = await timedRun(
|
||||||
this.crawler.browser.newWindowPageWithCDP(),
|
this.crawler.browser.newWindowPageWithCDP(),
|
||||||
NEW_WINDOW_TIMEOUT,
|
NEW_WINDOW_TIMEOUT,
|
||||||
"New Window Timed Out",
|
"New Window Timed Out",
|
||||||
{workerid},
|
{ workerid },
|
||||||
"worker"
|
"worker",
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!result) {
|
if (!result) {
|
||||||
|
@ -188,7 +216,9 @@ export class PageWorker
|
||||||
this.page = page;
|
this.page = page;
|
||||||
this.cdp = cdp;
|
this.cdp = cdp;
|
||||||
this.callbacks = {};
|
this.callbacks = {};
|
||||||
const directFetchCapture = this.recorder ? (x: string) => this.recorder.directFetchCapture(x) : null;
|
const directFetchCapture = this.recorder
|
||||||
|
? (x: string) => this.recorder.directFetchCapture(x)
|
||||||
|
: null;
|
||||||
this.opts = {
|
this.opts = {
|
||||||
page,
|
page,
|
||||||
cdp,
|
cdp,
|
||||||
|
@ -203,9 +233,11 @@ export class PageWorker
|
||||||
|
|
||||||
// updated per page crawl
|
// updated per page crawl
|
||||||
this.crashed = false;
|
this.crashed = false;
|
||||||
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
|
this.crashBreak = new Promise(
|
||||||
|
(resolve, reject) => (this.markCrashed = reject),
|
||||||
|
);
|
||||||
|
|
||||||
this.logDetails = {page: page.url(), workerid};
|
this.logDetails = { page: page.url(), workerid };
|
||||||
|
|
||||||
// more serious page crash, mark as failed
|
// more serious page crash, mark as failed
|
||||||
// TODO: Fix this the next time the file is edited.
|
// TODO: Fix this the next time the file is edited.
|
||||||
|
@ -213,7 +245,11 @@ export class PageWorker
|
||||||
page.on("error", (err: any) => {
|
page.on("error", (err: any) => {
|
||||||
// ensure we're still on this page, otherwise ignore!
|
// ensure we're still on this page, otherwise ignore!
|
||||||
if (this.page === page) {
|
if (this.page === page) {
|
||||||
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker");
|
logger.error(
|
||||||
|
"Page Crashed",
|
||||||
|
{ ...errJSON(err), ...this.logDetails },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
this.crashed = true;
|
this.crashed = true;
|
||||||
if (this.markCrashed) {
|
if (this.markCrashed) {
|
||||||
this.markCrashed("crashed");
|
this.markCrashed("crashed");
|
||||||
|
@ -224,9 +260,12 @@ export class PageWorker
|
||||||
await this.crawler.setupPage(this.opts);
|
await this.crawler.setupPage(this.opts);
|
||||||
|
|
||||||
return this.opts;
|
return this.opts;
|
||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
logger.warn("Error getting new page", {"workerid": this.id, ...errJSON(err)}, "worker");
|
logger.warn(
|
||||||
|
"Error getting new page",
|
||||||
|
{ workerid: this.id, ...errJSON(err) },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
retry++;
|
retry++;
|
||||||
|
|
||||||
if (!this.crawler.browser.browser) {
|
if (!this.crawler.browser.browser) {
|
||||||
|
@ -234,7 +273,11 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
|
|
||||||
if (retry >= MAX_REUSE) {
|
if (retry >= MAX_REUSE) {
|
||||||
logger.fatal("Unable to get new page, browser likely crashed", this.logDetails, "worker");
|
logger.fatal(
|
||||||
|
"Unable to get new page, browser likely crashed",
|
||||||
|
this.logDetails,
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await sleep(0.5);
|
await sleep(0.5);
|
||||||
|
@ -262,16 +305,16 @@ export class PageWorker
|
||||||
const { data } = opts;
|
const { data } = opts;
|
||||||
const { url } = data;
|
const { url } = data;
|
||||||
|
|
||||||
logger.info("Starting page", {workerid, "page": url}, "worker");
|
logger.info("Starting page", { workerid, page: url }, "worker");
|
||||||
|
|
||||||
this.logDetails = {page: url, workerid};
|
this.logDetails = { page: url, workerid };
|
||||||
|
|
||||||
// set new page id
|
// set new page id
|
||||||
const pageid = uuidv4();
|
const pageid = uuidv4();
|
||||||
data.pageid = pageid;
|
data.pageid = pageid;
|
||||||
|
|
||||||
if (this.recorder) {
|
if (this.recorder) {
|
||||||
this.recorder.startPage({pageid, url});
|
this.recorder.startPage({ pageid, url });
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -281,14 +324,17 @@ export class PageWorker
|
||||||
this.maxPageTime,
|
this.maxPageTime,
|
||||||
"Page Worker Timeout",
|
"Page Worker Timeout",
|
||||||
this.logDetails,
|
this.logDetails,
|
||||||
"worker"
|
"worker",
|
||||||
),
|
),
|
||||||
this.crashBreak
|
this.crashBreak,
|
||||||
]);
|
]);
|
||||||
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
if (e instanceof Error && e.message !== "logged" && !this.crashed) {
|
if (e instanceof Error && e.message !== "logged" && !this.crashed) {
|
||||||
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
|
logger.error(
|
||||||
|
"Worker Exception",
|
||||||
|
{ ...errJSON(e), ...this.logDetails },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
await timedRun(
|
await timedRun(
|
||||||
|
@ -296,19 +342,27 @@ export class PageWorker
|
||||||
FINISHED_TIMEOUT,
|
FINISHED_TIMEOUT,
|
||||||
"Page Finished Timed Out",
|
"Page Finished Timed Out",
|
||||||
this.logDetails,
|
this.logDetails,
|
||||||
"worker"
|
"worker",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async run() {
|
async run() {
|
||||||
logger.info("Worker starting", {workerid: this.id}, "worker");
|
logger.info("Worker starting", { workerid: this.id }, "worker");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await this.runLoop();
|
await this.runLoop();
|
||||||
logger.info("Worker done, all tasks complete", {workerid: this.id}, "worker");
|
logger.info(
|
||||||
|
"Worker done, all tasks complete",
|
||||||
|
{ workerid: this.id },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logger.error("Worker error, exiting", {...errJSON(e), workerid: this.id}, "worker");
|
logger.error(
|
||||||
|
"Worker error, exiting",
|
||||||
|
{ ...errJSON(e), workerid: this.id },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
} finally {
|
} finally {
|
||||||
if (this.recorder) {
|
if (this.recorder) {
|
||||||
await this.recorder.onDone();
|
await this.recorder.onDone();
|
||||||
|
@ -339,10 +393,9 @@ export class PageWorker
|
||||||
const opts = await this.initPage(data.url);
|
const opts = await this.initPage(data.url);
|
||||||
|
|
||||||
// run timed crawl of page
|
// run timed crawl of page
|
||||||
await this.timedCrawlPage({...opts, data});
|
await this.timedCrawlPage({ ...opts, data });
|
||||||
|
|
||||||
loggedWaiting = false;
|
loggedWaiting = false;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// indicate that the worker has no more work (mostly for screencasting, status, etc...)
|
// indicate that the worker has no more work (mostly for screencasting, status, etc...)
|
||||||
// depending on other works, will either get more work or crawl will end
|
// depending on other works, will either get more work or crawl will end
|
||||||
|
@ -354,7 +407,11 @@ export class PageWorker
|
||||||
// if pending, sleep and check again
|
// if pending, sleep and check again
|
||||||
if (pending) {
|
if (pending) {
|
||||||
if (!loggedWaiting) {
|
if (!loggedWaiting) {
|
||||||
logger.debug("No crawl tasks, but pending tasks remain, waiting", {pending, workerid: this.id}, "worker");
|
logger.debug(
|
||||||
|
"No crawl tasks, but pending tasks remain, waiting",
|
||||||
|
{ pending, workerid: this.id },
|
||||||
|
"worker",
|
||||||
|
);
|
||||||
loggedWaiting = true;
|
loggedWaiting = true;
|
||||||
}
|
}
|
||||||
await sleep(0.5);
|
await sleep(0.5);
|
||||||
|
@ -368,5 +425,3 @@ export class PageWorker
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,17 +10,21 @@ function runCrawl(name, config, commandExtra = "") {
|
||||||
const configYaml = yaml.dump(config);
|
const configYaml = yaml.dump(config);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
const proc = child_process.execSync(
|
||||||
|
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
||||||
|
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
console.log(proc);
|
console.log(proc);
|
||||||
}
|
} catch (error) {
|
||||||
catch (error) {
|
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function doesCDXContain(coll, value) {
|
function doesCDXContain(coll, value) {
|
||||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
const data = fs.readFileSync(
|
||||||
|
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
||||||
|
);
|
||||||
return data.indexOf(value) >= 0;
|
return data.indexOf(value) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,11 +45,13 @@ test("test crawl without ad block for specific URL", () => {
|
||||||
|
|
||||||
test("testcrawl with ad block for specific URL", () => {
|
test("testcrawl with ad block for specific URL", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://www.mozilla.org/en-US/firefox/",
|
url: "https://www.mozilla.org/en-US/firefox/",
|
||||||
"blockAds": true,
|
blockAds: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("adblock-block", config);
|
runCrawl("adblock-block", config);
|
||||||
|
|
||||||
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
|
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(
|
||||||
|
false,
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
|
@ -6,21 +6,25 @@ test("dynamically add exclusion while crawl is running", async () => {
|
||||||
|
|
||||||
const p = new Promise((resolve) => {
|
const p = new Promise((resolve) => {
|
||||||
callback = (error, stdout, stderr) => {
|
callback = (error, stdout, stderr) => {
|
||||||
resolve({error, stdout, stderr});
|
resolve({ error, stdout, stderr });
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
exec("docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", {"shell": "/bin/bash"}, callback);
|
exec(
|
||||||
|
"docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
|
||||||
|
{ shell: "/bin/bash" },
|
||||||
|
callback,
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 3000));
|
await new Promise((resolve) => setTimeout(resolve, 3000));
|
||||||
|
|
||||||
const redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
||||||
|
|
||||||
await redis.connect({maxRetriesPerRequest: 50});
|
await redis.connect({ maxRetriesPerRequest: 50 });
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (Number(await redis.zcard("test:q")) > 1) {
|
if (Number(await redis.zcard("test:q")) > 1) {
|
||||||
|
@ -33,7 +37,10 @@ test("dynamically add exclusion while crawl is running", async () => {
|
||||||
const uids = await redis.hkeys("test:status");
|
const uids = await redis.hkeys("test:status");
|
||||||
|
|
||||||
// exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
|
// exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
|
||||||
await redis.rpush(`${uids[0]}:msg`, JSON.stringify({type: "addExclusion", regex: "webrecorder"}));
|
await redis.rpush(
|
||||||
|
`${uids[0]}:msg`,
|
||||||
|
JSON.stringify({ type: "addExclusion", regex: "webrecorder" }),
|
||||||
|
);
|
||||||
|
|
||||||
// ensure 'Add Exclusion is contained in the debug logs
|
// ensure 'Add Exclusion is contained in the debug logs
|
||||||
const { stdout } = await p;
|
const { stdout } = await p;
|
||||||
|
@ -44,4 +51,3 @@ test("dynamically add exclusion while crawl is running", async () => {
|
||||||
|
|
||||||
await redis.disconnect();
|
await redis.disconnect();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -3,16 +3,18 @@ import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import md5 from "md5";
|
import md5 from "md5";
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
test("ensure basic crawl run with docker run passes", async () => {
|
test("ensure basic crawl run with docker run passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\"");
|
child_process.execSync(
|
||||||
|
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description"',
|
||||||
|
);
|
||||||
|
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
|
||||||
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
|
);
|
||||||
|
|
||||||
|
child_process.execSync(
|
||||||
|
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that a combined warc file exists in the archive folder", () => {
|
test("check that a combined warc file exists in the archive folder", () => {
|
||||||
|
@ -20,16 +22,17 @@ test("check that a combined warc file exists in the archive folder", () => {
|
||||||
var captureFound = 0;
|
var captureFound = 0;
|
||||||
|
|
||||||
for (var i = 0; i < warcLists.length; i++) {
|
for (var i = 0; i < warcLists.length; i++) {
|
||||||
if (warcLists[i].endsWith("_0.warc.gz")){
|
if (warcLists[i].endsWith("_0.warc.gz")) {
|
||||||
captureFound = 1;
|
captureFound = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
expect(captureFound).toEqual(1);
|
expect(captureFound).toEqual(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("check that a combined warc file is under the rolloverSize", () => {
|
test("check that a combined warc file is under the rolloverSize", () => {
|
||||||
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
|
const warcLists = fs.readdirSync(
|
||||||
|
path.join("test-crawls/collections/wr-net/wacz", "archive"),
|
||||||
|
);
|
||||||
let rolloverSize = 0;
|
let rolloverSize = 0;
|
||||||
|
|
||||||
function getFileSize(filename) {
|
function getFileSize(filename) {
|
||||||
|
@ -37,8 +40,10 @@ test("check that a combined warc file is under the rolloverSize", () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < warcLists.length; i++) {
|
for (let i = 0; i < warcLists.length; i++) {
|
||||||
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
const size = getFileSize(
|
||||||
if (size < 10000){
|
path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
|
||||||
|
);
|
||||||
|
if (size < 10000) {
|
||||||
rolloverSize = 1;
|
rolloverSize = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -46,27 +51,57 @@ test("check that a combined warc file is under the rolloverSize", () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
||||||
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
const crawl_hash = md5(
|
||||||
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
JSON.parse(
|
||||||
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
fs
|
||||||
|
.readFileSync(
|
||||||
|
"test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
)
|
||||||
|
.split("\n")[1],
|
||||||
|
)["text"],
|
||||||
|
);
|
||||||
|
const wacz_hash = md5(
|
||||||
|
JSON.parse(
|
||||||
|
fs
|
||||||
|
.readFileSync(
|
||||||
|
"test-crawls/collections/wr-net/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
)
|
||||||
|
.split("\n")[1],
|
||||||
|
)["text"],
|
||||||
|
);
|
||||||
|
const fixture_hash = md5(
|
||||||
|
JSON.parse(
|
||||||
|
fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
|
||||||
|
)["text"],
|
||||||
|
);
|
||||||
|
|
||||||
expect(wacz_hash).toEqual(fixture_hash);
|
expect(wacz_hash).toEqual(fixture_hash);
|
||||||
expect(wacz_hash).toEqual(crawl_hash);
|
expect(wacz_hash).toEqual(crawl_hash);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the supplied title and description made it into datapackage.json", () => {
|
test("check that the supplied title and description made it into datapackage.json", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
|
||||||
|
).toBe(true);
|
||||||
|
|
||||||
const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8");
|
const data = fs.readFileSync(
|
||||||
|
"test-crawls/collections/wr-net/wacz/datapackage.json",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
const dataPackageJSON = JSON.parse(data);
|
const dataPackageJSON = JSON.parse(data);
|
||||||
expect(dataPackageJSON.title).toEqual("test title");
|
expect(dataPackageJSON.title).toEqual("test title");
|
||||||
expect(dataPackageJSON.description).toEqual("test description");
|
expect(dataPackageJSON.description).toEqual("test description");
|
||||||
|
|
|
@ -10,17 +10,21 @@ function runCrawl(name, config, commandExtra = "") {
|
||||||
const configYaml = yaml.dump(config);
|
const configYaml = yaml.dump(config);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
const proc = child_process.execSync(
|
||||||
|
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
||||||
|
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
console.log(proc);
|
console.log(proc);
|
||||||
}
|
} catch (error) {
|
||||||
catch (error) {
|
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function doesCDXContain(coll, value) {
|
function doesCDXContain(coll, value) {
|
||||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
const data = fs.readFileSync(
|
||||||
|
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
||||||
|
);
|
||||||
return data.indexOf(value) >= 0;
|
return data.indexOf(value) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -39,131 +43,154 @@ test("test crawl without block for specific URL", () => {
|
||||||
});
|
});
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
test("test block rule on specific URL", () => {
|
test("test block rule on specific URL", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://www.iana.org/",
|
url: "https://www.iana.org/",
|
||||||
"blockRules": [
|
blockRules: [{ url: "adsense" }],
|
||||||
{"url": "adsense"}
|
|
||||||
]
|
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("block-1", config);
|
runCrawl("block-1", config);
|
||||||
|
|
||||||
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false);
|
expect(
|
||||||
|
doesCDXContain(
|
||||||
|
"block-1",
|
||||||
|
"https://cse.google.com/adsense/search/async-ads.js",
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("test block rule based on iframe text, content included due to match", () => {
|
test("test block rule based on iframe text, content included due to match", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||||
"blockRules": [{
|
blockRules: [
|
||||||
"url": "https://www.youtube.com/embed/",
|
{
|
||||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
url: "https://www.youtube.com/embed/",
|
||||||
"type": "allowOnly"
|
frameTextMatch:
|
||||||
}]
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||||
|
type: "allowOnly",
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("block-2", config);
|
runCrawl("block-2", config);
|
||||||
|
|
||||||
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true);
|
expect(doesCDXContain("block-2", '"video/mp4"')).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||||
"blockRules": [{
|
blockRules: [
|
||||||
"url": "https://www.youtube.com/embed/",
|
{
|
||||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"",
|
url: "https://www.youtube.com/embed/",
|
||||||
"type": "allowOnly"
|
frameTextMatch:
|
||||||
}]
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"',
|
||||||
|
type: "allowOnly",
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("block-3", config);
|
runCrawl("block-3", config);
|
||||||
|
|
||||||
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false);
|
expect(doesCDXContain("block-3", '"video/mp4"')).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("test block rule based on iframe text, block matched", () => {
|
test("test block rule based on iframe text, block matched", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||||
"blockRules": [{
|
blockRules: [
|
||||||
"url": "https://www.youtube.com/embed/",
|
{
|
||||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
url: "https://www.youtube.com/embed/",
|
||||||
}]
|
frameTextMatch:
|
||||||
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("block-4", config);
|
runCrawl("block-4", config);
|
||||||
|
|
||||||
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false);
|
expect(doesCDXContain("block-4", '"video/mp4"')).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||||
"blockRules": [{
|
blockRules: [
|
||||||
"url": "example.com/embed/",
|
{
|
||||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
url: "example.com/embed/",
|
||||||
"type": "block"
|
frameTextMatch:
|
||||||
}, {
|
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||||
"url": "(youtube.com|example.com)/embed/",
|
type: "block",
|
||||||
"type": "allowOnly",
|
},
|
||||||
"inFrameUrl": "oembed.link/",
|
{
|
||||||
}]
|
url: "(youtube.com|example.com)/embed/",
|
||||||
|
type: "allowOnly",
|
||||||
|
inFrameUrl: "oembed.link/",
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("non-block-5", config);
|
runCrawl("non-block-5", config);
|
||||||
|
|
||||||
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true);
|
expect(doesCDXContain("non-block-5", '"video/mp4"')).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("test block url in frame url", () => {
|
test("test block url in frame url", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||||
"blockRules": [{
|
blockRules: [
|
||||||
"url": "maxresdefault.jpg",
|
{
|
||||||
"type": "block",
|
url: "maxresdefault.jpg",
|
||||||
"inFrameUrl": "youtube.com/embed",
|
type: "block",
|
||||||
}]
|
inFrameUrl: "youtube.com/embed",
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
runCrawl("block-6", config);
|
runCrawl("block-6", config);
|
||||||
|
|
||||||
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false);
|
expect(
|
||||||
|
doesCDXContain(
|
||||||
|
"block-6",
|
||||||
|
'"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"',
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
||||||
const config = {
|
const config = {
|
||||||
"seeds": [
|
seeds: ["https://archiveweb.page/en/troubleshooting/errors/"],
|
||||||
"https://archiveweb.page/en/troubleshooting/errors/",
|
depth: "0",
|
||||||
|
blockRules: [
|
||||||
|
{
|
||||||
|
url: "(archiveweb.page|www.youtube.com)",
|
||||||
|
type: "allowOnly",
|
||||||
|
inFrameUrl: "archiveweb.page",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||||
|
inFrameUrl: "archiveweb.page",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "https://www.youtube.com/embed/",
|
||||||
|
type: "allowOnly",
|
||||||
|
frameTextMatch:
|
||||||
|
'(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")',
|
||||||
|
},
|
||||||
],
|
],
|
||||||
"depth": "0",
|
|
||||||
"blockRules": [{
|
|
||||||
"url": "(archiveweb.page|www.youtube.com)",
|
|
||||||
"type": "allowOnly",
|
|
||||||
"inFrameUrl": "archiveweb.page"
|
|
||||||
}, {
|
|
||||||
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
|
||||||
"inFrameUrl": "archiveweb.page"
|
|
||||||
}, {
|
|
||||||
"url": "https://www.youtube.com/embed/",
|
|
||||||
"type": "allowOnly",
|
|
||||||
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
|
|
||||||
}],
|
|
||||||
|
|
||||||
"combineWARC": true,
|
combineWARC: true,
|
||||||
|
|
||||||
"logging": "stats,debug"
|
logging: "stats,debug",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
runCrawl("block-7", config);
|
runCrawl("block-7", config);
|
||||||
|
|
||||||
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
|
expect(
|
||||||
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
|
doesCDXContain(
|
||||||
|
"block-7",
|
||||||
|
'"https://archiveweb.page/assets/js/vendor/lunr.min.js"',
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
|
expect(doesCDXContain("block-7", '"video/mp4"')).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,33 +1,32 @@
|
||||||
import util from "util";
|
import util from "util";
|
||||||
import {exec as execCallback } from "child_process";
|
import { exec as execCallback } from "child_process";
|
||||||
|
|
||||||
const exec = util.promisify(execCallback);
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
|
||||||
test("check that the collection name is properly validated", async () => {
|
test("check that the collection name is properly validated", async () => {
|
||||||
let passed = "";
|
let passed = "";
|
||||||
|
|
||||||
try{
|
try {
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
await exec(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid",
|
||||||
|
);
|
||||||
passed = true;
|
passed = true;
|
||||||
}
|
} catch (error) {
|
||||||
catch (error) {
|
|
||||||
passed = false;
|
passed = false;
|
||||||
}
|
}
|
||||||
expect(passed).toBe(true);
|
expect(passed).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
|
test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
|
||||||
let passed = "";
|
let passed = "";
|
||||||
|
|
||||||
try{
|
try {
|
||||||
await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
|
await exec(
|
||||||
|
"docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid",
|
||||||
|
);
|
||||||
passed = true;
|
passed = true;
|
||||||
}
|
} catch (e) {
|
||||||
catch(e){
|
|
||||||
passed = false;
|
passed = false;
|
||||||
}
|
}
|
||||||
expect(passed).toBe(false);
|
expect(passed).toBe(false);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
|
@ -2,21 +2,23 @@ import fs from "fs";
|
||||||
import yaml from "js-yaml";
|
import yaml from "js-yaml";
|
||||||
|
|
||||||
import util from "util";
|
import util from "util";
|
||||||
import {exec as execCallback } from "child_process";
|
import { exec as execCallback } from "child_process";
|
||||||
|
|
||||||
const exec = util.promisify(execCallback);
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
|
||||||
test("check yaml config file with seed list is used", async () => {
|
test("check yaml config file with seed list is used", async () => {
|
||||||
try{
|
try {
|
||||||
|
await exec(
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0",
|
||||||
}
|
);
|
||||||
catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8");
|
const crawledPages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/configtest/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
const pages = new Set();
|
const pages = new Set();
|
||||||
|
|
||||||
for (const line of crawledPages.trim().split("\n")) {
|
for (const line of crawledPages.trim().split("\n")) {
|
||||||
|
@ -26,7 +28,9 @@ test("check yaml config file with seed list is used", async () => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const config = yaml.load(fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"));
|
const config = yaml.load(
|
||||||
|
fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"),
|
||||||
|
);
|
||||||
|
|
||||||
let foundAllSeeds = true;
|
let foundAllSeeds = true;
|
||||||
|
|
||||||
|
@ -38,20 +42,24 @@ test("check yaml config file with seed list is used", async () => {
|
||||||
}
|
}
|
||||||
expect(foundAllSeeds).toBe(true);
|
expect(foundAllSeeds).toBe(true);
|
||||||
|
|
||||||
expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/configtest/configtest.wacz"),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check yaml config file will be overwritten by command line", async () => {
|
test("check yaml config file will be overwritten by command line", async () => {
|
||||||
try{
|
try {
|
||||||
|
await exec(
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000");
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000",
|
||||||
}
|
);
|
||||||
catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8");
|
const crawledPages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/configtest-2/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
const pages = new Set();
|
const pages = new Set();
|
||||||
|
|
||||||
for (const line of crawledPages.trim().split("\n")) {
|
for (const line of crawledPages.trim().split("\n")) {
|
||||||
|
@ -63,5 +71,4 @@ test("check yaml config file will be overwritten by command line", async () => {
|
||||||
|
|
||||||
expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
|
expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
|
||||||
expect(pages.size).toBe(1);
|
expect(pages.size).toBe(1);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
|
@ -7,15 +7,20 @@ test("pass config file via stdin", async () => {
|
||||||
const config = yaml.load(configYaml);
|
const config = yaml.load(configYaml);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
const proc = child_process.execSync(
|
||||||
|
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202",
|
||||||
|
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
console.log(proc);
|
console.log(proc);
|
||||||
}
|
} catch (error) {
|
||||||
catch (error) {
|
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
|
const crawledPages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/config-stdin/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
const pages = new Set();
|
const pages = new Set();
|
||||||
|
|
||||||
for (const line of crawledPages.trim().split("\n")) {
|
for (const line of crawledPages.trim().split("\n")) {
|
||||||
|
@ -37,6 +42,7 @@ test("pass config file via stdin", async () => {
|
||||||
}
|
}
|
||||||
expect(foundAllSeeds).toBe(true);
|
expect(foundAllSeeds).toBe(true);
|
||||||
|
|
||||||
expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz"),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,31 +1,48 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
|
|
||||||
test("ensure --overwrite with existing collection results in a successful crawl", async () => {
|
test("ensure --overwrite with existing collection results in a successful crawl", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite",
|
||||||
|
);
|
||||||
|
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the WACZ file exists in the collection", () => {
|
test("check that the WACZ file exists in the collection", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
//-----------
|
//-----------
|
||||||
|
|
||||||
test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
|
test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync(
|
||||||
|
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the WACZ file exists in the collection", () => {
|
test("check that the WACZ file exists in the collection", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync(
|
||||||
|
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,23 +1,36 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
test("test custom behaviors", async () => {
|
test("test custom behaviors", async () => {
|
||||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");
|
const res = child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||||
|
);
|
||||||
|
|
||||||
const log = res.toString();
|
const log = res.toString();
|
||||||
|
|
||||||
// custom behavior ran for example.com
|
// custom behavior ran for example.com
|
||||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(true);
|
||||||
|
|
||||||
// but not for example.org
|
// but not for example.org
|
||||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(false);
|
||||||
|
|
||||||
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(true);
|
||||||
|
|
||||||
// another custom behavior ran for webrecorder.net
|
// another custom behavior ran for webrecorder.net
|
||||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);
|
expect(
|
||||||
|
log.indexOf(
|
||||||
|
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
|
||||||
|
) > 0,
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||||
class TestBehavior2
|
class TestBehavior2 {
|
||||||
{
|
|
||||||
static init() {
|
static init() {
|
||||||
return {
|
return {
|
||||||
state: {}
|
state: {},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,8 +14,7 @@ class TestBehavior2
|
||||||
return window.location.origin === "https://webrecorder.net";
|
return window.location.origin === "https://webrecorder.net";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async *run(ctx) {
|
||||||
async* run(ctx) {
|
|
||||||
ctx.log("In Test Behavior 2!");
|
ctx.log("In Test Behavior 2!");
|
||||||
yield ctx.Lib.getState(ctx, "test-stat-2");
|
yield ctx.Lib.getState(ctx, "test-stat-2");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||||
class TestBehavior
|
class TestBehavior {
|
||||||
{
|
|
||||||
static init() {
|
static init() {
|
||||||
return {
|
return {
|
||||||
state: {}
|
state: {},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,8 +14,7 @@ class TestBehavior
|
||||||
return window.location.origin === "https://example.com";
|
return window.location.origin === "https://example.com";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async *run(ctx) {
|
||||||
async* run(ctx) {
|
|
||||||
ctx.log("In Test Behavior!");
|
ctx.log("In Test Behavior!");
|
||||||
yield ctx.Lib.getState(ctx, "test-stat");
|
yield ctx.Lib.getState(ctx, "test-stat");
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,16 +1,19 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
|
|
||||||
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
||||||
try {
|
try {
|
||||||
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs");
|
child_process.execSync(
|
||||||
}
|
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
|
||||||
catch (error) {
|
);
|
||||||
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8");
|
const crawledPages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
const pages = new Set();
|
const pages = new Set();
|
||||||
|
|
||||||
for (const line of crawledPages.trim().split("\n")) {
|
for (const line of crawledPages.trim().split("\n")) {
|
||||||
|
@ -26,9 +29,8 @@ test("ensure custom driver with custom selector crawls JS files as pages", async
|
||||||
const expectedPages = new Set([
|
const expectedPages = new Set([
|
||||||
"https://www.iana.org/",
|
"https://www.iana.org/",
|
||||||
"https://www.iana.org/_js/jquery.js",
|
"https://www.iana.org/_js/jquery.js",
|
||||||
"https://www.iana.org/_js/iana.js"
|
"https://www.iana.org/_js/iana.js",
|
||||||
]);
|
]);
|
||||||
|
|
||||||
expect(pages).toEqual(expectedPages);
|
expect(pages).toEqual(expectedPages);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,42 +1,49 @@
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
import util from "util";
|
import util from "util";
|
||||||
import {exec as execCallback } from "child_process";
|
import { exec as execCallback } from "child_process";
|
||||||
|
|
||||||
const exec = util.promisify(execCallback);
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
const extraHopsTimeout = 180000;
|
const extraHopsTimeout = 180000;
|
||||||
|
|
||||||
|
test(
|
||||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
"check that URLs are crawled 2 extra hops beyond depth",
|
||||||
try {
|
async () => {
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
|
try {
|
||||||
}
|
await exec(
|
||||||
catch (error) {
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7",
|
||||||
console.log(error);
|
);
|
||||||
}
|
} catch (error) {
|
||||||
|
console.log(error);
|
||||||
const crawledPages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
|
|
||||||
const crawledPagesArray = crawledPages.trim().split("\n");
|
|
||||||
|
|
||||||
const expectedPages = [
|
|
||||||
"https://webrecorder.net/",
|
|
||||||
"https://webrecorder.net/blog",
|
|
||||||
"https://webrecorder.net/tools",
|
|
||||||
"https://webrecorder.net/community",
|
|
||||||
"https://webrecorder.net/about",
|
|
||||||
"https://webrecorder.net/contact",
|
|
||||||
"https://webrecorder.net/faq",
|
|
||||||
];
|
|
||||||
|
|
||||||
// first line is the header, not page, so adding -1
|
|
||||||
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
|
|
||||||
|
|
||||||
for (const page of crawledPagesArray) {
|
|
||||||
const url = JSON.parse(page).url;
|
|
||||||
if (!url) {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
|
||||||
}
|
const crawledPages = fs.readFileSync(
|
||||||
}, extraHopsTimeout);
|
"test-crawls/collections/extra-hops-beyond/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
const crawledPagesArray = crawledPages.trim().split("\n");
|
||||||
|
|
||||||
|
const expectedPages = [
|
||||||
|
"https://webrecorder.net/",
|
||||||
|
"https://webrecorder.net/blog",
|
||||||
|
"https://webrecorder.net/tools",
|
||||||
|
"https://webrecorder.net/community",
|
||||||
|
"https://webrecorder.net/about",
|
||||||
|
"https://webrecorder.net/contact",
|
||||||
|
"https://webrecorder.net/faq",
|
||||||
|
];
|
||||||
|
|
||||||
|
// first line is the header, not page, so adding -1
|
||||||
|
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
|
||||||
|
|
||||||
|
for (const page of crawledPagesArray) {
|
||||||
|
const url = JSON.parse(page).url;
|
||||||
|
if (!url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
extraHopsTimeout,
|
||||||
|
);
|
||||||
|
|
|
@ -2,17 +2,18 @@ import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
test("ensure that stats file is modified", async () => {
|
test("ensure that stats file is modified", async () => {
|
||||||
|
const child = child_process.exec(
|
||||||
const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json");
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
|
||||||
|
);
|
||||||
|
|
||||||
// detect crawler exit
|
// detect crawler exit
|
||||||
let crawler_exited = false;
|
let crawler_exited = false;
|
||||||
child.on("exit", function() {
|
child.on("exit", function () {
|
||||||
crawler_exited = true;
|
crawler_exited = true;
|
||||||
});
|
});
|
||||||
|
|
||||||
// helper function to sleep
|
// helper function to sleep
|
||||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
|
||||||
|
|
||||||
// wait for stats file creation up to 30 secs (to not wait indefinitely)
|
// wait for stats file creation up to 30 secs (to not wait indefinitely)
|
||||||
let counter = 0;
|
let counter = 0;
|
||||||
|
@ -23,7 +24,9 @@ test("ensure that stats file is modified", async () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
// get initial modification time
|
// get initial modification time
|
||||||
const initial_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime;
|
const initial_mtime = fs.fstatSync(
|
||||||
|
fs.openSync("test-crawls/progress.json", "r"),
|
||||||
|
).mtime;
|
||||||
|
|
||||||
// wait for crawler exit
|
// wait for crawler exit
|
||||||
while (!crawler_exited) {
|
while (!crawler_exited) {
|
||||||
|
@ -31,12 +34,13 @@ test("ensure that stats file is modified", async () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
// get final modification time
|
// get final modification time
|
||||||
const final_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime;
|
const final_mtime = fs.fstatSync(
|
||||||
|
fs.openSync("test-crawls/progress.json", "r"),
|
||||||
|
).mtime;
|
||||||
|
|
||||||
// compare initial and final modification time
|
// compare initial and final modification time
|
||||||
const diff = Math.abs(final_mtime - initial_mtime);
|
const diff = Math.abs(final_mtime - initial_mtime);
|
||||||
expect(diff > 0).toBe(true);
|
expect(diff > 0).toBe(true);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that stats file format is correct", () => {
|
test("check that stats file format is correct", () => {
|
||||||
|
|
1
tests/fixtures/crawl-1.yaml
vendored
1
tests/fixtures/crawl-1.yaml
vendored
|
@ -5,4 +5,3 @@ seeds:
|
||||||
- https://specs.webrecorder.net/
|
- https://specs.webrecorder.net/
|
||||||
|
|
||||||
generateWACZ: true
|
generateWACZ: true
|
||||||
|
|
||||||
|
|
7
tests/fixtures/driver-1.mjs
vendored
7
tests/fixtures/driver-1.mjs
vendored
|
@ -1,4 +1,5 @@
|
||||||
export default async ({data, page, crawler}) => {
|
export default async ({ data, page, crawler }) => {
|
||||||
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
|
await crawler.loadPage(page, data, [
|
||||||
|
{ selector: "script[src]", extract: "src", isAttribute: false },
|
||||||
|
]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,9 @@ import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
test("ensure page limit reached", async () => {
|
test("ensure page limit reached", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors \"\" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json");
|
child_process.execSync(
|
||||||
|
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check limit written to stats file is as expected", () => {
|
test("check limit written to stats file is as expected", () => {
|
||||||
|
|
|
@ -2,9 +2,9 @@ import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
|
|
||||||
|
|
||||||
function jsonLinesToArray(string) {
|
function jsonLinesToArray(string) {
|
||||||
return string.split("\n")
|
return string
|
||||||
|
.split("\n")
|
||||||
.filter((line) => {
|
.filter((line) => {
|
||||||
try {
|
try {
|
||||||
JSON.parse(line);
|
JSON.parse(line);
|
||||||
|
@ -13,19 +13,19 @@ function jsonLinesToArray(string) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.map(line => JSON.parse(line));
|
.map((line) => JSON.parse(line));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
test("ensure crawl run with log options passes", async () => {
|
test("ensure crawl run with log options passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("check that log files exist and were filtered according to options", () => {
|
test("check that log files exist and were filtered according to options", () => {
|
||||||
const logDir = "test-crawls/collections/wr-specs-logs/logs/";
|
const logDir = "test-crawls/collections/wr-specs-logs/logs/";
|
||||||
const logFiles = [];
|
const logFiles = [];
|
||||||
fs.readdirSync(logDir).forEach(file => {
|
fs.readdirSync(logDir).forEach((file) => {
|
||||||
if (file.startsWith("crawl-") && file.endsWith(".log")) {
|
if (file.startsWith("crawl-") && file.endsWith(".log")) {
|
||||||
logFiles.push(path.join(logDir, file));
|
logFiles.push(path.join(logDir, file));
|
||||||
}
|
}
|
||||||
|
@ -33,14 +33,16 @@ test("check that log files exist and were filtered according to options", () =>
|
||||||
|
|
||||||
expect(logFiles.length).toBeGreaterThan(0);
|
expect(logFiles.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
for (let i=0; i < logFiles.length; i++) {
|
for (let i = 0; i < logFiles.length; i++) {
|
||||||
const logFile = logFiles[i];
|
const logFile = logFiles[i];
|
||||||
const parsedJSONLines = jsonLinesToArray(fs.readFileSync(logFile, "utf8"));
|
const parsedJSONLines = jsonLinesToArray(fs.readFileSync(logFile, "utf8"));
|
||||||
|
|
||||||
expect(parsedJSONLines.length).toBeGreaterThan(0);
|
expect(parsedJSONLines.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
parsedJSONLines.forEach((jsonLine) => {
|
parsedJSONLines.forEach((jsonLine) => {
|
||||||
expect(jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn").toBe(true);
|
expect(
|
||||||
|
jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn",
|
||||||
|
).toBe(true);
|
||||||
expect(jsonLine.context).toBe("general");
|
expect(jsonLine.context).toBe("general");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,24 +2,47 @@ import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
test("ensure multi url crawl run with docker run passes", async () => {
|
test("ensure multi url crawl run with docker run passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2");
|
child_process.execSync(
|
||||||
|
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz");
|
);
|
||||||
|
|
||||||
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that the favicon made it into the pages jsonl file", () => {
|
test("check that the favicon made it into the pages jsonl file", () => {
|
||||||
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true);
|
expect(
|
||||||
|
fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl"),
|
||||||
|
).toBe(true);
|
||||||
|
|
||||||
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]);
|
const data1 = JSON.parse(
|
||||||
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]);
|
fs
|
||||||
const data = [ data1, data2 ];
|
.readFileSync(
|
||||||
|
"test-crawls/collections/advanced/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
)
|
||||||
|
.split("\n")[1],
|
||||||
|
);
|
||||||
|
const data2 = JSON.parse(
|
||||||
|
fs
|
||||||
|
.readFileSync(
|
||||||
|
"test-crawls/collections/advanced/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
)
|
||||||
|
.split("\n")[2],
|
||||||
|
);
|
||||||
|
const data = [data1, data2];
|
||||||
for (const d of data) {
|
for (const d of data) {
|
||||||
if (d.url === "https://webrecorder.net/") {
|
if (d.url === "https://webrecorder.net/") {
|
||||||
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico");
|
expect(d.favIconUrl).toEqual(
|
||||||
|
"https://webrecorder.net/assets/favicon.ico",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
if (d.url === "https://iana.org/") {
|
if (d.url === "https://iana.org/") {
|
||||||
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico");
|
expect(d.favIconUrl).toEqual(
|
||||||
|
"https://www.iana.org/_img/bookmark_icon.ico",
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,14 +1,19 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
|
|
||||||
test("ensure crawl run with redis passes", async () => {
|
test("ensure crawl run with redis passes", async () => {
|
||||||
const redis = child_process.spawn("docker run -d --name test-crawl-redis -p 6379:6379 redis");
|
const redis = child_process.spawn(
|
||||||
|
"docker run -d --name test-crawl-redis -p 6379:6379 redis",
|
||||||
|
);
|
||||||
|
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2",
|
||||||
|
);
|
||||||
|
|
||||||
redis.kill("SIGINT");
|
redis.kill("SIGINT");
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that wacz created is valid", () => {
|
test("check that wacz created is valid", () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
|
@ -13,7 +13,7 @@ function waitForProcess() {
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
return {p, callback};
|
return { p, callback };
|
||||||
}
|
}
|
||||||
|
|
||||||
var savedStateFile;
|
var savedStateFile;
|
||||||
|
@ -28,9 +28,12 @@ test("check crawl interrupted + saved state written", async () => {
|
||||||
const wait = waitForProcess();
|
const wait = waitForProcess();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
proc = exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20", {"shell": "/bin/bash"}, wait.callback);
|
proc = exec(
|
||||||
}
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20",
|
||||||
catch (error) {
|
{ shell: "/bin/bash" },
|
||||||
|
wait.callback,
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -45,12 +48,15 @@ test("check crawl interrupted + saved state written", async () => {
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
const pages = fs.readFileSync(pagesFile, {encoding: "utf-8"}).trim().split("\n");
|
const pages = fs
|
||||||
|
.readFileSync(pagesFile, { encoding: "utf-8" })
|
||||||
|
.trim()
|
||||||
|
.split("\n");
|
||||||
|
|
||||||
if (pages.length >= 2) {
|
if (pages.length >= 2) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} catch(e) {
|
} catch (e) {
|
||||||
// ignore
|
// ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,17 +67,21 @@ test("check crawl interrupted + saved state written", async () => {
|
||||||
|
|
||||||
await wait.p;
|
await wait.p;
|
||||||
|
|
||||||
const savedStates = fs.readdirSync("test-crawls/collections/int-state-test/crawls");
|
const savedStates = fs.readdirSync(
|
||||||
|
"test-crawls/collections/int-state-test/crawls",
|
||||||
|
);
|
||||||
expect(savedStates.length > 0).toEqual(true);
|
expect(savedStates.length > 0).toEqual(true);
|
||||||
|
|
||||||
savedStateFile = savedStates[savedStates.length - 1];
|
savedStateFile = savedStates[savedStates.length - 1];
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("check parsing saved state + page done + queue present", () => {
|
test("check parsing saved state + page done + queue present", () => {
|
||||||
expect(savedStateFile).toBeTruthy();
|
expect(savedStateFile).toBeTruthy();
|
||||||
|
|
||||||
const savedState = fs.readFileSync(path.join("test-crawls/collections/int-state-test/crawls", savedStateFile), "utf-8");
|
const savedState = fs.readFileSync(
|
||||||
|
path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
|
||||||
|
"utf-8",
|
||||||
|
);
|
||||||
|
|
||||||
const saved = yaml.load(savedState);
|
const saved = yaml.load(savedState);
|
||||||
|
|
||||||
|
@ -82,31 +92,33 @@ test("check parsing saved state + page done + queue present", () => {
|
||||||
|
|
||||||
expect(state.done > 0).toEqual(true);
|
expect(state.done > 0).toEqual(true);
|
||||||
expect(state.queued.length > 0).toEqual(true);
|
expect(state.queued.length > 0).toEqual(true);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("check crawl restarted with saved state", async () => {
|
test("check crawl restarted with saved state", async () => {
|
||||||
let proc = null;
|
let proc = null;
|
||||||
|
|
||||||
const wait = waitForProcess();
|
const wait = waitForProcess();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
|
proc = exec(
|
||||||
|
`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
|
||||||
|
{ shell: "/bin/bash" },
|
||||||
|
wait.callback,
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
|
||||||
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await redis.connect({
|
await redis.connect({
|
||||||
maxRetriesPerRequest: 100,
|
maxRetriesPerRequest: 100,
|
||||||
retryStrategy(times) {
|
retryStrategy(times) {
|
||||||
return times < 100 ? 1000 : null;
|
return times < 100 ? 1000 : null;
|
||||||
}
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||||
|
@ -126,5 +138,3 @@ test("interrupt crawl and exit", async () => {
|
||||||
|
|
||||||
expect(res[0].value).toBe(0);
|
expect(res[0].value).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -23,12 +23,10 @@ seeds:
|
||||||
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("prefix");
|
expect(seeds[0].scopeType).toEqual("prefix");
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||||
expect(seeds[0].exclude).toEqual([]);
|
expect(seeds[0].exclude).toEqual([]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("default scope + exclude", async () => {
|
test("default scope + exclude", async () => {
|
||||||
|
@ -40,15 +38,12 @@ exclude: https://example.com/pathexclude
|
||||||
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("prefix");
|
expect(seeds[0].scopeType).toEqual("prefix");
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("default scope + exclude is numeric", async () => {
|
test("default scope + exclude is numeric", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -58,17 +53,12 @@ exclude: "2022"
|
||||||
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("prefix");
|
expect(seeds[0].scopeType).toEqual("prefix");
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||||
expect(seeds[0].exclude).toEqual([/2022/]);
|
expect(seeds[0].exclude).toEqual([/2022/]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
test("prefix scope global + exclude", async () => {
|
test("prefix scope global + exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -79,15 +69,12 @@ exclude: https://example.com/pathexclude
|
||||||
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("prefix");
|
expect(seeds[0].scopeType).toEqual("prefix");
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("prefix scope per seed + exclude", async () => {
|
test("prefix scope per seed + exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -98,15 +85,12 @@ exclude: https://example.com/pathexclude
|
||||||
|
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("prefix");
|
expect(seeds[0].scopeType).toEqual("prefix");
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("host scope and domain scope", async () => {
|
test("host scope and domain scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
@ -123,20 +107,26 @@ seeds:
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
||||||
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
|
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
|
||||||
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
|
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
|
||||||
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(true);
|
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(
|
||||||
expect(!!seeds[0].include[0].exec("https://sub.domain.example.com/path")).toEqual(true);
|
true,
|
||||||
expect(!!seeds[0].include[0].exec("https://notsub.domainexample.com/path")).toEqual(false);
|
);
|
||||||
|
expect(
|
||||||
|
!!seeds[0].include[0].exec("https://sub.domain.example.com/path"),
|
||||||
|
).toEqual(true);
|
||||||
|
expect(
|
||||||
|
!!seeds[0].include[0].exec("https://notsub.domainexample.com/path"),
|
||||||
|
).toEqual(false);
|
||||||
|
|
||||||
expect(seeds[1].scopeType).toEqual("host");
|
expect(seeds[1].scopeType).toEqual("host");
|
||||||
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
|
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
|
||||||
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
|
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
|
||||||
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
|
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
|
||||||
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(false);
|
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(
|
||||||
|
false,
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("domain scope drop www.", async () => {
|
test("domain scope drop www.", async () => {
|
||||||
|
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://www.example.com/
|
- url: https://www.example.com/
|
||||||
|
@ -146,11 +136,8 @@ seeds:
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("domain");
|
expect(seeds[0].scopeType).toEqual("domain");
|
||||||
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
test("custom scope", async () => {
|
test("custom scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -159,14 +146,12 @@ seeds:
|
||||||
exclude: https?://example.com/pathexclude
|
exclude: https?://example.com/pathexclude
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(1);
|
expect(seeds.length).toEqual(1);
|
||||||
expect(seeds[0].scopeType).toEqual("custom");
|
expect(seeds[0].scopeType).toEqual("custom");
|
||||||
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
||||||
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
|
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("inherit scope", async () => {
|
test("inherit scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
@ -178,7 +163,6 @@ include: https?://example.com/(path|other)
|
||||||
exclude: https://example.com/pathexclude
|
exclude: https://example.com/pathexclude
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
|
||||||
expect(seeds.length).toEqual(2);
|
expect(seeds.length).toEqual(2);
|
||||||
|
|
||||||
expect(seeds[0].scopeType).toEqual("custom");
|
expect(seeds[0].scopeType).toEqual("custom");
|
||||||
|
@ -190,10 +174,8 @@ exclude: https://example.com/pathexclude
|
||||||
expect(seeds[1].url).toEqual("https://example.com/2");
|
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||||
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
||||||
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("override scope", async () => {
|
test("override scope", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
@ -225,7 +207,10 @@ include: https://example.com/onlythispath
|
||||||
|
|
||||||
expect(seeds[2].scopeType).toEqual("prefix");
|
expect(seeds[2].scopeType).toEqual("prefix");
|
||||||
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
||||||
expect(seeds[2].include).toEqual([/^https?:\/\/example\.com\/subpath\//, /https:\/\/example.com\/onlythispath/]);
|
expect(seeds[2].include).toEqual([
|
||||||
|
/^https?:\/\/example\.com\/subpath\//,
|
||||||
|
/https:\/\/example.com\/onlythispath/,
|
||||||
|
]);
|
||||||
expect(seeds[2].exclude).toEqual([]);
|
expect(seeds[2].exclude).toEqual([]);
|
||||||
|
|
||||||
expect(seeds[3].scopeType).toEqual("custom");
|
expect(seeds[3].scopeType).toEqual("custom");
|
||||||
|
@ -234,7 +219,6 @@ include: https://example.com/onlythispath
|
||||||
expect(seeds[3].exclude).toEqual([]);
|
expect(seeds[3].exclude).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("override scope with exclude", async () => {
|
test("override scope with exclude", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
|
|
||||||
|
@ -288,10 +272,8 @@ exclude:
|
||||||
expect(seeds[4].url).toEqual("https://example.com/4");
|
expect(seeds[4].url).toEqual("https://example.com/4");
|
||||||
expect(seeds[4].include).toEqual([]);
|
expect(seeds[4].include).toEqual([]);
|
||||||
expect(seeds[4].exclude).toEqual([]);
|
expect(seeds[4].exclude).toEqual([]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("with exclude non-string types", async () => {
|
test("with exclude non-string types", async () => {
|
||||||
const seeds = getSeeds(`
|
const seeds = getSeeds(`
|
||||||
seeds:
|
seeds:
|
||||||
|
@ -342,5 +324,4 @@ seeds:
|
||||||
expect(seeds[7].exclude).toEqual([/null/]);
|
expect(seeds[7].exclude).toEqual([/null/]);
|
||||||
expect(seeds[8].exclude).toEqual([/false/]);
|
expect(seeds[8].exclude).toEqual([/false/]);
|
||||||
expect(seeds[9].exclude).toEqual([/true/]);
|
expect(seeds[9].exclude).toEqual([/true/]);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
|
@ -4,48 +4,66 @@ import fs from "fs";
|
||||||
// screenshot
|
// screenshot
|
||||||
|
|
||||||
test("ensure basic crawl run with --screenshot passes", async () => {
|
test("ensure basic crawl run with --screenshot passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that a screenshots warc file exists in the test collection", () => {
|
test("check that a screenshots warc file exists in the test collection", () => {
|
||||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/test/archive/screenshots.warc.gz");
|
const screenshotWarcExists = fs.existsSync(
|
||||||
|
"test-crawls/collections/test/archive/screenshots.warc.gz",
|
||||||
|
);
|
||||||
expect(screenshotWarcExists).toBe(true);
|
expect(screenshotWarcExists).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
// fullPageScreenshot
|
// fullPageScreenshot
|
||||||
|
|
||||||
test("ensure basic crawl run with --fullPageScreenshot passes", async () => {
|
test("ensure basic crawl run with --fullPageScreenshot passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that a screenshots warc file exists in the fullpage collection", () => {
|
test("check that a screenshots warc file exists in the fullpage collection", () => {
|
||||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/fullpage/archive/screenshots.warc.gz");
|
const screenshotWarcExists = fs.existsSync(
|
||||||
|
"test-crawls/collections/fullpage/archive/screenshots.warc.gz",
|
||||||
|
);
|
||||||
expect(screenshotWarcExists).toBe(true);
|
expect(screenshotWarcExists).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
// thumbnail
|
// thumbnail
|
||||||
|
|
||||||
test("ensure basic crawl run with --thumbnail passes", async () => {
|
test("ensure basic crawl run with --thumbnail passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that a screenshots warc file exists in the thumbnail collection", () => {
|
test("check that a screenshots warc file exists in the thumbnail collection", () => {
|
||||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/thumbnail/archive/screenshots.warc.gz");
|
const screenshotWarcExists = fs.existsSync(
|
||||||
|
"test-crawls/collections/thumbnail/archive/screenshots.warc.gz",
|
||||||
|
);
|
||||||
expect(screenshotWarcExists).toBe(true);
|
expect(screenshotWarcExists).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
// combination
|
// combination
|
||||||
|
|
||||||
test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => {
|
test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2",
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that a screenshots warc file exists in the combined collection", () => {
|
test("check that a screenshots warc file exists in the combined collection", () => {
|
||||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/combined/archive/screenshots.warc.gz");
|
const screenshotWarcExists = fs.existsSync(
|
||||||
|
"test-crawls/collections/combined/archive/screenshots.warc.gz",
|
||||||
|
);
|
||||||
expect(screenshotWarcExists).toBe(true);
|
expect(screenshotWarcExists).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("check that a wacz file exists in the combined collection", () => {
|
test("check that a wacz file exists in the combined collection", () => {
|
||||||
const waczExists = fs.existsSync("test-crawls/collections/combined/combined.wacz");
|
const waczExists = fs.existsSync(
|
||||||
|
"test-crawls/collections/combined/combined.wacz",
|
||||||
|
);
|
||||||
expect(waczExists).toBe(true);
|
expect(waczExists).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
import util from "util";
|
import util from "util";
|
||||||
import {exec as execCallback } from "child_process";
|
import { exec as execCallback } from "child_process";
|
||||||
|
|
||||||
const exec = util.promisify(execCallback);
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
|
|
||||||
test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
|
test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
|
||||||
let passed = true;
|
let passed = true;
|
||||||
try {
|
try {
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed");
|
await exec(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed",
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
passed = false;
|
passed = false;
|
||||||
|
@ -18,9 +19,10 @@ test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set",
|
||||||
test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
|
test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
|
||||||
let passed = true;
|
let passed = true;
|
||||||
try {
|
try {
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed");
|
await exec(
|
||||||
}
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed",
|
||||||
catch (error) {
|
);
|
||||||
|
} catch (error) {
|
||||||
passed = false;
|
passed = false;
|
||||||
}
|
}
|
||||||
expect(passed).toBe(false);
|
expect(passed).toBe(false);
|
||||||
|
@ -29,9 +31,10 @@ test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async ()
|
||||||
test("ensure crawl fails if no valid seeds are passed", async () => {
|
test("ensure crawl fails if no valid seeds are passed", async () => {
|
||||||
let passed = true;
|
let passed = true;
|
||||||
try {
|
try {
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds");
|
await exec(
|
||||||
}
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds",
|
||||||
catch (error) {
|
);
|
||||||
|
} catch (error) {
|
||||||
passed = false;
|
passed = false;
|
||||||
}
|
}
|
||||||
expect(passed).toBe(false);
|
expect(passed).toBe(false);
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import { calculatePercentageUsed, checkDiskUtilization } from "../dist/util/storage.js";
|
import {
|
||||||
|
calculatePercentageUsed,
|
||||||
|
checkDiskUtilization,
|
||||||
|
} from "../dist/util/storage.js";
|
||||||
|
|
||||||
test("ensure calculatePercentageUsed returns expected values", () => {
|
test("ensure calculatePercentageUsed returns expected values", () => {
|
||||||
expect(calculatePercentageUsed(30, 100)).toEqual(30);
|
expect(calculatePercentageUsed(30, 100)).toEqual(30);
|
||||||
|
@ -13,13 +15,11 @@ test("ensure calculatePercentageUsed returns expected values", () => {
|
||||||
expect(calculatePercentageUsed(0, 5)).toEqual(0);
|
expect(calculatePercentageUsed(0, 5)).toEqual(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("verify end-to-end disk utilization not exceeded threshold", async () => {
|
test("verify end-to-end disk utilization not exceeded threshold", async () => {
|
||||||
|
|
||||||
const params = {
|
const params = {
|
||||||
diskUtilization: 90,
|
diskUtilization: 90,
|
||||||
combineWARC: true,
|
combineWARC: true,
|
||||||
generateWACZ: true
|
generateWACZ: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const mockDfOutput = `\
|
const mockDfOutput = `\
|
||||||
|
@ -28,22 +28,24 @@ grpcfuse 1000000 285000 715000 28% /crawls`;
|
||||||
|
|
||||||
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
|
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
|
||||||
// does not exceed 90% threshold
|
// does not exceed 90% threshold
|
||||||
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput);
|
const returnValue = await checkDiskUtilization(
|
||||||
|
params,
|
||||||
|
5000 * 1024,
|
||||||
|
mockDfOutput,
|
||||||
|
);
|
||||||
expect(returnValue).toEqual({
|
expect(returnValue).toEqual({
|
||||||
stop: false,
|
stop: false,
|
||||||
used: 28,
|
used: 28,
|
||||||
projected: 31,
|
projected: 31,
|
||||||
threshold: 90
|
threshold: 90,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
test("verify end-to-end disk utilization exceeds threshold", async () => {
|
test("verify end-to-end disk utilization exceeds threshold", async () => {
|
||||||
|
|
||||||
const params = {
|
const params = {
|
||||||
diskUtilization: 90,
|
diskUtilization: 90,
|
||||||
combineWARC: false,
|
combineWARC: false,
|
||||||
generateWACZ: true
|
generateWACZ: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
const mockDfOutput = `\
|
const mockDfOutput = `\
|
||||||
|
@ -52,11 +54,15 @@ grpcfuse 100000 85000 15000 85% /crawls`;
|
||||||
|
|
||||||
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
|
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
|
||||||
// exceeds 90% threshold
|
// exceeds 90% threshold
|
||||||
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput);
|
const returnValue = await checkDiskUtilization(
|
||||||
|
params,
|
||||||
|
3000 * 1024,
|
||||||
|
mockDfOutput,
|
||||||
|
);
|
||||||
expect(returnValue).toEqual({
|
expect(returnValue).toEqual({
|
||||||
stop: true,
|
stop: true,
|
||||||
used: 85,
|
used: 85,
|
||||||
projected: 91,
|
projected: 91,
|
||||||
threshold: 90
|
threshold: 90,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
|
@ -3,16 +3,20 @@ import child_process from "child_process";
|
||||||
|
|
||||||
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
|
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
|
||||||
try {
|
try {
|
||||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
|
||||||
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
//console.log(new TextDecoder().decode(error));
|
//console.log(new TextDecoder().decode(error));
|
||||||
console.log(error.stderr);
|
console.log(error.stderr);
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});
|
const data = fs.readFileSync(
|
||||||
|
"test-crawls/collections/text-extract/indexes/index.cdxj",
|
||||||
|
{ encoding: "utf-8" },
|
||||||
|
);
|
||||||
|
|
||||||
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
||||||
|
|
||||||
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -1,24 +1,30 @@
|
||||||
import util from "util";
|
import util from "util";
|
||||||
import {exec as execCallback } from "child_process";
|
import { exec as execCallback } from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
const exec = util.promisify(execCallback);
|
const exec = util.promisify(execCallback);
|
||||||
|
|
||||||
test("check that URLs in seed-list are crawled", async () => {
|
test("check that URLs in seed-list are crawled", async () => {
|
||||||
try {
|
try {
|
||||||
|
await exec(
|
||||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000");
|
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
|
||||||
}
|
);
|
||||||
catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
|
let crawled_pages = fs.readFileSync(
|
||||||
let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort();
|
"test-crawls/collections/filelisttest/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
let seed_file = fs
|
||||||
|
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
||||||
|
.split("\n")
|
||||||
|
.sort();
|
||||||
|
|
||||||
let seed_file_list = [];
|
let seed_file_list = [];
|
||||||
for (var j = 0; j < seed_file.length; j++) {
|
for (var j = 0; j < seed_file.length; j++) {
|
||||||
if (seed_file[j] != undefined){
|
if (seed_file[j] != undefined) {
|
||||||
seed_file_list.push(seed_file[j]);
|
seed_file_list.push(seed_file[j]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,7 +32,7 @@ test("check that URLs in seed-list are crawled", async () => {
|
||||||
let foundSeedUrl = true;
|
let foundSeedUrl = true;
|
||||||
|
|
||||||
for (var i = 1; i < seed_file_list.length; i++) {
|
for (var i = 1; i < seed_file_list.length; i++) {
|
||||||
if (crawled_pages.indexOf(seed_file_list[i]) == -1){
|
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
||||||
foundSeedUrl = false;
|
foundSeedUrl = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,17 +3,21 @@ import zlib from "zlib";
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
|
|
||||||
test("check that the warcinfo file works as expected on the command line", async () => {
|
test("check that the warcinfo file works as expected on the command line", async () => {
|
||||||
try{
|
try {
|
||||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
const proc = child_process.execSync(
|
||||||
|
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC",
|
||||||
|
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||||
|
);
|
||||||
|
|
||||||
console.log(proc);
|
console.log(proc);
|
||||||
}
|
} catch (error) {
|
||||||
catch (error) {
|
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");
|
const warcData = fs.readFileSync(
|
||||||
|
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
||||||
|
);
|
||||||
|
|
||||||
const data = zlib.gunzipSync(warcData);
|
const data = zlib.gunzipSync(warcData);
|
||||||
|
|
||||||
|
@ -21,8 +25,8 @@ test("check that the warcinfo file works as expected on the command line", async
|
||||||
|
|
||||||
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||||
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||||
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
|
expect(
|
||||||
|
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
||||||
|
).not.toEqual(null);
|
||||||
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
||||||
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
|
@ -11,8 +11,12 @@
|
||||||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||||
|
|
||||||
/* Language and Environment */
|
/* Language and Environment */
|
||||||
"target": "es2022", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
"target": "es2022" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
|
||||||
"lib": ["es2022", "dom", "dom.iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
"lib": [
|
||||||
|
"es2022",
|
||||||
|
"dom",
|
||||||
|
"dom.iterable"
|
||||||
|
] /* Specify a set of bundled library declaration files that describe the target runtime environment. */,
|
||||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||||
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
|
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
|
||||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||||
|
@ -25,9 +29,9 @@
|
||||||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||||
|
|
||||||
/* Modules */
|
/* Modules */
|
||||||
"module": "NodeNext", /* Specify what module code is generated. */
|
"module": "NodeNext" /* Specify what module code is generated. */,
|
||||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
"rootDir": "./src" /* Specify the root folder within your source files. */,
|
||||||
"moduleResolution": "NodeNext", /* Specify how TypeScript looks up a file from a given module specifier. */
|
"moduleResolution": "NodeNext" /* Specify how TypeScript looks up a file from a given module specifier. */,
|
||||||
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
|
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
|
||||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||||
|
@ -39,8 +43,8 @@
|
||||||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||||
|
|
||||||
/* JavaScript Support */
|
/* JavaScript Support */
|
||||||
"allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
"allowJs": true /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */,
|
||||||
"checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
"checkJs": true /* Enable error reporting in type-checked JavaScript files. */,
|
||||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||||
|
|
||||||
/* Emit */
|
/* Emit */
|
||||||
|
@ -49,7 +53,7 @@
|
||||||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||||
"outDir": "./dist/", /* Specify an output folder for all emitted files. */
|
"outDir": "./dist/" /* Specify an output folder for all emitted files. */,
|
||||||
// "removeComments": true, /* Disable emitting comments. */
|
// "removeComments": true, /* Disable emitting comments. */
|
||||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||||
|
@ -73,10 +77,10 @@
|
||||||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||||
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||||
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
"forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
|
||||||
|
|
||||||
/* Type Checking */
|
/* Type Checking */
|
||||||
"strict": true, /* Enable all strict type-checking options. */
|
"strict": true /* Enable all strict type-checking options. */,
|
||||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||||
|
@ -98,10 +102,8 @@
|
||||||
|
|
||||||
/* Completeness */
|
/* Completeness */
|
||||||
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||||
},
|
},
|
||||||
|
|
||||||
"include": [
|
"include": ["src/**/*"]
|
||||||
"src/**/*",
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
|
|
10
yarn.lock
10
yarn.lock
|
@ -1914,6 +1914,11 @@ escodegen@^2.1.0:
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
source-map "~0.6.1"
|
source-map "~0.6.1"
|
||||||
|
|
||||||
|
eslint-config-prettier@^9.0.0:
|
||||||
|
version "9.0.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/eslint-config-prettier/-/eslint-config-prettier-9.0.0.tgz#eb25485946dd0c66cd216a46232dc05451518d1f"
|
||||||
|
integrity sha512-IcJsTkJae2S35pRsRAwoCE+925rJJStOdkKnLVgtE+tEpqU0EVVM7OqrwxqgptKdX29NUwC82I5pXsGFIgSevw==
|
||||||
|
|
||||||
eslint-plugin-react@^7.22.0:
|
eslint-plugin-react@^7.22.0:
|
||||||
version "7.23.2"
|
version "7.23.2"
|
||||||
resolved "https://registry.yarnpkg.com/eslint-plugin-react/-/eslint-plugin-react-7.23.2.tgz#2d2291b0f95c03728b55869f01102290e792d494"
|
resolved "https://registry.yarnpkg.com/eslint-plugin-react/-/eslint-plugin-react-7.23.2.tgz#2d2291b0f95c03728b55869f01102290e792d494"
|
||||||
|
@ -3829,6 +3834,11 @@ prelude-ls@^1.2.1:
|
||||||
resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396"
|
resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396"
|
||||||
integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==
|
integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==
|
||||||
|
|
||||||
|
prettier@3.0.3:
|
||||||
|
version "3.0.3"
|
||||||
|
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.0.3.tgz#432a51f7ba422d1469096c0fdc28e235db8f9643"
|
||||||
|
integrity sha512-L/4pUDMxcNa8R/EthV08Zt42WBO4h1rarVtK0K+QJG0X187OLo7l699jWw0GKuwzkPQ//jMFA/8Xm6Fh3J/DAg==
|
||||||
|
|
||||||
pretty-format@^29.2.1:
|
pretty-format@^29.2.1:
|
||||||
version "29.2.1"
|
version "29.2.1"
|
||||||
resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.2.1.tgz#86e7748fe8bbc96a6a4e04fa99172630907a9611"
|
resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.2.1.tgz#86e7748fe8bbc96a6a4e04fa99172630907a9611"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue