mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Add Prettier to the repo, and format all the files! (#428)
This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
This commit is contained in:
parent
af1e0860e4
commit
2a49406df7
70 changed files with 3192 additions and 2026 deletions
|
@ -5,7 +5,11 @@ module.exports = {
|
|||
node: true,
|
||||
jest: true,
|
||||
},
|
||||
extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"],
|
||||
extends: [
|
||||
"eslint:recommended",
|
||||
"plugin:@typescript-eslint/recommended",
|
||||
"prettier",
|
||||
],
|
||||
parser: "@typescript-eslint/parser",
|
||||
plugins: ["@typescript-eslint"],
|
||||
parserOptions: {
|
||||
|
@ -13,10 +17,6 @@ module.exports = {
|
|||
sourceType: "module",
|
||||
},
|
||||
rules: {
|
||||
indent: ["error", 2],
|
||||
"linebreak-style": ["error", "unix"],
|
||||
quotes: ["error", "double"],
|
||||
semi: ["error", "always"],
|
||||
"no-constant-condition": ["error", { checkLoops: false }],
|
||||
"no-use-before-define": [
|
||||
"error",
|
||||
|
|
53
.github/workflows/ci.yaml
vendored
53
.github/workflows/ci.yaml
vendored
|
@ -6,7 +6,6 @@ on:
|
|||
|
||||
jobs:
|
||||
lint:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
|
@ -14,18 +13,17 @@ jobs:
|
|||
node-version: [18.x]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: install requirements
|
||||
run: yarn install
|
||||
- name: run linter
|
||||
run: yarn lint
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: install requirements
|
||||
run: yarn install
|
||||
- name: run linter
|
||||
run: yarn lint && yarn format
|
||||
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
|
@ -33,21 +31,16 @@ jobs:
|
|||
node-version: [18.x]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: install requirements
|
||||
run: yarn install
|
||||
- name: build js
|
||||
run: yarn run tsc
|
||||
- name: build docker
|
||||
run: docker-compose build
|
||||
- name: run jest
|
||||
run: sudo yarn test
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
- name: install requirements
|
||||
run: yarn install
|
||||
- name: build js
|
||||
run: yarn run tsc
|
||||
- name: build docker
|
||||
run: docker-compose build
|
||||
- name: run jest
|
||||
run: sudo yarn test
|
||||
|
|
22
.github/workflows/release.yaml
vendored
22
.github/workflows/release.yaml
vendored
|
@ -8,12 +8,10 @@ jobs:
|
|||
name: Build x86 and ARM Images and push to Dockerhub
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
-
|
||||
name: Check out the repo
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
-
|
||||
name: Docker image metadata
|
||||
- name: Docker image metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
|
@ -21,23 +19,19 @@ jobs:
|
|||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
|
||||
-
|
||||
name: Set up QEMU
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
platforms: arm64
|
||||
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
-
|
||||
name: Login to DockerHub
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
-
|
||||
name: Build and push
|
||||
- name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
|
@ -45,7 +39,5 @@ jobs:
|
|||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
-
|
||||
name: Image digest
|
||||
- name: Image digest
|
||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -6,3 +6,4 @@ node_modules/
|
|||
crawls/
|
||||
test-crawls/
|
||||
.DS_Store
|
||||
dist
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env sh
|
||||
. "$(dirname -- "$0")/_/husky.sh"
|
||||
|
||||
yarn lint
|
||||
yarn lint:fix
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
repos:
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: husky-run-pre-commit
|
||||
name: husky
|
||||
language: system
|
||||
entry: .husky/pre-commit
|
||||
pass_filenames: false
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: husky-run-pre-commit
|
||||
name: husky
|
||||
language: system
|
||||
entry: .husky/pre-commit
|
||||
pass_filenames: false
|
||||
|
|
1
.prettierignore
Normal file
1
.prettierignore
Normal file
|
@ -0,0 +1 @@
|
|||
dist
|
18
CHANGES.md
18
CHANGES.md
|
@ -1,11 +1,13 @@
|
|||
## CHANGES
|
||||
|
||||
v0.8.1
|
||||
|
||||
- Logging and Behavior Tweaks by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/229
|
||||
- Fix typos by @stavares843 in https://github.com/webrecorder/browsertrix-crawler/pull/232
|
||||
- Add crawl log to WACZ by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/231
|
||||
|
||||
v0.8.0
|
||||
|
||||
- Switch to Chrome/Chromium 109
|
||||
- Convert to ESM module
|
||||
- Add ad blocking via request interception (#173)
|
||||
|
@ -25,11 +27,13 @@ v0.8.0
|
|||
- update behaviors to 0.4.1, rename 'Behavior line' -> 'Behavior log' by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/223
|
||||
|
||||
v0.7.1
|
||||
|
||||
- Fix for warcio.js by @ikreymer in #178
|
||||
- Guard against pre-existing user/group by @edsu in #176
|
||||
- Fix incorrect combineWARCs property in README.md by @Georift in #180
|
||||
|
||||
v0.7.0
|
||||
|
||||
- Update to Chrome/Chromium 101 - (0.7.0 Beta 0) by @ikreymer in #144
|
||||
- Add --netIdleWait, bump dependencies (0.7.0-beta.2) by @ikreymer in #145
|
||||
- Update README.md by @atomotic in #147
|
||||
|
@ -41,7 +45,6 @@ v0.7.0
|
|||
- Interrupt Handling Fixes by @ikreymer in #167
|
||||
- Run in Docker as User by @edsu in #171
|
||||
|
||||
|
||||
v0.6.0
|
||||
|
||||
- Add a --waitOnDone option, which has browsertrix crawler wait when finished (for use with Browsertrix Cloud)
|
||||
|
@ -56,8 +59,8 @@ v0.6.0
|
|||
- Fixes to interrupting a single instance in a shared state crawl
|
||||
- force all cookies, including session cookies, to fixed duration in days, configurable via --cookieDays
|
||||
|
||||
|
||||
v0.5.0
|
||||
|
||||
- Scope: support for `scopeType: domain` to include all subdomains and ignoring 'www.' if specified in the seed.
|
||||
- Profiles: support loading remote profile from URL as well as local file
|
||||
- Non-HTML Pages: Load non-200 responses in browser, even if non-html, fix waiting issues with non-HTML pages (eg. PDFs)
|
||||
|
@ -75,8 +78,8 @@ v0.5.0
|
|||
- Signing: Support for optional signing of WACZ
|
||||
- Dependencies: update to latest pywb, wacz and browsertrix-behaviors packages
|
||||
|
||||
|
||||
v0.4.4
|
||||
|
||||
- Page Block Rules Fix: 'request already handled' errors by avoiding adding duplicate handlers to same page.
|
||||
- Page Block Rules Fix: await all continue/abort() calls and catch errors.
|
||||
- Page Block Rules: Don't apply to top-level page, print warning and recommend scope rules instead.
|
||||
|
@ -86,11 +89,13 @@ v0.4.4
|
|||
- README: Update old type -> scopeType, list new scope types.
|
||||
|
||||
v0.4.3
|
||||
|
||||
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
|
||||
- BlockRules Fixes: Always allow pywb proxy scripts.
|
||||
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
|
||||
|
||||
v0.4.2
|
||||
|
||||
- Compose/docs: Build latest image by default, update README to refer to latest image
|
||||
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
||||
- Tests: Update all tests to use `test-crawls` directory
|
||||
|
@ -98,6 +103,7 @@ v0.4.2
|
|||
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
|
||||
|
||||
v0.4.1
|
||||
|
||||
- BlockRules Optimizations: don't intercept requests if no blockRules
|
||||
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
|
||||
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
|
||||
|
@ -107,6 +113,7 @@ v0.4.1
|
|||
- CI: Build a multi-platform (amd64 and arm64) image on each release
|
||||
|
||||
v0.4.0
|
||||
|
||||
- YAML based config, specifyable via --config property or via stdin (with '--config stdin')
|
||||
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level
|
||||
- Per-Seed scoping, including different scope types, or depth and include/exclude rules configurable per seed in 'seeds' list via YAML config
|
||||
|
@ -120,16 +127,17 @@ v0.4.0
|
|||
- Update to latest pywb (2.5.0b4), browsertrix-behaviors (0.2.3), py-wacz (0.3.1)
|
||||
|
||||
v0.3.2
|
||||
|
||||
- Added a `--urlFile` option: Allows users to specify a .txt file list of exact URLs to crawl (one URL per line).
|
||||
|
||||
|
||||
v0.3.1
|
||||
|
||||
- Improved shutdown wait: Instead of waiting for 5 secs, wait until all pending requests are written to WARCs
|
||||
- Bug fix: Use async APIs for combine WARC to avoid spurious issues with multiple crawls
|
||||
- Behaviors Update to Behaviors to 0.2.1, with support for facebook pages
|
||||
|
||||
|
||||
v0.3.0
|
||||
|
||||
- WARC Combining: `--combineWARC` and `--rolloverSize` flags for generating combined WARC at end of crawl, each WARC upto specified rolloverSize
|
||||
- Profiles: Support for creating reusable browser profiles, stored as tarballs, and running crawl with a login profile (see README for more info)
|
||||
- Behaviors: Switch to Browsertrix Behaviors v0.1.1 for in-page behaviors
|
||||
|
|
36
README.md
36
README.md
|
@ -51,7 +51,6 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
|
||||
## Crawling Configuration Options
|
||||
|
||||
|
||||
<details>
|
||||
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
|
||||
|
||||
|
@ -269,8 +268,8 @@ Options:
|
|||
ess (for debugging) [boolean]
|
||||
--config Path to YAML config file
|
||||
```
|
||||
</details>
|
||||
|
||||
</details>
|
||||
|
||||
### Waiting for Page Load
|
||||
|
||||
|
@ -282,13 +281,11 @@ See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remar
|
|||
|
||||
The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
|
||||
|
||||
|
||||
### YAML Crawl Config
|
||||
|
||||
Browsertix Crawler supports the use of a yaml file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option.
|
||||
|
||||
The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
|
||||
|
||||
The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
|
||||
|
||||
```
|
||||
docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml
|
||||
|
@ -300,7 +297,6 @@ The config can also be passed via stdin, which can simplify the command. Note th
|
|||
cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin
|
||||
```
|
||||
|
||||
|
||||
An example config file (eg. crawl-config.yaml) might contain:
|
||||
|
||||
```
|
||||
|
@ -361,7 +357,6 @@ To make this configuration as simple as possible, there are several predefined s
|
|||
The scope settings for multi-page crawls (page-spa, prefix, host, domain) also include http/https versions, eg. given a prefix of `http://example.com/path/`,
|
||||
`https://example.com/path/` is also included.
|
||||
|
||||
|
||||
#### Custom Scope Inclusion Rules
|
||||
|
||||
Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions.
|
||||
|
@ -375,7 +370,6 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
|
|||
|
||||
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
|
||||
|
||||
|
||||
#### Extra 'Hops' Beyond Current Scope
|
||||
|
||||
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
|
||||
|
@ -385,7 +379,6 @@ For example, this is most useful when crawling with a `host` or `prefix` scope,
|
|||
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
|
||||
that will take precedence over the `--extraHops`.
|
||||
|
||||
|
||||
#### Scope Rule Examples
|
||||
|
||||
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
|
||||
|
@ -456,27 +449,24 @@ If the `--blockMessage` is also specified, a blocked URL is replaced with the sp
|
|||
|
||||
If it seems confusing which rules should be used, here is a quick way to determine:
|
||||
|
||||
- If you'd like to restrict *the pages that are being crawled*, use the crawl scope rules (defined above).
|
||||
- If you'd like to restrict _the pages that are being crawled_, use the crawl scope rules (defined above).
|
||||
|
||||
- If you'd like to restrict *parts of a page* that are being loaded, use the page resource block rules described in this section.
|
||||
- If you'd like to restrict _parts of a page_ that are being loaded, use the page resource block rules described in this section.
|
||||
|
||||
The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked.
|
||||
|
||||
These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page).
|
||||
|
||||
|
||||
### Ad blocking
|
||||
|
||||
With version 0.8.0, Browsertrix Crawler supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.
|
||||
|
||||
|
||||
### Custom Warcinfo Fields
|
||||
|
||||
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
|
||||
|
||||
For example, the following are equivalent ways to add additional warcinfo fields:
|
||||
|
||||
|
||||
via yaml config:
|
||||
|
||||
```yaml
|
||||
|
@ -622,7 +612,6 @@ docker run -e CHROME_FLAGS="--disable-extensions-except=/ext/ublock --load-exten
|
|||
|
||||
You can also directly use extensions from an existing chrome-profile by using e.g. `~/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.41.8_0/` as the path.
|
||||
|
||||
|
||||
## Saving Crawl State: Interrupting and Restarting the Crawl
|
||||
|
||||
With version 0.5.0, a crawl can be gracefully interrupted with Ctrl-C (SIGINT) or a SIGTERM.
|
||||
|
@ -642,13 +631,11 @@ or `never` respectively, to control when the crawl state file should be written.
|
|||
When the `--saveState` is set to always, Browsertrix Crawler will also save the state automatically during the crawl, as set by the `--saveStateInterval` setting.
|
||||
When The crawler will keep the last `--saveStateHistory` save states and delete older ones. This provides extra backup, in case the crawl fails unexpectedly, or is not terminated via Ctrl-C, several previous crawl states are still available.
|
||||
|
||||
|
||||
## Creating and Using Browser Profiles
|
||||
|
||||
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
|
||||
to certain sites or setting other settings, and running a crawl exactly with those settings. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies.
|
||||
|
||||
|
||||
### Interactive Profile Creation
|
||||
|
||||
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
|
||||
|
@ -719,7 +706,6 @@ The script will then prompt you for login credentials, attempt to login and crea
|
|||
|
||||
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
|
||||
|
||||
|
||||
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional automated profile creation functionality, such as support for custom profile creation scripts, may be added in the future.
|
||||
|
||||
### Using Browser Profile with a Crawl
|
||||
|
@ -743,7 +729,6 @@ All released Docker Images are available from Docker Hub, listed by release tag
|
|||
|
||||
Details for each corresponding release tag are also available on GitHub at: https://github.com/webrecorder/browsertrix-crawler/releases
|
||||
|
||||
|
||||
## Architecture
|
||||
|
||||
The Docker container provided here packages up several components used in Browsertrix.
|
||||
|
@ -752,7 +737,6 @@ The system uses `pywb` in recording mode for capturing the content. The crawl pr
|
|||
|
||||
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
|
||||
|
||||
|
||||
### Usage with Docker Compose
|
||||
|
||||
Many examples in this README demonstrate running Browsertrix Crawler with `docker run`.
|
||||
|
@ -775,10 +759,8 @@ docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --
|
|||
|
||||
In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
|
||||
|
||||
|
||||
While the crawl is running, the status of the crawl prints the progress to the JSON log output. This can be disabled by using the `--logging` option and not including `stats`.
|
||||
|
||||
|
||||
### Multi-Platform Build / Support for Apple Silicon (M1/M2)
|
||||
|
||||
Browsertrix Crawler uses a browser image which supports amd64 and arm64.
|
||||
|
@ -787,7 +769,6 @@ This means Browsertrix Crawler can be built natively on Apple Silicon systems us
|
|||
|
||||
On an Apple Silicon system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build. Note that Chromium is different than Chrome, and for example, some video codecs may not be supported in the ARM / Chromium-based version that would be in the amd64 / Chrome version. For production crawling, it is recommended to run on an amd64 Linux environment.
|
||||
|
||||
|
||||
### Modifying Browser Image
|
||||
|
||||
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Chrome/Chromium (depending on host system chip architecture) and Brave Browser are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base).
|
||||
|
@ -796,7 +777,6 @@ The browser base image used is specified and can be changed at the top of the Do
|
|||
|
||||
Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image.
|
||||
|
||||
|
||||
### Viewing crawled data with pywb
|
||||
|
||||
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:
|
||||
|
@ -809,17 +789,13 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
|
|||
|
||||
(Previewing crawl results while a crawl its still running should also be possible soon!)
|
||||
|
||||
|
||||
Support
|
||||
-------
|
||||
## Support
|
||||
|
||||
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||
|
||||
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
|
||||
|
||||
|
||||
License
|
||||
-------
|
||||
## License
|
||||
|
||||
[AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see
|
||||
[LICENSE](LICENSE) for more details.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
debug: true
|
||||
|
||||
|
||||
proxy:
|
||||
coll: ${COLL}
|
||||
recording: true
|
||||
|
|
|
@ -1,17 +1,16 @@
|
|||
version: '3.5'
|
||||
|
||||
version: "3.5"
|
||||
|
||||
services:
|
||||
crawler:
|
||||
image: ${REGISTRY}webrecorder/browsertrix-crawler:latest
|
||||
build:
|
||||
context: ./
|
||||
crawler:
|
||||
image: ${REGISTRY}webrecorder/browsertrix-crawler:latest
|
||||
build:
|
||||
context: ./
|
||||
|
||||
volumes:
|
||||
- ./crawls:/crawls
|
||||
volumes:
|
||||
- ./crawls:/crawls
|
||||
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
|
||||
shm_size: 1gb
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
- SYS_ADMIN
|
||||
|
||||
shm_size: 1gb
|
||||
|
|
|
@ -1,39 +1,45 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
html, body, iframe {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
border: 0;
|
||||
overflow: hidden;
|
||||
font-family: sans-serif;
|
||||
}
|
||||
body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
iframe#main {
|
||||
height: calc(100% - 36px);
|
||||
}
|
||||
div#info {
|
||||
margin: 8px;
|
||||
}
|
||||
form {
|
||||
display: inline;
|
||||
}
|
||||
button {
|
||||
font-weight: bold;
|
||||
font-size: 15px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="info">
|
||||
Log in to any site(s) that you want to be part of the crawl profile using the embedded browser below. When done, click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
|
||||
</div>
|
||||
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
|
||||
</body>
|
||||
<head>
|
||||
<style>
|
||||
html,
|
||||
body,
|
||||
iframe {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
border: 0;
|
||||
overflow: hidden;
|
||||
font-family: sans-serif;
|
||||
}
|
||||
body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
iframe#main {
|
||||
height: calc(100% - 36px);
|
||||
}
|
||||
div#info {
|
||||
margin: 8px;
|
||||
}
|
||||
form {
|
||||
display: inline;
|
||||
}
|
||||
button {
|
||||
font-weight: bold;
|
||||
font-size: 15px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="info">
|
||||
Log in to any site(s) that you want to be part of the crawl profile using
|
||||
the embedded browser below. When done, click
|
||||
<form action="/createProfile" method="post">
|
||||
<button type="submit">Create Profile</button>
|
||||
</form>
|
||||
</div>
|
||||
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -1,75 +1,79 @@
|
|||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
#content {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
#content img {
|
||||
width: 640px;
|
||||
height: 480px;
|
||||
margin: 2rem;
|
||||
}
|
||||
</style>
|
||||
<script>
|
||||
const ws = new WebSocket(window.location.href.replace("http", "ws") + "ws");
|
||||
ws.addEventListener("message", (event) => handleMessage(event.data));
|
||||
<head>
|
||||
<style>
|
||||
#content {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
#content img {
|
||||
width: 640px;
|
||||
height: 480px;
|
||||
margin: 2rem;
|
||||
}
|
||||
</style>
|
||||
<script>
|
||||
const ws = new WebSocket(
|
||||
window.location.href.replace("http", "ws") + "ws",
|
||||
);
|
||||
ws.addEventListener("message", (event) => handleMessage(event.data));
|
||||
|
||||
const unusedElems = [];
|
||||
const unusedElems = [];
|
||||
|
||||
function handleMessage(resp) {
|
||||
resp = JSON.parse(resp);
|
||||
function handleMessage(resp) {
|
||||
resp = JSON.parse(resp);
|
||||
|
||||
switch (resp.msg) {
|
||||
case "screencast":
|
||||
img = createImage(resp.id);
|
||||
if (resp.data) {
|
||||
setImageData(img, resp.data);
|
||||
switch (resp.msg) {
|
||||
case "screencast":
|
||||
img = createImage(resp.id);
|
||||
if (resp.data) {
|
||||
setImageData(img, resp.data);
|
||||
}
|
||||
break;
|
||||
|
||||
case "close":
|
||||
img = unuseImage(resp.id);
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case "close":
|
||||
img = unuseImage(resp.id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
function setImageData(img, data) {
|
||||
//img.style.display = "";
|
||||
img.src = "data:image/png;base64," + data;
|
||||
}
|
||||
|
||||
function setImageData(img, data) {
|
||||
//img.style.display = "";
|
||||
img.src = "data:image/png;base64," + data;
|
||||
}
|
||||
function createImage(id) {
|
||||
let elem = document.getElementById(id);
|
||||
if (elem) {
|
||||
return elem;
|
||||
}
|
||||
|
||||
function createImage(id) {
|
||||
let elem = document.getElementById(id);
|
||||
if (elem) {
|
||||
return elem;
|
||||
}
|
||||
if (unusedElems.length) {
|
||||
elem = unusedElems.shift();
|
||||
elem.setAttribute("id", id);
|
||||
return elem;
|
||||
}
|
||||
|
||||
if (unusedElems.length) {
|
||||
elem = unusedElems.shift();
|
||||
elem.setAttribute("id", id);
|
||||
return elem;
|
||||
}
|
||||
elem = document.createElement("img");
|
||||
elem.setAttribute("id", id);
|
||||
document.getElementById("content").appendChild(elem);
|
||||
return elem;
|
||||
}
|
||||
|
||||
elem = document.createElement("img");
|
||||
elem.setAttribute("id", id);
|
||||
document.getElementById("content").appendChild(elem);
|
||||
return elem;
|
||||
}
|
||||
|
||||
function unuseImage(id) {
|
||||
const elem = document.getElementById(id);
|
||||
if (!elem) {
|
||||
return;
|
||||
}
|
||||
//elem.style.display = "none";
|
||||
unusedElems.push(elem);
|
||||
}
|
||||
</script>
|
||||
<head>
|
||||
<body>
|
||||
<div id="content">
|
||||
</div>
|
||||
</body>
|
||||
function unuseImage(id) {
|
||||
const elem = document.getElementById(id);
|
||||
if (!elem) {
|
||||
return;
|
||||
}
|
||||
//elem.style.display = "none";
|
||||
unusedElems.push(elem);
|
||||
}
|
||||
</script>
|
||||
<head>
|
||||
<body>
|
||||
<div id="content"></div>
|
||||
</body>
|
||||
</head>
|
||||
</head>
|
||||
</html>
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
<!DOCTYPE html>
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
|
||||
<head>
|
||||
<!--
|
||||
noVNC example: lightweight example using minimal UI and features
|
||||
|
||||
|
@ -16,180 +15,180 @@
|
|||
-->
|
||||
<title>noVNC</title>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta charset="utf-8" />
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin: 0;
|
||||
background-color: dimgrey;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
html {
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
background-color: dimgrey;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
html {
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
#top_bar {
|
||||
display: none;
|
||||
background-color: #6e84a3;
|
||||
color: white;
|
||||
font: bold 12px Helvetica;
|
||||
padding: 6px 5px 4px 5px;
|
||||
border-bottom: 1px outset;
|
||||
}
|
||||
#status {
|
||||
text-align: center;
|
||||
}
|
||||
#sendCtrlAltDelButton {
|
||||
display: none;
|
||||
position: fixed;
|
||||
top: 0px;
|
||||
right: 0px;
|
||||
border: 1px outset;
|
||||
padding: 5px 5px 4px 5px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
#screen {
|
||||
flex: 1; /* fill remaining space */
|
||||
overflow: hidden;
|
||||
}
|
||||
#top_bar {
|
||||
display: none;
|
||||
background-color: #6e84a3;
|
||||
color: white;
|
||||
font: bold 12px Helvetica;
|
||||
padding: 6px 5px 4px 5px;
|
||||
border-bottom: 1px outset;
|
||||
}
|
||||
#status {
|
||||
text-align: center;
|
||||
}
|
||||
#sendCtrlAltDelButton {
|
||||
display: none;
|
||||
position: fixed;
|
||||
top: 0px;
|
||||
right: 0px;
|
||||
border: 1px outset;
|
||||
padding: 5px 5px 4px 5px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
#screen {
|
||||
flex: 1; /* fill remaining space */
|
||||
overflow: hidden;
|
||||
}
|
||||
</style>
|
||||
|
||||
<script type="module" crossorigin="anonymous">
|
||||
// RFB holds the API to connect and communicate with a VNC server
|
||||
import RFB from './core/rfb.js';
|
||||
// RFB holds the API to connect and communicate with a VNC server
|
||||
import RFB from "./core/rfb.js";
|
||||
|
||||
let rfb;
|
||||
let desktopName;
|
||||
let rfb;
|
||||
let desktopName;
|
||||
|
||||
// When this function is called we have
|
||||
// successfully connected to a server
|
||||
function connectedToServer(e) {
|
||||
status("Connected to " + desktopName);
|
||||
// When this function is called we have
|
||||
// successfully connected to a server
|
||||
function connectedToServer(e) {
|
||||
status("Connected to " + desktopName);
|
||||
}
|
||||
|
||||
// This function is called when we are disconnected
|
||||
function disconnectedFromServer(e) {
|
||||
if (e.detail.clean) {
|
||||
status("Disconnected, retrying...");
|
||||
setTimeout(connect, 2000);
|
||||
} else {
|
||||
status("Something went wrong, connection is closed");
|
||||
}
|
||||
}
|
||||
|
||||
// When this function is called, the server requires
|
||||
// credentials to authenticate
|
||||
function credentialsAreRequired(e) {
|
||||
const password = prompt("Password Required:");
|
||||
rfb.sendCredentials({ password: password });
|
||||
}
|
||||
|
||||
// When this function is called we have received
|
||||
// a desktop name from the server
|
||||
function updateDesktopName(e) {
|
||||
desktopName = e.detail.name;
|
||||
}
|
||||
|
||||
// Since most operating systems will catch Ctrl+Alt+Del
|
||||
// before they get a chance to be intercepted by the browser,
|
||||
// we provide a way to emulate this key sequence.
|
||||
function sendCtrlAltDel() {
|
||||
rfb.sendCtrlAltDel();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Show a status text in the top bar
|
||||
function status(text) {
|
||||
document.getElementById("status").textContent = text;
|
||||
}
|
||||
|
||||
// This function extracts the value of one variable from the
|
||||
// query string. If the variable isn't defined in the URL
|
||||
// it returns the default value instead.
|
||||
function readQueryVariable(name, defaultValue) {
|
||||
// A URL with a query parameter can look like this (But will most probably get logged on the http server):
|
||||
// https://www.example.com?myqueryparam=myvalue
|
||||
//
|
||||
// For privacy (Using a hastag #, the parameters will not be sent to the server)
|
||||
// the url can be requested in the following way:
|
||||
// https://www.example.com#myqueryparam=myvalue&password=secreatvalue
|
||||
//
|
||||
// Even Mixing public and non public parameters will work:
|
||||
// https://www.example.com?nonsecretparam=example.com#password=secreatvalue
|
||||
//
|
||||
// Note that we use location.href instead of location.search
|
||||
// because Firefox < 53 has a bug w.r.t location.search
|
||||
const re = new RegExp(".*[?&]" + name + "=([^&#]*)"),
|
||||
match = ""
|
||||
.concat(document.location.href, window.location.hash)
|
||||
.match(re);
|
||||
|
||||
if (match) {
|
||||
// We have to decode the URL since want the cleartext value
|
||||
return decodeURIComponent(match[1]);
|
||||
}
|
||||
|
||||
// This function is called when we are disconnected
|
||||
function disconnectedFromServer(e) {
|
||||
if (e.detail.clean) {
|
||||
status("Disconnected, retrying...");
|
||||
setTimeout(connect, 2000);
|
||||
} else {
|
||||
status("Something went wrong, connection is closed");
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
document.getElementById("sendCtrlAltDelButton").onclick = sendCtrlAltDel;
|
||||
|
||||
// Read parameters specified in the URL query string
|
||||
// By default, use the host and port of server that served this file
|
||||
const host = readQueryVariable("host", window.location.hostname);
|
||||
let port = readQueryVariable("port", window.location.port);
|
||||
const password = readQueryVariable("password");
|
||||
const path = readQueryVariable("path", "websockify");
|
||||
|
||||
// | | | | | |
|
||||
// | | | Connect | | |
|
||||
// v v v v v v
|
||||
function connect() {
|
||||
status("Connecting");
|
||||
|
||||
// Build the websocket URL used to connect
|
||||
let url;
|
||||
if (window.location.protocol === "https:") {
|
||||
url = "wss";
|
||||
} else {
|
||||
url = "ws";
|
||||
}
|
||||
|
||||
// When this function is called, the server requires
|
||||
// credentials to authenticate
|
||||
function credentialsAreRequired(e) {
|
||||
const password = prompt("Password Required:");
|
||||
rfb.sendCredentials({ password: password });
|
||||
url += "://" + host;
|
||||
if (port) {
|
||||
url += ":" + port;
|
||||
}
|
||||
url += "/" + path;
|
||||
|
||||
// When this function is called we have received
|
||||
// a desktop name from the server
|
||||
function updateDesktopName(e) {
|
||||
desktopName = e.detail.name;
|
||||
}
|
||||
// Creating a new RFB object will start a new connection
|
||||
rfb = new RFB(document.getElementById("screen"), url, {
|
||||
credentials: { password: password },
|
||||
});
|
||||
|
||||
// Since most operating systems will catch Ctrl+Alt+Del
|
||||
// before they get a chance to be intercepted by the browser,
|
||||
// we provide a way to emulate this key sequence.
|
||||
function sendCtrlAltDel() {
|
||||
rfb.sendCtrlAltDel();
|
||||
return false;
|
||||
}
|
||||
// Add listeners to important events from the RFB module
|
||||
rfb.addEventListener("connect", connectedToServer);
|
||||
rfb.addEventListener("disconnect", disconnectedFromServer);
|
||||
rfb.addEventListener("credentialsrequired", credentialsAreRequired);
|
||||
rfb.addEventListener("desktopname", updateDesktopName);
|
||||
|
||||
// Show a status text in the top bar
|
||||
function status(text) {
|
||||
document.getElementById('status').textContent = text;
|
||||
}
|
||||
// Set parameters that can be changed on an active connection
|
||||
rfb.viewOnly = readQueryVariable("view_only", false);
|
||||
rfb.scaleViewport = readQueryVariable("scale", false);
|
||||
}
|
||||
|
||||
// This function extracts the value of one variable from the
|
||||
// query string. If the variable isn't defined in the URL
|
||||
// it returns the default value instead.
|
||||
function readQueryVariable(name, defaultValue) {
|
||||
// A URL with a query parameter can look like this (But will most probably get logged on the http server):
|
||||
// https://www.example.com?myqueryparam=myvalue
|
||||
//
|
||||
// For privacy (Using a hastag #, the parameters will not be sent to the server)
|
||||
// the url can be requested in the following way:
|
||||
// https://www.example.com#myqueryparam=myvalue&password=secreatvalue
|
||||
//
|
||||
// Even Mixing public and non public parameters will work:
|
||||
// https://www.example.com?nonsecretparam=example.com#password=secreatvalue
|
||||
//
|
||||
// Note that we use location.href instead of location.search
|
||||
// because Firefox < 53 has a bug w.r.t location.search
|
||||
const re = new RegExp('.*[?&]' + name + '=([^&#]*)'),
|
||||
match = ''.concat(document.location.href, window.location.hash).match(re);
|
||||
|
||||
if (match) {
|
||||
// We have to decode the URL since want the cleartext value
|
||||
return decodeURIComponent(match[1]);
|
||||
}
|
||||
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
document.getElementById('sendCtrlAltDelButton')
|
||||
.onclick = sendCtrlAltDel;
|
||||
|
||||
// Read parameters specified in the URL query string
|
||||
// By default, use the host and port of server that served this file
|
||||
const host = readQueryVariable('host', window.location.hostname);
|
||||
let port = readQueryVariable('port', window.location.port);
|
||||
const password = readQueryVariable('password');
|
||||
const path = readQueryVariable('path', 'websockify');
|
||||
|
||||
// | | | | | |
|
||||
// | | | Connect | | |
|
||||
// v v v v v v
|
||||
function connect() {
|
||||
status("Connecting");
|
||||
|
||||
// Build the websocket URL used to connect
|
||||
let url;
|
||||
if (window.location.protocol === "https:") {
|
||||
url = 'wss';
|
||||
} else {
|
||||
url = 'ws';
|
||||
}
|
||||
url += '://' + host;
|
||||
if(port) {
|
||||
url += ':' + port;
|
||||
}
|
||||
url += '/' + path;
|
||||
|
||||
// Creating a new RFB object will start a new connection
|
||||
rfb = new RFB(document.getElementById('screen'), url,
|
||||
{ credentials: { password: password } });
|
||||
|
||||
// Add listeners to important events from the RFB module
|
||||
rfb.addEventListener("connect", connectedToServer);
|
||||
rfb.addEventListener("disconnect", disconnectedFromServer);
|
||||
rfb.addEventListener("credentialsrequired", credentialsAreRequired);
|
||||
rfb.addEventListener("desktopname", updateDesktopName);
|
||||
|
||||
// Set parameters that can be changed on an active connection
|
||||
rfb.viewOnly = readQueryVariable('view_only', false);
|
||||
rfb.scaleViewport = readQueryVariable('scale', false);
|
||||
}
|
||||
|
||||
connect();
|
||||
connect();
|
||||
</script>
|
||||
</head>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<body>
|
||||
<div id="top_bar">
|
||||
<div id="status">Loading</div>
|
||||
<div id="sendCtrlAltDelButton">Send CtrlAltDel</div>
|
||||
<div id="status">Loading</div>
|
||||
<div id="sendCtrlAltDelButton">Send CtrlAltDel</div>
|
||||
</div>
|
||||
<div id="screen">
|
||||
<!-- This is where the remote screen will appear -->
|
||||
<!-- This is where the remote screen will appear -->
|
||||
</div>
|
||||
</body>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -8,7 +8,10 @@
|
|||
"license": "AGPL-3.0-or-later",
|
||||
"scripts": {
|
||||
"tsc": "tsc",
|
||||
"lint": "eslint *.js tests/*.test.js",
|
||||
"format": "prettier . --check",
|
||||
"format:fix": "prettier . --write",
|
||||
"lint": "eslint .",
|
||||
"lint:fix": "yarn format:fix && eslint . --fix",
|
||||
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
||||
"prepare": "husky install"
|
||||
},
|
||||
|
@ -40,9 +43,11 @@
|
|||
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
||||
"@typescript-eslint/parser": "^6.10.0",
|
||||
"eslint": "^8.53.0",
|
||||
"eslint-config-prettier": "^9.0.0",
|
||||
"eslint-plugin-react": "^7.22.0",
|
||||
"jest": "^29.2.1",
|
||||
"md5": "^2.3.0",
|
||||
"prettier": "3.0.3",
|
||||
"typescript": "^5.2.2"
|
||||
},
|
||||
"jest": {
|
||||
|
|
324
src/crawler.ts
324
src/crawler.ts
|
@ -4,7 +4,13 @@ import fs, { WriteStream } from "fs";
|
|||
import os from "os";
|
||||
import fsp, { FileHandle } from "fs/promises";
|
||||
|
||||
import { RedisCrawlState, LoadState, QueueState, PageState, WorkerId } from "./util/state.js";
|
||||
import {
|
||||
RedisCrawlState,
|
||||
LoadState,
|
||||
QueueState,
|
||||
PageState,
|
||||
WorkerId,
|
||||
} from "./util/state.js";
|
||||
|
||||
import Sitemapper from "sitemapper";
|
||||
import yaml from "js-yaml";
|
||||
|
@ -13,7 +19,14 @@ import * as warcio from "warcio";
|
|||
|
||||
import { HealthChecker } from "./util/healthcheck.js";
|
||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization, S3StorageSync } from "./util/storage.js";
|
||||
import {
|
||||
initStorage,
|
||||
getFileSize,
|
||||
getDirSize,
|
||||
interpolateFilename,
|
||||
checkDiskUtilization,
|
||||
S3StorageSync,
|
||||
} from "./util/storage.js";
|
||||
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
||||
import { Screenshots } from "./util/screenshots.js";
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
|
@ -25,7 +38,12 @@ import { collectAllFileSources } from "./util/file_reader.js";
|
|||
|
||||
import { Browser } from "./util/browser.js";
|
||||
|
||||
import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
|
||||
import {
|
||||
ADD_LINK_FUNC,
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
HTML_TYPES,
|
||||
DEFAULT_SELECTORS,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||
import { OriginOverride } from "./util/originoverride.js";
|
||||
|
@ -41,12 +59,23 @@ const HTTPS_AGENT = new HTTPSAgent({
|
|||
|
||||
const HTTP_AGENT = new HTTPAgent();
|
||||
|
||||
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||
import.meta.url,
|
||||
),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
const FETCH_TIMEOUT_SECS = 30;
|
||||
const PAGE_OP_TIMEOUT_SECS = 5;
|
||||
|
||||
const POST_CRAWL_STATES = ["generate-wacz", "uploading-wacz", "generate-cdx", "generate-warc"];
|
||||
const POST_CRAWL_STATES = [
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"generate-cdx",
|
||||
"generate-warc",
|
||||
];
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
type LogDetails = Record<string, any>;
|
||||
|
@ -62,7 +91,6 @@ type PageEntry = {
|
|||
favIconUrl?: string;
|
||||
};
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class Crawler {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -128,8 +156,12 @@ export class Crawler {
|
|||
maxHeapUsed = 0;
|
||||
maxHeapTotal = 0;
|
||||
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
driver!: (opts: { page: Page; data: PageState; crawler: Crawler }) => NonNullable<unknown>;
|
||||
driver!: (opts: {
|
||||
page: Page;
|
||||
data: PageState;
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
crawler: Crawler;
|
||||
}) => NonNullable<unknown>;
|
||||
|
||||
constructor() {
|
||||
const res = parseArgs();
|
||||
|
@ -140,12 +172,12 @@ export class Crawler {
|
|||
this.collDir = path.join(
|
||||
this.params.cwd,
|
||||
"collections",
|
||||
this.params.collection
|
||||
this.params.collection,
|
||||
);
|
||||
this.logDir = path.join(this.collDir, "logs");
|
||||
this.logFilename = path.join(
|
||||
this.logDir,
|
||||
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`
|
||||
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
|
||||
);
|
||||
|
||||
const debugLogging = this.params.logging.includes("debug");
|
||||
|
@ -252,7 +284,7 @@ export class Crawler {
|
|||
|
||||
if (!redisUrl.startsWith("redis://")) {
|
||||
logger.fatal(
|
||||
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported"
|
||||
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -272,7 +304,7 @@ export class Crawler {
|
|||
logger.debug(
|
||||
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
|
||||
{},
|
||||
"state"
|
||||
"state",
|
||||
);
|
||||
|
||||
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
||||
|
@ -281,7 +313,7 @@ export class Crawler {
|
|||
redis,
|
||||
this.params.crawlId,
|
||||
this.maxPageTime,
|
||||
os.hostname()
|
||||
os.hostname(),
|
||||
);
|
||||
|
||||
// clear any pending URLs from this instance
|
||||
|
@ -291,7 +323,7 @@ export class Crawler {
|
|||
logger.debug(
|
||||
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
|
||||
{},
|
||||
"state"
|
||||
"state",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -311,7 +343,7 @@ export class Crawler {
|
|||
logger.debug(
|
||||
`Screencast server started on: ${this.params.screencastPort}`,
|
||||
{},
|
||||
"screencast"
|
||||
"screencast",
|
||||
);
|
||||
}
|
||||
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
||||
|
@ -375,7 +407,7 @@ export class Crawler {
|
|||
logger.debug(`Clearing ${this.collDir} before starting`);
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error(`Unable to clear ${this.collDir}`, e);
|
||||
}
|
||||
|
@ -383,7 +415,7 @@ export class Crawler {
|
|||
|
||||
if (this.params.customBehaviors) {
|
||||
this.customBehaviors = this.loadCustomBehaviors(
|
||||
this.params.customBehaviors
|
||||
this.params.customBehaviors,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -445,7 +477,7 @@ export class Crawler {
|
|||
exitCode = 11;
|
||||
}
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error("Crawl failed", e);
|
||||
exitCode = 9;
|
||||
|
@ -461,7 +493,7 @@ export class Crawler {
|
|||
_behaviorLog(
|
||||
{ data, type }: { data: string; type: string },
|
||||
pageUrl: string,
|
||||
workerid: WorkerId
|
||||
workerid: WorkerId,
|
||||
) {
|
||||
let behaviorLine;
|
||||
let message;
|
||||
|
@ -481,21 +513,21 @@ export class Crawler {
|
|||
}
|
||||
|
||||
switch (type) {
|
||||
case "info":
|
||||
behaviorLine = JSON.stringify(data);
|
||||
if (behaviorLine !== this.behaviorLastLine) {
|
||||
logger.info(message, details, "behaviorScript");
|
||||
this.behaviorLastLine = behaviorLine;
|
||||
}
|
||||
break;
|
||||
case "info":
|
||||
behaviorLine = JSON.stringify(data);
|
||||
if (behaviorLine !== this.behaviorLastLine) {
|
||||
logger.info(message, details, "behaviorScript");
|
||||
this.behaviorLastLine = behaviorLine;
|
||||
}
|
||||
break;
|
||||
|
||||
case "error":
|
||||
logger.error(message, details, "behaviorScript");
|
||||
break;
|
||||
case "error":
|
||||
logger.error(message, details, "behaviorScript");
|
||||
break;
|
||||
|
||||
case "debug":
|
||||
default:
|
||||
logger.debug(message, details, "behaviorScript");
|
||||
case "debug":
|
||||
default:
|
||||
logger.debug(message, details, "behaviorScript");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -506,7 +538,7 @@ export class Crawler {
|
|||
depth,
|
||||
extraHops,
|
||||
}: { seedId: number; url: string; depth: number; extraHops: number },
|
||||
logDetails = {}
|
||||
logDetails = {},
|
||||
) {
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
|
@ -553,7 +585,7 @@ export class Crawler {
|
|||
logger.warn(
|
||||
msg.text(),
|
||||
{ location: msg.location(), page: page.url(), workerid },
|
||||
"jsError"
|
||||
"jsError",
|
||||
);
|
||||
}
|
||||
});
|
||||
|
@ -562,7 +594,7 @@ export class Crawler {
|
|||
logger.warn(
|
||||
"Page Error",
|
||||
{ ...errJSON(e), page: page.url(), workerid },
|
||||
"jsError"
|
||||
"jsError",
|
||||
);
|
||||
});
|
||||
}
|
||||
|
@ -574,14 +606,14 @@ export class Crawler {
|
|||
|
||||
await page.exposeFunction(
|
||||
ADD_LINK_FUNC,
|
||||
(url: string) => callbacks.addLink && callbacks.addLink(url)
|
||||
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
||||
);
|
||||
|
||||
if (this.params.behaviorOpts) {
|
||||
await page.exposeFunction(
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
(logdata: { data: string; type: string }) =>
|
||||
this._behaviorLog(logdata, page.url(), workerid)
|
||||
this._behaviorLog(logdata, page.url(), workerid),
|
||||
);
|
||||
await this.browser.addInitScript(page, behaviors);
|
||||
|
||||
|
@ -622,7 +654,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
logger.warn(
|
||||
"Failed to fetch favicon from browser /json endpoint",
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
return "";
|
||||
}
|
||||
|
@ -645,7 +677,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"HEAD request to determine if URL is HTML page timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true
|
||||
true,
|
||||
);
|
||||
|
||||
if (!data.isHTMLPage && directFetchCapture) {
|
||||
|
@ -656,7 +688,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"Direct fetch capture attempt timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true
|
||||
true,
|
||||
);
|
||||
if (fetched) {
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
|
@ -666,7 +698,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
"Direct fetch successful",
|
||||
{ url, ...logDetails },
|
||||
"fetch"
|
||||
"fetch",
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
@ -714,7 +746,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const { changed, text } = await textextract.extractAndStoreText(
|
||||
"text",
|
||||
false,
|
||||
this.params.text.includes("to-warc")
|
||||
this.params.text.includes("to-warc"),
|
||||
);
|
||||
|
||||
if (changed && text && this.params.text.includes("to-pages")) {
|
||||
|
@ -729,7 +761,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(
|
||||
"Skipping behaviors for non-HTML page",
|
||||
logDetails,
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
} else if (data.skipBehaviors) {
|
||||
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
||||
|
@ -739,7 +771,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.params.behaviorTimeout,
|
||||
"Behaviors timed out",
|
||||
logDetails,
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
|
||||
await this.netIdle(page, logDetails);
|
||||
|
@ -757,7 +789,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.params.pageExtraDelay) {
|
||||
logger.info(
|
||||
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
await sleep(this.params.pageExtraDelay);
|
||||
}
|
||||
|
@ -784,7 +816,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.warn(
|
||||
"Page Load Failed",
|
||||
{ loadState, ...logDetails },
|
||||
"pageStatus"
|
||||
"pageStatus",
|
||||
);
|
||||
|
||||
await this.crawlState.markFailed(data.url);
|
||||
|
@ -816,7 +848,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
page: Page,
|
||||
cdp: CDPSession,
|
||||
frames: Frame[],
|
||||
logDetails: LogDetails
|
||||
logDetails: LogDetails,
|
||||
) {
|
||||
try {
|
||||
frames = frames || page.frames();
|
||||
|
@ -828,7 +860,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
frameUrls: frames.map((frame) => frame.url()),
|
||||
...logDetails,
|
||||
},
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
|
@ -844,9 +876,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
self.__bx_behaviors.run();
|
||||
}`,
|
||||
logDetails,
|
||||
"behavior"
|
||||
)
|
||||
)
|
||||
"behavior",
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
for (const res of results) {
|
||||
|
@ -855,7 +887,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.warn(
|
||||
"Behavior run partially failed",
|
||||
{ reason, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -863,14 +895,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
"Behaviors finished",
|
||||
{ finished: results.length, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.warn(
|
||||
"Behavior run failed",
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
@ -886,14 +918,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
||||
// if there's no tag or an iframe tag, then assume its a regular frame
|
||||
const tagName = await frame.evaluate(
|
||||
"self && self.frameElement && self.frameElement.tagName"
|
||||
"self && self.frameElement && self.frameElement.tagName",
|
||||
);
|
||||
|
||||
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
||||
logger.debug(
|
||||
"Skipping processing non-frame object",
|
||||
{ tagName, frameUrl, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
@ -910,7 +942,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(
|
||||
"Skipping processing frame",
|
||||
{ frameUrl, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -921,13 +953,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const packageFileJSON = JSON.parse(
|
||||
await fsp.readFile(new URL("../package.json", import.meta.url), {
|
||||
encoding: "utf-8",
|
||||
})
|
||||
}),
|
||||
);
|
||||
const warcioPackageJSON = JSON.parse(
|
||||
await fsp.readFile(
|
||||
new URL("../node_modules/warcio/package.json", import.meta.url),
|
||||
{ encoding: "utf-8" }
|
||||
)
|
||||
{ encoding: "utf-8" },
|
||||
),
|
||||
);
|
||||
|
||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
||||
|
@ -945,7 +977,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const warcInfo = { ...info, ...this.params.warcInfo };
|
||||
const record = await warcio.WARCRecord.createWARCInfo(
|
||||
{ filename, type, warcVersion },
|
||||
warcInfo
|
||||
warcInfo,
|
||||
);
|
||||
const buffer = await warcio.WARCSerializer.serialize(record, {
|
||||
gzip: true,
|
||||
|
@ -964,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.params.sizeLimit) {
|
||||
if (size >= this.params.sizeLimit) {
|
||||
logger.info(
|
||||
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`
|
||||
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
|
||||
);
|
||||
interrupt = true;
|
||||
}
|
||||
|
@ -974,7 +1006,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const elapsed = secondsElapsed(this.startTime);
|
||||
if (elapsed >= this.params.timeLimit) {
|
||||
logger.info(
|
||||
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`
|
||||
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
|
||||
);
|
||||
interrupt = true;
|
||||
}
|
||||
|
@ -992,7 +1024,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const numFailed = this.crawlState.numFailed();
|
||||
if (numFailed >= this.params.failOnFailedLimit) {
|
||||
logger.fatal(
|
||||
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`
|
||||
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1060,14 +1092,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.params.healthCheckPort) {
|
||||
this.healthChecker = new HealthChecker(
|
||||
this.params.healthCheckPort,
|
||||
this.params.workers
|
||||
this.params.workers,
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
const driverUrl = new URL(this.params.driver, import.meta.url);
|
||||
this.driver = (await import(driverUrl.href)).default;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
||||
return;
|
||||
|
@ -1125,7 +1157,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.crawlState.load(
|
||||
this.params.state,
|
||||
this.params.scopedSeeds,
|
||||
true
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1133,14 +1165,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
this.adBlockRules = new AdBlockRules(
|
||||
this.captureBasePrefix,
|
||||
this.params.adBlockMessage
|
||||
this.params.adBlockMessage,
|
||||
);
|
||||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(
|
||||
this.params.blockRules,
|
||||
this.captureBasePrefix,
|
||||
this.params.blockMessage
|
||||
this.params.blockMessage,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1178,10 +1210,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.error(
|
||||
"Browser disconnected (crashed?), interrupting crawl",
|
||||
err,
|
||||
"browser"
|
||||
"browser",
|
||||
);
|
||||
},
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} as any);
|
||||
|
||||
// --------------
|
||||
|
@ -1220,7 +1252,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
const warcListFull = warcList.map((filename) =>
|
||||
path.join(this.collDir, "archive", filename)
|
||||
path.join(this.collDir, "archive", filename),
|
||||
);
|
||||
|
||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
|
@ -1230,7 +1262,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
...warcListFull,
|
||||
];
|
||||
const indexResult = await this.awaitProcess(
|
||||
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd })
|
||||
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
|
||||
);
|
||||
if (indexResult === 0) {
|
||||
logger.debug("Indexing complete, CDX successfully created");
|
||||
|
@ -1251,11 +1283,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (uploaded && this.uploadAndDeleteLocal) {
|
||||
logger.info(
|
||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`
|
||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
||||
);
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.warn(`Unable to clear ${this.collDir} before exit`, e);
|
||||
}
|
||||
|
@ -1352,13 +1384,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
createArgs.push("-f");
|
||||
|
||||
warcFileList.forEach((val) =>
|
||||
createArgs.push(path.join(archiveDir, val))
|
||||
);
|
||||
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
|
||||
|
||||
// create WACZ
|
||||
const waczResult = await this.awaitProcess(
|
||||
child_process.spawn("wacz", createArgs)
|
||||
child_process.spawn("wacz", createArgs),
|
||||
);
|
||||
|
||||
if (waczResult !== 0) {
|
||||
|
@ -1430,7 +1460,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
maxHeapTotal: this.maxHeapTotal,
|
||||
...memUsage,
|
||||
},
|
||||
"memory"
|
||||
"memory",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1461,9 +1491,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
try {
|
||||
await fsp.writeFile(
|
||||
this.params.statsFilename,
|
||||
JSON.stringify(stats, null, 2)
|
||||
JSON.stringify(stats, null, 2),
|
||||
);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (err: any) {
|
||||
logger.warn("Stats output failed", err);
|
||||
}
|
||||
|
@ -1473,7 +1503,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async loadPage(
|
||||
page: Page,
|
||||
data: PageState,
|
||||
selectorOptsList = DEFAULT_SELECTORS
|
||||
selectorOptsList = DEFAULT_SELECTORS,
|
||||
) {
|
||||
const { url, seedId, depth } = data;
|
||||
|
||||
|
@ -1533,7 +1563,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const contentType = resp.headers()["content-type"];
|
||||
|
||||
isHTMLPage = this.isHTMLContentType(contentType);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
const msg = e.message || "";
|
||||
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
|
||||
|
@ -1575,7 +1605,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const frames = await page.frames();
|
||||
|
||||
const filteredFrames = await Promise.allSettled(
|
||||
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails))
|
||||
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
|
||||
);
|
||||
|
||||
data.filteredFrames = filteredFrames
|
||||
|
@ -1640,7 +1670,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
page: Page,
|
||||
data: PageState,
|
||||
selectors = DEFAULT_SELECTORS,
|
||||
logDetails: LogDetails
|
||||
logDetails: LogDetails,
|
||||
) {
|
||||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||
|
||||
|
@ -1651,7 +1681,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
links.push(url);
|
||||
if (links.length == 500) {
|
||||
promiseList.push(
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails)
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||
);
|
||||
links = [];
|
||||
}
|
||||
|
@ -1676,7 +1706,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
document.querySelectorAll(selector).forEach(getter);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const func = (window as any)[addLinkFunc] as (url: string) => NonNullable<unknown>;
|
||||
const func = (window as any)[addLinkFunc] as (
|
||||
url: string,
|
||||
) => NonNullable<unknown>;
|
||||
urls.forEach((url) => func.call(this, url));
|
||||
|
||||
return true;
|
||||
|
@ -1701,9 +1733,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}),
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
"Link extraction timed out",
|
||||
logDetails
|
||||
)
|
||||
)
|
||||
logDetails,
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
for (let i = 0; i < promiseResults.length; i++) {
|
||||
|
@ -1718,14 +1750,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.warn("Link Extraction failed", e);
|
||||
}
|
||||
|
||||
if (links.length) {
|
||||
promiseList.push(
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails)
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1737,7 +1769,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
urls: string[],
|
||||
depth: number,
|
||||
extraHops = 0,
|
||||
logDetails: LogDetails = {}
|
||||
logDetails: LogDetails = {},
|
||||
) {
|
||||
try {
|
||||
depth += 1;
|
||||
|
@ -1748,7 +1780,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
for (const possibleUrl of urls) {
|
||||
const res = this.isInScope(
|
||||
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
|
||||
if (!res) {
|
||||
|
@ -1763,11 +1795,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
url,
|
||||
depth,
|
||||
isOOS ? newExtraHops : extraHops,
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
}
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error("Queuing Error", e);
|
||||
}
|
||||
|
@ -1784,12 +1816,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"Cloudflare check timed out",
|
||||
logDetails,
|
||||
"general",
|
||||
true
|
||||
true,
|
||||
)
|
||||
) {
|
||||
logger.debug(
|
||||
"Cloudflare Check Detected, waiting for reload...",
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
await sleep(5.5);
|
||||
}
|
||||
|
@ -1803,7 +1835,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
url: string,
|
||||
depth: number,
|
||||
extraHops: number,
|
||||
logDetails: LogDetails = {}
|
||||
logDetails: LogDetails = {},
|
||||
) {
|
||||
if (this.limitHit) {
|
||||
return false;
|
||||
|
@ -1811,30 +1843,30 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const result = await this.crawlState.addToQueue(
|
||||
{ url, seedId, depth, extraHops },
|
||||
this.pageLimit
|
||||
this.pageLimit,
|
||||
);
|
||||
|
||||
switch (result) {
|
||||
case QueueState.ADDED:
|
||||
logger.debug("Queued new page url", { url, ...logDetails }, "links");
|
||||
return true;
|
||||
case QueueState.ADDED:
|
||||
logger.debug("Queued new page url", { url, ...logDetails }, "links");
|
||||
return true;
|
||||
|
||||
case QueueState.LIMIT_HIT:
|
||||
logger.debug(
|
||||
"Not queued page url, at page limit",
|
||||
{ url, ...logDetails },
|
||||
"links"
|
||||
);
|
||||
this.limitHit = true;
|
||||
return false;
|
||||
case QueueState.LIMIT_HIT:
|
||||
logger.debug(
|
||||
"Not queued page url, at page limit",
|
||||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
this.limitHit = true;
|
||||
return false;
|
||||
|
||||
case QueueState.DUPE_URL:
|
||||
logger.debug(
|
||||
"Not queued page url, already seen",
|
||||
{ url, ...logDetails },
|
||||
"links"
|
||||
);
|
||||
return false;
|
||||
case QueueState.DUPE_URL:
|
||||
logger.debug(
|
||||
"Not queued page url, already seen",
|
||||
{ url, ...logDetails },
|
||||
"links",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -1867,7 +1899,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const header_formatted = JSON.stringify(header).concat("\n");
|
||||
await this.pagesFH.writeFile(header_formatted);
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (err: any) {
|
||||
logger.error("pages/pages.jsonl creation failed", err);
|
||||
}
|
||||
|
@ -1904,7 +1936,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const processedRow = JSON.stringify(row) + "\n";
|
||||
try {
|
||||
await this.pagesFH!.writeFile(processedRow);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (err: any) {
|
||||
logger.warn("pages/pages.jsonl append failed", err);
|
||||
}
|
||||
|
@ -1920,7 +1952,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
method: "HEAD",
|
||||
headers: this.headers,
|
||||
agent: this.resolveAgent,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} as any);
|
||||
if (resp.status !== 200) {
|
||||
logger.debug("HEAD response code != 200, loading in browser", {
|
||||
|
@ -1961,14 +1993,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
"Fetching full sitemap (fromDate not specified/valid)",
|
||||
{ url, sitemapFromDate },
|
||||
"sitemap"
|
||||
"sitemap",
|
||||
);
|
||||
} else {
|
||||
lastmodFromTimestamp = dateObj.getTime();
|
||||
logger.info(
|
||||
"Fetching and filtering sitemap by date",
|
||||
{ url, sitemapFromDate },
|
||||
"sitemap"
|
||||
"sitemap",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1984,7 +2016,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const { sites } = await sitemapper.fetch();
|
||||
logger.info("Sitemap Urls Found", { urls: sites.length }, "sitemap");
|
||||
await this.queueInScopeUrls(seedId, sites, 0);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.warn("Error fetching sites from sitemap", e, "sitemap");
|
||||
}
|
||||
|
@ -2088,21 +2120,21 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
async serializeConfig(done = false) {
|
||||
switch (this.params.saveState) {
|
||||
case "never":
|
||||
return;
|
||||
|
||||
case "partial":
|
||||
if (!done) {
|
||||
case "never":
|
||||
return;
|
||||
}
|
||||
if (await this.crawlState.isFinished()) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case "always":
|
||||
default:
|
||||
break;
|
||||
case "partial":
|
||||
if (!done) {
|
||||
return;
|
||||
}
|
||||
if (await this.crawlState.isFinished()) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case "always":
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
|
@ -2137,7 +2169,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
try {
|
||||
logger.info(`Saving crawl state to: ${filename}`);
|
||||
await fsp.writeFile(filename, res);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error(`Failed to write save state file: ${filename}`, e);
|
||||
return;
|
||||
|
@ -2166,8 +2198,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
function shouldIgnoreAbort(req: HTTPRequest) {
|
||||
try {
|
||||
const failure = req.failure();
|
||||
const failureText = failure && failure.errorText || "";
|
||||
if (failureText !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
||||
const failureText = (failure && failure.errorText) || "";
|
||||
if (
|
||||
failureText !== "net::ERR_ABORTED" ||
|
||||
req.resourceType() !== "document"
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -2178,8 +2213,10 @@ function shouldIgnoreAbort(req: HTTPRequest) {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (headers["content-disposition"] ||
|
||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
||||
if (
|
||||
headers["content-disposition"] ||
|
||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
} catch (e) {
|
||||
|
@ -2188,4 +2225,3 @@ function shouldIgnoreAbort(req: HTTPRequest) {
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,81 +15,99 @@ import { Browser } from "./util/browser.js";
|
|||
import { initStorage } from "./util/storage.js";
|
||||
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||
|
||||
const profileHTML = fs.readFileSync(new URL("../html/createProfile.html", import.meta.url), {encoding: "utf8"});
|
||||
const vncHTML = fs.readFileSync(new URL("../html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
|
||||
const profileHTML = fs.readFileSync(
|
||||
new URL("../html/createProfile.html", import.meta.url),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
const vncHTML = fs.readFileSync(
|
||||
new URL("../html/vnc_lite.html", import.meta.url),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||
import.meta.url,
|
||||
),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
function cliOpts(): { [key: string]: Options } {
|
||||
function cliOpts(): { [key: string]: Options } {
|
||||
return {
|
||||
"url": {
|
||||
url: {
|
||||
describe: "The URL of the login page",
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
},
|
||||
|
||||
"user": {
|
||||
describe: "The username for the login. If not specified, will be prompted",
|
||||
user: {
|
||||
describe:
|
||||
"The username for the login. If not specified, will be prompted",
|
||||
},
|
||||
|
||||
"password": {
|
||||
describe: "The password for the login. If not specified, will be prompted (recommended)",
|
||||
password: {
|
||||
describe:
|
||||
"The password for the login. If not specified, will be prompted (recommended)",
|
||||
},
|
||||
|
||||
"filename": {
|
||||
filename: {
|
||||
describe: "The filename for the profile tarball",
|
||||
default: "/crawls/profiles/profile.tar.gz",
|
||||
},
|
||||
|
||||
"debugScreenshot": {
|
||||
describe: "If specified, take a screenshot after login and save as this filename"
|
||||
debugScreenshot: {
|
||||
describe:
|
||||
"If specified, take a screenshot after login and save as this filename",
|
||||
},
|
||||
|
||||
"headless": {
|
||||
headless: {
|
||||
describe: "Run in headless mode, otherwise start xvfb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"automated": {
|
||||
automated: {
|
||||
describe: "Start in automated mode, no interactive browser",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"interactive": {
|
||||
interactive: {
|
||||
describe: "Deprecated. Now the default option!",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"shutdownWait": {
|
||||
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
shutdownWait: {
|
||||
describe:
|
||||
"Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
type: "number",
|
||||
default: 0
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
profile: {
|
||||
describe:
|
||||
"Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"windowSize": {
|
||||
windowSize: {
|
||||
type: "string",
|
||||
describe: "Browser window dimensions, specified as: width,height",
|
||||
default: getDefaultWindowSize()
|
||||
default: getDefaultWindowSize(),
|
||||
},
|
||||
|
||||
"proxy": {
|
||||
proxy: {
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"cookieDays": {
|
||||
cookieDays: {
|
||||
type: "number",
|
||||
describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||
default: 7
|
||||
}
|
||||
describe:
|
||||
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||
default: 7,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -100,14 +118,11 @@ function getDefaultWindowSize() {
|
|||
return `${x},${y}`;
|
||||
}
|
||||
|
||||
|
||||
|
||||
async function main() {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const params : any = yargs(process.argv)
|
||||
const params: any = yargs(process.argv)
|
||||
.usage("browsertrix-crawler profile [options]")
|
||||
.option(cliOpts())
|
||||
.argv;
|
||||
.option(cliOpts()).argv;
|
||||
|
||||
logger.setDebugLogging(true);
|
||||
|
||||
|
@ -122,7 +137,7 @@ async function main() {
|
|||
process.env.GEOMETRY || "",
|
||||
"-ac",
|
||||
"+extension",
|
||||
"RANDR"
|
||||
"RANDR",
|
||||
]);
|
||||
|
||||
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
|
||||
|
@ -140,7 +155,7 @@ async function main() {
|
|||
"-passwd",
|
||||
process.env.VNC_PASS || "",
|
||||
"-display",
|
||||
process.env.DISPLAY || ""
|
||||
process.env.DISPLAY || "",
|
||||
]);
|
||||
}
|
||||
|
||||
|
@ -156,13 +171,15 @@ async function main() {
|
|||
"--window-position=0,0",
|
||||
`--window-size=${params.windowSize}`,
|
||||
// to disable the 'stability will suffer' infobar
|
||||
"--test-type"
|
||||
]
|
||||
}
|
||||
"--test-type",
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
if (params.interactive) {
|
||||
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
|
||||
logger.warn(
|
||||
"Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode",
|
||||
);
|
||||
}
|
||||
|
||||
if (params.user || params.password) {
|
||||
|
@ -179,20 +196,23 @@ async function main() {
|
|||
|
||||
const { page, cdp } = await browser.newWindowPageWithCDP();
|
||||
|
||||
const waitUntil : PuppeteerLifeCycleEvent = "load";
|
||||
const waitUntil: PuppeteerLifeCycleEvent = "load";
|
||||
|
||||
await page.setCacheEnabled(false);
|
||||
|
||||
if (!params.automated) {
|
||||
await browser.setupPage({page, cdp});
|
||||
await browser.setupPage({ page, cdp });
|
||||
|
||||
// for testing, inject browsertrix-behaviors
|
||||
await browser.addInitScript(page, behaviors + ";\nself.__bx_behaviors.init();");
|
||||
await browser.addInitScript(
|
||||
page,
|
||||
behaviors + ";\nself.__bx_behaviors.init();",
|
||||
);
|
||||
}
|
||||
|
||||
logger.info(`Loading page: ${params.url}`);
|
||||
|
||||
await page.goto(params.url, {waitUntil});
|
||||
await page.goto(params.url, { waitUntil });
|
||||
|
||||
if (!params.automated) {
|
||||
const target = await cdp.send("Target.getTargetInfo");
|
||||
|
@ -204,20 +224,29 @@ async function main() {
|
|||
}
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async function automatedProfile(params: any, browser: Browser, page: Page, cdp: CDPSession,
|
||||
waitUntil: PuppeteerLifeCycleEvent) {
|
||||
async function automatedProfile(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: any,
|
||||
browser: Browser,
|
||||
page: Page,
|
||||
cdp: CDPSession,
|
||||
waitUntil: PuppeteerLifeCycleEvent,
|
||||
) {
|
||||
let u, p;
|
||||
|
||||
logger.debug("Looking for username and password entry fields on page...");
|
||||
|
||||
try {
|
||||
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
||||
p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']");
|
||||
|
||||
u = await page.waitForSelector(
|
||||
"//input[contains(@name, 'user') or contains(@name, 'email')]",
|
||||
);
|
||||
p = await page.waitForSelector(
|
||||
"//input[contains(@name, 'pass') and @type='password']",
|
||||
);
|
||||
} catch (e) {
|
||||
if (params.debugScreenshot) {
|
||||
await page.screenshot({path: params.debugScreenshot});
|
||||
await page.screenshot({ path: params.debugScreenshot });
|
||||
}
|
||||
logger.debug("Login form could not be found");
|
||||
await page.close();
|
||||
|
@ -231,11 +260,11 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
|
|||
|
||||
await Promise.allSettled([
|
||||
p!.press("Enter"),
|
||||
page.waitForNavigation({waitUntil})
|
||||
page.waitForNavigation({ waitUntil }),
|
||||
]);
|
||||
|
||||
if (params.debugScreenshot) {
|
||||
await page.screenshot({path: params.debugScreenshot});
|
||||
await page.screenshot({ path: params.debugScreenshot });
|
||||
}
|
||||
|
||||
await createProfile(params, browser, page, cdp);
|
||||
|
@ -243,8 +272,15 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
|
|||
process.exit(0);
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async function createProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, targetFilename = "") {
|
||||
async function createProfile(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: any,
|
||||
browser: Browser,
|
||||
page: Page,
|
||||
cdp: CDPSession,
|
||||
targetFilename = "",
|
||||
) {
|
||||
await cdp.send("Network.clearBrowserCache");
|
||||
|
||||
await browser.close();
|
||||
|
@ -252,10 +288,10 @@ async function createProfile(params: any, browser: Browser, page: Page, cdp: CDP
|
|||
logger.info("Creating profile");
|
||||
|
||||
const profileFilename = params.filename || "/crawls/profiles/profile.tar.gz";
|
||||
|
||||
|
||||
const outputDir = path.dirname(profileFilename);
|
||||
if (outputDir && !fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, {recursive: true});
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
browser.saveProfile(profileFilename);
|
||||
|
@ -274,9 +310,9 @@ async function createProfile(params: any, browser: Browser, page: Page, cdp: CDP
|
|||
|
||||
function promptInput(msg: string, hidden = false) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const rl : any = readline.createInterface({
|
||||
const rl: any = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout
|
||||
output: process.stdout,
|
||||
});
|
||||
|
||||
if (hidden) {
|
||||
|
@ -303,7 +339,6 @@ function promptInput(msg: string, hidden = false) {
|
|||
});
|
||||
}
|
||||
|
||||
|
||||
class InteractiveBrowser {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: any;
|
||||
|
@ -323,7 +358,7 @@ class InteractiveBrowser {
|
|||
browser: Browser,
|
||||
page: Page,
|
||||
cdp: CDPSession,
|
||||
targetId: string
|
||||
targetId: string,
|
||||
) {
|
||||
logger.info("Creating Profile Interactively...");
|
||||
child_process.spawn("socat", [
|
||||
|
@ -359,19 +394,19 @@ class InteractiveBrowser {
|
|||
if (this.shutdownWait) {
|
||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||
logger.debug(
|
||||
`Shutting down in ${this.shutdownWait}ms if no ping received`
|
||||
`Shutting down in ${this.shutdownWait}ms if no ping received`,
|
||||
);
|
||||
} else {
|
||||
this.shutdownTimer = null;
|
||||
}
|
||||
|
||||
const httpServer = http.createServer((req, res) =>
|
||||
this.handleRequest(req, res)
|
||||
this.handleRequest(req, res),
|
||||
);
|
||||
const port = 9223;
|
||||
httpServer.listen(port);
|
||||
logger.info(
|
||||
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`
|
||||
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`,
|
||||
);
|
||||
|
||||
if (!params.headless) {
|
||||
|
@ -442,141 +477,141 @@ class InteractiveBrowser {
|
|||
let origins;
|
||||
|
||||
switch (pathname) {
|
||||
case "/":
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
if (this.params.headless) {
|
||||
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
||||
} else {
|
||||
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
||||
}
|
||||
res.end(
|
||||
profileHTML.replace(
|
||||
"$DEVTOOLS_SRC",
|
||||
targetUrl.replaceAll("$HOST", parsedUrl.hostname)
|
||||
)
|
||||
);
|
||||
return;
|
||||
|
||||
case "/vnc/":
|
||||
case "/vnc/index.html":
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
res.end(vncHTML);
|
||||
return;
|
||||
|
||||
case "/ping":
|
||||
if (this.shutdownWait) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
clearTimeout(this.shutdownTimer as any);
|
||||
this.shutdownTimer = setTimeout(
|
||||
() => process.exit(0),
|
||||
this.shutdownWait
|
||||
case "/":
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
if (this.params.headless) {
|
||||
targetUrl = `http://$HOST:9222/devtools/inspector.html?ws=$HOST:9222/devtools/page/${this.targetId}&panel=resources`;
|
||||
} else {
|
||||
targetUrl = `http://$HOST:9223/vnc/?host=$HOST&port=6080&password=${process.env.VNC_PASS}`;
|
||||
}
|
||||
res.end(
|
||||
profileHTML.replace(
|
||||
"$DEVTOOLS_SRC",
|
||||
targetUrl.replaceAll("$HOST", parsedUrl.hostname),
|
||||
),
|
||||
);
|
||||
logger.debug(
|
||||
`Ping received, delaying shutdown for ${this.shutdownWait}ms`
|
||||
);
|
||||
}
|
||||
return;
|
||||
|
||||
origins = Array.from(this.originSet.values());
|
||||
case "/vnc/":
|
||||
case "/vnc/index.html":
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
res.end(vncHTML);
|
||||
return;
|
||||
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
case "/ping":
|
||||
if (this.shutdownWait) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
clearTimeout(this.shutdownTimer as any);
|
||||
this.shutdownTimer = setTimeout(
|
||||
() => process.exit(0),
|
||||
this.shutdownWait,
|
||||
);
|
||||
logger.debug(
|
||||
`Ping received, delaying shutdown for ${this.shutdownWait}ms`,
|
||||
);
|
||||
}
|
||||
|
||||
res.end(JSON.stringify({ pong: true, origins }));
|
||||
return;
|
||||
|
||||
case "/target":
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ targetId: this.targetId }));
|
||||
return;
|
||||
|
||||
case "/vncpass":
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ password: process.env.VNC_PASS }));
|
||||
return;
|
||||
|
||||
case "/navigate":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const postData = await this.readBodyJson(req);
|
||||
const url = new URL(postData.url).href;
|
||||
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ success: true }));
|
||||
|
||||
this.page.goto(url);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(400, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ error: e.toString() }));
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
return;
|
||||
|
||||
case "/createProfileJS":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const postData = await this.readBodyJson(req);
|
||||
const targetFilename = postData.filename || "";
|
||||
|
||||
await this.saveAllCookies();
|
||||
|
||||
const resource = await createProfile(
|
||||
this.params,
|
||||
this.browser,
|
||||
this.page,
|
||||
this.cdp,
|
||||
targetFilename
|
||||
);
|
||||
origins = Array.from(this.originSet.values());
|
||||
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ resource, origins }));
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(500, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ error: e.toString() }));
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
|
||||
setTimeout(() => process.exit(0), 200);
|
||||
return;
|
||||
res.end(JSON.stringify({ pong: true, origins }));
|
||||
return;
|
||||
|
||||
case "/createProfile":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
case "/target":
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ targetId: this.targetId }));
|
||||
return;
|
||||
|
||||
try {
|
||||
await this.saveAllCookies();
|
||||
case "/vncpass":
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ password: process.env.VNC_PASS }));
|
||||
return;
|
||||
|
||||
await createProfile(this.params, this.browser, this.page, this.cdp);
|
||||
case "/navigate":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
res.end(
|
||||
"<html><body>Profile Created! You may now close this window.</body></html>"
|
||||
);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(500, { "Content-Type": "text/html" });
|
||||
res.end(
|
||||
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info"
|
||||
);
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
try {
|
||||
const postData = await this.readBodyJson(req);
|
||||
const url = new URL(postData.url).href;
|
||||
|
||||
setTimeout(() => process.exit(0), 200);
|
||||
return;
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ success: true }));
|
||||
|
||||
this.page.goto(url);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(400, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ error: e.toString() }));
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
return;
|
||||
|
||||
case "/createProfileJS":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
const postData = await this.readBodyJson(req);
|
||||
const targetFilename = postData.filename || "";
|
||||
|
||||
await this.saveAllCookies();
|
||||
|
||||
const resource = await createProfile(
|
||||
this.params,
|
||||
this.browser,
|
||||
this.page,
|
||||
this.cdp,
|
||||
targetFilename,
|
||||
);
|
||||
origins = Array.from(this.originSet.values());
|
||||
|
||||
res.writeHead(200, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ resource, origins }));
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(500, { "Content-Type": "application/json" });
|
||||
res.end(JSON.stringify({ error: e.toString() }));
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
|
||||
setTimeout(() => process.exit(0), 200);
|
||||
return;
|
||||
|
||||
case "/createProfile":
|
||||
if (req.method !== "POST") {
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.saveAllCookies();
|
||||
|
||||
await createProfile(this.params, this.browser, this.page, this.cdp);
|
||||
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
res.end(
|
||||
"<html><body>Profile Created! You may now close this window.</body></html>",
|
||||
);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(500, { "Content-Type": "text/html" });
|
||||
res.end(
|
||||
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info",
|
||||
);
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
|
||||
setTimeout(() => process.exit(0), 200);
|
||||
return;
|
||||
}
|
||||
|
||||
if (pathname.startsWith("/vnc/")) {
|
||||
const fileUrl = new URL(
|
||||
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
|
||||
import.meta.url
|
||||
import.meta.url,
|
||||
);
|
||||
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
|
||||
res.writeHead(200, { "Content-Type": "application/javascript" });
|
||||
|
@ -607,6 +642,4 @@ class InteractiveBrowser {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
main();
|
||||
|
||||
|
|
|
@ -2,6 +2,14 @@ import { Page } from "puppeteer-core";
|
|||
import { PageState } from "./util/state.js";
|
||||
import { Crawler } from "./crawler.js";
|
||||
|
||||
export default async ({data, page, crawler} : {data: PageState, page: Page, crawler: Crawler}) => {
|
||||
export default async ({
|
||||
data,
|
||||
page,
|
||||
crawler,
|
||||
}: {
|
||||
data: PageState;
|
||||
page: Page;
|
||||
crawler: Crawler;
|
||||
}) => {
|
||||
await crawler.loadPage(page, data);
|
||||
};
|
||||
|
|
|
@ -4,13 +4,11 @@ import { logger } from "./util/logger.js";
|
|||
import { setExitOnRedisError } from "./util/redis.js";
|
||||
import { Crawler } from "./crawler.js";
|
||||
|
||||
|
||||
let crawler : Crawler | null = null;
|
||||
let crawler: Crawler | null = null;
|
||||
|
||||
let lastSigInt = 0;
|
||||
let forceTerm = false;
|
||||
|
||||
|
||||
async function handleTerminate(signame: string) {
|
||||
logger.info(`${signame} received...`);
|
||||
if (!crawler || !crawler.crawlState) {
|
||||
|
@ -53,5 +51,3 @@ process.on("SIGABRT", async () => {
|
|||
|
||||
crawler = new Crawler();
|
||||
crawler.run();
|
||||
|
||||
|
||||
|
|
|
@ -7,199 +7,225 @@ import { KnownDevices as devices } from "puppeteer-core";
|
|||
import yargs, { Options } from "yargs";
|
||||
import { hideBin } from "yargs/helpers";
|
||||
|
||||
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
|
||||
import {
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
WAIT_UNTIL_OPTS,
|
||||
EXTRACT_TEXT_TYPES,
|
||||
} from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
import { screenshotTypes } from "./screenshots.js";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
class ArgParser {
|
||||
get cliOpts() : { [key: string]: Options } {
|
||||
const coerce = (array : string[]) => {
|
||||
return array.flatMap(v => v.split(",")).filter(x => !!x);
|
||||
get cliOpts(): { [key: string]: Options } {
|
||||
const coerce = (array: string[]) => {
|
||||
return array.flatMap((v) => v.split(",")).filter((x) => !!x);
|
||||
};
|
||||
|
||||
return {
|
||||
"seeds": {
|
||||
seeds: {
|
||||
alias: "url",
|
||||
describe: "The URL to start crawling from",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"seedFile": {
|
||||
seedFile: {
|
||||
alias: ["urlFile"],
|
||||
describe: "If set, read a list of seed urls, one per line, from the specified",
|
||||
describe:
|
||||
"If set, read a list of seed urls, one per line, from the specified",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"workers": {
|
||||
workers: {
|
||||
alias: "w",
|
||||
describe: "The number of workers to run in parallel",
|
||||
default: 1,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"crawlId": {
|
||||
crawlId: {
|
||||
alias: "id",
|
||||
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
||||
describe:
|
||||
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"waitUntil": {
|
||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||
waitUntil: {
|
||||
describe:
|
||||
"Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||
type: "array",
|
||||
default: ["load", "networkidle2"],
|
||||
choices: WAIT_UNTIL_OPTS,
|
||||
coerce,
|
||||
},
|
||||
|
||||
"depth": {
|
||||
depth: {
|
||||
describe: "The depth of the crawl for all seeds",
|
||||
default: -1,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"extraHops": {
|
||||
extraHops: {
|
||||
describe: "Number of extra 'hops' to follow, beyond the current scope",
|
||||
default: 0,
|
||||
type: "number"
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"pageLimit": {
|
||||
pageLimit: {
|
||||
alias: "limit",
|
||||
describe: "Limit crawl to this number of pages",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"maxPageLimit": {
|
||||
describe: "Maximum pages to crawl, overriding pageLimit if both are set",
|
||||
maxPageLimit: {
|
||||
describe:
|
||||
"Maximum pages to crawl, overriding pageLimit if both are set",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"pageLoadTimeout": {
|
||||
pageLoadTimeout: {
|
||||
alias: "timeout",
|
||||
describe: "Timeout for each page to load (in seconds)",
|
||||
default: 90,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"scopeType": {
|
||||
describe: "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
||||
scopeType: {
|
||||
describe:
|
||||
"A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
||||
type: "string",
|
||||
choices: ["page", "page-spa", "prefix", "host", "domain", "any", "custom"]
|
||||
choices: [
|
||||
"page",
|
||||
"page-spa",
|
||||
"prefix",
|
||||
"host",
|
||||
"domain",
|
||||
"any",
|
||||
"custom",
|
||||
],
|
||||
},
|
||||
|
||||
"scopeIncludeRx": {
|
||||
scopeIncludeRx: {
|
||||
alias: "include",
|
||||
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||
describe:
|
||||
"Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||
},
|
||||
|
||||
"scopeExcludeRx": {
|
||||
scopeExcludeRx: {
|
||||
alias: "exclude",
|
||||
describe: "Regex of page URLs that should be excluded from the crawl."
|
||||
describe: "Regex of page URLs that should be excluded from the crawl.",
|
||||
},
|
||||
|
||||
"allowHashUrls": {
|
||||
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||
allowHashUrls: {
|
||||
describe:
|
||||
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||
},
|
||||
|
||||
"blockRules": {
|
||||
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||
blockRules: {
|
||||
describe:
|
||||
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"blockMessage": {
|
||||
describe: "If specified, when a URL is blocked, a record with this error message is added instead",
|
||||
blockMessage: {
|
||||
describe:
|
||||
"If specified, when a URL is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"blockAds": {
|
||||
blockAds: {
|
||||
alias: "blockads",
|
||||
describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
||||
describe:
|
||||
"If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"adBlockMessage": {
|
||||
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
|
||||
adBlockMessage: {
|
||||
describe:
|
||||
"If specified, when an ad is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"collection": {
|
||||
collection: {
|
||||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
describe:
|
||||
"Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
type: "string",
|
||||
default: "crawl-@ts"
|
||||
default: "crawl-@ts",
|
||||
},
|
||||
|
||||
"headless": {
|
||||
headless: {
|
||||
describe: "Run in headless mode, otherwise start xvfb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"driver": {
|
||||
driver: {
|
||||
describe: "JS driver for the crawler",
|
||||
type: "string",
|
||||
default: "./defaultDriver.js",
|
||||
},
|
||||
|
||||
"generateCDX": {
|
||||
generateCDX: {
|
||||
alias: ["generatecdx", "generateCdx"],
|
||||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
describe:
|
||||
"If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"combineWARC": {
|
||||
combineWARC: {
|
||||
alias: ["combinewarc", "combineWarc"],
|
||||
describe: "If set, combine the warcs",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"rolloverSize": {
|
||||
rolloverSize: {
|
||||
describe: "If set, declare the rollover size",
|
||||
default: 1000000000,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"generateWACZ": {
|
||||
generateWACZ: {
|
||||
alias: ["generatewacz", "generateWacz"],
|
||||
describe: "If set, generate wacz",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||
logging: {
|
||||
describe:
|
||||
"Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||
type: "array",
|
||||
default: ["stats"],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"logLevel": {
|
||||
logLevel: {
|
||||
describe: "Comma-separated list of log levels to include in logs",
|
||||
type: "array",
|
||||
default: [],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"context": {
|
||||
context: {
|
||||
describe: "Comma-separated list of contexts to include in logs",
|
||||
type: "array",
|
||||
default: [],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"text": {
|
||||
describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
text: {
|
||||
describe:
|
||||
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
type: "array",
|
||||
choices: EXTRACT_TEXT_TYPES,
|
||||
coerce: (array) => {
|
||||
|
@ -211,45 +237,51 @@ class ArgParser {
|
|||
return [];
|
||||
}
|
||||
return coerce(array);
|
||||
}
|
||||
},
|
||||
},
|
||||
|
||||
"cwd": {
|
||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||
cwd: {
|
||||
describe:
|
||||
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||
type: "string",
|
||||
default: process.cwd(),
|
||||
},
|
||||
|
||||
"mobileDevice": {
|
||||
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||
mobileDevice: {
|
||||
describe:
|
||||
"Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"userAgent": {
|
||||
userAgent: {
|
||||
describe: "Override user-agent with specified string",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"userAgentSuffix": {
|
||||
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||
userAgentSuffix: {
|
||||
describe:
|
||||
"Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"useSitemap": {
|
||||
useSitemap: {
|
||||
alias: "sitemap",
|
||||
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||
describe:
|
||||
"If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||
},
|
||||
|
||||
"sitemapFromDate": {
|
||||
sitemapFromDate: {
|
||||
alias: "sitemapFrom",
|
||||
describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
||||
describe:
|
||||
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
||||
},
|
||||
|
||||
"statsFilename": {
|
||||
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
||||
statsFilename: {
|
||||
describe:
|
||||
"If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)",
|
||||
},
|
||||
|
||||
"behaviors": {
|
||||
behaviors: {
|
||||
describe: "Which background behaviors to enable on each page",
|
||||
type: "array",
|
||||
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
|
||||
|
@ -257,179 +289,204 @@ class ArgParser {
|
|||
coerce,
|
||||
},
|
||||
|
||||
"behaviorTimeout": {
|
||||
describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
||||
behaviorTimeout: {
|
||||
describe:
|
||||
"If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
||||
default: 90,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"pageExtraDelay": {
|
||||
pageExtraDelay: {
|
||||
alias: "delay",
|
||||
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
||||
describe:
|
||||
"If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"dedupPolicy": {
|
||||
dedupPolicy: {
|
||||
describe: "Deduplication policy",
|
||||
default: "skip",
|
||||
type: "string",
|
||||
choices: ["skip", "revisit", "keep"],
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
profile: {
|
||||
describe:
|
||||
"Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"screenshot": {
|
||||
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
||||
screenshot: {
|
||||
describe:
|
||||
"Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
||||
type: "array",
|
||||
default: [],
|
||||
choices: Array.from(Object.keys(screenshotTypes)),
|
||||
coerce,
|
||||
},
|
||||
|
||||
"screencastPort": {
|
||||
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
||||
type: "number",
|
||||
default: 0
|
||||
},
|
||||
|
||||
"screencastRedis": {
|
||||
describe: "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
|
||||
type: "boolean",
|
||||
default: false
|
||||
},
|
||||
|
||||
"warcInfo": {
|
||||
alias: ["warcinfo"],
|
||||
describe: "Optional fields added to the warcinfo record in combined WARCs",
|
||||
//type: "object"
|
||||
},
|
||||
|
||||
"redisStoreUrl": {
|
||||
describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
||||
type: "string",
|
||||
default: "redis://localhost:6379/0"
|
||||
},
|
||||
|
||||
"saveState": {
|
||||
describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
||||
type: "string",
|
||||
default: "partial",
|
||||
choices: ["never", "partial", "always"]
|
||||
},
|
||||
|
||||
"saveStateInterval": {
|
||||
describe: "If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
|
||||
type: "number",
|
||||
default: 300,
|
||||
},
|
||||
|
||||
"saveStateHistory": {
|
||||
describe: "Number of save states to keep during the duration of a crawl",
|
||||
type: "number",
|
||||
default: 5,
|
||||
},
|
||||
|
||||
"sizeLimit": {
|
||||
describe: "If set, save state and exit if size limit exceeds this value",
|
||||
screencastPort: {
|
||||
describe:
|
||||
"If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"diskUtilization": {
|
||||
describe: "If set, save state and exit if disk utilization exceeds this percentage value",
|
||||
screencastRedis: {
|
||||
describe:
|
||||
"If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
warcInfo: {
|
||||
alias: ["warcinfo"],
|
||||
describe:
|
||||
"Optional fields added to the warcinfo record in combined WARCs",
|
||||
//type: "object"
|
||||
},
|
||||
|
||||
redisStoreUrl: {
|
||||
describe:
|
||||
"If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
||||
type: "string",
|
||||
default: "redis://localhost:6379/0",
|
||||
},
|
||||
|
||||
saveState: {
|
||||
describe:
|
||||
"If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
||||
type: "string",
|
||||
default: "partial",
|
||||
choices: ["never", "partial", "always"],
|
||||
},
|
||||
|
||||
saveStateInterval: {
|
||||
describe:
|
||||
"If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
|
||||
type: "number",
|
||||
default: 300,
|
||||
},
|
||||
|
||||
saveStateHistory: {
|
||||
describe:
|
||||
"Number of save states to keep during the duration of a crawl",
|
||||
type: "number",
|
||||
default: 5,
|
||||
},
|
||||
|
||||
sizeLimit: {
|
||||
describe:
|
||||
"If set, save state and exit if size limit exceeds this value",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
diskUtilization: {
|
||||
describe:
|
||||
"If set, save state and exit if disk utilization exceeds this percentage value",
|
||||
type: "number",
|
||||
default: 90,
|
||||
},
|
||||
|
||||
"timeLimit": {
|
||||
timeLimit: {
|
||||
describe: "If set, save state and exit after time limit, in seconds",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"healthCheckPort": {
|
||||
healthCheckPort: {
|
||||
describe: "port to run healthcheck on",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"overwrite": {
|
||||
describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
||||
overwrite: {
|
||||
describe:
|
||||
"overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"waitOnDone": {
|
||||
describe: "if set, wait for interrupt signal when finished instead of exiting",
|
||||
waitOnDone: {
|
||||
describe:
|
||||
"if set, wait for interrupt signal when finished instead of exiting",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"restartsOnError": {
|
||||
describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
|
||||
restartsOnError: {
|
||||
describe:
|
||||
"if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"netIdleWait": {
|
||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||
netIdleWait: {
|
||||
describe:
|
||||
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||
type: "number",
|
||||
default: -1
|
||||
default: -1,
|
||||
},
|
||||
|
||||
"lang": {
|
||||
describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
type: "string"
|
||||
lang: {
|
||||
describe:
|
||||
"if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"title": {
|
||||
describe: "If set, write supplied title into WACZ datapackage.json metadata",
|
||||
type: "string"
|
||||
title: {
|
||||
describe:
|
||||
"If set, write supplied title into WACZ datapackage.json metadata",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"description": {
|
||||
description: {
|
||||
alias: ["desc"],
|
||||
describe: "If set, write supplied description into WACZ datapackage.json metadata",
|
||||
type: "string"
|
||||
describe:
|
||||
"If set, write supplied description into WACZ datapackage.json metadata",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"originOverride": {
|
||||
describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
|
||||
originOverride: {
|
||||
describe:
|
||||
"if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"logErrorsToRedis": {
|
||||
logErrorsToRedis: {
|
||||
describe: "If set, write error messages to redis",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"failOnFailedSeed": {
|
||||
describe: "If set, crawler will fail with exit code 1 if any seed fails",
|
||||
failOnFailedSeed: {
|
||||
describe:
|
||||
"If set, crawler will fail with exit code 1 if any seed fails",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"failOnFailedLimit": {
|
||||
describe: "If set, save state and exit if number of failed pages exceeds this value",
|
||||
failOnFailedLimit: {
|
||||
describe:
|
||||
"If set, save state and exit if number of failed pages exceeds this value",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"customBehaviors": {
|
||||
describe: "injects a custom behavior file or set of behavior files in a directory",
|
||||
type: "string"
|
||||
customBehaviors: {
|
||||
describe:
|
||||
"injects a custom behavior file or set of behavior files in a directory",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"debugAccessRedis": {
|
||||
describe: "if set, runs internal redis without protected mode to allow external access (for debugging)",
|
||||
debugAccessRedis: {
|
||||
describe:
|
||||
"if set, runs internal redis without protected mode to allow external access (for debugging)",
|
||||
type: "boolean",
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -445,25 +502,28 @@ class ArgParser {
|
|||
const parsed = yargs(hideBin(argv))
|
||||
.usage("crawler [options]")
|
||||
.option(this.cliOpts)
|
||||
.config("config", "Path to YAML config file", (configPath : string | number) => {
|
||||
if (configPath === "/crawls/stdin") {
|
||||
configPath = process.stdin.fd;
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
|
||||
return origConfig;
|
||||
})
|
||||
.check((argv) => this.validateArgs(argv))
|
||||
.argv;
|
||||
.config(
|
||||
"config",
|
||||
"Path to YAML config file",
|
||||
(configPath: string | number) => {
|
||||
if (configPath === "/crawls/stdin") {
|
||||
configPath = process.stdin.fd;
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
|
||||
return origConfig;
|
||||
},
|
||||
)
|
||||
.check((argv) => this.validateArgs(argv)).argv;
|
||||
|
||||
return {parsed, origConfig};
|
||||
return { parsed, origConfig };
|
||||
}
|
||||
|
||||
splitCrawlArgsQuoteSafe(crawlArgs: string) : string[] {
|
||||
splitCrawlArgsQuoteSafe(crawlArgs: string): string[] {
|
||||
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
|
||||
const regex = /"[^"]+"|[^\s]+/g;
|
||||
const res = crawlArgs.match(regex);
|
||||
return res ? res.map(e => e.replace(/"(.+)"/, "$1")) : [];
|
||||
return res ? res.map((e) => e.replace(/"(.+)"/, "$1")) : [];
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -472,13 +532,15 @@ class ArgParser {
|
|||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||
|
||||
// Check that the collection name is valid.
|
||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1){
|
||||
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
|
||||
logger.fatal(
|
||||
`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`,
|
||||
);
|
||||
}
|
||||
|
||||
// background behaviors to apply
|
||||
const behaviorOpts : {[key: string]: string | boolean} = {};
|
||||
argv.behaviors.forEach((x: string) => behaviorOpts[x] = true);
|
||||
const behaviorOpts: { [key: string]: string | boolean } = {};
|
||||
argv.behaviors.forEach((x: string) => (behaviorOpts[x] = true));
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||
|
||||
|
@ -486,19 +548,21 @@ class ArgParser {
|
|||
|
||||
if (argv.mobileDevice) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
argv.emulateDevice = (devices as Record<string, any>)[argv.mobileDevice.replace("-", " ")];
|
||||
argv.emulateDevice = (devices as Record<string, any>)[
|
||||
argv.mobileDevice.replace("-", " ")
|
||||
];
|
||||
if (!argv.emulateDevice) {
|
||||
logger.fatal("Unknown device: " + argv.mobileDevice);
|
||||
}
|
||||
} else {
|
||||
argv.emulateDevice = {viewport: null};
|
||||
argv.emulateDevice = { viewport: null };
|
||||
}
|
||||
|
||||
if (argv.seedFile) {
|
||||
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
|
||||
const urlSeedFileList = urlSeedFile.split("\n");
|
||||
|
||||
if (typeof(argv.seeds) === "string") {
|
||||
if (typeof argv.seeds === "string") {
|
||||
argv.seeds = [argv.seeds];
|
||||
}
|
||||
|
||||
|
@ -530,12 +594,12 @@ class ArgParser {
|
|||
argv.scopedSeeds = [];
|
||||
|
||||
for (let seed of argv.seeds) {
|
||||
if (typeof(seed) === "string") {
|
||||
seed = {url: seed};
|
||||
if (typeof seed === "string") {
|
||||
seed = { url: seed };
|
||||
}
|
||||
|
||||
try {
|
||||
argv.scopedSeeds.push(new ScopedSeed({...scopeOpts, ...seed}));
|
||||
argv.scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...seed }));
|
||||
} catch (e) {
|
||||
if (argv.failOnFailedSeed) {
|
||||
logger.fatal(`Invalid Seed "${seed.url}" specified, aborting crawl.`);
|
||||
|
@ -552,7 +616,7 @@ class ArgParser {
|
|||
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
||||
}
|
||||
|
||||
if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) {
|
||||
if (argv.diskUtilization < 0 || argv.diskUtilization > 99) {
|
||||
argv.diskUtilization = 90;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ const BlockState = {
|
|||
BLOCK_PAGE_NAV: "page",
|
||||
BLOCK_IFRAME_NAV: "iframe",
|
||||
BLOCK_OTHER: "resource",
|
||||
BLOCK_AD: "advertisement"
|
||||
BLOCK_AD: "advertisement",
|
||||
};
|
||||
|
||||
type BlockRuleDecl = {
|
||||
|
@ -21,30 +21,30 @@ type BlockRuleDecl = {
|
|||
frameTextMatch?: string;
|
||||
inFrameUrl?: string;
|
||||
type?: string;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// ===========================================================================
|
||||
class BlockRule
|
||||
{
|
||||
class BlockRule {
|
||||
type: string;
|
||||
url: RegExp | null;
|
||||
frameTextMatch?: RegExp | null;
|
||||
inFrameUrl?: RegExp | null;
|
||||
|
||||
constructor(data: string | BlockRuleDecl) {
|
||||
if (typeof(data) === "string") {
|
||||
if (typeof data === "string") {
|
||||
this.url = new RegExp(data);
|
||||
this.type = "block";
|
||||
} else {
|
||||
this.url = data.url ? new RegExp(data.url) : null;
|
||||
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
|
||||
this.frameTextMatch = data.frameTextMatch
|
||||
? new RegExp(data.frameTextMatch)
|
||||
: null;
|
||||
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
|
||||
this.type = data.type || "block";
|
||||
}
|
||||
|
||||
if (!RULE_TYPES.includes(this.type)) {
|
||||
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
||||
logger.fatal('Rule "type" must be: ' + RULE_TYPES.join(", "));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,16 +59,18 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class BlockRules
|
||||
{
|
||||
export class BlockRules {
|
||||
rules: BlockRule[];
|
||||
blockPutUrl: string;
|
||||
blockErrMsg: string;
|
||||
blockedUrlSet = new Set();
|
||||
|
||||
constructor(blockRules: BlockRuleDecl[], blockPutUrl: string, blockErrMsg: string) {
|
||||
constructor(
|
||||
blockRules: BlockRuleDecl[],
|
||||
blockPutUrl: string,
|
||||
blockErrMsg: string,
|
||||
) {
|
||||
this.rules = [];
|
||||
this.blockPutUrl = blockPutUrl;
|
||||
this.blockErrMsg = blockErrMsg;
|
||||
|
@ -89,11 +91,15 @@ export class BlockRules
|
|||
|
||||
async initPage(browser: Browser, page: Page) {
|
||||
const onRequest = async (request: HTTPRequest) => {
|
||||
const logDetails = {page: page.url()};
|
||||
const logDetails = { page: page.url() };
|
||||
try {
|
||||
await this.handleRequest(request, logDetails);
|
||||
} catch (e) {
|
||||
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
|
||||
logger.warn(
|
||||
"Error handling request",
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
}
|
||||
};
|
||||
await browser.interceptRequest(page, onRequest);
|
||||
|
@ -113,14 +119,22 @@ export class BlockRules
|
|||
} else {
|
||||
await request.abort("blockedbyclient", 1);
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
`Block: (${blockState}) Failed On: ${url}`,
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
||||
async shouldBlock(
|
||||
request: HTTPRequest,
|
||||
url: string,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
|
@ -162,14 +176,29 @@ export class BlockRules
|
|||
}
|
||||
|
||||
for (const rule of this.rules) {
|
||||
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails);
|
||||
const { done, block } = await this.ruleCheck(
|
||||
rule,
|
||||
request,
|
||||
url,
|
||||
frameUrl,
|
||||
isNavReq,
|
||||
logDetails,
|
||||
);
|
||||
|
||||
if (block) {
|
||||
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
||||
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
|
||||
logger.warn(
|
||||
"Block rule match for page request ignored, set --exclude to block full pages",
|
||||
{ url, ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
"URL Blocked in iframe",
|
||||
{ url, frameUrl, ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
await this.recordBlockMsg(url);
|
||||
return blockState;
|
||||
}
|
||||
|
@ -181,47 +210,75 @@ export class BlockRules
|
|||
return BlockState.ALLOW;
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async ruleCheck(rule: BlockRule, request: HTTPRequest, reqUrl: string, frameUrl: string, isNavReq: boolean, logDetails: Record<string, any>) {
|
||||
const {url, inFrameUrl, frameTextMatch} = rule;
|
||||
async ruleCheck(
|
||||
rule: BlockRule,
|
||||
request: HTTPRequest,
|
||||
reqUrl: string,
|
||||
frameUrl: string,
|
||||
isNavReq: boolean,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
const { url, inFrameUrl, frameTextMatch } = rule;
|
||||
|
||||
const type = rule.type || "block";
|
||||
const allowOnly = (type === "allowOnly");
|
||||
const allowOnly = type === "allowOnly";
|
||||
|
||||
// not a frame match, skip rule
|
||||
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
||||
return {block: false, done: false};
|
||||
return { block: false, done: false };
|
||||
}
|
||||
|
||||
const urlMatched = (url && reqUrl.match(url));
|
||||
const urlMatched = url && reqUrl.match(url);
|
||||
|
||||
// if frame text-based rule: if url matched and a frame request
|
||||
// frame text-based match: only applies to nav requests, never block otherwise
|
||||
if (frameTextMatch) {
|
||||
if (!urlMatched || !isNavReq) {
|
||||
return {block: false, done: false};
|
||||
return { block: false, done: false };
|
||||
}
|
||||
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
|
||||
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
|
||||
return {block, done: true};
|
||||
const block = (await this.isTextMatch(
|
||||
request,
|
||||
reqUrl,
|
||||
frameTextMatch,
|
||||
logDetails,
|
||||
))
|
||||
? !allowOnly
|
||||
: allowOnly;
|
||||
logger.debug(
|
||||
"URL Conditional rule in iframe",
|
||||
{ ...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl },
|
||||
"blocking",
|
||||
);
|
||||
return { block, done: true };
|
||||
}
|
||||
|
||||
// for non frame text rule, simply match by URL
|
||||
const block = urlMatched ? !allowOnly : allowOnly;
|
||||
return {block, done: false};
|
||||
return { block, done: false };
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async isTextMatch(request: HTTPRequest, reqUrl: string, frameTextMatch: RegExp, logDetails: Record<string, any>) {
|
||||
async isTextMatch(
|
||||
request: HTTPRequest,
|
||||
reqUrl: string,
|
||||
frameTextMatch: RegExp,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
try {
|
||||
const res = await fetch(reqUrl);
|
||||
const text = await res.text();
|
||||
|
||||
return !!text.match(frameTextMatch);
|
||||
|
||||
} catch (e) {
|
||||
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
"Error determining text match",
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -239,19 +296,29 @@ export class BlockRules
|
|||
const body = this.blockErrMsg;
|
||||
const putUrl = new URL(this.blockPutUrl);
|
||||
putUrl.searchParams.set("url", url);
|
||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||
await fetch(putUrl.href, {
|
||||
method: "PUT",
|
||||
headers: { "Content-Type": "text/html" },
|
||||
body,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class AdBlockRules extends BlockRules
|
||||
{
|
||||
export class AdBlockRules extends BlockRules {
|
||||
adhosts: string[];
|
||||
|
||||
constructor(blockPutUrl: string, blockErrMsg: string, adhostsFilePath = "../../ad-hosts.json") {
|
||||
constructor(
|
||||
blockPutUrl: string,
|
||||
blockErrMsg: string,
|
||||
adhostsFilePath = "../../ad-hosts.json",
|
||||
) {
|
||||
super([], blockPutUrl, blockErrMsg);
|
||||
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {"encoding": "utf-8"}));
|
||||
this.adhosts = JSON.parse(
|
||||
fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {
|
||||
encoding: "utf-8",
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
isAdUrl(url: string) {
|
||||
|
@ -260,10 +327,19 @@ export class AdBlockRules extends BlockRules
|
|||
return domain && this.adhosts.includes(domain);
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
||||
async shouldBlock(
|
||||
request: HTTPRequest,
|
||||
url: string,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
if (this.isAdUrl(url)) {
|
||||
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
"URL blocked for being an ad",
|
||||
{ url, ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
await this.recordBlockMsg(url);
|
||||
return BlockState.BLOCK_AD;
|
||||
}
|
||||
|
|
|
@ -9,28 +9,32 @@ import path from "path";
|
|||
import { logger } from "./logger.js";
|
||||
import { initStorage } from "./storage.js";
|
||||
|
||||
import puppeteer, { Frame, HTTPRequest, Page, PuppeteerLaunchOptions, Viewport } from "puppeteer-core";
|
||||
import puppeteer, {
|
||||
Frame,
|
||||
HTTPRequest,
|
||||
Page,
|
||||
PuppeteerLaunchOptions,
|
||||
Viewport,
|
||||
} from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||
|
||||
type LaunchOpts = {
|
||||
profileUrl: string;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
chromeOptions: Record<string, any>
|
||||
chromeOptions: Record<string, any>;
|
||||
signals: boolean;
|
||||
headless: boolean;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
emulateDevice?: Record<string, any>
|
||||
emulateDevice?: Record<string, any>;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
ondisconnect?: ((err: any) => NonNullable<unknown>) | null
|
||||
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;
|
||||
};
|
||||
|
||||
|
||||
// ==================================================================
|
||||
export class Browser
|
||||
{
|
||||
export class Browser {
|
||||
profileDir: string;
|
||||
customProfile = false;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -48,47 +52,58 @@ export class Browser
|
|||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
}
|
||||
|
||||
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} : LaunchOpts) { if (this.isLaunched()) {
|
||||
return;
|
||||
async launch({
|
||||
profileUrl,
|
||||
chromeOptions,
|
||||
signals = false,
|
||||
headless = false,
|
||||
emulateDevice = {},
|
||||
ondisconnect = null,
|
||||
}: LaunchOpts) {
|
||||
if (this.isLaunched()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (profileUrl) {
|
||||
this.customProfile = await this.loadProfile(profileUrl);
|
||||
}
|
||||
|
||||
this.emulateDevice = emulateDevice;
|
||||
|
||||
const args = this.chromeArgs(chromeOptions);
|
||||
|
||||
let defaultViewport = null;
|
||||
|
||||
if (process.env.GEOMETRY) {
|
||||
const geom = process.env.GEOMETRY.split("x");
|
||||
|
||||
defaultViewport = { width: Number(geom[0]), height: Number(geom[1]) };
|
||||
}
|
||||
|
||||
const launchOpts: PuppeteerLaunchOptions = {
|
||||
args,
|
||||
headless: headless ? "new" : false,
|
||||
executablePath: this.getBrowserExe(),
|
||||
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
||||
ignoreHTTPSErrors: true,
|
||||
handleSIGHUP: signals,
|
||||
handleSIGINT: signals,
|
||||
handleSIGTERM: signals,
|
||||
protocolTimeout: 0,
|
||||
|
||||
defaultViewport,
|
||||
waitForInitialPage: false,
|
||||
userDataDir: this.profileDir,
|
||||
};
|
||||
|
||||
await this._init(launchOpts, ondisconnect);
|
||||
}
|
||||
|
||||
if (profileUrl) {
|
||||
this.customProfile = await this.loadProfile(profileUrl);
|
||||
}
|
||||
|
||||
this.emulateDevice = emulateDevice;
|
||||
|
||||
const args = this.chromeArgs(chromeOptions);
|
||||
|
||||
let defaultViewport = null;
|
||||
|
||||
if (process.env.GEOMETRY) {
|
||||
const geom = process.env.GEOMETRY.split("x");
|
||||
|
||||
defaultViewport = {width: Number(geom[0]), height: Number(geom[1])};
|
||||
}
|
||||
|
||||
const launchOpts : PuppeteerLaunchOptions = {
|
||||
args,
|
||||
headless: headless ? "new" : false,
|
||||
executablePath: this.getBrowserExe(),
|
||||
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
|
||||
ignoreHTTPSErrors: true,
|
||||
handleSIGHUP: signals,
|
||||
handleSIGINT: signals,
|
||||
handleSIGTERM: signals,
|
||||
protocolTimeout: 0,
|
||||
|
||||
defaultViewport,
|
||||
waitForInitialPage: false,
|
||||
userDataDir: this.profileDir
|
||||
};
|
||||
|
||||
await this._init(launchOpts, ondisconnect);
|
||||
}
|
||||
|
||||
async setupPage({page} : {page: Page, cdp: CDPSession}) {
|
||||
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
|
||||
await this.addInitScript(
|
||||
page,
|
||||
'Object.defineProperty(navigator, "webdriver", {value: false});',
|
||||
);
|
||||
|
||||
if (this.customProfile) {
|
||||
logger.info("Disabling Service Workers for profile", {}, "browser");
|
||||
|
@ -97,20 +112,26 @@ export class Browser
|
|||
}
|
||||
}
|
||||
|
||||
async loadProfile(profileFilename: string) : Promise<boolean> {
|
||||
async loadProfile(profileFilename: string): Promise<boolean> {
|
||||
const targetFilename = "/tmp/profile.tar.gz";
|
||||
|
||||
if (profileFilename &&
|
||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
||||
|
||||
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
|
||||
if (
|
||||
profileFilename &&
|
||||
(profileFilename.startsWith("http:") ||
|
||||
profileFilename.startsWith("https:"))
|
||||
) {
|
||||
logger.info(
|
||||
`Downloading ${profileFilename} to ${targetFilename}`,
|
||||
{},
|
||||
"browserProfile",
|
||||
);
|
||||
|
||||
const resp = await fetch(profileFilename);
|
||||
await pipeline(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
Readable.fromWeb(resp.body as any),
|
||||
fs.createWriteStream(targetFilename)
|
||||
fs.createWriteStream(targetFilename),
|
||||
);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
|
@ -118,7 +139,9 @@ export class Browser
|
|||
const storage = initStorage();
|
||||
|
||||
if (!storage) {
|
||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
logger.fatal(
|
||||
"Profile specified relative to s3 storage, but no S3 storage defined",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -129,7 +152,9 @@ export class Browser
|
|||
|
||||
if (profileFilename) {
|
||||
try {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir});
|
||||
child_process.execSync("tar xvfz " + profileFilename, {
|
||||
cwd: this.profileDir,
|
||||
});
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
|
@ -140,10 +165,12 @@ export class Browser
|
|||
}
|
||||
|
||||
saveProfile(profileFilename: string) {
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {
|
||||
cwd: this.profileDir,
|
||||
});
|
||||
}
|
||||
|
||||
chromeArgs({proxy=true, userAgent=null, extraArgs=[]} = {}) {
|
||||
chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) {
|
||||
// Chrome Flags, including proxy server
|
||||
const args = [
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
|
@ -162,25 +189,29 @@ export class Browser
|
|||
|
||||
if (proxy) {
|
||||
args.push("--ignore-certificate-errors");
|
||||
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
||||
args.push(
|
||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
||||
);
|
||||
}
|
||||
|
||||
return args;
|
||||
}
|
||||
|
||||
getDefaultUA() {
|
||||
let version : string | undefined = process.env.BROWSER_VERSION;
|
||||
let version: string | undefined = process.env.BROWSER_VERSION;
|
||||
|
||||
try {
|
||||
const browser = this.getBrowserExe();
|
||||
if (browser) {
|
||||
version = child_process.execFileSync(browser, ["--version"], {encoding: "utf8"});
|
||||
version = child_process.execFileSync(browser, ["--version"], {
|
||||
encoding: "utf8",
|
||||
});
|
||||
const match = version && version.match(/[\d.]+/);
|
||||
if (match) {
|
||||
version = match[0];
|
||||
}
|
||||
}
|
||||
} catch(e) {
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
}
|
||||
|
||||
|
@ -188,7 +219,11 @@ export class Browser
|
|||
}
|
||||
|
||||
getBrowserExe() {
|
||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||
const files = [
|
||||
process.env.BROWSER_BIN,
|
||||
"/usr/bin/google-chrome",
|
||||
"/usr/bin/chromium-browser",
|
||||
];
|
||||
for (const file of files) {
|
||||
if (file && fs.existsSync(file)) {
|
||||
return file;
|
||||
|
@ -196,14 +231,25 @@ export class Browser
|
|||
}
|
||||
}
|
||||
|
||||
async evaluateWithCLI_(cdp: CDPSession, frame: Frame, cdpContextId: number, funcString: string, logData: Record<string, string>, contextName: string) {
|
||||
async evaluateWithCLI_(
|
||||
cdp: CDPSession,
|
||||
frame: Frame,
|
||||
cdpContextId: number,
|
||||
funcString: string,
|
||||
logData: Record<string, string>,
|
||||
contextName: string,
|
||||
) {
|
||||
const frameUrl = frame.url();
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
let details : Record<string, any> = {frameUrl, ...logData};
|
||||
let details: Record<string, any> = { frameUrl, ...logData };
|
||||
|
||||
if (!frameUrl || frame.isDetached()) {
|
||||
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
|
||||
logger.info(
|
||||
"Run Script Skipped, frame no longer attached or has no URL",
|
||||
details,
|
||||
contextName,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -213,19 +259,22 @@ export class Browser
|
|||
//const contextId = context._contextId;
|
||||
const expression = funcString + "\n//# sourceURL=__evaluation_script__";
|
||||
|
||||
const { exceptionDetails, result } = await cdp
|
||||
.send("Runtime.evaluate", {
|
||||
expression,
|
||||
contextId: cdpContextId,
|
||||
returnByValue: true,
|
||||
awaitPromise: true,
|
||||
userGesture: true,
|
||||
includeCommandLineAPI: true,
|
||||
});
|
||||
const { exceptionDetails, result } = await cdp.send("Runtime.evaluate", {
|
||||
expression,
|
||||
contextId: cdpContextId,
|
||||
returnByValue: true,
|
||||
awaitPromise: true,
|
||||
userGesture: true,
|
||||
includeCommandLineAPI: true,
|
||||
});
|
||||
|
||||
if (exceptionDetails) {
|
||||
if (exceptionDetails.stackTrace) {
|
||||
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
|
||||
details = {
|
||||
...exceptionDetails.stackTrace,
|
||||
text: exceptionDetails.text,
|
||||
...details,
|
||||
};
|
||||
}
|
||||
logger.error("Run Script Failed", details, contextName);
|
||||
} else {
|
||||
|
@ -256,8 +305,11 @@ export class Browser
|
|||
return page.evaluateOnNewDocument(script);
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
async _init(launchOpts: PuppeteerLaunchOptions, ondisconnect : Function | null = null) {
|
||||
async _init(
|
||||
launchOpts: PuppeteerLaunchOptions,
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
ondisconnect: Function | null = null,
|
||||
) {
|
||||
this.browser = await puppeteer.launch(launchOpts);
|
||||
|
||||
const target = this.browser.target();
|
||||
|
@ -274,9 +326,10 @@ export class Browser
|
|||
});
|
||||
}
|
||||
|
||||
async newWindowPageWithCDP() : Promise<{cdp: CDPSession, page: Page}> {
|
||||
async newWindowPageWithCDP(): Promise<{ cdp: CDPSession; page: Page }> {
|
||||
// unique url to detect new pages
|
||||
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||
const startPage =
|
||||
"about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||
|
||||
const p = new Promise<Target>((resolve) => {
|
||||
const listener = (target: Target) => {
|
||||
|
@ -298,7 +351,10 @@ export class Browser
|
|||
}
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||
await this.firstCDP.send("Target.createTarget", {
|
||||
url: startPage,
|
||||
newWindow: true,
|
||||
});
|
||||
} catch (e) {
|
||||
if (!this.browser) {
|
||||
throw e;
|
||||
|
@ -307,7 +363,10 @@ export class Browser
|
|||
|
||||
this.firstCDP = await target.createCDPSession();
|
||||
|
||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||
await this.firstCDP.send("Target.createTarget", {
|
||||
url: startPage,
|
||||
newWindow: true,
|
||||
});
|
||||
}
|
||||
|
||||
const target = await p;
|
||||
|
@ -331,7 +390,7 @@ export class Browser
|
|||
|
||||
const cdp = await target.createCDPSession();
|
||||
|
||||
return {page, cdp};
|
||||
return { page, cdp };
|
||||
}
|
||||
|
||||
async serviceWorkerFetch() {
|
||||
|
@ -348,9 +407,13 @@ export class Browser
|
|||
|
||||
if (networkId) {
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
||||
logger.warn(
|
||||
"continueResponse failed",
|
||||
{ url: request.url },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -369,12 +432,20 @@ export class Browser
|
|||
}
|
||||
|
||||
if (!foundRecorder) {
|
||||
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
||||
logger.debug(
|
||||
"Skipping URL from unknown frame",
|
||||
{ url: request.url, frameId },
|
||||
"recorder",
|
||||
);
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", {requestId});
|
||||
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
||||
logger.warn(
|
||||
"continueResponse failed",
|
||||
{ url: request.url },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
||||
return;
|
||||
|
@ -383,11 +454,13 @@ export class Browser
|
|||
await foundRecorder.handleRequestPaused(params, this.firstCDP, true);
|
||||
});
|
||||
|
||||
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
||||
await this.firstCDP.send("Fetch.enable", {
|
||||
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
||||
|
||||
async evaluateWithCLI(
|
||||
_: unknown,
|
||||
frame: Frame,
|
||||
|
@ -395,21 +468,28 @@ export class Browser
|
|||
funcString: string,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logData: Record<string, any>,
|
||||
contextName: string
|
||||
contextName: string,
|
||||
) {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const context = await (frame as any).executionContext();
|
||||
cdp = context._client;
|
||||
const cdpContextId = context._contextId;
|
||||
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
|
||||
return await this.evaluateWithCLI_(
|
||||
cdp,
|
||||
frame,
|
||||
cdpContextId,
|
||||
funcString,
|
||||
logData,
|
||||
contextName,
|
||||
);
|
||||
}
|
||||
|
||||
interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
|
||||
page.on("request", callback);
|
||||
}
|
||||
|
||||
async waitForNetworkIdle(page: Page, params: {timeout?: number}) {
|
||||
async waitForNetworkIdle(page: Page, params: { timeout?: number }) {
|
||||
return await page.waitForNetworkIdle(params);
|
||||
}
|
||||
|
||||
|
@ -428,7 +508,6 @@ export class Browser
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ==================================================================
|
||||
// Default Chromium args from playwright
|
||||
export const defaultArgs = [
|
||||
|
@ -470,5 +549,5 @@ export const defaultArgs = [
|
|||
"--apps-gallery-url=https://invalid.webstore.example.com/",
|
||||
"--apps-gallery-update-url=https://invalid.webstore.example.com/",
|
||||
"--component-updater=url-source=http://invalid.dev/",
|
||||
"--brave-stats-updater-server=url-source=http://invalid.dev/"
|
||||
"--brave-stats-updater-server=url-source=http://invalid.dev/",
|
||||
];
|
||||
|
|
|
@ -1,15 +1,24 @@
|
|||
|
||||
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
export const HTML_TYPES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
"application/xhtml+xml",
|
||||
];
|
||||
export const WAIT_UNTIL_OPTS = [
|
||||
"load",
|
||||
"domcontentloaded",
|
||||
"networkidle0",
|
||||
"networkidle2",
|
||||
];
|
||||
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
||||
|
||||
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||
export const ADD_LINK_FUNC = "__bx_addLink";
|
||||
export const MAX_DEPTH = 1000000;
|
||||
|
||||
export const DEFAULT_SELECTORS = [{
|
||||
selector: "a[href]",
|
||||
extract: "href",
|
||||
isAttribute: false
|
||||
}];
|
||||
|
||||
export const DEFAULT_SELECTORS = [
|
||||
{
|
||||
selector: "a[href]",
|
||||
extract: "href",
|
||||
isAttribute: false,
|
||||
},
|
||||
];
|
||||
|
|
|
@ -3,11 +3,17 @@ import path from "path";
|
|||
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0) : string[] {
|
||||
export function collectAllFileSources(
|
||||
fileOrDir: string,
|
||||
ext?: string,
|
||||
depth = 0,
|
||||
): string[] {
|
||||
const resolvedPath = path.resolve(fileOrDir);
|
||||
|
||||
if (depth >= MAX_DEPTH) {
|
||||
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
|
||||
console.warn(
|
||||
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
|
||||
|
@ -27,7 +33,9 @@ export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0
|
|||
}
|
||||
|
||||
if (depth === 0) {
|
||||
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
||||
console.warn(
|
||||
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
|
||||
);
|
||||
}
|
||||
|
||||
return [];
|
||||
|
|
|
@ -2,10 +2,8 @@ import http from "http";
|
|||
import url from "url";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class HealthChecker
|
||||
{
|
||||
export class HealthChecker {
|
||||
port: number;
|
||||
errorThreshold: number;
|
||||
healthServer: http.Server;
|
||||
|
@ -16,7 +14,9 @@ export class HealthChecker
|
|||
this.port = port;
|
||||
this.errorThreshold = errorThreshold;
|
||||
|
||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
||||
this.healthServer = http.createServer((...args) =>
|
||||
this.healthCheck(...args),
|
||||
);
|
||||
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
|
||||
this.healthServer.listen(port);
|
||||
}
|
||||
|
@ -24,23 +24,35 @@ export class HealthChecker
|
|||
async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) {
|
||||
const pathname = req.url ? url.parse(req.url).pathname : "";
|
||||
switch (pathname) {
|
||||
case "/healthz":
|
||||
if (this.errorCount < this.errorThreshold) {
|
||||
logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck");
|
||||
res.writeHead(200);
|
||||
res.end();
|
||||
}
|
||||
return;
|
||||
case "/healthz":
|
||||
if (this.errorCount < this.errorThreshold) {
|
||||
logger.debug(
|
||||
`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
|
||||
{},
|
||||
"healthcheck",
|
||||
);
|
||||
res.writeHead(200);
|
||||
res.end();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
logger.error(`health check failed: ${this.errorCount} >= ${this.errorThreshold}`, {}, "healthcheck");
|
||||
logger.error(
|
||||
`health check failed: ${this.errorCount} >= ${this.errorThreshold}`,
|
||||
{},
|
||||
"healthcheck",
|
||||
);
|
||||
res.writeHead(503);
|
||||
res.end();
|
||||
}
|
||||
|
||||
resetErrors() {
|
||||
if (this.errorCount > 0) {
|
||||
logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck");
|
||||
logger.info(
|
||||
`Page loaded, resetting error count ${this.errorCount} to 0`,
|
||||
{},
|
||||
"healthcheck",
|
||||
);
|
||||
this.errorCount = 0;
|
||||
}
|
||||
}
|
||||
|
@ -49,4 +61,3 @@ export class HealthChecker
|
|||
this.errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,30 +5,29 @@ import { Writable } from "node:stream";
|
|||
import { RedisCrawlState } from "./state.js";
|
||||
|
||||
// RegExp.prototype.toJSON = RegExp.prototype.toString;
|
||||
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
|
||||
|
||||
Object.defineProperty(RegExp.prototype, "toJSON", {
|
||||
value: RegExp.prototype.toString,
|
||||
});
|
||||
|
||||
// ===========================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export function errJSON(e: any) {
|
||||
if (e instanceof Error) {
|
||||
return {"type": "exception", "message": e.message, "stack": e.stack};
|
||||
return { type: "exception", message: e.message, stack: e.stack };
|
||||
} else {
|
||||
return {"message": e.toString()};
|
||||
return { message: e.toString() };
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class Logger
|
||||
{
|
||||
logStream : Writable | null = null;
|
||||
class Logger {
|
||||
logStream: Writable | null = null;
|
||||
debugLogging = false;
|
||||
logErrorsToRedis = false;
|
||||
logLevels : string[] = [];
|
||||
contexts : string[] = [];
|
||||
crawlState? : RedisCrawlState | null = null;
|
||||
logLevels: string[] = [];
|
||||
contexts: string[] = [];
|
||||
crawlState?: RedisCrawlState | null = null;
|
||||
fatalExitCode = 17;
|
||||
|
||||
setDefaultFatalExitCode(exitCode: number) {
|
||||
|
@ -60,18 +59,18 @@ class Logger
|
|||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
||||
|
||||
logAsJSON(
|
||||
message: string,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
data: Record<string, string> | Error | any,
|
||||
context: string,
|
||||
logLevel="info"
|
||||
logLevel = "info",
|
||||
) {
|
||||
if (data instanceof Error) {
|
||||
data = errJSON(data);
|
||||
} else if (typeof data !== "object") {
|
||||
data = {"message": data.toString()};
|
||||
data = { message: data.toString() };
|
||||
}
|
||||
|
||||
if (this.logLevels.length) {
|
||||
|
@ -87,11 +86,11 @@ class Logger
|
|||
}
|
||||
|
||||
const dataToLog = {
|
||||
"timestamp": new Date().toISOString(),
|
||||
"logLevel": logLevel,
|
||||
"context": context,
|
||||
"message": message,
|
||||
"details": data ? data : {}
|
||||
timestamp: new Date().toISOString(),
|
||||
logLevel: logLevel,
|
||||
context: context,
|
||||
message: message,
|
||||
details: data ? data : {},
|
||||
};
|
||||
const string = JSON.stringify(dataToLog);
|
||||
console.log(string);
|
||||
|
@ -100,30 +99,34 @@ class Logger
|
|||
}
|
||||
|
||||
const toLogToRedis = ["error", "fatal"];
|
||||
if (this.logErrorsToRedis && this.crawlState && toLogToRedis.includes(logLevel)) {
|
||||
if (
|
||||
this.logErrorsToRedis &&
|
||||
this.crawlState &&
|
||||
toLogToRedis.includes(logLevel)
|
||||
) {
|
||||
this.crawlState.logError(string);
|
||||
}
|
||||
}
|
||||
|
||||
info(message: string, data={}, context="general") {
|
||||
info(message: string, data = {}, context = "general") {
|
||||
this.logAsJSON(message, data, context);
|
||||
}
|
||||
|
||||
error(message: string, data={}, context="general") {
|
||||
error(message: string, data = {}, context = "general") {
|
||||
this.logAsJSON(message, data, context, "error");
|
||||
}
|
||||
|
||||
warn(message: string, data={}, context="general") {
|
||||
warn(message: string, data = {}, context = "general") {
|
||||
this.logAsJSON(message, data, context, "warn");
|
||||
}
|
||||
|
||||
debug(message: string, data={}, context="general") {
|
||||
debug(message: string, data = {}, context = "general") {
|
||||
if (this.debugLogging) {
|
||||
this.logAsJSON(message, data, context, "debug");
|
||||
}
|
||||
}
|
||||
|
||||
fatal(message: string, data={}, context="general", exitCode=0) {
|
||||
fatal(message: string, data = {}, context = "general", exitCode = 0) {
|
||||
exitCode = exitCode || this.fatalExitCode;
|
||||
this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
|
||||
|
||||
|
|
|
@ -2,9 +2,8 @@ import { HTTPRequest, Page } from "puppeteer-core";
|
|||
import { errJSON, logger } from "./logger.js";
|
||||
import { Browser } from "./browser.js";
|
||||
|
||||
export class OriginOverride
|
||||
{
|
||||
originOverride: {origUrl: URL, destUrl: URL}[];
|
||||
export class OriginOverride {
|
||||
originOverride: { origUrl: URL; destUrl: URL }[];
|
||||
|
||||
constructor(originOverride: string[]) {
|
||||
this.originOverride = originOverride.map((override) => {
|
||||
|
@ -12,7 +11,7 @@ export class OriginOverride
|
|||
const origUrl = new URL(orig);
|
||||
const destUrl = new URL(dest);
|
||||
|
||||
return {origUrl, destUrl};
|
||||
return { origUrl, destUrl };
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -24,7 +23,7 @@ export class OriginOverride
|
|||
let newUrl = null;
|
||||
let orig = null;
|
||||
|
||||
for (const {origUrl, destUrl} of this.originOverride) {
|
||||
for (const { origUrl, destUrl } of this.originOverride) {
|
||||
if (url.startsWith(origUrl.origin)) {
|
||||
newUrl = destUrl.origin + url.slice(origUrl.origin.length);
|
||||
orig = origUrl;
|
||||
|
@ -44,18 +43,25 @@ export class OriginOverride
|
|||
headers.set("origin", orig.origin);
|
||||
}
|
||||
|
||||
const resp = await fetch(newUrl, {headers});
|
||||
const resp = await fetch(newUrl, { headers });
|
||||
|
||||
const body = Buffer.from(await resp.arrayBuffer());
|
||||
const respHeaders = Object.fromEntries(resp.headers);
|
||||
const status = resp.status;
|
||||
|
||||
logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride");
|
||||
|
||||
request.respond({body, headers: respHeaders, status}, -1);
|
||||
logger.debug(
|
||||
"Origin overridden",
|
||||
{ orig: url, dest: newUrl, status, body: body.length },
|
||||
"originoverride",
|
||||
);
|
||||
|
||||
request.respond({ body, headers: respHeaders, status }, -1);
|
||||
} catch (e) {
|
||||
logger.warn("Error overriding origin", {...errJSON(e), url: page.url()}, "originoverride");
|
||||
logger.warn(
|
||||
"Error overriding origin",
|
||||
{ ...errJSON(e), url: page.url() },
|
||||
"originoverride",
|
||||
);
|
||||
request.continue({}, -1);
|
||||
}
|
||||
};
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -14,14 +14,13 @@ console.error = function (...args) {
|
|||
typeof args[0] === "string" &&
|
||||
args[0].indexOf("[ioredis] Unhandled error event") === 0
|
||||
) {
|
||||
|
||||
const now = Date.now();
|
||||
|
||||
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||
if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||
if (lastLogTime && exitOnError) {
|
||||
logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
|
||||
}
|
||||
logger.warn("ioredis error", {error: args[0]}, "redis");
|
||||
logger.warn("ioredis error", { error: args[0] }, "redis");
|
||||
lastLogTime = now;
|
||||
}
|
||||
return;
|
||||
|
@ -30,7 +29,7 @@ console.error = function (...args) {
|
|||
};
|
||||
|
||||
export async function initRedis(url: string) {
|
||||
const redis = new Redis(url, {lazyConnect: true});
|
||||
const redis = new Redis(url, { lazyConnect: true });
|
||||
await redis.connect();
|
||||
return redis;
|
||||
}
|
||||
|
|
|
@ -7,10 +7,8 @@ const CONTENT_LENGTH = "content-length";
|
|||
const CONTENT_TYPE = "content-type";
|
||||
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class RequestResponseInfo
|
||||
{
|
||||
export class RequestResponseInfo {
|
||||
_created: Date = new Date();
|
||||
|
||||
requestId: string;
|
||||
|
@ -33,7 +31,7 @@ export class RequestResponseInfo
|
|||
statusText?: string;
|
||||
|
||||
responseHeaders?: Record<string, string>;
|
||||
responseHeadersList?: {name: string, value: string}[];
|
||||
responseHeadersList?: { name: string; value: string }[];
|
||||
responseHeadersText?: string;
|
||||
|
||||
payload?: Uint8Array;
|
||||
|
@ -79,7 +77,6 @@ export class RequestResponseInfo
|
|||
if (params.type) {
|
||||
this.resourceType = params.type;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -100,12 +97,17 @@ export class RequestResponseInfo
|
|||
|
||||
fillResponse(response: Protocol.Network.Response) {
|
||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||
if (response.status == 304 && this.status && this.status != 304 && this.url) {
|
||||
if (
|
||||
response.status == 304 &&
|
||||
this.status &&
|
||||
this.status != 304 &&
|
||||
this.url
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.url = response.url.split("#")[0];
|
||||
|
||||
|
||||
this.status = response.status;
|
||||
this.statusText = response.statusText || getStatusText(this.status);
|
||||
|
||||
|
@ -124,12 +126,16 @@ export class RequestResponseInfo
|
|||
this.responseHeadersText = response.headersText;
|
||||
}
|
||||
|
||||
this.fromServiceWorker = !!response.fromServiceWorker;
|
||||
this.fromServiceWorker = !!response.fromServiceWorker;
|
||||
|
||||
if (response.securityDetails) {
|
||||
const issuer : string = response.securityDetails.issuer || "";
|
||||
const ctc : string = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
|
||||
this.extraOpts.cert = {issuer, ctc};
|
||||
const issuer: string = response.securityDetails.issuer || "";
|
||||
const ctc: string =
|
||||
response.securityDetails.certificateTransparencyCompliance ===
|
||||
"compliant"
|
||||
? "1"
|
||||
: "0";
|
||||
this.extraOpts.cert = { issuer, ctc };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,7 +167,6 @@ export class RequestResponseInfo
|
|||
this.responseHeaders = Object.fromEntries(response.headers);
|
||||
this.status = response.status;
|
||||
this.statusText = response.statusText || getStatusText(this.status);
|
||||
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -175,7 +180,10 @@ export class RequestResponseInfo
|
|||
|
||||
if (this.responseHeaders) {
|
||||
for (const header of Object.keys(this.responseHeaders)) {
|
||||
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
|
||||
headers += `${header}: ${this.responseHeaders[header].replace(
|
||||
/\n/g,
|
||||
", ",
|
||||
)}\r\n`;
|
||||
}
|
||||
}
|
||||
headers += "\r\n";
|
||||
|
@ -191,10 +199,18 @@ export class RequestResponseInfo
|
|||
}
|
||||
|
||||
getResponseHeadersDict(length = 0) {
|
||||
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
|
||||
return this._getHeadersDict(
|
||||
this.responseHeaders,
|
||||
this.responseHeadersList,
|
||||
length,
|
||||
);
|
||||
}
|
||||
|
||||
_getHeadersDict(headersDict?: Record<string, string>, headersList?: {name: string, value: string}[], actualContentLength = 0) {
|
||||
_getHeadersDict(
|
||||
headersDict?: Record<string, string>,
|
||||
headersList?: { name: string; value: string }[],
|
||||
actualContentLength = 0,
|
||||
) {
|
||||
if (!headersDict && headersList) {
|
||||
headersDict = {};
|
||||
|
||||
|
|
|
@ -9,12 +9,13 @@ import { Duplex } from "stream";
|
|||
import { CDPSession, Page } from "puppeteer-core";
|
||||
import { WorkerId } from "./state.js";
|
||||
|
||||
const indexHTML = fs.readFileSync(new URL("../../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
||||
|
||||
const indexHTML = fs.readFileSync(
|
||||
new URL("../../html/screencast.html", import.meta.url),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
// ===========================================================================
|
||||
class WSTransport
|
||||
{
|
||||
class WSTransport {
|
||||
allWS = new Set<WebSocket>();
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
caster!: ScreenCaster;
|
||||
|
@ -22,7 +23,6 @@ class WSTransport
|
|||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
httpServer: any;
|
||||
|
||||
|
||||
constructor(port: number) {
|
||||
this.allWS = new Set();
|
||||
|
@ -31,16 +31,21 @@ class WSTransport
|
|||
|
||||
this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
|
||||
|
||||
this.httpServer = http.createServer((...args) => this.handleRequest(...args));
|
||||
this.httpServer.on("upgrade", (request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
||||
const pathname = url.parse(request.url || "").pathname;
|
||||
this.httpServer = http.createServer((...args) =>
|
||||
this.handleRequest(...args),
|
||||
);
|
||||
this.httpServer.on(
|
||||
"upgrade",
|
||||
(request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
||||
const pathname = url.parse(request.url || "").pathname;
|
||||
|
||||
if (pathname === "/ws") {
|
||||
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
||||
this.wss.emit("connection", ws, request);
|
||||
});
|
||||
}
|
||||
});
|
||||
if (pathname === "/ws") {
|
||||
this.wss.handleUpgrade(request, socket, head, (ws) => {
|
||||
this.wss.emit("connection", ws, request);
|
||||
});
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
this.httpServer.listen(port);
|
||||
}
|
||||
|
@ -48,13 +53,13 @@ class WSTransport
|
|||
async handleRequest(req: IncomingMessage, res: ServerResponse) {
|
||||
const pathname = url.parse(req.url || "").pathname;
|
||||
switch (pathname) {
|
||||
case "/":
|
||||
res.writeHead(200, {"Content-Type": "text/html"});
|
||||
res.end(indexHTML);
|
||||
return;
|
||||
case "/":
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
res.end(indexHTML);
|
||||
return;
|
||||
}
|
||||
|
||||
res.writeHead(404, {"Content-Type": "text/html"});
|
||||
res.writeHead(404, { "Content-Type": "text/html" });
|
||||
res.end("Not Found");
|
||||
}
|
||||
|
||||
|
@ -65,7 +70,11 @@ class WSTransport
|
|||
|
||||
this.allWS.add(ws);
|
||||
|
||||
logger.debug("New Screencast Conn", {total: this.allWS.size}, "screencast");
|
||||
logger.debug(
|
||||
"New Screencast Conn",
|
||||
{ total: this.allWS.size },
|
||||
"screencast",
|
||||
);
|
||||
|
||||
if (this.allWS.size === 1) {
|
||||
this.caster.startCastAll();
|
||||
|
@ -95,10 +104,8 @@ class WSTransport
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class RedisPubSubTransport
|
||||
{
|
||||
class RedisPubSubTransport {
|
||||
numConnections: number = 0;
|
||||
castChannel: string;
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
|
@ -128,23 +135,23 @@ class RedisPubSubTransport
|
|||
}
|
||||
|
||||
switch (message) {
|
||||
case "connect":
|
||||
this.numConnections++;
|
||||
if (this.numConnections === 1) {
|
||||
this.caster.startCastAll();
|
||||
} else {
|
||||
for (const packet of this.caster.iterCachedData()) {
|
||||
await this.sendAll(packet);
|
||||
case "connect":
|
||||
this.numConnections++;
|
||||
if (this.numConnections === 1) {
|
||||
this.caster.startCastAll();
|
||||
} else {
|
||||
for (const packet of this.caster.iterCachedData()) {
|
||||
await this.sendAll(packet);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
break;
|
||||
|
||||
case "disconnect":
|
||||
this.numConnections--;
|
||||
if (this.numConnections === 0) {
|
||||
this.caster.stopCastAll();
|
||||
}
|
||||
break;
|
||||
case "disconnect":
|
||||
this.numConnections--;
|
||||
if (this.numConnections === 0) {
|
||||
this.caster.stopCastAll();
|
||||
}
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -157,14 +164,12 @@ class RedisPubSubTransport
|
|||
|
||||
async isActive() {
|
||||
const result = await this.redis.pubsub("numsub", this.castChannel);
|
||||
return (result.length > 1 ? result[1] > 0: false);
|
||||
return result.length > 1 ? result[1] > 0 : false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class ScreenCaster
|
||||
{
|
||||
class ScreenCaster {
|
||||
transport: WSTransport;
|
||||
caches = new Map<WorkerId, string>();
|
||||
urls = new Map<WorkerId, string>();
|
||||
|
@ -173,7 +178,7 @@ class ScreenCaster
|
|||
maxHeight = 480;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
initMsg: {[key: string]: any};
|
||||
initMsg: { [key: string]: any };
|
||||
|
||||
constructor(transport: WSTransport, numWorkers: number) {
|
||||
this.transport = transport;
|
||||
|
@ -183,7 +188,7 @@ class ScreenCaster
|
|||
msg: "init",
|
||||
width: this.maxWidth,
|
||||
height: this.maxHeight,
|
||||
browsers: numWorkers
|
||||
browsers: numWorkers,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -193,7 +198,7 @@ class ScreenCaster
|
|||
for (const id of this.caches.keys()) {
|
||||
const data = this.caches.get(id);
|
||||
const url = this.urls.get(id);
|
||||
yield {msg, id, url, data};
|
||||
yield { msg, id, url, data };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -202,7 +207,7 @@ class ScreenCaster
|
|||
|
||||
// shouldn't happen, getting duplicate cdp
|
||||
if (this.cdps.get(id) === cdp) {
|
||||
logger.warn("worker already registered", {workerid: id}, "screencast");
|
||||
logger.warn("worker already registered", { workerid: id }, "screencast");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -215,19 +220,19 @@ class ScreenCaster
|
|||
const sessionId = resp.sessionId;
|
||||
const url = page.url();
|
||||
|
||||
logger.debug("screencastFrame", {workerid: id, url}, "screencast");
|
||||
logger.debug("screencastFrame", { workerid: id, url }, "screencast");
|
||||
|
||||
// keep previous data cached if just showing about:blank
|
||||
if (url && !url.startsWith("about:blank")) {
|
||||
this.caches.set(id, data);
|
||||
this.urls.set(id, url);
|
||||
|
||||
await this.transport.sendAll({msg, id, data, url});
|
||||
await this.transport.sendAll({ msg, id, data, url });
|
||||
}
|
||||
|
||||
try {
|
||||
await cdp.send("Page.screencastFrameAck", {sessionId});
|
||||
} catch(e) {
|
||||
await cdp.send("Page.screencastFrameAck", { sessionId });
|
||||
} catch (e) {
|
||||
//console.log("Ack Failed, probably window/tab already closed", e);
|
||||
}
|
||||
});
|
||||
|
@ -243,7 +248,7 @@ class ScreenCaster
|
|||
}
|
||||
}
|
||||
|
||||
async stopById(id: WorkerId, sendClose=false) {
|
||||
async stopById(id: WorkerId, sendClose = false) {
|
||||
this.caches.delete(id);
|
||||
this.urls.delete(id);
|
||||
|
||||
|
@ -258,7 +263,7 @@ class ScreenCaster
|
|||
}
|
||||
|
||||
if (sendClose) {
|
||||
await this.transport.sendAll({msg: "close", id});
|
||||
await this.transport.sendAll({ msg: "close", id });
|
||||
}
|
||||
|
||||
this.cdps.delete(id);
|
||||
|
@ -275,9 +280,14 @@ class ScreenCaster
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(cdp as any)._startedCast = true;
|
||||
|
||||
logger.info("Started Screencast", {workerid: id}, "screencast");
|
||||
logger.info("Started Screencast", { workerid: id }, "screencast");
|
||||
|
||||
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
|
||||
await cdp.send("Page.startScreencast", {
|
||||
format: "png",
|
||||
everyNthFrame: 1,
|
||||
maxWidth: this.maxWidth,
|
||||
maxHeight: this.maxHeight,
|
||||
});
|
||||
}
|
||||
|
||||
async stopCast(cdp: CDPSession, id: WorkerId) {
|
||||
|
@ -291,7 +301,7 @@ class ScreenCaster
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(cdp as any)._startedCast = false;
|
||||
|
||||
logger.info("Stopping Screencast", {workerid: id}, "screencast");
|
||||
logger.info("Stopping Screencast", { workerid: id }, "screencast");
|
||||
|
||||
try {
|
||||
await cdp.send("Page.stopScreencast");
|
||||
|
|
|
@ -4,31 +4,30 @@ import { WARCResourceWriter } from "./warcresourcewriter.js";
|
|||
import { logger, errJSON } from "./logger.js";
|
||||
import { Browser } from "./browser.js";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
|
||||
type ScreenShotType = {
|
||||
type: string;
|
||||
omitBackground: boolean;
|
||||
fullPage: boolean;
|
||||
}
|
||||
};
|
||||
|
||||
export const screenshotTypes : Record<string, ScreenShotType> = {
|
||||
"view": {
|
||||
export const screenshotTypes: Record<string, ScreenShotType> = {
|
||||
view: {
|
||||
type: "png",
|
||||
omitBackground: true,
|
||||
fullPage: false
|
||||
fullPage: false,
|
||||
},
|
||||
"thumbnail": {
|
||||
thumbnail: {
|
||||
type: "jpeg",
|
||||
omitBackground: true,
|
||||
fullPage: false
|
||||
fullPage: false,
|
||||
},
|
||||
"fullPage": {
|
||||
fullPage: {
|
||||
type: "png",
|
||||
omitBackground: true,
|
||||
fullPage: true
|
||||
}
|
||||
fullPage: true,
|
||||
},
|
||||
};
|
||||
|
||||
export class Screenshots extends WARCResourceWriter {
|
||||
|
@ -40,22 +39,35 @@ export class Screenshots extends WARCResourceWriter {
|
|||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
constructor(opts: any) {
|
||||
super({...opts, warcName: "screenshots.warc.gz"});
|
||||
super({ ...opts, warcName: "screenshots.warc.gz" });
|
||||
this.browser = opts.browser;
|
||||
this.page = opts.page;
|
||||
}
|
||||
|
||||
async take(screenshotType="view") {
|
||||
async take(screenshotType = "view") {
|
||||
try {
|
||||
if (screenshotType !== "fullPage") {
|
||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
||||
await this.browser.setViewport(this.page, {
|
||||
width: 1920,
|
||||
height: 1080,
|
||||
});
|
||||
}
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type);
|
||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
await this.writeBufferToWARC(
|
||||
screenshotBuffer,
|
||||
screenshotType,
|
||||
"image/" + options.type,
|
||||
);
|
||||
logger.info(
|
||||
`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`,
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
logger.error(
|
||||
"Taking screenshot failed",
|
||||
{ page: this.url, type: screenshotType, ...errJSON(e) },
|
||||
"screenshots",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -66,17 +78,27 @@ export class Screenshots extends WARCResourceWriter {
|
|||
async takeThumbnail() {
|
||||
const screenshotType = "thumbnail";
|
||||
try {
|
||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
||||
await this.browser.setViewport(this.page, { width: 1920, height: 1080 });
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
const thumbnailBuffer = await sharp(screenshotBuffer)
|
||||
// 16:9 thumbnail
|
||||
.resize(640, 360)
|
||||
.toBuffer();
|
||||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type);
|
||||
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
|
||||
await this.writeBufferToWARC(
|
||||
thumbnailBuffer,
|
||||
screenshotType,
|
||||
"image/" + options.type,
|
||||
);
|
||||
logger.info(
|
||||
`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`,
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
logger.error(
|
||||
"Taking screenshot failed",
|
||||
{ page: this.url, type: screenshotType, ...errJSON(e) },
|
||||
"screenshots",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,10 +8,9 @@ type ScopeType =
|
|||
| "page"
|
||||
| "page-spa"
|
||||
| "any"
|
||||
| "custom";
|
||||
| "custom";
|
||||
|
||||
export class ScopedSeed
|
||||
{
|
||||
export class ScopedSeed {
|
||||
url: string;
|
||||
scopeType: ScopeType;
|
||||
include: RegExp[];
|
||||
|
@ -24,11 +23,25 @@ export class ScopedSeed
|
|||
maxExtraHops = 0;
|
||||
maxDepth = 0;
|
||||
|
||||
|
||||
constructor(
|
||||
{url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
|
||||
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: string | boolean | null, extraHops?: number}
|
||||
) {
|
||||
constructor({
|
||||
url,
|
||||
scopeType,
|
||||
include,
|
||||
exclude = [],
|
||||
allowHash = false,
|
||||
depth = -1,
|
||||
sitemap = false,
|
||||
extraHops = 0,
|
||||
}: {
|
||||
url: string;
|
||||
scopeType: ScopeType;
|
||||
include: string[];
|
||||
exclude?: string[];
|
||||
allowHash?: boolean;
|
||||
depth?: number;
|
||||
sitemap?: string | boolean | null;
|
||||
extraHops?: number;
|
||||
}) {
|
||||
const parsedUrl = this.parseUrl(url);
|
||||
if (!parsedUrl) {
|
||||
throw new Error("Invalid URL");
|
||||
|
@ -43,7 +56,10 @@ export class ScopedSeed
|
|||
}
|
||||
|
||||
if (this.scopeType !== "custom") {
|
||||
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
|
||||
const [includeNew, allowHashNew] = this.scopeFromType(
|
||||
this.scopeType,
|
||||
parsedUrl,
|
||||
);
|
||||
this.include = [...includeNew, ...this.include];
|
||||
allowHash = allowHashNew;
|
||||
}
|
||||
|
@ -63,13 +79,13 @@ export class ScopedSeed
|
|||
//parseRx(value? : union[string[], string, RegExp[]]) -> RegExp[] {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
parseRx(value : any) {
|
||||
parseRx(value: any) {
|
||||
if (value === null || value === undefined || value === "") {
|
||||
return [];
|
||||
} else if (!(value instanceof Array)) {
|
||||
return [new RegExp(value)];
|
||||
} else {
|
||||
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
|
||||
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,24 +113,27 @@ export class ScopedSeed
|
|||
try {
|
||||
parsedUrl = new URL(url.trim());
|
||||
} catch (e) {
|
||||
logger.warn("Invalid Page - not a valid URL", {url, ...logDetails});
|
||||
logger.warn("Invalid Page - not a valid URL", { url, ...logDetails });
|
||||
return null;
|
||||
}
|
||||
|
||||
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
||||
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
|
||||
logger.warn("Invalid Page - URL must start with http:// or https://", {
|
||||
url,
|
||||
...logDetails,
|
||||
});
|
||||
parsedUrl = null;
|
||||
}
|
||||
|
||||
return parsedUrl;
|
||||
}
|
||||
|
||||
resolveSiteMap(sitemap: boolean | string | null) : string | null {
|
||||
resolveSiteMap(sitemap: boolean | string | null): string | null {
|
||||
if (sitemap === true) {
|
||||
const url = new URL(this.url);
|
||||
url.pathname = "/sitemap.xml";
|
||||
return url.href;
|
||||
} else if (typeof(sitemap) === "string") {
|
||||
} else if (typeof sitemap === "string") {
|
||||
const url = new URL(sitemap, this.url);
|
||||
return url.href;
|
||||
}
|
||||
|
@ -122,42 +141,68 @@ export class ScopedSeed
|
|||
return null;
|
||||
}
|
||||
|
||||
scopeFromType(scopeType: ScopeType, parsedUrl: URL) : [RegExp[], boolean] {
|
||||
let include : RegExp[] = [];
|
||||
scopeFromType(scopeType: ScopeType, parsedUrl: URL): [RegExp[], boolean] {
|
||||
let include: RegExp[] = [];
|
||||
let allowHash = false;
|
||||
|
||||
switch (scopeType) {
|
||||
case "page":
|
||||
include = [];
|
||||
break;
|
||||
case "page":
|
||||
include = [];
|
||||
break;
|
||||
|
||||
case "page-spa":
|
||||
// allow scheme-agnostic URLS as likely redirects
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
|
||||
allowHash = true;
|
||||
break;
|
||||
case "page-spa":
|
||||
// allow scheme-agnostic URLS as likely redirects
|
||||
include = [
|
||||
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
|
||||
];
|
||||
allowHash = true;
|
||||
break;
|
||||
|
||||
case "prefix":
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
|
||||
break;
|
||||
case "prefix":
|
||||
include = [
|
||||
new RegExp(
|
||||
"^" +
|
||||
urlRxEscape(
|
||||
parsedUrl.origin +
|
||||
parsedUrl.pathname.slice(
|
||||
0,
|
||||
parsedUrl.pathname.lastIndexOf("/") + 1,
|
||||
),
|
||||
parsedUrl,
|
||||
),
|
||||
),
|
||||
];
|
||||
break;
|
||||
|
||||
case "host":
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
|
||||
break;
|
||||
case "host":
|
||||
include = [
|
||||
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
|
||||
];
|
||||
break;
|
||||
|
||||
case "domain":
|
||||
if (parsedUrl.hostname.startsWith("www.")) {
|
||||
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
||||
}
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
|
||||
break;
|
||||
case "domain":
|
||||
if (parsedUrl.hostname.startsWith("www.")) {
|
||||
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
||||
}
|
||||
include = [
|
||||
new RegExp(
|
||||
"^" +
|
||||
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
|
||||
"\\/\\/",
|
||||
"\\/\\/([^/]+\\.)*",
|
||||
),
|
||||
),
|
||||
];
|
||||
break;
|
||||
|
||||
case "any":
|
||||
include = [/.*/];
|
||||
break;
|
||||
case "any":
|
||||
include = [/.*/];
|
||||
break;
|
||||
|
||||
default:
|
||||
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
||||
default:
|
||||
logger.fatal(
|
||||
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
|
||||
);
|
||||
}
|
||||
|
||||
return [include, allowHash];
|
||||
|
@ -221,7 +266,7 @@ export class ScopedSeed
|
|||
}
|
||||
}
|
||||
|
||||
return {url, isOOS};
|
||||
return { url, isOOS };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -232,7 +277,3 @@ export function rxEscape(string: string) {
|
|||
export function urlRxEscape(url: string, parsedUrl: URL) {
|
||||
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,6 @@ import { MAX_DEPTH } from "./constants.js";
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { Frame } from "puppeteer-core";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export enum LoadState {
|
||||
FAILED = 0,
|
||||
|
@ -16,7 +15,6 @@ export enum LoadState {
|
|||
BEHAVIORS_DONE = 4,
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export enum QueueState {
|
||||
ADDED = 0,
|
||||
|
@ -24,14 +22,11 @@ export enum QueueState {
|
|||
DUPE_URL = 2,
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export type WorkerId = number;
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class PageState
|
||||
{
|
||||
export class PageState {
|
||||
url: string;
|
||||
seedId: number;
|
||||
depth: number;
|
||||
|
@ -53,11 +48,16 @@ export class PageState
|
|||
|
||||
skipBehaviors = false;
|
||||
filteredFrames: Frame[] = [];
|
||||
loadState : LoadState = LoadState.FAILED;
|
||||
loadState: LoadState = LoadState.FAILED;
|
||||
|
||||
logDetails = {};
|
||||
|
||||
constructor(redisData: {url: string, seedId: number, depth: number, extraHops: number}) {
|
||||
constructor(redisData: {
|
||||
url: string;
|
||||
seedId: number;
|
||||
depth: number;
|
||||
extraHops: number;
|
||||
}) {
|
||||
this.url = redisData.url;
|
||||
this.seedId = redisData.seedId;
|
||||
this.depth = redisData.depth;
|
||||
|
@ -78,10 +78,7 @@ declare module "ioredis" {
|
|||
limit: number,
|
||||
): Result<number, Context>;
|
||||
|
||||
getnext(
|
||||
qkey: string,
|
||||
pkey: string,
|
||||
): Result<string, Context>;
|
||||
getnext(qkey: string, pkey: string): Result<string, Context>;
|
||||
|
||||
markstarted(
|
||||
pkey: string,
|
||||
|
@ -103,7 +100,7 @@ declare module "ioredis" {
|
|||
unlockpending(
|
||||
pkeyUrl: string,
|
||||
uid: string,
|
||||
callback?: Callback<string>
|
||||
callback?: Callback<string>,
|
||||
): Result<void, Context>;
|
||||
|
||||
requeue(
|
||||
|
@ -113,13 +110,11 @@ declare module "ioredis" {
|
|||
url: string,
|
||||
maxRetryPending: number,
|
||||
): Result<number, Context>;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
export class RedisCrawlState
|
||||
{
|
||||
export class RedisCrawlState {
|
||||
redis: Redis;
|
||||
maxRetryPending = 1;
|
||||
_lastSize = 0;
|
||||
|
@ -134,12 +129,10 @@ export class RedisCrawlState
|
|||
dkey: string;
|
||||
fkey: string;
|
||||
ekey: string;
|
||||
|
||||
|
||||
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
||||
this.redis = redis;
|
||||
|
||||
|
||||
|
||||
this.uid = uid;
|
||||
this.key = key;
|
||||
this.maxPageTime = maxPageTime;
|
||||
|
@ -172,7 +165,7 @@ end
|
|||
redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]);
|
||||
redis.call('hdel', KEYS[1], ARGV[1]);
|
||||
return 0;
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("getnext", {
|
||||
|
@ -187,7 +180,7 @@ if json then
|
|||
end
|
||||
|
||||
return json;
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("markstarted", {
|
||||
|
@ -203,7 +196,7 @@ if json then
|
|||
redis.call('setex', KEYS[2], ARGV[3], ARGV[4]);
|
||||
end
|
||||
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("unlockpending", {
|
||||
|
@ -215,7 +208,7 @@ if value == ARGV[1] then
|
|||
redis.call('del', KEYS[1])
|
||||
end
|
||||
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("movefailed", {
|
||||
|
@ -232,7 +225,7 @@ if json then
|
|||
redis.call('hdel', KEYS[1], ARGV[1]);
|
||||
end
|
||||
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("requeue", {
|
||||
|
@ -255,9 +248,8 @@ if not res then
|
|||
end
|
||||
end
|
||||
return 0;
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
async _getNext() {
|
||||
|
@ -271,7 +263,14 @@ return 0;
|
|||
async markStarted(url: string) {
|
||||
const started = this._timestamp();
|
||||
|
||||
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid);
|
||||
return await this.redis.markstarted(
|
||||
this.pkey,
|
||||
this.pkey + ":" + url,
|
||||
url,
|
||||
started,
|
||||
this.maxPageTime,
|
||||
this.uid,
|
||||
);
|
||||
}
|
||||
|
||||
async markFinished(url: string) {
|
||||
|
@ -292,21 +291,24 @@ return 0;
|
|||
await this.redis.srem(this.skey, url);
|
||||
}
|
||||
|
||||
recheckScope(data: {url: string, depth: number, extraHops: number, seedId: number}, seeds: ScopedSeed[]) {
|
||||
recheckScope(
|
||||
data: { url: string; depth: number; extraHops: number; seedId: number },
|
||||
seeds: ScopedSeed[],
|
||||
) {
|
||||
const seed = seeds[data.seedId];
|
||||
|
||||
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
||||
}
|
||||
|
||||
async isFinished() {
|
||||
return ((await this.queueSize()) == 0) && ((await this.numDone()) > 0);
|
||||
return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
|
||||
}
|
||||
|
||||
async setStatus(status_: string) {
|
||||
await this.redis.hset(`${this.key}:status`, this.uid, status_);
|
||||
}
|
||||
|
||||
async getStatus() : Promise<string> {
|
||||
async getStatus(): Promise<string> {
|
||||
return (await this.redis.hget(`${this.key}:status`, this.uid)) || "";
|
||||
}
|
||||
|
||||
|
@ -343,35 +345,35 @@ return 0;
|
|||
return;
|
||||
}
|
||||
try {
|
||||
const {type, regex} = JSON.parse(result);
|
||||
const { type, regex } = JSON.parse(result);
|
||||
switch (type) {
|
||||
case "addExclusion":
|
||||
logger.debug("Add Exclusion", {type, regex}, "exclusion");
|
||||
if (!regex) {
|
||||
case "addExclusion":
|
||||
logger.debug("Add Exclusion", { type, regex }, "exclusion");
|
||||
if (!regex) {
|
||||
break;
|
||||
}
|
||||
for (const seed of seeds) {
|
||||
seed.addExclusion(regex);
|
||||
}
|
||||
// can happen async w/o slowing down crawling
|
||||
// each page is still checked if in scope before crawling, even while
|
||||
// queue is being filtered
|
||||
this.filterQueue(regex);
|
||||
break;
|
||||
}
|
||||
for (const seed of seeds) {
|
||||
seed.addExclusion(regex);
|
||||
}
|
||||
// can happen async w/o slowing down crawling
|
||||
// each page is still checked if in scope before crawling, even while
|
||||
// queue is being filtered
|
||||
this.filterQueue(regex);
|
||||
break;
|
||||
|
||||
case "removeExclusion":
|
||||
logger.debug("Remove Exclusion", {type, regex}, "exclusion");
|
||||
if (!regex) {
|
||||
case "removeExclusion":
|
||||
logger.debug("Remove Exclusion", { type, regex }, "exclusion");
|
||||
if (!regex) {
|
||||
break;
|
||||
}
|
||||
for (const seed of seeds) {
|
||||
seed.removeExclusion(regex);
|
||||
}
|
||||
break;
|
||||
}
|
||||
for (const seed of seeds) {
|
||||
seed.removeExclusion(regex);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} // TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
catch (e: any) {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.warn("Error processing message", e, "redisMessage");
|
||||
}
|
||||
}
|
||||
|
@ -389,7 +391,7 @@ return 0;
|
|||
|
||||
// regexStr just a string, optimize by using glob matching
|
||||
if (this.isStrMatch(regexStr)) {
|
||||
matcher = {"match": `*${regexStr}*`};
|
||||
matcher = { match: `*${regexStr}*` };
|
||||
}
|
||||
|
||||
const stream = this.redis.zscanStream(this.qkey, matcher);
|
||||
|
@ -404,14 +406,18 @@ return 0;
|
|||
//if (removed) {
|
||||
await this.markExcluded(url);
|
||||
//}
|
||||
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
|
||||
logger.debug(
|
||||
"Removing excluded URL",
|
||||
{ url, regex, removed },
|
||||
"exclusion",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
stream.resume();
|
||||
});
|
||||
|
||||
return new Promise<void>(resolve => {
|
||||
return new Promise<void>((resolve) => {
|
||||
stream.on("end", () => {
|
||||
resolve();
|
||||
});
|
||||
|
@ -424,15 +430,23 @@ return 0;
|
|||
|
||||
// consider failed if 3 failed retries in 60 secs
|
||||
await this.redis.expire(key, 60);
|
||||
return (res >= 3);
|
||||
return res >= 3;
|
||||
}
|
||||
|
||||
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
||||
async addToQueue({url, seedId, depth = 0, extraHops = 0} : {url: string, seedId: number, depth?: number, extraHops?: number}, limit = 0) {
|
||||
async addToQueue(
|
||||
{
|
||||
url,
|
||||
seedId,
|
||||
depth = 0,
|
||||
extraHops = 0,
|
||||
}: { url: string; seedId: number; depth?: number; extraHops?: number },
|
||||
limit = 0,
|
||||
) {
|
||||
const added = this._timestamp();
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const data : any = {added, url, seedId, depth};
|
||||
const data: any = { added, url, seedId, depth };
|
||||
if (extraHops) {
|
||||
data.extraHops = extraHops;
|
||||
}
|
||||
|
@ -441,7 +455,15 @@ return 0;
|
|||
// 0 - url queued successfully
|
||||
// 1 - url queue size limit reached
|
||||
// 2 - url is a dupe
|
||||
return await this.redis.addqueue(this.pkey, this.qkey, this.skey, url, this._getScore(data), JSON.stringify(data), limit);
|
||||
return await this.redis.addqueue(
|
||||
this.pkey,
|
||||
this.qkey,
|
||||
this.skey,
|
||||
url,
|
||||
this._getScore(data),
|
||||
JSON.stringify(data),
|
||||
limit,
|
||||
);
|
||||
}
|
||||
|
||||
async nextFromQueue() {
|
||||
|
@ -450,7 +472,7 @@ return 0;
|
|||
|
||||
try {
|
||||
data = JSON.parse(json);
|
||||
} catch(e) {
|
||||
} catch (e) {
|
||||
logger.error("Invalid queued json", json);
|
||||
return null;
|
||||
}
|
||||
|
@ -476,20 +498,27 @@ return 0;
|
|||
const failed = await this._iterListKeys(this.fkey);
|
||||
const errors = await this.getErrorList();
|
||||
|
||||
return {done, queued, pending, failed, errors};
|
||||
return { done, queued, pending, failed, errors };
|
||||
}
|
||||
|
||||
_getScore(data: {depth: number, extraHops: number}) {
|
||||
_getScore(data: { depth: number; extraHops: number }) {
|
||||
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
|
||||
}
|
||||
|
||||
async _iterSortedKey(key: string, inc = 100) {
|
||||
const results : string[] = [];
|
||||
const results: string[] = [];
|
||||
|
||||
const len = await this.redis.zcard(key);
|
||||
|
||||
for (let i = 0; i < len; i += inc) {
|
||||
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "LIMIT", i, inc);
|
||||
const someResults = await this.redis.zrangebyscore(
|
||||
key,
|
||||
0,
|
||||
"inf",
|
||||
"LIMIT",
|
||||
i,
|
||||
inc,
|
||||
);
|
||||
results.push(...someResults);
|
||||
}
|
||||
|
||||
|
@ -497,7 +526,7 @@ return 0;
|
|||
}
|
||||
|
||||
async _iterListKeys(key: string, inc = 100) {
|
||||
const results : string[] = [];
|
||||
const results: string[] = [];
|
||||
|
||||
const len = await this.redis.llen(key);
|
||||
|
||||
|
@ -508,10 +537,14 @@ return 0;
|
|||
return results;
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async load(state: Record<string, any>, seeds: ScopedSeed[], checkScope: boolean) {
|
||||
const seen : string[] = [];
|
||||
async load(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
state: Record<string, any>,
|
||||
seeds: ScopedSeed[],
|
||||
checkScope: boolean,
|
||||
) {
|
||||
const seen: string[] = [];
|
||||
|
||||
// need to delete existing keys, if exist to fully reset state
|
||||
await this.redis.del(this.qkey);
|
||||
|
@ -528,7 +561,7 @@ return 0;
|
|||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
await this.redis.zadd(this.qkey, this._getScore(data), json);
|
||||
seen.push(data.url);
|
||||
}
|
||||
|
@ -545,7 +578,7 @@ return 0;
|
|||
seen.push(data.url);
|
||||
}
|
||||
|
||||
if (typeof(state.done) === "number") {
|
||||
if (typeof state.done === "number") {
|
||||
// done key is just an int counter
|
||||
await this.redis.set(this.dkey, state.done);
|
||||
} else if (state.done instanceof Array) {
|
||||
|
@ -601,7 +634,7 @@ return 0;
|
|||
|
||||
async getPendingList() {
|
||||
const list = await this.redis.hvals(this.pkey);
|
||||
return list.map(x => JSON.parse(x));
|
||||
return list.map((x) => JSON.parse(x));
|
||||
}
|
||||
|
||||
async getErrorList() {
|
||||
|
@ -615,9 +648,9 @@ return 0;
|
|||
for (const url of pendingUrls) {
|
||||
await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
|
||||
}
|
||||
} // TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
catch (e: any) {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.error("Redis Del Pending Failed", e, "state");
|
||||
}
|
||||
}
|
||||
|
@ -626,15 +659,21 @@ return 0;
|
|||
const pendingUrls = await this.redis.hkeys(this.pkey);
|
||||
|
||||
for (const url of pendingUrls) {
|
||||
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending);
|
||||
const res = await this.redis.requeue(
|
||||
this.pkey,
|
||||
this.qkey,
|
||||
this.pkey + ":" + url,
|
||||
url,
|
||||
this.maxRetryPending,
|
||||
);
|
||||
switch (res) {
|
||||
case 1:
|
||||
logger.info(`Requeued: ${url}`);
|
||||
break;
|
||||
case 1:
|
||||
logger.info(`Requeued: ${url}`);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
logger.info(`Not requeuing anymore: ${url}`);
|
||||
break;
|
||||
case 2:
|
||||
logger.info(`Not requeuing anymore: ${url}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -656,4 +695,3 @@ return 0;
|
|||
return await this.redis.lpush(this.ekey, error);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,10 +16,8 @@ import { logger } from "./logger.js";
|
|||
// @ts-expect-error TODO fill in why error is expected
|
||||
import getFolderSize from "get-folder-size";
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class S3StorageSync
|
||||
{
|
||||
export class S3StorageSync {
|
||||
fullPrefix: string;
|
||||
client: Minio.Client;
|
||||
|
||||
|
@ -32,25 +30,27 @@ export class S3StorageSync
|
|||
webhookUrl?: string;
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
||||
|
||||
constructor(
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
urlOrData: string | any,
|
||||
{webhookUrl, userId, crawlId} :
|
||||
{webhookUrl?: string, userId: string, crawlId: string}
|
||||
{
|
||||
webhookUrl,
|
||||
userId,
|
||||
crawlId,
|
||||
}: { webhookUrl?: string; userId: string; crawlId: string },
|
||||
) {
|
||||
let url;
|
||||
let accessKey;
|
||||
let secretKey;
|
||||
|
||||
if (typeof(urlOrData) === "string") {
|
||||
if (typeof urlOrData === "string") {
|
||||
url = new URL(urlOrData);
|
||||
accessKey = url.username;
|
||||
secretKey = url.password;
|
||||
url.username = "";
|
||||
url.password = "";
|
||||
this.fullPrefix = url.href;
|
||||
|
||||
} else {
|
||||
url = new URL(urlOrData.endpointUrl);
|
||||
accessKey = urlOrData.accessKey;
|
||||
|
@ -64,7 +64,7 @@ export class S3StorageSync
|
|||
useSSL: url.protocol === "https:",
|
||||
accessKey,
|
||||
secretKey,
|
||||
partSize: 100*1024*1024
|
||||
partSize: 100 * 1024 * 1024,
|
||||
});
|
||||
|
||||
this.bucketName = url.pathname.slice(1).split("/")[0];
|
||||
|
@ -80,31 +80,47 @@ export class S3StorageSync
|
|||
|
||||
async uploadFile(srcFilename: string, targetFilename: string) {
|
||||
const fileUploadInfo = {
|
||||
"bucket": this.bucketName,
|
||||
"crawlId": this.crawlId,
|
||||
"prefix": this.objectPrefix,
|
||||
targetFilename
|
||||
bucket: this.bucketName,
|
||||
crawlId: this.crawlId,
|
||||
prefix: this.objectPrefix,
|
||||
targetFilename,
|
||||
};
|
||||
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
||||
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
||||
await this.client.fPutObject(
|
||||
this.bucketName,
|
||||
this.objectPrefix + targetFilename,
|
||||
srcFilename,
|
||||
);
|
||||
|
||||
const {hash, crc32} = await checksumFile("sha256", srcFilename);
|
||||
const { hash, crc32 } = await checksumFile("sha256", srcFilename);
|
||||
const path = targetFilename;
|
||||
|
||||
const size = await getFileSize(srcFilename);
|
||||
|
||||
// for backwards compatibility, keep 'bytes'
|
||||
return {path, size, hash, crc32, bytes: size};
|
||||
return { path, size, hash, crc32, bytes: size };
|
||||
}
|
||||
|
||||
async downloadFile(srcFilename: string, destFilename: string) {
|
||||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
||||
await this.client.fGetObject(
|
||||
this.bucketName,
|
||||
this.objectPrefix + srcFilename,
|
||||
destFilename,
|
||||
);
|
||||
}
|
||||
|
||||
async uploadCollWACZ(srcFilename: string, targetFilename: string, completed = true) {
|
||||
async uploadCollWACZ(
|
||||
srcFilename: string,
|
||||
targetFilename: string,
|
||||
completed = true,
|
||||
) {
|
||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||
logger.info("WACZ S3 file upload resource", {targetFilename, resource}, "s3Upload");
|
||||
logger.info(
|
||||
"WACZ S3 file upload resource",
|
||||
{ targetFilename, resource },
|
||||
"s3Upload",
|
||||
);
|
||||
|
||||
if (this.webhookUrl) {
|
||||
const body = {
|
||||
|
@ -115,17 +131,25 @@ export class S3StorageSync
|
|||
filename: this.fullPrefix + targetFilename,
|
||||
|
||||
...resource,
|
||||
completed
|
||||
completed,
|
||||
};
|
||||
|
||||
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
||||
|
||||
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
|
||||
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
|
||||
if (
|
||||
this.webhookUrl.startsWith("http://") ||
|
||||
this.webhookUrl.startsWith("https://")
|
||||
) {
|
||||
await fetch(this.webhookUrl, {
|
||||
method: "POST",
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
} else if (this.webhookUrl.startsWith("redis://")) {
|
||||
const parts = this.webhookUrl.split("/");
|
||||
if (parts.length !== 5) {
|
||||
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
||||
logger.fatal(
|
||||
"redis webhook url must be in format: redis://<host>:<port>/<db>/<key>",
|
||||
);
|
||||
}
|
||||
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
||||
await redis.rpush(parts[4], JSON.stringify(body));
|
||||
|
@ -139,7 +163,8 @@ export function initStorage() {
|
|||
return null;
|
||||
}
|
||||
|
||||
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const endpointUrl =
|
||||
process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const storeInfo = {
|
||||
endpointUrl,
|
||||
accessKey: process.env.STORE_ACCESS_KEY,
|
||||
|
@ -156,7 +181,6 @@ export function initStorage() {
|
|||
return new S3StorageSync(storeInfo, opts);
|
||||
}
|
||||
|
||||
|
||||
export async function getFileSize(filename: string) {
|
||||
const stats = await fsp.stat(filename);
|
||||
return stats.size;
|
||||
|
@ -165,25 +189,34 @@ export async function getFileSize(filename: string) {
|
|||
export async function getDirSize(dir: string) {
|
||||
const { size, errors } = await getFolderSize(dir);
|
||||
if (errors && errors.length) {
|
||||
logger.warn("Size check errors", {errors}, "sizecheck");
|
||||
logger.warn("Size check errors", { errors }, "sizecheck");
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export async function checkDiskUtilization(params: Record<string, any>, archiveDirSize: number, dfOutput=null) {
|
||||
const diskUsage : Record<string, string> = await getDiskUsage("/crawls", dfOutput);
|
||||
export async function checkDiskUtilization(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: Record<string, any>,
|
||||
archiveDirSize: number,
|
||||
dfOutput = null,
|
||||
) {
|
||||
const diskUsage: Record<string, string> = await getDiskUsage(
|
||||
"/crawls",
|
||||
dfOutput,
|
||||
);
|
||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||
|
||||
// Check that disk usage isn't already above threshold
|
||||
if (usedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
logger.info(
|
||||
`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`,
|
||||
);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: null,
|
||||
threshold: params.diskUtilization
|
||||
threshold: params.diskUtilization,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -191,7 +224,7 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
|||
const kbUsed = parseInt(diskUsage["Used"]);
|
||||
const kbTotal = parseInt(diskUsage["1K-blocks"]);
|
||||
|
||||
let kbArchiveDirSize = Math.round(archiveDirSize/1024);
|
||||
let kbArchiveDirSize = Math.round(archiveDirSize / 1024);
|
||||
if (params.combineWARC && params.generateWACZ) {
|
||||
kbArchiveDirSize *= 4;
|
||||
} else if (params.combineWARC || params.generateWACZ) {
|
||||
|
@ -199,15 +232,20 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
|||
}
|
||||
|
||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal);
|
||||
const projectedUsedPercentage = calculatePercentageUsed(
|
||||
projectedTotal,
|
||||
kbTotal,
|
||||
);
|
||||
|
||||
if (projectedUsedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
logger.info(
|
||||
`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`,
|
||||
);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
threshold: params.diskUtilization,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -215,7 +253,7 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
|||
stop: false,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
threshold: params.diskUtilization,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -225,12 +263,12 @@ export async function getDFOutput(path: string) {
|
|||
return res.stdout;
|
||||
}
|
||||
|
||||
export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
||||
export async function getDiskUsage(path = "/crawls", dfOutput = null) {
|
||||
const result = dfOutput || (await getDFOutput(path));
|
||||
const lines = result.split("\n");
|
||||
const keys = lines[0].split(/\s+/ig);
|
||||
const rows = lines.slice(1).map(line => {
|
||||
const values = line.split(/\s+/ig);
|
||||
const keys = lines[0].split(/\s+/gi);
|
||||
const rows = lines.slice(1).map((line) => {
|
||||
const values = line.split(/\s+/gi);
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return keys.reduce((o: Record<string, any>, k, index) => {
|
||||
|
@ -242,29 +280,34 @@ export async function getDiskUsage(path="/crawls", dfOutput = null) {
|
|||
}
|
||||
|
||||
export function calculatePercentageUsed(used: number, total: number) {
|
||||
return Math.round((used/total) * 100);
|
||||
return Math.round((used / total) * 100);
|
||||
}
|
||||
|
||||
function checksumFile(hashName: string, path: string) : Promise<{hash: string, crc32: number}>{
|
||||
function checksumFile(
|
||||
hashName: string,
|
||||
path: string,
|
||||
): Promise<{ hash: string; crc32: number }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const hash = createHash(hashName);
|
||||
let crc : number = 0;
|
||||
let crc: number = 0;
|
||||
|
||||
const stream = fs.createReadStream(path);
|
||||
stream.on("error", err => reject(err));
|
||||
stream.on("error", (err) => reject(err));
|
||||
stream.on("data", (chunk) => {
|
||||
hash.update(chunk);
|
||||
crc = crc32(chunk, crc);
|
||||
});
|
||||
stream.on("end", () => resolve({hash: hash.digest("hex"), crc32: crc}));
|
||||
stream.on("end", () => resolve({ hash: hash.digest("hex"), crc32: crc }));
|
||||
});
|
||||
}
|
||||
|
||||
export function interpolateFilename(filename: string, crawlId: string) {
|
||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
||||
filename = filename.replace(
|
||||
"@ts",
|
||||
new Date().toISOString().replace(/[:TZz.-]/g, ""),
|
||||
);
|
||||
filename = filename.replace("@hostname", os.hostname());
|
||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
||||
filename = filename.replace("@id", crawlId);
|
||||
return filename;
|
||||
}
|
||||
|
||||
|
|
|
@ -11,54 +11,79 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
|
|||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
constructor(cdp: CDPSession, opts: any) {
|
||||
super({...opts, warcName: "text.warc.gz"});
|
||||
super({ ...opts, warcName: "text.warc.gz" });
|
||||
this.cdp = cdp;
|
||||
}
|
||||
|
||||
async extractAndStoreText(resourceType: string, ignoreIfMatchesLast = false, saveToWarc = false) {
|
||||
async extractAndStoreText(
|
||||
resourceType: string,
|
||||
ignoreIfMatchesLast = false,
|
||||
saveToWarc = false,
|
||||
) {
|
||||
try {
|
||||
const text = await this.doGetText();
|
||||
|
||||
if (ignoreIfMatchesLast && text === this.lastText) {
|
||||
this.lastText = this.text;
|
||||
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
|
||||
return {changed: false, text};
|
||||
logger.debug(
|
||||
"Skipping, extracted text unchanged from last extraction",
|
||||
{ url: this.url },
|
||||
"text",
|
||||
);
|
||||
return { changed: false, text };
|
||||
}
|
||||
if (saveToWarc) {
|
||||
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
|
||||
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
|
||||
await this.writeBufferToWARC(
|
||||
new TextEncoder().encode(text),
|
||||
resourceType,
|
||||
"text/plain",
|
||||
);
|
||||
logger.debug(
|
||||
`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`,
|
||||
);
|
||||
}
|
||||
|
||||
this.lastText = text;
|
||||
return {changed: true, text};
|
||||
} // TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
catch (e: any) {
|
||||
return { changed: true, text };
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
logger.debug("Error extracting text", e, "text");
|
||||
return {changed: false, text: null};
|
||||
return { changed: false, text: null };
|
||||
}
|
||||
}
|
||||
|
||||
abstract doGetText() : Promise<string>;
|
||||
abstract doGetText(): Promise<string>;
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||
async doGetText() : Promise<string> {
|
||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
||||
async doGetText(): Promise<string> {
|
||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {
|
||||
computedStyles: [],
|
||||
});
|
||||
return this.parseTextFromDOMSnapshot(result);
|
||||
}
|
||||
|
||||
parseTextFromDOMSnapshot(result: Protocol.DOMSnapshot.CaptureSnapshotResponse) : string {
|
||||
parseTextFromDOMSnapshot(
|
||||
result: Protocol.DOMSnapshot.CaptureSnapshotResponse,
|
||||
): string {
|
||||
const TEXT_NODE = 3;
|
||||
const ELEMENT_NODE = 1;
|
||||
|
||||
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
|
||||
const SKIPPED_NODES = [
|
||||
"SCRIPT",
|
||||
"STYLE",
|
||||
"HEADER",
|
||||
"FOOTER",
|
||||
"BANNER-DIV",
|
||||
"NOSCRIPT",
|
||||
"TITLE",
|
||||
];
|
||||
|
||||
const {strings, documents} = result;
|
||||
const { strings, documents } = result;
|
||||
|
||||
const accum : string[] = [];
|
||||
const accum: string[] = [];
|
||||
|
||||
for (const doc of documents) {
|
||||
const nodeValues = doc.nodes.nodeValue || [];
|
||||
|
@ -91,16 +116,18 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class TextExtractViaDocument extends BaseTextExtract {
|
||||
async doGetText() : Promise<string> {
|
||||
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||
async doGetText(): Promise<string> {
|
||||
const result = await this.cdp.send("DOM.getDocument", {
|
||||
depth: -1,
|
||||
pierce: true,
|
||||
});
|
||||
return this.parseTextFromDOM(result);
|
||||
}
|
||||
|
||||
parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse) : string {
|
||||
const accum : string[] = [];
|
||||
parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse): string {
|
||||
const accum: string[] = [];
|
||||
const metadata = {};
|
||||
|
||||
this.parseText(dom.root, metadata, accum);
|
||||
|
@ -108,14 +135,26 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
|||
return accum.join("\n");
|
||||
}
|
||||
|
||||
parseText(node: Protocol.DOM.Node, metadata: Record<string, string> | null, accum: string[]) {
|
||||
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
||||
const EMPTY_LIST : Protocol.DOM.Node[] = [];
|
||||
parseText(
|
||||
node: Protocol.DOM.Node,
|
||||
metadata: Record<string, string> | null,
|
||||
accum: string[],
|
||||
) {
|
||||
const SKIPPED_NODES = [
|
||||
"head",
|
||||
"script",
|
||||
"style",
|
||||
"header",
|
||||
"footer",
|
||||
"banner-div",
|
||||
"noscript",
|
||||
];
|
||||
const EMPTY_LIST: Protocol.DOM.Node[] = [];
|
||||
const TEXT = "#text";
|
||||
const TITLE = "title";
|
||||
|
||||
const name = node.nodeName.toLowerCase();
|
||||
|
||||
|
||||
if (SKIPPED_NODES.includes(name)) {
|
||||
return;
|
||||
}
|
||||
|
@ -128,7 +167,7 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
|||
accum.push(value);
|
||||
}
|
||||
} else if (name === TITLE) {
|
||||
const title : string[] = [];
|
||||
const title: string[] = [];
|
||||
|
||||
for (const child of children) {
|
||||
this.parseText(child, null, title);
|
||||
|
@ -144,10 +183,9 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
|||
this.parseText(child, metadata, accum);
|
||||
}
|
||||
|
||||
if (node.contentDocument) {
|
||||
if (node.contentDocument) {
|
||||
this.parseText(node.contentDocument, null, accum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,39 +1,45 @@
|
|||
import { logger } from "./logger.js";
|
||||
|
||||
export function sleep(seconds: number) {
|
||||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
||||
return new Promise((resolve) => setTimeout(resolve, seconds * 1000));
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
||||
|
||||
export function timedRun(
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
promise: Promise<any>,
|
||||
seconds: number,
|
||||
message="Promise timed out",
|
||||
logDetails={},
|
||||
context="general",
|
||||
isWarn=false
|
||||
message = "Promise timed out",
|
||||
logDetails = {},
|
||||
context = "general",
|
||||
isWarn = false,
|
||||
) {
|
||||
// return Promise return value or log error if timeout is reached first
|
||||
const timeout = seconds * 1000;
|
||||
|
||||
const rejectPromiseOnTimeout = (timeout: number) => {
|
||||
return new Promise((resolve, reject) => {
|
||||
setTimeout(() => (reject("timeout reached")), timeout);
|
||||
setTimeout(() => reject("timeout reached"), timeout);
|
||||
});
|
||||
};
|
||||
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
|
||||
.catch((err) => {
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)]).catch(
|
||||
(err) => {
|
||||
if (err == "timeout reached") {
|
||||
const logFunc = isWarn ? logger.warn : logger.error;
|
||||
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
|
||||
logFunc.call(
|
||||
logger,
|
||||
message,
|
||||
{ seconds: seconds, ...logDetails },
|
||||
context,
|
||||
);
|
||||
} else {
|
||||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||
throw err;
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
|
||||
|
|
|
@ -2,8 +2,7 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import * as warcio from "warcio";
|
||||
|
||||
export class WARCResourceWriter
|
||||
{
|
||||
export class WARCResourceWriter {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
page: any;
|
||||
|
@ -12,34 +11,53 @@ export class WARCResourceWriter
|
|||
warcName: string;
|
||||
date: Date;
|
||||
|
||||
constructor({url, directory, date, warcName} : {url: string, directory: string, date: Date, warcName: string}) {
|
||||
constructor({
|
||||
url,
|
||||
directory,
|
||||
date,
|
||||
warcName,
|
||||
}: {
|
||||
url: string;
|
||||
directory: string;
|
||||
date: Date;
|
||||
warcName: string;
|
||||
}) {
|
||||
this.url = url;
|
||||
this.directory = directory;
|
||||
this.warcName = path.join(this.directory, warcName);
|
||||
this.date = date ? date : new Date();
|
||||
}
|
||||
|
||||
async writeBufferToWARC(contents: Uint8Array, resourceType: string, contentType: string) {
|
||||
async writeBufferToWARC(
|
||||
contents: Uint8Array,
|
||||
resourceType: string,
|
||||
contentType: string,
|
||||
) {
|
||||
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {
|
||||
gzip: true,
|
||||
});
|
||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||
}
|
||||
|
||||
async wrap(buffer: Uint8Array, resourceType: string, contentType: string) {
|
||||
const warcVersion = "WARC/1.1";
|
||||
const warcRecordType = "resource";
|
||||
const warcHeaders = {"Content-Type": contentType};
|
||||
const warcHeaders = { "Content-Type": contentType };
|
||||
async function* content() {
|
||||
yield buffer;
|
||||
}
|
||||
const resourceUrl = `urn:${resourceType}:${this.url}`;
|
||||
|
||||
return warcio.WARCRecord.create({
|
||||
url: resourceUrl,
|
||||
date: this.date.toISOString(),
|
||||
type: warcRecordType,
|
||||
warcVersion,
|
||||
warcHeaders
|
||||
}, content());
|
||||
return warcio.WARCRecord.create(
|
||||
{
|
||||
url: resourceUrl,
|
||||
date: this.date.toISOString(),
|
||||
type: warcRecordType,
|
||||
warcVersion,
|
||||
warcHeaders,
|
||||
},
|
||||
content(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,10 +7,8 @@ import { WARCSerializer } from "warcio/node";
|
|||
import { logger, errJSON } from "./logger.js";
|
||||
import type { IndexerOffsetLength, WARCRecord } from "warcio";
|
||||
|
||||
|
||||
// =================================================================
|
||||
export class WARCWriter implements IndexerOffsetLength
|
||||
{
|
||||
export class WARCWriter implements IndexerOffsetLength {
|
||||
archivesDir: string;
|
||||
tempCdxDir: string;
|
||||
filename: string;
|
||||
|
@ -25,8 +23,19 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
fh?: Writable | null;
|
||||
cdxFH?: Writable | null;
|
||||
|
||||
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails} :
|
||||
{archivesDir: string, tempCdxDir: string, filename: string, gzip: boolean, logDetails: Record<string, string>}) {
|
||||
constructor({
|
||||
archivesDir,
|
||||
tempCdxDir,
|
||||
filename,
|
||||
gzip,
|
||||
logDetails,
|
||||
}: {
|
||||
archivesDir: string;
|
||||
tempCdxDir: string;
|
||||
filename: string;
|
||||
gzip: boolean;
|
||||
logDetails: Record<string, string>;
|
||||
}) {
|
||||
this.archivesDir = archivesDir;
|
||||
this.tempCdxDir = tempCdxDir;
|
||||
this.filename = filename;
|
||||
|
@ -37,21 +46,29 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
this.recordLength = 0;
|
||||
|
||||
if (this.tempCdxDir) {
|
||||
this.indexer = new CDXIndexer({format: "cdxj"});
|
||||
this.indexer = new CDXIndexer({ format: "cdxj" });
|
||||
}
|
||||
}
|
||||
|
||||
async initFH() {
|
||||
if (!this.fh) {
|
||||
this.fh = fs.createWriteStream(path.join(this.archivesDir, this.filename));
|
||||
this.fh = fs.createWriteStream(
|
||||
path.join(this.archivesDir, this.filename),
|
||||
);
|
||||
}
|
||||
if (!this.cdxFH && this.tempCdxDir) {
|
||||
this.cdxFH = fs.createWriteStream(path.join(this.tempCdxDir, this.filename + ".cdx"));
|
||||
this.cdxFH = fs.createWriteStream(
|
||||
path.join(this.tempCdxDir, this.filename + ".cdx"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async writeRecordPair(responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined) {
|
||||
const opts = {gzip: this.gzip};
|
||||
async writeRecordPair(
|
||||
responseRecord: WARCRecord,
|
||||
requestRecord: WARCRecord,
|
||||
responseSerializer: WARCSerializer | undefined = undefined,
|
||||
) {
|
||||
const opts = { gzip: this.gzip };
|
||||
|
||||
if (!responseSerializer) {
|
||||
responseSerializer = new WARCSerializer(responseRecord, opts);
|
||||
|
@ -59,15 +76,20 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
await this.initFH();
|
||||
|
||||
this.recordLength = await this._writeRecord(responseRecord, responseSerializer);
|
||||
this.recordLength = await this._writeRecord(
|
||||
responseRecord,
|
||||
responseSerializer,
|
||||
);
|
||||
|
||||
this._writeCDX(responseRecord);
|
||||
|
||||
const requestSerializer = new WARCSerializer(requestRecord, opts);
|
||||
this.recordLength = await this._writeRecord(requestRecord, requestSerializer);
|
||||
this.recordLength = await this._writeRecord(
|
||||
requestRecord,
|
||||
requestSerializer,
|
||||
);
|
||||
|
||||
this._writeCDX(requestRecord);
|
||||
|
||||
}
|
||||
|
||||
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
||||
|
@ -83,7 +105,11 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
try {
|
||||
this.fh.write(chunk);
|
||||
} catch (e) {
|
||||
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
||||
logger.error(
|
||||
"Error writing to WARC, corruption possible",
|
||||
{ ...errJSON(e), url, ...this.logDetails },
|
||||
"writer",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -119,7 +145,7 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
// =================================================================
|
||||
export function streamFinish(fh: Writable) {
|
||||
const p = new Promise<void>(resolve => {
|
||||
const p = new Promise<void>((resolve) => {
|
||||
fh.once("finish", () => resolve());
|
||||
});
|
||||
fh.end();
|
||||
|
|
|
@ -16,9 +16,14 @@ const TEARDOWN_TIMEOUT = 10;
|
|||
const FINISHED_TIMEOUT = 60;
|
||||
|
||||
// ===========================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number, collDir: string) {
|
||||
export function runWorkers(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
crawler: any,
|
||||
numWorkers: number,
|
||||
maxPageTime: number,
|
||||
collDir: string,
|
||||
) {
|
||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||
|
||||
const workers = [];
|
||||
|
@ -39,13 +44,12 @@ export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number
|
|||
}
|
||||
|
||||
for (let i = 0; i < numWorkers; i++) {
|
||||
workers.push(new PageWorker((i + offset), crawler, maxPageTime, collDir));
|
||||
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir));
|
||||
}
|
||||
|
||||
return Promise.allSettled(workers.map((worker) => worker.run()));
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -55,17 +59,18 @@ export type WorkerOpts = Record<string, any> & {
|
|||
workerid: WorkerId;
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
callbacks: Record<string, Function>;
|
||||
directFetchCapture?: ((url: string) => Promise<{fetched: boolean, mime: string}>) | null;
|
||||
directFetchCapture?:
|
||||
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
|
||||
| null;
|
||||
};
|
||||
|
||||
// ===========================================================================
|
||||
export type WorkerState = WorkerOpts & {
|
||||
data: PageState
|
||||
data: PageState;
|
||||
};
|
||||
|
||||
// ===========================================================================
|
||||
export class PageWorker
|
||||
{
|
||||
export class PageWorker {
|
||||
id: WorkerId;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -91,16 +96,25 @@ export class PageWorker
|
|||
|
||||
recorder: Recorder;
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
constructor(id: WorkerId, crawler: any, maxPageTime: number, collDir: string) {
|
||||
constructor(
|
||||
id: WorkerId,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
crawler: any,
|
||||
maxPageTime: number,
|
||||
collDir: string,
|
||||
) {
|
||||
this.id = id;
|
||||
this.crawler = crawler;
|
||||
this.maxPageTime = maxPageTime;
|
||||
|
||||
this.logDetails = {workerid: this.id};
|
||||
this.logDetails = { workerid: this.id };
|
||||
|
||||
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler});
|
||||
this.recorder = new Recorder({
|
||||
workerid: id,
|
||||
collDir,
|
||||
crawler: this.crawler,
|
||||
});
|
||||
|
||||
this.crawler.browser.recorders.push(this.recorder);
|
||||
}
|
||||
|
@ -121,7 +135,7 @@ export class PageWorker
|
|||
TEARDOWN_TIMEOUT,
|
||||
"Page Teardown Timed Out",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
|
@ -129,13 +143,17 @@ export class PageWorker
|
|||
}
|
||||
|
||||
try {
|
||||
logger.debug("Closing page", {crashed: this.crashed, workerid: this.id}, "worker");
|
||||
logger.debug(
|
||||
"Closing page",
|
||||
{ crashed: this.crashed, workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
await timedRun(
|
||||
this.page.close(),
|
||||
TEARDOWN_TIMEOUT,
|
||||
"Page Close Timed Out",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
|
@ -155,14 +173,24 @@ export class PageWorker
|
|||
}
|
||||
}
|
||||
|
||||
async initPage(url: string) : Promise<WorkerOpts> {
|
||||
if (!this.crashed && this.page && this.opts && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) {
|
||||
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker");
|
||||
async initPage(url: string): Promise<WorkerOpts> {
|
||||
if (
|
||||
!this.crashed &&
|
||||
this.page &&
|
||||
this.opts &&
|
||||
++this.reuseCount <= MAX_REUSE &&
|
||||
this.isSameOrigin(url)
|
||||
) {
|
||||
logger.debug(
|
||||
"Reusing page",
|
||||
{ reuseCount: this.reuseCount, ...this.logDetails },
|
||||
"worker",
|
||||
);
|
||||
return this.opts;
|
||||
} else if (this.page) {
|
||||
await this.closePage();
|
||||
}
|
||||
|
||||
|
||||
this.reuseCount = 1;
|
||||
const workerid = this.id;
|
||||
|
||||
|
@ -170,13 +198,13 @@ export class PageWorker
|
|||
|
||||
while (await this.crawler.isCrawlRunning()) {
|
||||
try {
|
||||
logger.debug("Getting page in new window", {workerid}, "worker");
|
||||
logger.debug("Getting page in new window", { workerid }, "worker");
|
||||
const result = await timedRun(
|
||||
this.crawler.browser.newWindowPageWithCDP(),
|
||||
NEW_WINDOW_TIMEOUT,
|
||||
"New Window Timed Out",
|
||||
{workerid},
|
||||
"worker"
|
||||
{ workerid },
|
||||
"worker",
|
||||
);
|
||||
|
||||
if (!result) {
|
||||
|
@ -188,7 +216,9 @@ export class PageWorker
|
|||
this.page = page;
|
||||
this.cdp = cdp;
|
||||
this.callbacks = {};
|
||||
const directFetchCapture = this.recorder ? (x: string) => this.recorder.directFetchCapture(x) : null;
|
||||
const directFetchCapture = this.recorder
|
||||
? (x: string) => this.recorder.directFetchCapture(x)
|
||||
: null;
|
||||
this.opts = {
|
||||
page,
|
||||
cdp,
|
||||
|
@ -203,9 +233,11 @@ export class PageWorker
|
|||
|
||||
// updated per page crawl
|
||||
this.crashed = false;
|
||||
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
|
||||
this.crashBreak = new Promise(
|
||||
(resolve, reject) => (this.markCrashed = reject),
|
||||
);
|
||||
|
||||
this.logDetails = {page: page.url(), workerid};
|
||||
this.logDetails = { page: page.url(), workerid };
|
||||
|
||||
// more serious page crash, mark as failed
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -213,7 +245,11 @@ export class PageWorker
|
|||
page.on("error", (err: any) => {
|
||||
// ensure we're still on this page, otherwise ignore!
|
||||
if (this.page === page) {
|
||||
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker");
|
||||
logger.error(
|
||||
"Page Crashed",
|
||||
{ ...errJSON(err), ...this.logDetails },
|
||||
"worker",
|
||||
);
|
||||
this.crashed = true;
|
||||
if (this.markCrashed) {
|
||||
this.markCrashed("crashed");
|
||||
|
@ -224,17 +260,24 @@ export class PageWorker
|
|||
await this.crawler.setupPage(this.opts);
|
||||
|
||||
return this.opts;
|
||||
|
||||
} catch (err) {
|
||||
logger.warn("Error getting new page", {"workerid": this.id, ...errJSON(err)}, "worker");
|
||||
logger.warn(
|
||||
"Error getting new page",
|
||||
{ workerid: this.id, ...errJSON(err) },
|
||||
"worker",
|
||||
);
|
||||
retry++;
|
||||
|
||||
if (!this.crawler.browser.browser) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (retry >= MAX_REUSE) {
|
||||
logger.fatal("Unable to get new page, browser likely crashed", this.logDetails, "worker");
|
||||
logger.fatal(
|
||||
"Unable to get new page, browser likely crashed",
|
||||
this.logDetails,
|
||||
"worker",
|
||||
);
|
||||
}
|
||||
|
||||
await sleep(0.5);
|
||||
|
@ -262,16 +305,16 @@ export class PageWorker
|
|||
const { data } = opts;
|
||||
const { url } = data;
|
||||
|
||||
logger.info("Starting page", {workerid, "page": url}, "worker");
|
||||
logger.info("Starting page", { workerid, page: url }, "worker");
|
||||
|
||||
this.logDetails = {page: url, workerid};
|
||||
this.logDetails = { page: url, workerid };
|
||||
|
||||
// set new page id
|
||||
const pageid = uuidv4();
|
||||
data.pageid = pageid;
|
||||
|
||||
if (this.recorder) {
|
||||
this.recorder.startPage({pageid, url});
|
||||
this.recorder.startPage({ pageid, url });
|
||||
}
|
||||
|
||||
try {
|
||||
|
@ -281,14 +324,17 @@ export class PageWorker
|
|||
this.maxPageTime,
|
||||
"Page Worker Timeout",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
),
|
||||
this.crashBreak
|
||||
this.crashBreak,
|
||||
]);
|
||||
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message !== "logged" && !this.crashed) {
|
||||
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
|
||||
logger.error(
|
||||
"Worker Exception",
|
||||
{ ...errJSON(e), ...this.logDetails },
|
||||
"worker",
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
await timedRun(
|
||||
|
@ -296,19 +342,27 @@ export class PageWorker
|
|||
FINISHED_TIMEOUT,
|
||||
"Page Finished Timed Out",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async run() {
|
||||
logger.info("Worker starting", {workerid: this.id}, "worker");
|
||||
logger.info("Worker starting", { workerid: this.id }, "worker");
|
||||
|
||||
try {
|
||||
await this.runLoop();
|
||||
logger.info("Worker done, all tasks complete", {workerid: this.id}, "worker");
|
||||
logger.info(
|
||||
"Worker done, all tasks complete",
|
||||
{ workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Worker error, exiting", {...errJSON(e), workerid: this.id}, "worker");
|
||||
logger.error(
|
||||
"Worker error, exiting",
|
||||
{ ...errJSON(e), workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
} finally {
|
||||
if (this.recorder) {
|
||||
await this.recorder.onDone();
|
||||
|
@ -339,10 +393,9 @@ export class PageWorker
|
|||
const opts = await this.initPage(data.url);
|
||||
|
||||
// run timed crawl of page
|
||||
await this.timedCrawlPage({...opts, data});
|
||||
await this.timedCrawlPage({ ...opts, data });
|
||||
|
||||
loggedWaiting = false;
|
||||
|
||||
} else {
|
||||
// indicate that the worker has no more work (mostly for screencasting, status, etc...)
|
||||
// depending on other works, will either get more work or crawl will end
|
||||
|
@ -354,7 +407,11 @@ export class PageWorker
|
|||
// if pending, sleep and check again
|
||||
if (pending) {
|
||||
if (!loggedWaiting) {
|
||||
logger.debug("No crawl tasks, but pending tasks remain, waiting", {pending, workerid: this.id}, "worker");
|
||||
logger.debug(
|
||||
"No crawl tasks, but pending tasks remain, waiting",
|
||||
{ pending, workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
loggedWaiting = true;
|
||||
}
|
||||
await sleep(0.5);
|
||||
|
@ -368,5 +425,3 @@ export class PageWorker
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -6,21 +6,25 @@ function runCrawl(name, config, commandExtra = "") {
|
|||
config.generateCDX = true;
|
||||
config.depth = 0;
|
||||
config.collection = name;
|
||||
|
||||
|
||||
const configYaml = yaml.dump(config);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
function doesCDXContain(coll, value) {
|
||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
||||
const data = fs.readFileSync(
|
||||
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
||||
);
|
||||
return data.indexOf(value) >= 0;
|
||||
}
|
||||
|
||||
|
@ -41,11 +45,13 @@ test("test crawl without ad block for specific URL", () => {
|
|||
|
||||
test("testcrawl with ad block for specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.mozilla.org/en-US/firefox/",
|
||||
"blockAds": true,
|
||||
url: "https://www.mozilla.org/en-US/firefox/",
|
||||
blockAds: true,
|
||||
};
|
||||
|
||||
runCrawl("adblock-block", config);
|
||||
|
||||
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
|
||||
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
|
|
@ -6,21 +6,25 @@ test("dynamically add exclusion while crawl is running", async () => {
|
|||
|
||||
const p = new Promise((resolve) => {
|
||||
callback = (error, stdout, stderr) => {
|
||||
resolve({error, stdout, stderr});
|
||||
resolve({ error, stdout, stderr });
|
||||
};
|
||||
});
|
||||
|
||||
try {
|
||||
exec("docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", {"shell": "/bin/bash"}, callback);
|
||||
exec(
|
||||
"docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
|
||||
{ shell: "/bin/bash" },
|
||||
callback,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 3000));
|
||||
|
||||
const redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
||||
|
||||
await redis.connect({maxRetriesPerRequest: 50});
|
||||
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
||||
|
||||
await redis.connect({ maxRetriesPerRequest: 50 });
|
||||
|
||||
while (true) {
|
||||
if (Number(await redis.zcard("test:q")) > 1) {
|
||||
|
@ -33,7 +37,10 @@ test("dynamically add exclusion while crawl is running", async () => {
|
|||
const uids = await redis.hkeys("test:status");
|
||||
|
||||
// exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
|
||||
await redis.rpush(`${uids[0]}:msg`, JSON.stringify({type: "addExclusion", regex: "webrecorder"}));
|
||||
await redis.rpush(
|
||||
`${uids[0]}:msg`,
|
||||
JSON.stringify({ type: "addExclusion", regex: "webrecorder" }),
|
||||
);
|
||||
|
||||
// ensure 'Add Exclusion is contained in the debug logs
|
||||
const { stdout } = await p;
|
||||
|
@ -44,4 +51,3 @@ test("dynamically add exclusion while crawl is running", async () => {
|
|||
|
||||
await redis.disconnect();
|
||||
});
|
||||
|
||||
|
|
|
@ -3,33 +3,36 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import md5 from "md5";
|
||||
|
||||
|
||||
|
||||
|
||||
test("ensure basic crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\"");
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description"',
|
||||
);
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
|
||||
|
||||
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a combined warc file exists in the archive folder", () => {
|
||||
const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
|
||||
var captureFound = 0;
|
||||
|
||||
|
||||
for (var i = 0; i < warcLists.length; i++) {
|
||||
if (warcLists[i].endsWith("_0.warc.gz")){
|
||||
if (warcLists[i].endsWith("_0.warc.gz")) {
|
||||
captureFound = 1;
|
||||
}
|
||||
}
|
||||
expect(captureFound).toEqual(1);
|
||||
});
|
||||
|
||||
|
||||
test("check that a combined warc file is under the rolloverSize", () => {
|
||||
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
|
||||
const warcLists = fs.readdirSync(
|
||||
path.join("test-crawls/collections/wr-net/wacz", "archive"),
|
||||
);
|
||||
let rolloverSize = 0;
|
||||
|
||||
function getFileSize(filename) {
|
||||
|
@ -37,8 +40,10 @@ test("check that a combined warc file is under the rolloverSize", () => {
|
|||
}
|
||||
|
||||
for (let i = 0; i < warcLists.length; i++) {
|
||||
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
||||
if (size < 10000){
|
||||
const size = getFileSize(
|
||||
path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
|
||||
);
|
||||
if (size < 10000) {
|
||||
rolloverSize = 1;
|
||||
}
|
||||
}
|
||||
|
@ -46,27 +51,57 @@ test("check that a combined warc file is under the rolloverSize", () => {
|
|||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
||||
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
|
||||
const crawl_hash = md5(
|
||||
JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[1],
|
||||
)["text"],
|
||||
);
|
||||
const wacz_hash = md5(
|
||||
JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/wr-net/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[1],
|
||||
)["text"],
|
||||
);
|
||||
const fixture_hash = md5(
|
||||
JSON.parse(
|
||||
fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
|
||||
)["text"],
|
||||
);
|
||||
|
||||
expect(wacz_hash).toEqual(fixture_hash);
|
||||
expect(wacz_hash).toEqual(crawl_hash);
|
||||
|
||||
});
|
||||
|
||||
test("check that the supplied title and description made it into datapackage.json", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
|
||||
).toBe(true);
|
||||
|
||||
const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8");
|
||||
const data = fs.readFileSync(
|
||||
"test-crawls/collections/wr-net/wacz/datapackage.json",
|
||||
"utf8",
|
||||
);
|
||||
const dataPackageJSON = JSON.parse(data);
|
||||
expect(dataPackageJSON.title).toEqual("test title");
|
||||
expect(dataPackageJSON.description).toEqual("test description");
|
||||
|
|
|
@ -6,21 +6,25 @@ function runCrawl(name, config, commandExtra = "") {
|
|||
config.generateCDX = true;
|
||||
config.depth = 0;
|
||||
config.collection = name;
|
||||
|
||||
|
||||
const configYaml = yaml.dump(config);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
function doesCDXContain(coll, value) {
|
||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
||||
const data = fs.readFileSync(
|
||||
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
||||
);
|
||||
return data.indexOf(value) >= 0;
|
||||
}
|
||||
|
||||
|
@ -39,131 +43,154 @@ test("test crawl without block for specific URL", () => {
|
|||
});
|
||||
*/
|
||||
|
||||
|
||||
test("test block rule on specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.iana.org/",
|
||||
"blockRules": [
|
||||
{"url": "adsense"}
|
||||
]
|
||||
url: "https://www.iana.org/",
|
||||
blockRules: [{ url: "adsense" }],
|
||||
};
|
||||
|
||||
runCrawl("block-1", config);
|
||||
|
||||
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false);
|
||||
expect(
|
||||
doesCDXContain(
|
||||
"block-1",
|
||||
"https://cse.google.com/adsense/search/async-ads.js",
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
test("test block rule based on iframe text, content included due to match", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
"type": "allowOnly"
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||
type: "allowOnly",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-2", config);
|
||||
|
||||
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true);
|
||||
expect(doesCDXContain("block-2", '"video/mp4"')).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"",
|
||||
"type": "allowOnly"
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"',
|
||||
type: "allowOnly",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-3", config);
|
||||
|
||||
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false);
|
||||
expect(doesCDXContain("block-3", '"video/mp4"')).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule based on iframe text, block matched", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-4", config);
|
||||
|
||||
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false);
|
||||
expect(doesCDXContain("block-4", '"video/mp4"')).toBe(false);
|
||||
});
|
||||
|
||||
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "example.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
"type": "block"
|
||||
}, {
|
||||
"url": "(youtube.com|example.com)/embed/",
|
||||
"type": "allowOnly",
|
||||
"inFrameUrl": "oembed.link/",
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "example.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||
type: "block",
|
||||
},
|
||||
{
|
||||
url: "(youtube.com|example.com)/embed/",
|
||||
type: "allowOnly",
|
||||
inFrameUrl: "oembed.link/",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("non-block-5", config);
|
||||
|
||||
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true);
|
||||
expect(doesCDXContain("non-block-5", '"video/mp4"')).toBe(true);
|
||||
});
|
||||
|
||||
test("test block url in frame url", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "maxresdefault.jpg",
|
||||
"type": "block",
|
||||
"inFrameUrl": "youtube.com/embed",
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "maxresdefault.jpg",
|
||||
type: "block",
|
||||
inFrameUrl: "youtube.com/embed",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-6", config);
|
||||
|
||||
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false);
|
||||
expect(
|
||||
doesCDXContain(
|
||||
"block-6",
|
||||
'"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"',
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
||||
const config = {
|
||||
"seeds": [
|
||||
"https://archiveweb.page/en/troubleshooting/errors/",
|
||||
seeds: ["https://archiveweb.page/en/troubleshooting/errors/"],
|
||||
depth: "0",
|
||||
blockRules: [
|
||||
{
|
||||
url: "(archiveweb.page|www.youtube.com)",
|
||||
type: "allowOnly",
|
||||
inFrameUrl: "archiveweb.page",
|
||||
},
|
||||
{
|
||||
url: "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||
inFrameUrl: "archiveweb.page",
|
||||
},
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
type: "allowOnly",
|
||||
frameTextMatch:
|
||||
'(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")',
|
||||
},
|
||||
],
|
||||
"depth": "0",
|
||||
"blockRules": [{
|
||||
"url": "(archiveweb.page|www.youtube.com)",
|
||||
"type": "allowOnly",
|
||||
"inFrameUrl": "archiveweb.page"
|
||||
}, {
|
||||
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||
"inFrameUrl": "archiveweb.page"
|
||||
}, {
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"type": "allowOnly",
|
||||
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
|
||||
}],
|
||||
|
||||
"combineWARC": true,
|
||||
combineWARC: true,
|
||||
|
||||
"logging": "stats,debug"
|
||||
logging: "stats,debug",
|
||||
};
|
||||
|
||||
|
||||
runCrawl("block-7", config);
|
||||
|
||||
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
|
||||
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
|
||||
expect(
|
||||
doesCDXContain(
|
||||
"block-7",
|
||||
'"https://archiveweb.page/assets/js/vendor/lunr.min.js"',
|
||||
),
|
||||
).toBe(false);
|
||||
expect(doesCDXContain("block-7", '"video/mp4"')).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,33 +1,32 @@
|
|||
import util from "util";
|
||||
import {exec as execCallback } from "child_process";
|
||||
import { exec as execCallback } from "child_process";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
test("check that the collection name is properly validated", async () => {
|
||||
let passed = "";
|
||||
|
||||
try{
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid",
|
||||
);
|
||||
passed = true;
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
|
||||
let passed = "";
|
||||
|
||||
try{
|
||||
await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
|
||||
try {
|
||||
await exec(
|
||||
"docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid",
|
||||
);
|
||||
passed = true;
|
||||
}
|
||||
catch(e){
|
||||
} catch (e) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
||||
});
|
||||
|
|
|
@ -2,21 +2,23 @@ import fs from "fs";
|
|||
import yaml from "js-yaml";
|
||||
|
||||
import util from "util";
|
||||
import {exec as execCallback } from "child_process";
|
||||
import { exec as execCallback } from "child_process";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
test("check yaml config file with seed list is used", async () => {
|
||||
try{
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
|
||||
}
|
||||
catch (error) {
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/configtest/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -26,9 +28,11 @@ test("check yaml config file with seed list is used", async () => {
|
|||
}
|
||||
}
|
||||
|
||||
const config = yaml.load(fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"));
|
||||
const config = yaml.load(
|
||||
fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"),
|
||||
);
|
||||
|
||||
let foundAllSeeds = true;
|
||||
let foundAllSeeds = true;
|
||||
|
||||
for (const seed of config.seeds) {
|
||||
const url = new URL(seed).href;
|
||||
|
@ -38,20 +42,24 @@ test("check yaml config file with seed list is used", async () => {
|
|||
}
|
||||
expect(foundAllSeeds).toBe(true);
|
||||
|
||||
expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true);
|
||||
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/configtest/configtest.wacz"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check yaml config file will be overwritten by command line", async () => {
|
||||
try{
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000");
|
||||
}
|
||||
catch (error) {
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/configtest-2/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -63,5 +71,4 @@ test("check yaml config file will be overwritten by command line", async () => {
|
|||
|
||||
expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
|
||||
expect(pages.size).toBe(1);
|
||||
|
||||
});
|
||||
|
|
|
@ -7,15 +7,20 @@ test("pass config file via stdin", async () => {
|
|||
const config = yaml.load(configYaml);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202",
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/config-stdin/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -37,6 +42,7 @@ test("pass config file via stdin", async () => {
|
|||
}
|
||||
expect(foundAllSeeds).toBe(true);
|
||||
|
||||
expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
|
||||
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
|
|
@ -1,31 +1,48 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
|
||||
test("ensure --overwrite with existing collection results in a successful crawl", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite",
|
||||
);
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the WACZ file exists in the collection", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
//-----------
|
||||
|
||||
test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync(
|
||||
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the WACZ file exists in the collection", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync(
|
||||
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
|
|
@ -1,23 +1,36 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("test custom behaviors", async () => {
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// custom behavior ran for example.com
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// but not for example.org
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(false);
|
||||
|
||||
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// another custom behavior ran for webrecorder.net
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
|
||||
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||
class TestBehavior2
|
||||
{
|
||||
class TestBehavior2 {
|
||||
static init() {
|
||||
return {
|
||||
state: {}
|
||||
state: {},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,8 +14,7 @@ class TestBehavior2
|
|||
return window.location.origin === "https://webrecorder.net";
|
||||
}
|
||||
|
||||
|
||||
async* run(ctx) {
|
||||
async *run(ctx) {
|
||||
ctx.log("In Test Behavior 2!");
|
||||
yield ctx.Lib.getState(ctx, "test-stat-2");
|
||||
}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||
class TestBehavior
|
||||
{
|
||||
class TestBehavior {
|
||||
static init() {
|
||||
return {
|
||||
state: {}
|
||||
state: {},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,8 +14,7 @@ class TestBehavior
|
|||
return window.location.origin === "https://example.com";
|
||||
}
|
||||
|
||||
|
||||
async* run(ctx) {
|
||||
async *run(ctx) {
|
||||
ctx.log("In Test Behavior!");
|
||||
yield ctx.Lib.getState(ctx, "test-stat");
|
||||
}
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
|
||||
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
||||
try {
|
||||
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs");
|
||||
}
|
||||
catch (error) {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -26,9 +29,8 @@ test("ensure custom driver with custom selector crawls JS files as pages", async
|
|||
const expectedPages = new Set([
|
||||
"https://www.iana.org/",
|
||||
"https://www.iana.org/_js/jquery.js",
|
||||
"https://www.iana.org/_js/iana.js"
|
||||
"https://www.iana.org/_js/iana.js",
|
||||
]);
|
||||
|
||||
expect(pages).toEqual(expectedPages);
|
||||
|
||||
});
|
||||
|
|
|
@ -1,42 +1,49 @@
|
|||
import fs from "fs";
|
||||
|
||||
import util from "util";
|
||||
import {exec as execCallback } from "child_process";
|
||||
import { exec as execCallback } from "child_process";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
const extraHopsTimeout = 180000;
|
||||
|
||||
|
||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
|
||||
const crawledPagesArray = crawledPages.trim().split("\n");
|
||||
|
||||
const expectedPages = [
|
||||
"https://webrecorder.net/",
|
||||
"https://webrecorder.net/blog",
|
||||
"https://webrecorder.net/tools",
|
||||
"https://webrecorder.net/community",
|
||||
"https://webrecorder.net/about",
|
||||
"https://webrecorder.net/contact",
|
||||
"https://webrecorder.net/faq",
|
||||
];
|
||||
|
||||
// first line is the header, not page, so adding -1
|
||||
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
|
||||
|
||||
for (const page of crawledPagesArray) {
|
||||
const url = JSON.parse(page).url;
|
||||
if (!url) {
|
||||
continue;
|
||||
test(
|
||||
"check that URLs are crawled 2 extra hops beyond depth",
|
||||
async () => {
|
||||
try {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
||||
}
|
||||
}, extraHopsTimeout);
|
||||
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/extra-hops-beyond/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const crawledPagesArray = crawledPages.trim().split("\n");
|
||||
|
||||
const expectedPages = [
|
||||
"https://webrecorder.net/",
|
||||
"https://webrecorder.net/blog",
|
||||
"https://webrecorder.net/tools",
|
||||
"https://webrecorder.net/community",
|
||||
"https://webrecorder.net/about",
|
||||
"https://webrecorder.net/contact",
|
||||
"https://webrecorder.net/faq",
|
||||
];
|
||||
|
||||
// first line is the header, not page, so adding -1
|
||||
expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
|
||||
|
||||
for (const page of crawledPagesArray) {
|
||||
const url = JSON.parse(page).url;
|
||||
if (!url) {
|
||||
continue;
|
||||
}
|
||||
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
||||
}
|
||||
},
|
||||
extraHopsTimeout,
|
||||
);
|
||||
|
|
|
@ -2,17 +2,18 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
|
||||
test("ensure that stats file is modified", async () => {
|
||||
|
||||
const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json");
|
||||
const child = child_process.exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
|
||||
);
|
||||
|
||||
// detect crawler exit
|
||||
let crawler_exited = false;
|
||||
child.on("exit", function() {
|
||||
child.on("exit", function () {
|
||||
crawler_exited = true;
|
||||
});
|
||||
|
||||
// helper function to sleep
|
||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
||||
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
|
||||
|
||||
// wait for stats file creation up to 30 secs (to not wait indefinitely)
|
||||
let counter = 0;
|
||||
|
@ -23,7 +24,9 @@ test("ensure that stats file is modified", async () => {
|
|||
}
|
||||
|
||||
// get initial modification time
|
||||
const initial_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime;
|
||||
const initial_mtime = fs.fstatSync(
|
||||
fs.openSync("test-crawls/progress.json", "r"),
|
||||
).mtime;
|
||||
|
||||
// wait for crawler exit
|
||||
while (!crawler_exited) {
|
||||
|
@ -31,12 +34,13 @@ test("ensure that stats file is modified", async () => {
|
|||
}
|
||||
|
||||
// get final modification time
|
||||
const final_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime;
|
||||
const final_mtime = fs.fstatSync(
|
||||
fs.openSync("test-crawls/progress.json", "r"),
|
||||
).mtime;
|
||||
|
||||
// compare initial and final modification time
|
||||
const diff = Math.abs(final_mtime - initial_mtime);
|
||||
expect(diff > 0).toBe(true);
|
||||
|
||||
});
|
||||
|
||||
test("check that stats file format is correct", () => {
|
||||
|
|
1
tests/fixtures/crawl-1.yaml
vendored
1
tests/fixtures/crawl-1.yaml
vendored
|
@ -5,4 +5,3 @@ seeds:
|
|||
- https://specs.webrecorder.net/
|
||||
|
||||
generateWACZ: true
|
||||
|
||||
|
|
7
tests/fixtures/driver-1.mjs
vendored
7
tests/fixtures/driver-1.mjs
vendored
|
@ -1,4 +1,5 @@
|
|||
export default async ({data, page, crawler}) => {
|
||||
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
|
||||
export default async ({ data, page, crawler }) => {
|
||||
await crawler.loadPage(page, data, [
|
||||
{ selector: "script[src]", extract: "src", isAttribute: false },
|
||||
]);
|
||||
};
|
||||
|
||||
|
|
|
@ -2,8 +2,9 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
|
||||
test("ensure page limit reached", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors \"\" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json");
|
||||
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
|
||||
);
|
||||
});
|
||||
|
||||
test("check limit written to stats file is as expected", () => {
|
||||
|
|
|
@ -2,9 +2,9 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
|
||||
|
||||
function jsonLinesToArray(string) {
|
||||
return string.split("\n")
|
||||
return string
|
||||
.split("\n")
|
||||
.filter((line) => {
|
||||
try {
|
||||
JSON.parse(line);
|
||||
|
@ -13,19 +13,19 @@ function jsonLinesToArray(string) {
|
|||
return false;
|
||||
}
|
||||
})
|
||||
.map(line => JSON.parse(line));
|
||||
.map((line) => JSON.parse(line));
|
||||
}
|
||||
|
||||
|
||||
test("ensure crawl run with log options passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general",
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
test("check that log files exist and were filtered according to options", () => {
|
||||
const logDir = "test-crawls/collections/wr-specs-logs/logs/";
|
||||
const logFiles = [];
|
||||
fs.readdirSync(logDir).forEach(file => {
|
||||
fs.readdirSync(logDir).forEach((file) => {
|
||||
if (file.startsWith("crawl-") && file.endsWith(".log")) {
|
||||
logFiles.push(path.join(logDir, file));
|
||||
}
|
||||
|
@ -33,14 +33,16 @@ test("check that log files exist and were filtered according to options", () =>
|
|||
|
||||
expect(logFiles.length).toBeGreaterThan(0);
|
||||
|
||||
for (let i=0; i < logFiles.length; i++) {
|
||||
for (let i = 0; i < logFiles.length; i++) {
|
||||
const logFile = logFiles[i];
|
||||
const parsedJSONLines = jsonLinesToArray(fs.readFileSync(logFile, "utf8"));
|
||||
|
||||
expect(parsedJSONLines.length).toBeGreaterThan(0);
|
||||
|
||||
parsedJSONLines.forEach((jsonLine) => {
|
||||
expect(jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn").toBe(true);
|
||||
expect(
|
||||
jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn",
|
||||
).toBe(true);
|
||||
expect(jsonLine.context).toBe("general");
|
||||
});
|
||||
}
|
||||
|
|
|
@ -2,24 +2,47 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
|
||||
test("ensure multi url crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2");
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz");
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that the favicon made it into the pages jsonl file", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
|
||||
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]);
|
||||
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]);
|
||||
const data = [ data1, data2 ];
|
||||
const data1 = JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/advanced/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[1],
|
||||
);
|
||||
const data2 = JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/advanced/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[2],
|
||||
);
|
||||
const data = [data1, data2];
|
||||
for (const d of data) {
|
||||
if (d.url === "https://webrecorder.net/") {
|
||||
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico");
|
||||
expect(d.favIconUrl).toEqual(
|
||||
"https://webrecorder.net/assets/favicon.ico",
|
||||
);
|
||||
}
|
||||
if (d.url === "https://iana.org/") {
|
||||
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico");
|
||||
expect(d.favIconUrl).toEqual(
|
||||
"https://www.iana.org/_img/bookmark_icon.ico",
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
|
@ -1,14 +1,19 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
|
||||
test("ensure crawl run with redis passes", async () => {
|
||||
const redis = child_process.spawn("docker run -d --name test-crawl-redis -p 6379:6379 redis");
|
||||
const redis = child_process.spawn(
|
||||
"docker run -d --name test-crawl-redis -p 6379:6379 redis",
|
||||
);
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2",
|
||||
);
|
||||
|
||||
redis.kill("SIGINT");
|
||||
});
|
||||
|
||||
test("check that wacz created is valid", () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
|
||||
);
|
||||
});
|
||||
|
|
|
@ -13,7 +13,7 @@ function waitForProcess() {
|
|||
};
|
||||
});
|
||||
|
||||
return {p, callback};
|
||||
return { p, callback };
|
||||
}
|
||||
|
||||
var savedStateFile;
|
||||
|
@ -28,9 +28,12 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
const wait = waitForProcess();
|
||||
|
||||
try {
|
||||
proc = exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20", {"shell": "/bin/bash"}, wait.callback);
|
||||
}
|
||||
catch (error) {
|
||||
proc = exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20",
|
||||
{ shell: "/bin/bash" },
|
||||
wait.callback,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
|
@ -45,12 +48,15 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
|
||||
while (true) {
|
||||
try {
|
||||
const pages = fs.readFileSync(pagesFile, {encoding: "utf-8"}).trim().split("\n");
|
||||
const pages = fs
|
||||
.readFileSync(pagesFile, { encoding: "utf-8" })
|
||||
.trim()
|
||||
.split("\n");
|
||||
|
||||
if (pages.length >= 2) {
|
||||
break;
|
||||
}
|
||||
} catch(e) {
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
|
@ -61,18 +67,22 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
|
||||
await wait.p;
|
||||
|
||||
const savedStates = fs.readdirSync("test-crawls/collections/int-state-test/crawls");
|
||||
const savedStates = fs.readdirSync(
|
||||
"test-crawls/collections/int-state-test/crawls",
|
||||
);
|
||||
expect(savedStates.length > 0).toEqual(true);
|
||||
|
||||
savedStateFile = savedStates[savedStates.length - 1];
|
||||
});
|
||||
|
||||
|
||||
test("check parsing saved state + page done + queue present", () => {
|
||||
expect(savedStateFile).toBeTruthy();
|
||||
|
||||
const savedState = fs.readFileSync(path.join("test-crawls/collections/int-state-test/crawls", savedStateFile), "utf-8");
|
||||
|
||||
const savedState = fs.readFileSync(
|
||||
path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const saved = yaml.load(savedState);
|
||||
|
||||
expect(!!saved.state).toBe(true);
|
||||
|
@ -82,31 +92,33 @@ test("check parsing saved state + page done + queue present", () => {
|
|||
|
||||
expect(state.done > 0).toEqual(true);
|
||||
expect(state.queued.length > 0).toEqual(true);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("check crawl restarted with saved state", async () => {
|
||||
let proc = null;
|
||||
|
||||
const wait = waitForProcess();
|
||||
|
||||
try {
|
||||
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
|
||||
proc = exec(
|
||||
`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
|
||||
{ shell: "/bin/bash" },
|
||||
wait.callback,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
||||
redis = new Redis("redis://127.0.0.1:36379/0", {lazyConnect: true});
|
||||
redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
|
||||
|
||||
try {
|
||||
await redis.connect({
|
||||
maxRetriesPerRequest: 100,
|
||||
retryStrategy(times) {
|
||||
return times < 100 ? 1000 : null;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
@ -126,5 +138,3 @@ test("interrupt crawl and exit", async () => {
|
|||
|
||||
expect(res[0].value).toBe(0);
|
||||
});
|
||||
|
||||
|
||||
|
|
|
@ -23,12 +23,10 @@ seeds:
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
||||
test("default scope + exclude", async () => {
|
||||
|
@ -40,15 +38,12 @@ exclude: https://example.com/pathexclude
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("default scope + exclude is numeric", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -58,17 +53,12 @@ exclude: "2022"
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/2022/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
test("prefix scope global + exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -79,15 +69,12 @@ exclude: https://example.com/pathexclude
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("prefix scope per seed + exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -98,15 +85,12 @@ exclude: https://example.com/pathexclude
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("host scope and domain scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -123,20 +107,26 @@ seeds:
|
|||
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
||||
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://sub.domain.example.com/path")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://notsub.domainexample.com/path")).toEqual(false);
|
||||
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
!!seeds[0].include[0].exec("https://sub.domain.example.com/path"),
|
||||
).toEqual(true);
|
||||
expect(
|
||||
!!seeds[0].include[0].exec("https://notsub.domainexample.com/path"),
|
||||
).toEqual(false);
|
||||
|
||||
expect(seeds[1].scopeType).toEqual("host");
|
||||
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
|
||||
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
|
||||
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
|
||||
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(false);
|
||||
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
test("domain scope drop www.", async () => {
|
||||
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
- url: https://www.example.com/
|
||||
|
@ -146,11 +136,8 @@ seeds:
|
|||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("domain");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
|
||||
test("custom scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -159,14 +146,12 @@ seeds:
|
|||
exclude: https?://example.com/pathexclude
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
|
||||
});
|
||||
|
||||
|
||||
test("inherit scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -178,7 +163,6 @@ include: https?://example.com/(path|other)
|
|||
exclude: https://example.com/pathexclude
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(2);
|
||||
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
|
@ -190,10 +174,8 @@ exclude: https://example.com/pathexclude
|
|||
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("override scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -225,7 +207,10 @@ include: https://example.com/onlythispath
|
|||
|
||||
expect(seeds[2].scopeType).toEqual("prefix");
|
||||
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
||||
expect(seeds[2].include).toEqual([/^https?:\/\/example\.com\/subpath\//, /https:\/\/example.com\/onlythispath/]);
|
||||
expect(seeds[2].include).toEqual([
|
||||
/^https?:\/\/example\.com\/subpath\//,
|
||||
/https:\/\/example.com\/onlythispath/,
|
||||
]);
|
||||
expect(seeds[2].exclude).toEqual([]);
|
||||
|
||||
expect(seeds[3].scopeType).toEqual("custom");
|
||||
|
@ -234,7 +219,6 @@ include: https://example.com/onlythispath
|
|||
expect(seeds[3].exclude).toEqual([]);
|
||||
});
|
||||
|
||||
|
||||
test("override scope with exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -288,10 +272,8 @@ exclude:
|
|||
expect(seeds[4].url).toEqual("https://example.com/4");
|
||||
expect(seeds[4].include).toEqual([]);
|
||||
expect(seeds[4].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("with exclude non-string types", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -342,5 +324,4 @@ seeds:
|
|||
expect(seeds[7].exclude).toEqual([/null/]);
|
||||
expect(seeds[8].exclude).toEqual([/false/]);
|
||||
expect(seeds[9].exclude).toEqual([/true/]);
|
||||
|
||||
});
|
||||
|
|
|
@ -4,48 +4,66 @@ import fs from "fs";
|
|||
// screenshot
|
||||
|
||||
test("ensure basic crawl run with --screenshot passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the test collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/test/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/test/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
// fullPageScreenshot
|
||||
|
||||
test("ensure basic crawl run with --fullPageScreenshot passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the fullpage collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/fullpage/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/fullpage/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
// thumbnail
|
||||
|
||||
test("ensure basic crawl run with --thumbnail passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the thumbnail collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/thumbnail/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/thumbnail/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
// combination
|
||||
|
||||
test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the combined collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/combined/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/combined/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
test("check that a wacz file exists in the combined collection", () => {
|
||||
const waczExists = fs.existsSync("test-crawls/collections/combined/combined.wacz");
|
||||
const waczExists = fs.existsSync(
|
||||
"test-crawls/collections/combined/combined.wacz",
|
||||
);
|
||||
expect(waczExists).toBe(true);
|
||||
});
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
import util from "util";
|
||||
import {exec as execCallback } from "child_process";
|
||||
import { exec as execCallback } from "child_process";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed");
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
passed = false;
|
||||
|
@ -18,9 +19,10 @@ test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set",
|
|||
test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed",
|
||||
);
|
||||
} catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
@ -29,9 +31,10 @@ test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async ()
|
|||
test("ensure crawl fails if no valid seeds are passed", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds",
|
||||
);
|
||||
} catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import { calculatePercentageUsed, checkDiskUtilization } from "../dist/util/storage.js";
|
||||
|
||||
import {
|
||||
calculatePercentageUsed,
|
||||
checkDiskUtilization,
|
||||
} from "../dist/util/storage.js";
|
||||
|
||||
test("ensure calculatePercentageUsed returns expected values", () => {
|
||||
expect(calculatePercentageUsed(30, 100)).toEqual(30);
|
||||
|
@ -13,13 +15,11 @@ test("ensure calculatePercentageUsed returns expected values", () => {
|
|||
expect(calculatePercentageUsed(0, 5)).toEqual(0);
|
||||
});
|
||||
|
||||
|
||||
test("verify end-to-end disk utilization not exceeded threshold", async () => {
|
||||
|
||||
const params = {
|
||||
diskUtilization: 90,
|
||||
combineWARC: true,
|
||||
generateWACZ: true
|
||||
generateWACZ: true,
|
||||
};
|
||||
|
||||
const mockDfOutput = `\
|
||||
|
@ -28,22 +28,24 @@ grpcfuse 1000000 285000 715000 28% /crawls`;
|
|||
|
||||
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
|
||||
// does not exceed 90% threshold
|
||||
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput);
|
||||
const returnValue = await checkDiskUtilization(
|
||||
params,
|
||||
5000 * 1024,
|
||||
mockDfOutput,
|
||||
);
|
||||
expect(returnValue).toEqual({
|
||||
stop: false,
|
||||
used: 28,
|
||||
projected: 31,
|
||||
threshold: 90
|
||||
threshold: 90,
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
test("verify end-to-end disk utilization exceeds threshold", async () => {
|
||||
|
||||
const params = {
|
||||
diskUtilization: 90,
|
||||
combineWARC: false,
|
||||
generateWACZ: true
|
||||
generateWACZ: true,
|
||||
};
|
||||
|
||||
const mockDfOutput = `\
|
||||
|
@ -52,11 +54,15 @@ grpcfuse 100000 85000 15000 85% /crawls`;
|
|||
|
||||
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
|
||||
// exceeds 90% threshold
|
||||
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput);
|
||||
const returnValue = await checkDiskUtilization(
|
||||
params,
|
||||
3000 * 1024,
|
||||
mockDfOutput,
|
||||
);
|
||||
expect(returnValue).toEqual({
|
||||
stop: true,
|
||||
used: 85,
|
||||
projected: 91,
|
||||
threshold: 90
|
||||
threshold: 90,
|
||||
});
|
||||
});
|
||||
|
|
|
@ -3,16 +3,20 @@ import child_process from "child_process";
|
|||
|
||||
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
|
||||
try {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
|
||||
);
|
||||
} catch (error) {
|
||||
//console.log(new TextDecoder().decode(error));
|
||||
console.log(error.stderr);
|
||||
}
|
||||
|
||||
const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});
|
||||
const data = fs.readFileSync(
|
||||
"test-crawls/collections/text-extract/indexes/index.cdxj",
|
||||
{ encoding: "utf-8" },
|
||||
);
|
||||
|
||||
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
||||
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
||||
|
||||
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
||||
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
||||
});
|
||||
|
||||
|
|
|
@ -1,24 +1,30 @@
|
|||
import util from "util";
|
||||
import {exec as execCallback } from "child_process";
|
||||
import { exec as execCallback } from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
test("check that URLs in seed-list are crawled", async () => {
|
||||
try {
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
|
||||
let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort();
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/filelisttest/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
let seed_file = fs
|
||||
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
||||
.split("\n")
|
||||
.sort();
|
||||
|
||||
let seed_file_list = [];
|
||||
for (var j = 0; j < seed_file.length; j++) {
|
||||
if (seed_file[j] != undefined){
|
||||
if (seed_file[j] != undefined) {
|
||||
seed_file_list.push(seed_file[j]);
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +32,7 @@ test("check that URLs in seed-list are crawled", async () => {
|
|||
let foundSeedUrl = true;
|
||||
|
||||
for (var i = 1; i < seed_file_list.length; i++) {
|
||||
if (crawled_pages.indexOf(seed_file_list[i]) == -1){
|
||||
if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
|
||||
foundSeedUrl = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,17 +3,21 @@ import zlib from "zlib";
|
|||
import child_process from "child_process";
|
||||
|
||||
test("check that the warcinfo file works as expected on the command line", async () => {
|
||||
try{
|
||||
try {
|
||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC",
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");
|
||||
const warcData = fs.readFileSync(
|
||||
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
||||
);
|
||||
|
||||
const data = zlib.gunzipSync(warcData);
|
||||
|
||||
|
@ -21,8 +25,8 @@ test("check that the warcinfo file works as expected on the command line", async
|
|||
|
||||
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
|
||||
expect(
|
||||
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
||||
).not.toEqual(null);
|
||||
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
||||
|
||||
|
||||
});
|
||||
|
|
|
@ -11,8 +11,12 @@
|
|||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
|
||||
/* Language and Environment */
|
||||
"target": "es2022", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
"lib": ["es2022", "dom", "dom.iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
"target": "es2022" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
|
||||
"lib": [
|
||||
"es2022",
|
||||
"dom",
|
||||
"dom.iterable"
|
||||
] /* Specify a set of bundled library declaration files that describe the target runtime environment. */,
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
|
||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||
|
@ -25,9 +29,9 @@
|
|||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
"module": "NodeNext", /* Specify what module code is generated. */
|
||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||
"moduleResolution": "NodeNext", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
"module": "NodeNext" /* Specify what module code is generated. */,
|
||||
"rootDir": "./src" /* Specify the root folder within your source files. */,
|
||||
"moduleResolution": "NodeNext" /* Specify how TypeScript looks up a file from a given module specifier. */,
|
||||
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||
|
@ -39,8 +43,8 @@
|
|||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
/* JavaScript Support */
|
||||
"allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||
"checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||
"allowJs": true /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */,
|
||||
"checkJs": true /* Enable error reporting in type-checked JavaScript files. */,
|
||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||
|
||||
/* Emit */
|
||||
|
@ -49,7 +53,7 @@
|
|||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||
"outDir": "./dist/", /* Specify an output folder for all emitted files. */
|
||||
"outDir": "./dist/" /* Specify an output folder for all emitted files. */,
|
||||
// "removeComments": true, /* Disable emitting comments. */
|
||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||
|
@ -73,10 +77,10 @@
|
|||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||
"forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
|
||||
|
||||
/* Type Checking */
|
||||
"strict": true, /* Enable all strict type-checking options. */
|
||||
"strict": true /* Enable all strict type-checking options. */,
|
||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||
|
@ -98,10 +102,8 @@
|
|||
|
||||
/* Completeness */
|
||||
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
|
||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
},
|
||||
|
||||
"include": [
|
||||
"src/**/*",
|
||||
]
|
||||
"include": ["src/**/*"]
|
||||
}
|
||||
|
|
10
yarn.lock
10
yarn.lock
|
@ -1914,6 +1914,11 @@ escodegen@^2.1.0:
|
|||
optionalDependencies:
|
||||
source-map "~0.6.1"
|
||||
|
||||
eslint-config-prettier@^9.0.0:
|
||||
version "9.0.0"
|
||||
resolved "https://registry.yarnpkg.com/eslint-config-prettier/-/eslint-config-prettier-9.0.0.tgz#eb25485946dd0c66cd216a46232dc05451518d1f"
|
||||
integrity sha512-IcJsTkJae2S35pRsRAwoCE+925rJJStOdkKnLVgtE+tEpqU0EVVM7OqrwxqgptKdX29NUwC82I5pXsGFIgSevw==
|
||||
|
||||
eslint-plugin-react@^7.22.0:
|
||||
version "7.23.2"
|
||||
resolved "https://registry.yarnpkg.com/eslint-plugin-react/-/eslint-plugin-react-7.23.2.tgz#2d2291b0f95c03728b55869f01102290e792d494"
|
||||
|
@ -3829,6 +3834,11 @@ prelude-ls@^1.2.1:
|
|||
resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396"
|
||||
integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==
|
||||
|
||||
prettier@3.0.3:
|
||||
version "3.0.3"
|
||||
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.0.3.tgz#432a51f7ba422d1469096c0fdc28e235db8f9643"
|
||||
integrity sha512-L/4pUDMxcNa8R/EthV08Zt42WBO4h1rarVtK0K+QJG0X187OLo7l699jWw0GKuwzkPQ//jMFA/8Xm6Fh3J/DAg==
|
||||
|
||||
pretty-format@^29.2.1:
|
||||
version "29.2.1"
|
||||
resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.2.1.tgz#86e7748fe8bbc96a6a4e04fa99172630907a9611"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue