Add Prettier to the repo, and format all the files! (#428)

This adds prettier to the repo, and sets up the pre-commit hook to
auto-format as well as lint.
Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
This commit is contained in:
Emma Segal-Grossman 2023-11-09 19:11:11 -05:00 committed by GitHub
parent af1e0860e4
commit 2a49406df7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
70 changed files with 3192 additions and 2026 deletions

View file

@ -5,7 +5,11 @@ module.exports = {
node: true, node: true,
jest: true, jest: true,
}, },
extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"], extends: [
"eslint:recommended",
"plugin:@typescript-eslint/recommended",
"prettier",
],
parser: "@typescript-eslint/parser", parser: "@typescript-eslint/parser",
plugins: ["@typescript-eslint"], plugins: ["@typescript-eslint"],
parserOptions: { parserOptions: {
@ -13,10 +17,6 @@ module.exports = {
sourceType: "module", sourceType: "module",
}, },
rules: { rules: {
indent: ["error", 2],
"linebreak-style": ["error", "unix"],
quotes: ["error", "double"],
semi: ["error", "always"],
"no-constant-condition": ["error", { checkLoops: false }], "no-constant-condition": ["error", { checkLoops: false }],
"no-use-before-define": [ "no-use-before-define": [
"error", "error",

View file

@ -6,7 +6,6 @@ on:
jobs: jobs:
lint: lint:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
@ -22,10 +21,9 @@ jobs:
- name: install requirements - name: install requirements
run: yarn install run: yarn install
- name: run linter - name: run linter
run: yarn lint run: yarn lint && yarn format
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
@ -46,8 +44,3 @@ jobs:
run: docker-compose build run: docker-compose build
- name: run jest - name: run jest
run: sudo yarn test run: sudo yarn test

View file

@ -8,12 +8,10 @@ jobs:
name: Build x86 and ARM Images and push to Dockerhub name: Build x86 and ARM Images and push to Dockerhub
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
steps: steps:
- - name: Check out the repo
name: Check out the repo
uses: actions/checkout@v4 uses: actions/checkout@v4
- - name: Docker image metadata
name: Docker image metadata
id: meta id: meta
uses: docker/metadata-action@v5 uses: docker/metadata-action@v5
with: with:
@ -21,23 +19,19 @@ jobs:
tags: | tags: |
type=semver,pattern={{version}} type=semver,pattern={{version}}
- - name: Set up QEMU
name: Set up QEMU
uses: docker/setup-qemu-action@v3 uses: docker/setup-qemu-action@v3
with: with:
platforms: arm64 platforms: arm64
- - name: Set up Docker Buildx
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1 uses: docker/setup-buildx-action@v1
- - name: Login to DockerHub
name: Login to DockerHub
uses: docker/login-action@v3 uses: docker/login-action@v3
with: with:
username: ${{ secrets.DOCKER_USERNAME }} username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }} password: ${{ secrets.DOCKER_PASSWORD }}
- - name: Build and push
name: Build and push
id: docker_build id: docker_build
uses: docker/build-push-action@v3 uses: docker/build-push-action@v3
with: with:
@ -45,7 +39,5 @@ jobs:
push: true push: true
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
platforms: "linux/amd64,linux/arm64" platforms: "linux/amd64,linux/arm64"
- - name: Image digest
name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }} run: echo ${{ steps.docker_build.outputs.digest }}

1
.gitignore vendored
View file

@ -6,3 +6,4 @@ node_modules/
crawls/ crawls/
test-crawls/ test-crawls/
.DS_Store .DS_Store
dist

View file

@ -1,4 +1,4 @@
#!/usr/bin/env sh #!/usr/bin/env sh
. "$(dirname -- "$0")/_/husky.sh" . "$(dirname -- "$0")/_/husky.sh"
yarn lint yarn lint:fix

1
.prettierignore Normal file
View file

@ -0,0 +1 @@
dist

View file

@ -1,11 +1,13 @@
## CHANGES ## CHANGES
v0.8.1 v0.8.1
- Logging and Behavior Tweaks by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/229 - Logging and Behavior Tweaks by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/229
- Fix typos by @stavares843 in https://github.com/webrecorder/browsertrix-crawler/pull/232 - Fix typos by @stavares843 in https://github.com/webrecorder/browsertrix-crawler/pull/232
- Add crawl log to WACZ by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/231 - Add crawl log to WACZ by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/231
v0.8.0 v0.8.0
- Switch to Chrome/Chromium 109 - Switch to Chrome/Chromium 109
- Convert to ESM module - Convert to ESM module
- Add ad blocking via request interception (#173) - Add ad blocking via request interception (#173)
@ -25,11 +27,13 @@ v0.8.0
- update behaviors to 0.4.1, rename 'Behavior line' -> 'Behavior log' by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/223 - update behaviors to 0.4.1, rename 'Behavior line' -> 'Behavior log' by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/223
v0.7.1 v0.7.1
- Fix for warcio.js by @ikreymer in #178 - Fix for warcio.js by @ikreymer in #178
- Guard against pre-existing user/group by @edsu in #176 - Guard against pre-existing user/group by @edsu in #176
- Fix incorrect combineWARCs property in README.md by @Georift in #180 - Fix incorrect combineWARCs property in README.md by @Georift in #180
v0.7.0 v0.7.0
- Update to Chrome/Chromium 101 - (0.7.0 Beta 0) by @ikreymer in #144 - Update to Chrome/Chromium 101 - (0.7.0 Beta 0) by @ikreymer in #144
- Add --netIdleWait, bump dependencies (0.7.0-beta.2) by @ikreymer in #145 - Add --netIdleWait, bump dependencies (0.7.0-beta.2) by @ikreymer in #145
- Update README.md by @atomotic in #147 - Update README.md by @atomotic in #147
@ -41,7 +45,6 @@ v0.7.0
- Interrupt Handling Fixes by @ikreymer in #167 - Interrupt Handling Fixes by @ikreymer in #167
- Run in Docker as User by @edsu in #171 - Run in Docker as User by @edsu in #171
v0.6.0 v0.6.0
- Add a --waitOnDone option, which has browsertrix crawler wait when finished (for use with Browsertrix Cloud) - Add a --waitOnDone option, which has browsertrix crawler wait when finished (for use with Browsertrix Cloud)
@ -56,8 +59,8 @@ v0.6.0
- Fixes to interrupting a single instance in a shared state crawl - Fixes to interrupting a single instance in a shared state crawl
- force all cookies, including session cookies, to fixed duration in days, configurable via --cookieDays - force all cookies, including session cookies, to fixed duration in days, configurable via --cookieDays
v0.5.0 v0.5.0
- Scope: support for `scopeType: domain` to include all subdomains and ignoring 'www.' if specified in the seed. - Scope: support for `scopeType: domain` to include all subdomains and ignoring 'www.' if specified in the seed.
- Profiles: support loading remote profile from URL as well as local file - Profiles: support loading remote profile from URL as well as local file
- Non-HTML Pages: Load non-200 responses in browser, even if non-html, fix waiting issues with non-HTML pages (eg. PDFs) - Non-HTML Pages: Load non-200 responses in browser, even if non-html, fix waiting issues with non-HTML pages (eg. PDFs)
@ -75,8 +78,8 @@ v0.5.0
- Signing: Support for optional signing of WACZ - Signing: Support for optional signing of WACZ
- Dependencies: update to latest pywb, wacz and browsertrix-behaviors packages - Dependencies: update to latest pywb, wacz and browsertrix-behaviors packages
v0.4.4 v0.4.4
- Page Block Rules Fix: 'request already handled' errors by avoiding adding duplicate handlers to same page. - Page Block Rules Fix: 'request already handled' errors by avoiding adding duplicate handlers to same page.
- Page Block Rules Fix: await all continue/abort() calls and catch errors. - Page Block Rules Fix: await all continue/abort() calls and catch errors.
- Page Block Rules: Don't apply to top-level page, print warning and recommend scope rules instead. - Page Block Rules: Don't apply to top-level page, print warning and recommend scope rules instead.
@ -86,11 +89,13 @@ v0.4.4
- README: Update old type -> scopeType, list new scope types. - README: Update old type -> scopeType, list new scope types.
v0.4.3 v0.4.3
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame. - BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
- BlockRules Fixes: Always allow pywb proxy scripts. - BlockRules Fixes: Always allow pywb proxy scripts.
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging' - Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
v0.4.2 v0.4.2
- Compose/docs: Build latest image by default, update README to refer to latest image - Compose/docs: Build latest image by default, update README to refer to latest image
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing - Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
- Tests: Update all tests to use `test-crawls` directory - Tests: Update all tests to use `test-crawls` directory
@ -98,6 +103,7 @@ v0.4.2
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction - loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
v0.4.1 v0.4.1
- BlockRules Optimizations: don't intercept requests if no blockRules - BlockRules Optimizations: don't intercept requests if no blockRules
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup - Profile Creation: Support extending existing profile by passing a --profile param to load on startup
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size - Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
@ -107,6 +113,7 @@ v0.4.1
- CI: Build a multi-platform (amd64 and arm64) image on each release - CI: Build a multi-platform (amd64 and arm64) image on each release
v0.4.0 v0.4.0
- YAML based config, specifyable via --config property or via stdin (with '--config stdin') - YAML based config, specifyable via --config property or via stdin (with '--config stdin')
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level - Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level
- Per-Seed scoping, including different scope types, or depth and include/exclude rules configurable per seed in 'seeds' list via YAML config - Per-Seed scoping, including different scope types, or depth and include/exclude rules configurable per seed in 'seeds' list via YAML config
@ -120,16 +127,17 @@ v0.4.0
- Update to latest pywb (2.5.0b4), browsertrix-behaviors (0.2.3), py-wacz (0.3.1) - Update to latest pywb (2.5.0b4), browsertrix-behaviors (0.2.3), py-wacz (0.3.1)
v0.3.2 v0.3.2
- Added a `--urlFile` option: Allows users to specify a .txt file list of exact URLs to crawl (one URL per line). - Added a `--urlFile` option: Allows users to specify a .txt file list of exact URLs to crawl (one URL per line).
v0.3.1 v0.3.1
- Improved shutdown wait: Instead of waiting for 5 secs, wait until all pending requests are written to WARCs - Improved shutdown wait: Instead of waiting for 5 secs, wait until all pending requests are written to WARCs
- Bug fix: Use async APIs for combine WARC to avoid spurious issues with multiple crawls - Bug fix: Use async APIs for combine WARC to avoid spurious issues with multiple crawls
- Behaviors Update to Behaviors to 0.2.1, with support for facebook pages - Behaviors Update to Behaviors to 0.2.1, with support for facebook pages
v0.3.0 v0.3.0
- WARC Combining: `--combineWARC` and `--rolloverSize` flags for generating combined WARC at end of crawl, each WARC upto specified rolloverSize - WARC Combining: `--combineWARC` and `--rolloverSize` flags for generating combined WARC at end of crawl, each WARC upto specified rolloverSize
- Profiles: Support for creating reusable browser profiles, stored as tarballs, and running crawl with a login profile (see README for more info) - Profiles: Support for creating reusable browser profiles, stored as tarballs, and running crawl with a login profile (see README for more info)
- Behaviors: Switch to Browsertrix Behaviors v0.1.1 for in-page behaviors - Behaviors: Switch to Browsertrix Behaviors v0.1.1 for in-page behaviors

View file

@ -51,7 +51,6 @@ Browsertrix Crawler includes a number of additional command-line options, explai
## Crawling Configuration Options ## Crawling Configuration Options
<details> <details>
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary> <summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
@ -269,8 +268,8 @@ Options:
ess (for debugging) [boolean] ess (for debugging) [boolean]
--config Path to YAML config file --config Path to YAML config file
``` ```
</details>
</details>
### Waiting for Page Load ### Waiting for Page Load
@ -282,14 +281,12 @@ See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remar
The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first. The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
### YAML Crawl Config ### YAML Crawl Config
Browsertix Crawler supports the use of a yaml file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option. Browsertix Crawler supports the use of a yaml file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option.
The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`. The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
``` ```
docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml
``` ```
@ -300,7 +297,6 @@ The config can also be passed via stdin, which can simplify the command. Note th
cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin
``` ```
An example config file (eg. crawl-config.yaml) might contain: An example config file (eg. crawl-config.yaml) might contain:
``` ```
@ -361,7 +357,6 @@ To make this configuration as simple as possible, there are several predefined s
The scope settings for multi-page crawls (page-spa, prefix, host, domain) also include http/https versions, eg. given a prefix of `http://example.com/path/`, The scope settings for multi-page crawls (page-spa, prefix, host, domain) also include http/https versions, eg. given a prefix of `http://example.com/path/`,
`https://example.com/path/` is also included. `https://example.com/path/` is also included.
#### Custom Scope Inclusion Rules #### Custom Scope Inclusion Rules
Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions. Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions.
@ -375,7 +370,6 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well. The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
#### Extra 'Hops' Beyond Current Scope #### Extra 'Hops' Beyond Current Scope
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope. Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
@ -385,7 +379,6 @@ For example, this is most useful when crawling with a `host` or `prefix` scope,
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules, The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
that will take precedence over the `--extraHops`. that will take precedence over the `--extraHops`.
#### Scope Rule Examples #### Scope Rule Examples
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*` For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
@ -456,27 +449,24 @@ If the `--blockMessage` is also specified, a blocked URL is replaced with the sp
If it seems confusing which rules should be used, here is a quick way to determine: If it seems confusing which rules should be used, here is a quick way to determine:
- If you'd like to restrict *the pages that are being crawled*, use the crawl scope rules (defined above). - If you'd like to restrict _the pages that are being crawled_, use the crawl scope rules (defined above).
- If you'd like to restrict *parts of a page* that are being loaded, use the page resource block rules described in this section. - If you'd like to restrict _parts of a page_ that are being loaded, use the page resource block rules described in this section.
The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked. The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked.
These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page). These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page).
### Ad blocking ### Ad blocking
With version 0.8.0, Browsertrix Crawler supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place. With version 0.8.0, Browsertrix Crawler supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.
### Custom Warcinfo Fields ### Custom Warcinfo Fields
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line. Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
For example, the following are equivalent ways to add additional warcinfo fields: For example, the following are equivalent ways to add additional warcinfo fields:
via yaml config: via yaml config:
```yaml ```yaml
@ -622,7 +612,6 @@ docker run -e CHROME_FLAGS="--disable-extensions-except=/ext/ublock --load-exten
You can also directly use extensions from an existing chrome-profile by using e.g. `~/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.41.8_0/` as the path. You can also directly use extensions from an existing chrome-profile by using e.g. `~/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.41.8_0/` as the path.
## Saving Crawl State: Interrupting and Restarting the Crawl ## Saving Crawl State: Interrupting and Restarting the Crawl
With version 0.5.0, a crawl can be gracefully interrupted with Ctrl-C (SIGINT) or a SIGTERM. With version 0.5.0, a crawl can be gracefully interrupted with Ctrl-C (SIGINT) or a SIGTERM.
@ -642,13 +631,11 @@ or `never` respectively, to control when the crawl state file should be written.
When the `--saveState` is set to always, Browsertrix Crawler will also save the state automatically during the crawl, as set by the `--saveStateInterval` setting. When the `--saveState` is set to always, Browsertrix Crawler will also save the state automatically during the crawl, as set by the `--saveStateInterval` setting.
When The crawler will keep the last `--saveStateHistory` save states and delete older ones. This provides extra backup, in case the crawl fails unexpectedly, or is not terminated via Ctrl-C, several previous crawl states are still available. When The crawler will keep the last `--saveStateHistory` save states and delete older ones. This provides extra backup, in case the crawl fails unexpectedly, or is not terminated via Ctrl-C, several previous crawl states are still available.
## Creating and Using Browser Profiles ## Creating and Using Browser Profiles
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
to certain sites or setting other settings, and running a crawl exactly with those settings. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies. to certain sites or setting other settings, and running a crawl exactly with those settings. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies.
### Interactive Profile Creation ### Interactive Profile Creation
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used. For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
@ -719,7 +706,6 @@ The script will then prompt you for login credentials, attempt to login and crea
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900) - To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional automated profile creation functionality, such as support for custom profile creation scripts, may be added in the future. The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional automated profile creation functionality, such as support for custom profile creation scripts, may be added in the future.
### Using Browser Profile with a Crawl ### Using Browser Profile with a Crawl
@ -743,7 +729,6 @@ All released Docker Images are available from Docker Hub, listed by release tag
Details for each corresponding release tag are also available on GitHub at: https://github.com/webrecorder/browsertrix-crawler/releases Details for each corresponding release tag are also available on GitHub at: https://github.com/webrecorder/browsertrix-crawler/releases
## Architecture ## Architecture
The Docker container provided here packages up several components used in Browsertrix. The Docker container provided here packages up several components used in Browsertrix.
@ -752,7 +737,6 @@ The system uses `pywb` in recording mode for capturing the content. The crawl pr
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup). To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
### Usage with Docker Compose ### Usage with Docker Compose
Many examples in this README demonstrate running Browsertrix Crawler with `docker run`. Many examples in this README demonstrate running Browsertrix Crawler with `docker run`.
@ -775,10 +759,8 @@ docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --
In this example, the crawl data is written to `./crawls/collections/wr-net` by default. In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
While the crawl is running, the status of the crawl prints the progress to the JSON log output. This can be disabled by using the `--logging` option and not including `stats`. While the crawl is running, the status of the crawl prints the progress to the JSON log output. This can be disabled by using the `--logging` option and not including `stats`.
### Multi-Platform Build / Support for Apple Silicon (M1/M2) ### Multi-Platform Build / Support for Apple Silicon (M1/M2)
Browsertrix Crawler uses a browser image which supports amd64 and arm64. Browsertrix Crawler uses a browser image which supports amd64 and arm64.
@ -787,7 +769,6 @@ This means Browsertrix Crawler can be built natively on Apple Silicon systems us
On an Apple Silicon system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build. Note that Chromium is different than Chrome, and for example, some video codecs may not be supported in the ARM / Chromium-based version that would be in the amd64 / Chrome version. For production crawling, it is recommended to run on an amd64 Linux environment. On an Apple Silicon system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build. Note that Chromium is different than Chrome, and for example, some video codecs may not be supported in the ARM / Chromium-based version that would be in the amd64 / Chrome version. For production crawling, it is recommended to run on an amd64 Linux environment.
### Modifying Browser Image ### Modifying Browser Image
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Chrome/Chromium (depending on host system chip architecture) and Brave Browser are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base). It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Chrome/Chromium (depending on host system chip architecture) and Brave Browser are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base).
@ -796,7 +777,6 @@ The browser base image used is specified and can be changed at the top of the Do
Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image. Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image.
### Viewing crawled data with pywb ### Viewing crawled data with pywb
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl: When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:
@ -809,17 +789,13 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
(Previewing crawl results while a crawl its still running should also be possible soon!) (Previewing crawl results while a crawl its still running should also be possible soon!)
## Support
Support
-------
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder. Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/). Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
## License
License
-------
[AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see [AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see
[LICENSE](LICENSE) for more details. [LICENSE](LICENSE) for more details.

View file

@ -1,4 +1,4 @@
version: '3.5' version: "3.5"
services: services:
crawler: crawler:
@ -14,4 +14,3 @@ services:
- SYS_ADMIN - SYS_ADMIN
shm_size: 1gb shm_size: 1gb

View file

@ -2,7 +2,9 @@
<html> <html>
<head> <head>
<style> <style>
html, body, iframe { html,
body,
iframe {
width: 100%; width: 100%;
height: 100%; height: 100%;
margin: 0; margin: 0;
@ -32,7 +34,11 @@ button {
</head> </head>
<body> <body>
<div id="info"> <div id="info">
Log in to any site(s) that you want to be part of the crawl profile using the embedded browser below. When done, click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form> Log in to any site(s) that you want to be part of the crawl profile using
the embedded browser below. When done, click
<form action="/createProfile" method="post">
<button type="submit">Create Profile</button>
</form>
</div> </div>
<iframe id="main" src="$DEVTOOLS_SRC"></iframe> <iframe id="main" src="$DEVTOOLS_SRC"></iframe>
</body> </body>

View file

@ -14,7 +14,9 @@
} }
</style> </style>
<script> <script>
const ws = new WebSocket(window.location.href.replace("http", "ws") + "ws"); const ws = new WebSocket(
window.location.href.replace("http", "ws") + "ws",
);
ws.addEventListener("message", (event) => handleMessage(event.data)); ws.addEventListener("message", (event) => handleMessage(event.data));
const unusedElems = []; const unusedElems = [];
@ -70,6 +72,8 @@
</script> </script>
<head> <head>
<body> <body>
<div id="content"> <div id="content"></div>
</div>
</body> </body>
</head>
</head>
</html>

View file

@ -1,7 +1,6 @@
<!DOCTYPE html> <!doctype html>
<html lang="en"> <html lang="en">
<head> <head>
<!-- <!--
noVNC example: lightweight example using minimal UI and features noVNC example: lightweight example using minimal UI and features
@ -16,10 +15,9 @@
--> -->
<title>noVNC</title> <title>noVNC</title>
<meta charset="utf-8"> <meta charset="utf-8" />
<style> <style>
body { body {
margin: 0; margin: 0;
background-color: dimgrey; background-color: dimgrey;
@ -56,12 +54,11 @@
flex: 1; /* fill remaining space */ flex: 1; /* fill remaining space */
overflow: hidden; overflow: hidden;
} }
</style> </style>
<script type="module" crossorigin="anonymous"> <script type="module" crossorigin="anonymous">
// RFB holds the API to connect and communicate with a VNC server // RFB holds the API to connect and communicate with a VNC server
import RFB from './core/rfb.js'; import RFB from "./core/rfb.js";
let rfb; let rfb;
let desktopName; let desktopName;
@ -105,7 +102,7 @@
// Show a status text in the top bar // Show a status text in the top bar
function status(text) { function status(text) {
document.getElementById('status').textContent = text; document.getElementById("status").textContent = text;
} }
// This function extracts the value of one variable from the // This function extracts the value of one variable from the
@ -124,8 +121,10 @@
// //
// Note that we use location.href instead of location.search // Note that we use location.href instead of location.search
// because Firefox < 53 has a bug w.r.t location.search // because Firefox < 53 has a bug w.r.t location.search
const re = new RegExp('.*[?&]' + name + '=([^&#]*)'), const re = new RegExp(".*[?&]" + name + "=([^&#]*)"),
match = ''.concat(document.location.href, window.location.hash).match(re); match = ""
.concat(document.location.href, window.location.hash)
.match(re);
if (match) { if (match) {
// We have to decode the URL since want the cleartext value // We have to decode the URL since want the cleartext value
@ -135,15 +134,14 @@
return defaultValue; return defaultValue;
} }
document.getElementById('sendCtrlAltDelButton') document.getElementById("sendCtrlAltDelButton").onclick = sendCtrlAltDel;
.onclick = sendCtrlAltDel;
// Read parameters specified in the URL query string // Read parameters specified in the URL query string
// By default, use the host and port of server that served this file // By default, use the host and port of server that served this file
const host = readQueryVariable('host', window.location.hostname); const host = readQueryVariable("host", window.location.hostname);
let port = readQueryVariable('port', window.location.port); let port = readQueryVariable("port", window.location.port);
const password = readQueryVariable('password'); const password = readQueryVariable("password");
const path = readQueryVariable('path', 'websockify'); const path = readQueryVariable("path", "websockify");
// | | | | | | // | | | | | |
// | | | Connect | | | // | | | Connect | | |
@ -154,19 +152,20 @@
// Build the websocket URL used to connect // Build the websocket URL used to connect
let url; let url;
if (window.location.protocol === "https:") { if (window.location.protocol === "https:") {
url = 'wss'; url = "wss";
} else { } else {
url = 'ws'; url = "ws";
} }
url += '://' + host; url += "://" + host;
if (port) { if (port) {
url += ':' + port; url += ":" + port;
} }
url += '/' + path; url += "/" + path;
// Creating a new RFB object will start a new connection // Creating a new RFB object will start a new connection
rfb = new RFB(document.getElementById('screen'), url, rfb = new RFB(document.getElementById("screen"), url, {
{ credentials: { password: password } }); credentials: { password: password },
});
// Add listeners to important events from the RFB module // Add listeners to important events from the RFB module
rfb.addEventListener("connect", connectedToServer); rfb.addEventListener("connect", connectedToServer);
@ -175,8 +174,8 @@
rfb.addEventListener("desktopname", updateDesktopName); rfb.addEventListener("desktopname", updateDesktopName);
// Set parameters that can be changed on an active connection // Set parameters that can be changed on an active connection
rfb.viewOnly = readQueryVariable('view_only', false); rfb.viewOnly = readQueryVariable("view_only", false);
rfb.scaleViewport = readQueryVariable('scale', false); rfb.scaleViewport = readQueryVariable("scale", false);
} }
connect(); connect();

View file

@ -8,7 +8,10 @@
"license": "AGPL-3.0-or-later", "license": "AGPL-3.0-or-later",
"scripts": { "scripts": {
"tsc": "tsc", "tsc": "tsc",
"lint": "eslint *.js tests/*.test.js", "format": "prettier . --check",
"format:fix": "prettier . --write",
"lint": "eslint .",
"lint:fix": "yarn format:fix && eslint . --fix",
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)", "test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
"prepare": "husky install" "prepare": "husky install"
}, },
@ -40,9 +43,11 @@
"@typescript-eslint/eslint-plugin": "^6.10.0", "@typescript-eslint/eslint-plugin": "^6.10.0",
"@typescript-eslint/parser": "^6.10.0", "@typescript-eslint/parser": "^6.10.0",
"eslint": "^8.53.0", "eslint": "^8.53.0",
"eslint-config-prettier": "^9.0.0",
"eslint-plugin-react": "^7.22.0", "eslint-plugin-react": "^7.22.0",
"jest": "^29.2.1", "jest": "^29.2.1",
"md5": "^2.3.0", "md5": "^2.3.0",
"prettier": "3.0.3",
"typescript": "^5.2.2" "typescript": "^5.2.2"
}, },
"jest": { "jest": {

View file

@ -4,7 +4,13 @@ import fs, { WriteStream } from "fs";
import os from "os"; import os from "os";
import fsp, { FileHandle } from "fs/promises"; import fsp, { FileHandle } from "fs/promises";
import { RedisCrawlState, LoadState, QueueState, PageState, WorkerId } from "./util/state.js"; import {
RedisCrawlState,
LoadState,
QueueState,
PageState,
WorkerId,
} from "./util/state.js";
import Sitemapper from "sitemapper"; import Sitemapper from "sitemapper";
import yaml from "js-yaml"; import yaml from "js-yaml";
@ -13,7 +19,14 @@ import * as warcio from "warcio";
import { HealthChecker } from "./util/healthcheck.js"; import { HealthChecker } from "./util/healthcheck.js";
import { TextExtractViaSnapshot } from "./util/textextract.js"; import { TextExtractViaSnapshot } from "./util/textextract.js";
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization, S3StorageSync } from "./util/storage.js"; import {
initStorage,
getFileSize,
getDirSize,
interpolateFilename,
checkDiskUtilization,
S3StorageSync,
} from "./util/storage.js";
import { ScreenCaster, WSTransport } from "./util/screencaster.js"; import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js"; import { Screenshots } from "./util/screenshots.js";
import { parseArgs } from "./util/argParser.js"; import { parseArgs } from "./util/argParser.js";
@ -25,7 +38,12 @@ import { collectAllFileSources } from "./util/file_reader.js";
import { Browser } from "./util/browser.js"; import { Browser } from "./util/browser.js";
import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js"; import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
HTML_TYPES,
DEFAULT_SELECTORS,
} from "./util/constants.js";
import { AdBlockRules, BlockRules } from "./util/blockrules.js"; import { AdBlockRules, BlockRules } from "./util/blockrules.js";
import { OriginOverride } from "./util/originoverride.js"; import { OriginOverride } from "./util/originoverride.js";
@ -41,12 +59,23 @@ const HTTPS_AGENT = new HTTPSAgent({
const HTTP_AGENT = new HTTPAgent(); const HTTP_AGENT = new HTTPAgent();
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"}); const behaviors = fs.readFileSync(
new URL(
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
import.meta.url,
),
{ encoding: "utf8" },
);
const FETCH_TIMEOUT_SECS = 30; const FETCH_TIMEOUT_SECS = 30;
const PAGE_OP_TIMEOUT_SECS = 5; const PAGE_OP_TIMEOUT_SECS = 5;
const POST_CRAWL_STATES = ["generate-wacz", "uploading-wacz", "generate-cdx", "generate-warc"]; const POST_CRAWL_STATES = [
"generate-wacz",
"uploading-wacz",
"generate-cdx",
"generate-warc",
];
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
type LogDetails = Record<string, any>; type LogDetails = Record<string, any>;
@ -62,7 +91,6 @@ type PageEntry = {
favIconUrl?: string; favIconUrl?: string;
}; };
// ============================================================================ // ============================================================================
export class Crawler { export class Crawler {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -128,8 +156,12 @@ export class Crawler {
maxHeapUsed = 0; maxHeapUsed = 0;
maxHeapTotal = 0; maxHeapTotal = 0;
driver!: (opts: {
page: Page;
data: PageState;
// eslint-disable-next-line no-use-before-define // eslint-disable-next-line no-use-before-define
driver!: (opts: { page: Page; data: PageState; crawler: Crawler }) => NonNullable<unknown>; crawler: Crawler;
}) => NonNullable<unknown>;
constructor() { constructor() {
const res = parseArgs(); const res = parseArgs();
@ -140,12 +172,12 @@ export class Crawler {
this.collDir = path.join( this.collDir = path.join(
this.params.cwd, this.params.cwd,
"collections", "collections",
this.params.collection this.params.collection,
); );
this.logDir = path.join(this.collDir, "logs"); this.logDir = path.join(this.collDir, "logs");
this.logFilename = path.join( this.logFilename = path.join(
this.logDir, this.logDir,
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log` `crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
); );
const debugLogging = this.params.logging.includes("debug"); const debugLogging = this.params.logging.includes("debug");
@ -252,7 +284,7 @@ export class Crawler {
if (!redisUrl.startsWith("redis://")) { if (!redisUrl.startsWith("redis://")) {
logger.fatal( logger.fatal(
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported" "stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
); );
} }
@ -272,7 +304,7 @@ export class Crawler {
logger.debug( logger.debug(
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`, `Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
{}, {},
"state" "state",
); );
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state"); logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
@ -281,7 +313,7 @@ export class Crawler {
redis, redis,
this.params.crawlId, this.params.crawlId,
this.maxPageTime, this.maxPageTime,
os.hostname() os.hostname(),
); );
// clear any pending URLs from this instance // clear any pending URLs from this instance
@ -291,7 +323,7 @@ export class Crawler {
logger.debug( logger.debug(
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`, `Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
{}, {},
"state" "state",
); );
} }
@ -311,7 +343,7 @@ export class Crawler {
logger.debug( logger.debug(
`Screencast server started on: ${this.params.screencastPort}`, `Screencast server started on: ${this.params.screencastPort}`,
{}, {},
"screencast" "screencast",
); );
} }
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) { // } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
@ -383,7 +415,7 @@ export class Crawler {
if (this.params.customBehaviors) { if (this.params.customBehaviors) {
this.customBehaviors = this.loadCustomBehaviors( this.customBehaviors = this.loadCustomBehaviors(
this.params.customBehaviors this.params.customBehaviors,
); );
} }
@ -461,7 +493,7 @@ export class Crawler {
_behaviorLog( _behaviorLog(
{ data, type }: { data: string; type: string }, { data, type }: { data: string; type: string },
pageUrl: string, pageUrl: string,
workerid: WorkerId workerid: WorkerId,
) { ) {
let behaviorLine; let behaviorLine;
let message; let message;
@ -506,7 +538,7 @@ export class Crawler {
depth, depth,
extraHops, extraHops,
}: { seedId: number; url: string; depth: number; extraHops: number }, }: { seedId: number; url: string; depth: number; extraHops: number },
logDetails = {} logDetails = {},
) { ) {
const seed = this.params.scopedSeeds[seedId]; const seed = this.params.scopedSeeds[seedId];
@ -553,7 +585,7 @@ export class Crawler {
logger.warn( logger.warn(
msg.text(), msg.text(),
{ location: msg.location(), page: page.url(), workerid }, { location: msg.location(), page: page.url(), workerid },
"jsError" "jsError",
); );
} }
}); });
@ -562,7 +594,7 @@ export class Crawler {
logger.warn( logger.warn(
"Page Error", "Page Error",
{ ...errJSON(e), page: page.url(), workerid }, { ...errJSON(e), page: page.url(), workerid },
"jsError" "jsError",
); );
}); });
} }
@ -574,14 +606,14 @@ export class Crawler {
await page.exposeFunction( await page.exposeFunction(
ADD_LINK_FUNC, ADD_LINK_FUNC,
(url: string) => callbacks.addLink && callbacks.addLink(url) (url: string) => callbacks.addLink && callbacks.addLink(url),
); );
if (this.params.behaviorOpts) { if (this.params.behaviorOpts) {
await page.exposeFunction( await page.exposeFunction(
BEHAVIOR_LOG_FUNC, BEHAVIOR_LOG_FUNC,
(logdata: { data: string; type: string }) => (logdata: { data: string; type: string }) =>
this._behaviorLog(logdata, page.url(), workerid) this._behaviorLog(logdata, page.url(), workerid),
); );
await this.browser.addInitScript(page, behaviors); await this.browser.addInitScript(page, behaviors);
@ -622,7 +654,7 @@ self.__bx_behaviors.selectMainBehavior();
} }
logger.warn( logger.warn(
"Failed to fetch favicon from browser /json endpoint", "Failed to fetch favicon from browser /json endpoint",
logDetails logDetails,
); );
return ""; return "";
} }
@ -645,7 +677,7 @@ self.__bx_behaviors.selectMainBehavior();
"HEAD request to determine if URL is HTML page timed out", "HEAD request to determine if URL is HTML page timed out",
logDetails, logDetails,
"fetch", "fetch",
true true,
); );
if (!data.isHTMLPage && directFetchCapture) { if (!data.isHTMLPage && directFetchCapture) {
@ -656,7 +688,7 @@ self.__bx_behaviors.selectMainBehavior();
"Direct fetch capture attempt timed out", "Direct fetch capture attempt timed out",
logDetails, logDetails,
"fetch", "fetch",
true true,
); );
if (fetched) { if (fetched) {
data.loadState = LoadState.FULL_PAGE_LOADED; data.loadState = LoadState.FULL_PAGE_LOADED;
@ -666,7 +698,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.info( logger.info(
"Direct fetch successful", "Direct fetch successful",
{ url, ...logDetails }, { url, ...logDetails },
"fetch" "fetch",
); );
return true; return true;
} }
@ -714,7 +746,7 @@ self.__bx_behaviors.selectMainBehavior();
const { changed, text } = await textextract.extractAndStoreText( const { changed, text } = await textextract.extractAndStoreText(
"text", "text",
false, false,
this.params.text.includes("to-warc") this.params.text.includes("to-warc"),
); );
if (changed && text && this.params.text.includes("to-pages")) { if (changed && text && this.params.text.includes("to-pages")) {
@ -729,7 +761,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug( logger.debug(
"Skipping behaviors for non-HTML page", "Skipping behaviors for non-HTML page",
logDetails, logDetails,
"behavior" "behavior",
); );
} else if (data.skipBehaviors) { } else if (data.skipBehaviors) {
logger.info("Skipping behaviors for slow page", logDetails, "behavior"); logger.info("Skipping behaviors for slow page", logDetails, "behavior");
@ -739,7 +771,7 @@ self.__bx_behaviors.selectMainBehavior();
this.params.behaviorTimeout, this.params.behaviorTimeout,
"Behaviors timed out", "Behaviors timed out",
logDetails, logDetails,
"behavior" "behavior",
); );
await this.netIdle(page, logDetails); await this.netIdle(page, logDetails);
@ -757,7 +789,7 @@ self.__bx_behaviors.selectMainBehavior();
if (this.params.pageExtraDelay) { if (this.params.pageExtraDelay) {
logger.info( logger.info(
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`, `Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
logDetails logDetails,
); );
await sleep(this.params.pageExtraDelay); await sleep(this.params.pageExtraDelay);
} }
@ -784,7 +816,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.warn( logger.warn(
"Page Load Failed", "Page Load Failed",
{ loadState, ...logDetails }, { loadState, ...logDetails },
"pageStatus" "pageStatus",
); );
await this.crawlState.markFailed(data.url); await this.crawlState.markFailed(data.url);
@ -816,7 +848,7 @@ self.__bx_behaviors.selectMainBehavior();
page: Page, page: Page,
cdp: CDPSession, cdp: CDPSession,
frames: Frame[], frames: Frame[],
logDetails: LogDetails logDetails: LogDetails,
) { ) {
try { try {
frames = frames || page.frames(); frames = frames || page.frames();
@ -828,7 +860,7 @@ self.__bx_behaviors.selectMainBehavior();
frameUrls: frames.map((frame) => frame.url()), frameUrls: frames.map((frame) => frame.url()),
...logDetails, ...logDetails,
}, },
"behavior" "behavior",
); );
const results = await Promise.allSettled( const results = await Promise.allSettled(
@ -844,9 +876,9 @@ self.__bx_behaviors.selectMainBehavior();
self.__bx_behaviors.run(); self.__bx_behaviors.run();
}`, }`,
logDetails, logDetails,
"behavior" "behavior",
) ),
) ),
); );
for (const res of results) { for (const res of results) {
@ -855,7 +887,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.warn( logger.warn(
"Behavior run partially failed", "Behavior run partially failed",
{ reason, ...logDetails }, { reason, ...logDetails },
"behavior" "behavior",
); );
} }
} }
@ -863,14 +895,14 @@ self.__bx_behaviors.selectMainBehavior();
logger.info( logger.info(
"Behaviors finished", "Behaviors finished",
{ finished: results.length, ...logDetails }, { finished: results.length, ...logDetails },
"behavior" "behavior",
); );
return true; return true;
} catch (e) { } catch (e) {
logger.warn( logger.warn(
"Behavior run failed", "Behavior run failed",
{ ...errJSON(e), ...logDetails }, { ...errJSON(e), ...logDetails },
"behavior" "behavior",
); );
return false; return false;
} }
@ -886,14 +918,14 @@ self.__bx_behaviors.selectMainBehavior();
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs // this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
// if there's no tag or an iframe tag, then assume its a regular frame // if there's no tag or an iframe tag, then assume its a regular frame
const tagName = await frame.evaluate( const tagName = await frame.evaluate(
"self && self.frameElement && self.frameElement.tagName" "self && self.frameElement && self.frameElement.tagName",
); );
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") { if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
logger.debug( logger.debug(
"Skipping processing non-frame object", "Skipping processing non-frame object",
{ tagName, frameUrl, ...logDetails }, { tagName, frameUrl, ...logDetails },
"behavior" "behavior",
); );
return null; return null;
} }
@ -910,7 +942,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug( logger.debug(
"Skipping processing frame", "Skipping processing frame",
{ frameUrl, ...logDetails }, { frameUrl, ...logDetails },
"behavior" "behavior",
); );
} }
@ -921,13 +953,13 @@ self.__bx_behaviors.selectMainBehavior();
const packageFileJSON = JSON.parse( const packageFileJSON = JSON.parse(
await fsp.readFile(new URL("../package.json", import.meta.url), { await fsp.readFile(new URL("../package.json", import.meta.url), {
encoding: "utf-8", encoding: "utf-8",
}) }),
); );
const warcioPackageJSON = JSON.parse( const warcioPackageJSON = JSON.parse(
await fsp.readFile( await fsp.readFile(
new URL("../node_modules/warcio/package.json", import.meta.url), new URL("../node_modules/warcio/package.json", import.meta.url),
{ encoding: "utf-8" } { encoding: "utf-8" },
) ),
); );
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`; return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
@ -945,7 +977,7 @@ self.__bx_behaviors.selectMainBehavior();
const warcInfo = { ...info, ...this.params.warcInfo }; const warcInfo = { ...info, ...this.params.warcInfo };
const record = await warcio.WARCRecord.createWARCInfo( const record = await warcio.WARCRecord.createWARCInfo(
{ filename, type, warcVersion }, { filename, type, warcVersion },
warcInfo warcInfo,
); );
const buffer = await warcio.WARCSerializer.serialize(record, { const buffer = await warcio.WARCSerializer.serialize(record, {
gzip: true, gzip: true,
@ -964,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
if (this.params.sizeLimit) { if (this.params.sizeLimit) {
if (size >= this.params.sizeLimit) { if (size >= this.params.sizeLimit) {
logger.info( logger.info(
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping` `Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
); );
interrupt = true; interrupt = true;
} }
@ -974,7 +1006,7 @@ self.__bx_behaviors.selectMainBehavior();
const elapsed = secondsElapsed(this.startTime); const elapsed = secondsElapsed(this.startTime);
if (elapsed >= this.params.timeLimit) { if (elapsed >= this.params.timeLimit) {
logger.info( logger.info(
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping` `Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
); );
interrupt = true; interrupt = true;
} }
@ -992,7 +1024,7 @@ self.__bx_behaviors.selectMainBehavior();
const numFailed = this.crawlState.numFailed(); const numFailed = this.crawlState.numFailed();
if (numFailed >= this.params.failOnFailedLimit) { if (numFailed >= this.params.failOnFailedLimit) {
logger.fatal( logger.fatal(
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl` `Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`,
); );
} }
} }
@ -1060,7 +1092,7 @@ self.__bx_behaviors.selectMainBehavior();
if (this.params.healthCheckPort) { if (this.params.healthCheckPort) {
this.healthChecker = new HealthChecker( this.healthChecker = new HealthChecker(
this.params.healthCheckPort, this.params.healthCheckPort,
this.params.workers this.params.workers,
); );
} }
@ -1125,7 +1157,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.crawlState.load( await this.crawlState.load(
this.params.state, this.params.state,
this.params.scopedSeeds, this.params.scopedSeeds,
true true,
); );
} }
@ -1133,14 +1165,14 @@ self.__bx_behaviors.selectMainBehavior();
this.adBlockRules = new AdBlockRules( this.adBlockRules = new AdBlockRules(
this.captureBasePrefix, this.captureBasePrefix,
this.params.adBlockMessage this.params.adBlockMessage,
); );
if (this.params.blockRules && this.params.blockRules.length) { if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules( this.blockRules = new BlockRules(
this.params.blockRules, this.params.blockRules,
this.captureBasePrefix, this.captureBasePrefix,
this.params.blockMessage this.params.blockMessage,
); );
} }
@ -1178,7 +1210,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.error( logger.error(
"Browser disconnected (crashed?), interrupting crawl", "Browser disconnected (crashed?), interrupting crawl",
err, err,
"browser" "browser",
); );
}, },
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -1220,7 +1252,7 @@ self.__bx_behaviors.selectMainBehavior();
const warcList = await fsp.readdir(path.join(this.collDir, "archive")); const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
const warcListFull = warcList.map((filename) => const warcListFull = warcList.map((filename) =>
path.join(this.collDir, "archive", filename) path.join(this.collDir, "archive", filename),
); );
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd})); //const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
@ -1230,7 +1262,7 @@ self.__bx_behaviors.selectMainBehavior();
...warcListFull, ...warcListFull,
]; ];
const indexResult = await this.awaitProcess( const indexResult = await this.awaitProcess(
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }) child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
); );
if (indexResult === 0) { if (indexResult === 0) {
logger.debug("Indexing complete, CDX successfully created"); logger.debug("Indexing complete, CDX successfully created");
@ -1251,7 +1283,7 @@ self.__bx_behaviors.selectMainBehavior();
if (uploaded && this.uploadAndDeleteLocal) { if (uploaded && this.uploadAndDeleteLocal) {
logger.info( logger.info(
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}` `Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
); );
try { try {
fs.rmSync(this.collDir, { recursive: true, force: true }); fs.rmSync(this.collDir, { recursive: true, force: true });
@ -1352,13 +1384,11 @@ self.__bx_behaviors.selectMainBehavior();
createArgs.push("-f"); createArgs.push("-f");
warcFileList.forEach((val) => warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
createArgs.push(path.join(archiveDir, val))
);
// create WACZ // create WACZ
const waczResult = await this.awaitProcess( const waczResult = await this.awaitProcess(
child_process.spawn("wacz", createArgs) child_process.spawn("wacz", createArgs),
); );
if (waczResult !== 0) { if (waczResult !== 0) {
@ -1430,7 +1460,7 @@ self.__bx_behaviors.selectMainBehavior();
maxHeapTotal: this.maxHeapTotal, maxHeapTotal: this.maxHeapTotal,
...memUsage, ...memUsage,
}, },
"memory" "memory",
); );
} }
@ -1461,7 +1491,7 @@ self.__bx_behaviors.selectMainBehavior();
try { try {
await fsp.writeFile( await fsp.writeFile(
this.params.statsFilename, this.params.statsFilename,
JSON.stringify(stats, null, 2) JSON.stringify(stats, null, 2),
); );
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (err: any) { } catch (err: any) {
@ -1473,7 +1503,7 @@ self.__bx_behaviors.selectMainBehavior();
async loadPage( async loadPage(
page: Page, page: Page,
data: PageState, data: PageState,
selectorOptsList = DEFAULT_SELECTORS selectorOptsList = DEFAULT_SELECTORS,
) { ) {
const { url, seedId, depth } = data; const { url, seedId, depth } = data;
@ -1575,7 +1605,7 @@ self.__bx_behaviors.selectMainBehavior();
const frames = await page.frames(); const frames = await page.frames();
const filteredFrames = await Promise.allSettled( const filteredFrames = await Promise.allSettled(
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)) frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
); );
data.filteredFrames = filteredFrames data.filteredFrames = filteredFrames
@ -1640,7 +1670,7 @@ self.__bx_behaviors.selectMainBehavior();
page: Page, page: Page,
data: PageState, data: PageState,
selectors = DEFAULT_SELECTORS, selectors = DEFAULT_SELECTORS,
logDetails: LogDetails logDetails: LogDetails,
) { ) {
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data; const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
@ -1651,7 +1681,7 @@ self.__bx_behaviors.selectMainBehavior();
links.push(url); links.push(url);
if (links.length == 500) { if (links.length == 500) {
promiseList.push( promiseList.push(
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails) this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
); );
links = []; links = [];
} }
@ -1676,7 +1706,9 @@ self.__bx_behaviors.selectMainBehavior();
document.querySelectorAll(selector).forEach(getter); document.querySelectorAll(selector).forEach(getter);
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
const func = (window as any)[addLinkFunc] as (url: string) => NonNullable<unknown>; const func = (window as any)[addLinkFunc] as (
url: string,
) => NonNullable<unknown>;
urls.forEach((url) => func.call(this, url)); urls.forEach((url) => func.call(this, url));
return true; return true;
@ -1701,9 +1733,9 @@ self.__bx_behaviors.selectMainBehavior();
}), }),
PAGE_OP_TIMEOUT_SECS, PAGE_OP_TIMEOUT_SECS,
"Link extraction timed out", "Link extraction timed out",
logDetails logDetails,
) ),
) ),
); );
for (let i = 0; i < promiseResults.length; i++) { for (let i = 0; i < promiseResults.length; i++) {
@ -1725,7 +1757,7 @@ self.__bx_behaviors.selectMainBehavior();
if (links.length) { if (links.length) {
promiseList.push( promiseList.push(
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails) this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
); );
} }
@ -1737,7 +1769,7 @@ self.__bx_behaviors.selectMainBehavior();
urls: string[], urls: string[],
depth: number, depth: number,
extraHops = 0, extraHops = 0,
logDetails: LogDetails = {} logDetails: LogDetails = {},
) { ) {
try { try {
depth += 1; depth += 1;
@ -1748,7 +1780,7 @@ self.__bx_behaviors.selectMainBehavior();
for (const possibleUrl of urls) { for (const possibleUrl of urls) {
const res = this.isInScope( const res = this.isInScope(
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId }, { url: possibleUrl, extraHops: newExtraHops, depth, seedId },
logDetails logDetails,
); );
if (!res) { if (!res) {
@ -1763,7 +1795,7 @@ self.__bx_behaviors.selectMainBehavior();
url, url,
depth, depth,
isOOS ? newExtraHops : extraHops, isOOS ? newExtraHops : extraHops,
logDetails logDetails,
); );
} }
} }
@ -1784,12 +1816,12 @@ self.__bx_behaviors.selectMainBehavior();
"Cloudflare check timed out", "Cloudflare check timed out",
logDetails, logDetails,
"general", "general",
true true,
) )
) { ) {
logger.debug( logger.debug(
"Cloudflare Check Detected, waiting for reload...", "Cloudflare Check Detected, waiting for reload...",
logDetails logDetails,
); );
await sleep(5.5); await sleep(5.5);
} }
@ -1803,7 +1835,7 @@ self.__bx_behaviors.selectMainBehavior();
url: string, url: string,
depth: number, depth: number,
extraHops: number, extraHops: number,
logDetails: LogDetails = {} logDetails: LogDetails = {},
) { ) {
if (this.limitHit) { if (this.limitHit) {
return false; return false;
@ -1811,7 +1843,7 @@ self.__bx_behaviors.selectMainBehavior();
const result = await this.crawlState.addToQueue( const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops }, { url, seedId, depth, extraHops },
this.pageLimit this.pageLimit,
); );
switch (result) { switch (result) {
@ -1823,7 +1855,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug( logger.debug(
"Not queued page url, at page limit", "Not queued page url, at page limit",
{ url, ...logDetails }, { url, ...logDetails },
"links" "links",
); );
this.limitHit = true; this.limitHit = true;
return false; return false;
@ -1832,7 +1864,7 @@ self.__bx_behaviors.selectMainBehavior();
logger.debug( logger.debug(
"Not queued page url, already seen", "Not queued page url, already seen",
{ url, ...logDetails }, { url, ...logDetails },
"links" "links",
); );
return false; return false;
} }
@ -1961,14 +1993,14 @@ self.__bx_behaviors.selectMainBehavior();
logger.info( logger.info(
"Fetching full sitemap (fromDate not specified/valid)", "Fetching full sitemap (fromDate not specified/valid)",
{ url, sitemapFromDate }, { url, sitemapFromDate },
"sitemap" "sitemap",
); );
} else { } else {
lastmodFromTimestamp = dateObj.getTime(); lastmodFromTimestamp = dateObj.getTime();
logger.info( logger.info(
"Fetching and filtering sitemap by date", "Fetching and filtering sitemap by date",
{ url, sitemapFromDate }, { url, sitemapFromDate },
"sitemap" "sitemap",
); );
} }
@ -2166,8 +2198,11 @@ self.__bx_behaviors.selectMainBehavior();
function shouldIgnoreAbort(req: HTTPRequest) { function shouldIgnoreAbort(req: HTTPRequest) {
try { try {
const failure = req.failure(); const failure = req.failure();
const failureText = failure && failure.errorText || ""; const failureText = (failure && failure.errorText) || "";
if (failureText !== "net::ERR_ABORTED" || req.resourceType() !== "document") { if (
failureText !== "net::ERR_ABORTED" ||
req.resourceType() !== "document"
) {
return false; return false;
} }
@ -2178,8 +2213,10 @@ function shouldIgnoreAbort(req: HTTPRequest) {
return false; return false;
} }
if (headers["content-disposition"] || if (
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) { headers["content-disposition"] ||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))
) {
return true; return true;
} }
} catch (e) { } catch (e) {
@ -2188,4 +2225,3 @@ function shouldIgnoreAbort(req: HTTPRequest) {
return false; return false;
} }

View file

@ -15,81 +15,99 @@ import { Browser } from "./util/browser.js";
import { initStorage } from "./util/storage.js"; import { initStorage } from "./util/storage.js";
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core"; import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
const profileHTML = fs.readFileSync(new URL("../html/createProfile.html", import.meta.url), {encoding: "utf8"}); const profileHTML = fs.readFileSync(
const vncHTML = fs.readFileSync(new URL("../html/vnc_lite.html", import.meta.url), {encoding: "utf8"}); new URL("../html/createProfile.html", import.meta.url),
{ encoding: "utf8" },
);
const vncHTML = fs.readFileSync(
new URL("../html/vnc_lite.html", import.meta.url),
{ encoding: "utf8" },
);
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"}); const behaviors = fs.readFileSync(
new URL(
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
import.meta.url,
),
{ encoding: "utf8" },
);
function cliOpts(): { [key: string]: Options } { function cliOpts(): { [key: string]: Options } {
return { return {
"url": { url: {
describe: "The URL of the login page", describe: "The URL of the login page",
type: "string", type: "string",
demandOption: true, demandOption: true,
}, },
"user": { user: {
describe: "The username for the login. If not specified, will be prompted", describe:
"The username for the login. If not specified, will be prompted",
}, },
"password": { password: {
describe: "The password for the login. If not specified, will be prompted (recommended)", describe:
"The password for the login. If not specified, will be prompted (recommended)",
}, },
"filename": { filename: {
describe: "The filename for the profile tarball", describe: "The filename for the profile tarball",
default: "/crawls/profiles/profile.tar.gz", default: "/crawls/profiles/profile.tar.gz",
}, },
"debugScreenshot": { debugScreenshot: {
describe: "If specified, take a screenshot after login and save as this filename" describe:
"If specified, take a screenshot after login and save as this filename",
}, },
"headless": { headless: {
describe: "Run in headless mode, otherwise start xvfb", describe: "Run in headless mode, otherwise start xvfb",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"automated": { automated: {
describe: "Start in automated mode, no interactive browser", describe: "Start in automated mode, no interactive browser",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"interactive": { interactive: {
describe: "Deprecated. Now the default option!", describe: "Deprecated. Now the default option!",
type: "boolean", type: "boolean",
default: false default: false,
}, },
"shutdownWait": { shutdownWait: {
describe: "Shutdown browser in interactive after this many seconds, if no pings received", describe:
"Shutdown browser in interactive after this many seconds, if no pings received",
type: "number", type: "number",
default: 0 default: 0,
}, },
"profile": { profile: {
describe: "Path to tar.gz file which will be extracted and used as the browser profile", describe:
"Path to tar.gz file which will be extracted and used as the browser profile",
type: "string", type: "string",
}, },
"windowSize": { windowSize: {
type: "string", type: "string",
describe: "Browser window dimensions, specified as: width,height", describe: "Browser window dimensions, specified as: width,height",
default: getDefaultWindowSize() default: getDefaultWindowSize(),
}, },
"proxy": { proxy: {
type: "boolean", type: "boolean",
default: false default: false,
}, },
"cookieDays": { cookieDays: {
type: "number", type: "number",
describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile", describe:
default: 7 "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
} default: 7,
},
}; };
} }
@ -100,14 +118,11 @@ function getDefaultWindowSize() {
return `${x},${y}`; return `${x},${y}`;
} }
async function main() { async function main() {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
const params: any = yargs(process.argv) const params: any = yargs(process.argv)
.usage("browsertrix-crawler profile [options]") .usage("browsertrix-crawler profile [options]")
.option(cliOpts()) .option(cliOpts()).argv;
.argv;
logger.setDebugLogging(true); logger.setDebugLogging(true);
@ -122,7 +137,7 @@ async function main() {
process.env.GEOMETRY || "", process.env.GEOMETRY || "",
"-ac", "-ac",
"+extension", "+extension",
"RANDR" "RANDR",
]); ]);
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true}); //await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
@ -140,7 +155,7 @@ async function main() {
"-passwd", "-passwd",
process.env.VNC_PASS || "", process.env.VNC_PASS || "",
"-display", "-display",
process.env.DISPLAY || "" process.env.DISPLAY || "",
]); ]);
} }
@ -156,13 +171,15 @@ async function main() {
"--window-position=0,0", "--window-position=0,0",
`--window-size=${params.windowSize}`, `--window-size=${params.windowSize}`,
// to disable the 'stability will suffer' infobar // to disable the 'stability will suffer' infobar
"--test-type" "--test-type",
] ],
} },
}); });
if (params.interactive) { if (params.interactive) {
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode"); logger.warn(
"Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode",
);
} }
if (params.user || params.password) { if (params.user || params.password) {
@ -187,7 +204,10 @@ async function main() {
await browser.setupPage({ page, cdp }); await browser.setupPage({ page, cdp });
// for testing, inject browsertrix-behaviors // for testing, inject browsertrix-behaviors
await browser.addInitScript(page, behaviors + ";\nself.__bx_behaviors.init();"); await browser.addInitScript(
page,
behaviors + ";\nself.__bx_behaviors.init();",
);
} }
logger.info(`Loading page: ${params.url}`); logger.info(`Loading page: ${params.url}`);
@ -204,17 +224,26 @@ async function main() {
} }
} }
async function automatedProfile(
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async function automatedProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, params: any,
waitUntil: PuppeteerLifeCycleEvent) { browser: Browser,
page: Page,
cdp: CDPSession,
waitUntil: PuppeteerLifeCycleEvent,
) {
let u, p; let u, p;
logger.debug("Looking for username and password entry fields on page..."); logger.debug("Looking for username and password entry fields on page...");
try { try {
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]"); u = await page.waitForSelector(
p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']"); "//input[contains(@name, 'user') or contains(@name, 'email')]",
);
p = await page.waitForSelector(
"//input[contains(@name, 'pass') and @type='password']",
);
} catch (e) { } catch (e) {
if (params.debugScreenshot) { if (params.debugScreenshot) {
await page.screenshot({ path: params.debugScreenshot }); await page.screenshot({ path: params.debugScreenshot });
@ -231,7 +260,7 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
await Promise.allSettled([ await Promise.allSettled([
p!.press("Enter"), p!.press("Enter"),
page.waitForNavigation({waitUntil}) page.waitForNavigation({ waitUntil }),
]); ]);
if (params.debugScreenshot) { if (params.debugScreenshot) {
@ -243,8 +272,15 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
process.exit(0); process.exit(0);
} }
async function createProfile(
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async function createProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, targetFilename = "") { params: any,
browser: Browser,
page: Page,
cdp: CDPSession,
targetFilename = "",
) {
await cdp.send("Network.clearBrowserCache"); await cdp.send("Network.clearBrowserCache");
await browser.close(); await browser.close();
@ -276,7 +312,7 @@ function promptInput(msg: string, hidden = false) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
const rl: any = readline.createInterface({ const rl: any = readline.createInterface({
input: process.stdin, input: process.stdin,
output: process.stdout output: process.stdout,
}); });
if (hidden) { if (hidden) {
@ -303,7 +339,6 @@ function promptInput(msg: string, hidden = false) {
}); });
} }
class InteractiveBrowser { class InteractiveBrowser {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
params: any; params: any;
@ -323,7 +358,7 @@ class InteractiveBrowser {
browser: Browser, browser: Browser,
page: Page, page: Page,
cdp: CDPSession, cdp: CDPSession,
targetId: string targetId: string,
) { ) {
logger.info("Creating Profile Interactively..."); logger.info("Creating Profile Interactively...");
child_process.spawn("socat", [ child_process.spawn("socat", [
@ -359,19 +394,19 @@ class InteractiveBrowser {
if (this.shutdownWait) { if (this.shutdownWait) {
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait); this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
logger.debug( logger.debug(
`Shutting down in ${this.shutdownWait}ms if no ping received` `Shutting down in ${this.shutdownWait}ms if no ping received`,
); );
} else { } else {
this.shutdownTimer = null; this.shutdownTimer = null;
} }
const httpServer = http.createServer((req, res) => const httpServer = http.createServer((req, res) =>
this.handleRequest(req, res) this.handleRequest(req, res),
); );
const port = 9223; const port = 9223;
httpServer.listen(port); httpServer.listen(port);
logger.info( logger.info(
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.` `Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`,
); );
if (!params.headless) { if (!params.headless) {
@ -452,8 +487,8 @@ class InteractiveBrowser {
res.end( res.end(
profileHTML.replace( profileHTML.replace(
"$DEVTOOLS_SRC", "$DEVTOOLS_SRC",
targetUrl.replaceAll("$HOST", parsedUrl.hostname) targetUrl.replaceAll("$HOST", parsedUrl.hostname),
) ),
); );
return; return;
@ -469,10 +504,10 @@ class InteractiveBrowser {
clearTimeout(this.shutdownTimer as any); clearTimeout(this.shutdownTimer as any);
this.shutdownTimer = setTimeout( this.shutdownTimer = setTimeout(
() => process.exit(0), () => process.exit(0),
this.shutdownWait this.shutdownWait,
); );
logger.debug( logger.debug(
`Ping received, delaying shutdown for ${this.shutdownWait}ms` `Ping received, delaying shutdown for ${this.shutdownWait}ms`,
); );
} }
@ -530,7 +565,7 @@ class InteractiveBrowser {
this.browser, this.browser,
this.page, this.page,
this.cdp, this.cdp,
targetFilename targetFilename,
); );
origins = Array.from(this.originSet.values()); origins = Array.from(this.originSet.values());
@ -558,13 +593,13 @@ class InteractiveBrowser {
res.writeHead(200, { "Content-Type": "text/html" }); res.writeHead(200, { "Content-Type": "text/html" });
res.end( res.end(
"<html><body>Profile Created! You may now close this window.</body></html>" "<html><body>Profile Created! You may now close this window.</body></html>",
); );
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) { } catch (e: any) {
res.writeHead(500, { "Content-Type": "text/html" }); res.writeHead(500, { "Content-Type": "text/html" });
res.end( res.end(
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info" "<html><body>Profile creation failed! See the browsertrix-crawler console for more info",
); );
logger.warn("HTTP Error", e); logger.warn("HTTP Error", e);
} }
@ -576,7 +611,7 @@ class InteractiveBrowser {
if (pathname.startsWith("/vnc/")) { if (pathname.startsWith("/vnc/")) {
const fileUrl = new URL( const fileUrl = new URL(
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length), "../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
import.meta.url import.meta.url,
); );
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" }); const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
res.writeHead(200, { "Content-Type": "application/javascript" }); res.writeHead(200, { "Content-Type": "application/javascript" });
@ -607,6 +642,4 @@ class InteractiveBrowser {
} }
} }
main(); main();

View file

@ -2,6 +2,14 @@ import { Page } from "puppeteer-core";
import { PageState } from "./util/state.js"; import { PageState } from "./util/state.js";
import { Crawler } from "./crawler.js"; import { Crawler } from "./crawler.js";
export default async ({data, page, crawler} : {data: PageState, page: Page, crawler: Crawler}) => { export default async ({
data,
page,
crawler,
}: {
data: PageState;
page: Page;
crawler: Crawler;
}) => {
await crawler.loadPage(page, data); await crawler.loadPage(page, data);
}; };

View file

@ -4,13 +4,11 @@ import { logger } from "./util/logger.js";
import { setExitOnRedisError } from "./util/redis.js"; import { setExitOnRedisError } from "./util/redis.js";
import { Crawler } from "./crawler.js"; import { Crawler } from "./crawler.js";
let crawler: Crawler | null = null; let crawler: Crawler | null = null;
let lastSigInt = 0; let lastSigInt = 0;
let forceTerm = false; let forceTerm = false;
async function handleTerminate(signame: string) { async function handleTerminate(signame: string) {
logger.info(`${signame} received...`); logger.info(`${signame} received...`);
if (!crawler || !crawler.crawlState) { if (!crawler || !crawler.crawlState) {
@ -53,5 +51,3 @@ process.on("SIGABRT", async () => {
crawler = new Crawler(); crawler = new Crawler();
crawler.run(); crawler.run();

View file

@ -7,199 +7,225 @@ import { KnownDevices as devices } from "puppeteer-core";
import yargs, { Options } from "yargs"; import yargs, { Options } from "yargs";
import { hideBin } from "yargs/helpers"; import { hideBin } from "yargs/helpers";
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js"; import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js"; import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js"; import { screenshotTypes } from "./screenshots.js";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
// ============================================================================ // ============================================================================
class ArgParser { class ArgParser {
get cliOpts(): { [key: string]: Options } { get cliOpts(): { [key: string]: Options } {
const coerce = (array: string[]) => { const coerce = (array: string[]) => {
return array.flatMap(v => v.split(",")).filter(x => !!x); return array.flatMap((v) => v.split(",")).filter((x) => !!x);
}; };
return { return {
"seeds": { seeds: {
alias: "url", alias: "url",
describe: "The URL to start crawling from", describe: "The URL to start crawling from",
type: "array", type: "array",
default: [], default: [],
}, },
"seedFile": { seedFile: {
alias: ["urlFile"], alias: ["urlFile"],
describe: "If set, read a list of seed urls, one per line, from the specified", describe:
"If set, read a list of seed urls, one per line, from the specified",
type: "string", type: "string",
}, },
"workers": { workers: {
alias: "w", alias: "w",
describe: "The number of workers to run in parallel", describe: "The number of workers to run in parallel",
default: 1, default: 1,
type: "number", type: "number",
}, },
"crawlId": { crawlId: {
alias: "id", alias: "id",
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)", describe:
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
type: "string", type: "string",
}, },
"waitUntil": { waitUntil: {
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','", describe:
"Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
type: "array", type: "array",
default: ["load", "networkidle2"], default: ["load", "networkidle2"],
choices: WAIT_UNTIL_OPTS, choices: WAIT_UNTIL_OPTS,
coerce, coerce,
}, },
"depth": { depth: {
describe: "The depth of the crawl for all seeds", describe: "The depth of the crawl for all seeds",
default: -1, default: -1,
type: "number", type: "number",
}, },
"extraHops": { extraHops: {
describe: "Number of extra 'hops' to follow, beyond the current scope", describe: "Number of extra 'hops' to follow, beyond the current scope",
default: 0, default: 0,
type: "number" type: "number",
}, },
"pageLimit": { pageLimit: {
alias: "limit", alias: "limit",
describe: "Limit crawl to this number of pages", describe: "Limit crawl to this number of pages",
default: 0, default: 0,
type: "number", type: "number",
}, },
"maxPageLimit": { maxPageLimit: {
describe: "Maximum pages to crawl, overriding pageLimit if both are set", describe:
"Maximum pages to crawl, overriding pageLimit if both are set",
default: 0, default: 0,
type: "number", type: "number",
}, },
"pageLoadTimeout": { pageLoadTimeout: {
alias: "timeout", alias: "timeout",
describe: "Timeout for each page to load (in seconds)", describe: "Timeout for each page to load (in seconds)",
default: 90, default: 90,
type: "number", type: "number",
}, },
"scopeType": { scopeType: {
describe: "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes", describe:
"A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
type: "string", type: "string",
choices: ["page", "page-spa", "prefix", "host", "domain", "any", "custom"] choices: [
"page",
"page-spa",
"prefix",
"host",
"domain",
"any",
"custom",
],
}, },
"scopeIncludeRx": { scopeIncludeRx: {
alias: "include", alias: "include",
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)", describe:
"Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
}, },
"scopeExcludeRx": { scopeExcludeRx: {
alias: "exclude", alias: "exclude",
describe: "Regex of page URLs that should be excluded from the crawl." describe: "Regex of page URLs that should be excluded from the crawl.",
}, },
"allowHashUrls": { allowHashUrls: {
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content", describe:
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
}, },
"blockRules": { blockRules: {
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe", describe:
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
type: "array", type: "array",
default: [], default: [],
}, },
"blockMessage": { blockMessage: {
describe: "If specified, when a URL is blocked, a record with this error message is added instead", describe:
"If specified, when a URL is blocked, a record with this error message is added instead",
type: "string", type: "string",
}, },
"blockAds": { blockAds: {
alias: "blockads", alias: "blockads",
describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)", describe:
"If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"adBlockMessage": { adBlockMessage: {
describe: "If specified, when an ad is blocked, a record with this error message is added instead", describe:
"If specified, when an ad is blocked, a record with this error message is added instead",
type: "string", type: "string",
}, },
"collection": { collection: {
alias: "c", alias: "c",
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)", describe:
"Collection name to crawl to (replay will be accessible under this name in pywb preview)",
type: "string", type: "string",
default: "crawl-@ts" default: "crawl-@ts",
}, },
"headless": { headless: {
describe: "Run in headless mode, otherwise start xvfb", describe: "Run in headless mode, otherwise start xvfb",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"driver": { driver: {
describe: "JS driver for the crawler", describe: "JS driver for the crawler",
type: "string", type: "string",
default: "./defaultDriver.js", default: "./defaultDriver.js",
}, },
"generateCDX": { generateCDX: {
alias: ["generatecdx", "generateCdx"], alias: ["generatecdx", "generateCdx"],
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done", describe:
"If set, generate index (CDXJ) for use with pywb after crawl is done",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"combineWARC": { combineWARC: {
alias: ["combinewarc", "combineWarc"], alias: ["combinewarc", "combineWarc"],
describe: "If set, combine the warcs", describe: "If set, combine the warcs",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"rolloverSize": { rolloverSize: {
describe: "If set, declare the rollover size", describe: "If set, declare the rollover size",
default: 1000000000, default: 1000000000,
type: "number", type: "number",
}, },
"generateWACZ": { generateWACZ: {
alias: ["generatewacz", "generateWacz"], alias: ["generatewacz", "generateWacz"],
describe: "If set, generate wacz", describe: "If set, generate wacz",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"logging": { logging: {
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug", describe:
"Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
type: "array", type: "array",
default: ["stats"], default: ["stats"],
coerce, coerce,
}, },
"logLevel": { logLevel: {
describe: "Comma-separated list of log levels to include in logs", describe: "Comma-separated list of log levels to include in logs",
type: "array", type: "array",
default: [], default: [],
coerce, coerce,
}, },
"context": { context: {
describe: "Comma-separated list of contexts to include in logs", describe: "Comma-separated list of contexts to include in logs",
type: "array", type: "array",
default: [], default: [],
coerce, coerce,
}, },
"text": { text: {
describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)", describe:
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
type: "array", type: "array",
choices: EXTRACT_TEXT_TYPES, choices: EXTRACT_TEXT_TYPES,
coerce: (array) => { coerce: (array) => {
@ -211,45 +237,51 @@ class ArgParser {
return []; return [];
} }
return coerce(array); return coerce(array);
} },
}, },
"cwd": { cwd: {
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()", describe:
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
type: "string", type: "string",
default: process.cwd(), default: process.cwd(),
}, },
"mobileDevice": { mobileDevice: {
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts", describe:
"Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
type: "string", type: "string",
}, },
"userAgent": { userAgent: {
describe: "Override user-agent with specified string", describe: "Override user-agent with specified string",
type: "string", type: "string",
}, },
"userAgentSuffix": { userAgentSuffix: {
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)", describe:
"Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
type: "string", type: "string",
}, },
"useSitemap": { useSitemap: {
alias: "sitemap", alias: "sitemap",
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified", describe:
"If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
}, },
"sitemapFromDate": { sitemapFromDate: {
alias: "sitemapFrom", alias: "sitemapFrom",
describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", describe:
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
}, },
"statsFilename": { statsFilename: {
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)" describe:
"If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)",
}, },
"behaviors": { behaviors: {
describe: "Which background behaviors to enable on each page", describe: "Which background behaviors to enable on each page",
type: "array", type: "array",
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"], default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
@ -257,179 +289,204 @@ class ArgParser {
coerce, coerce,
}, },
"behaviorTimeout": { behaviorTimeout: {
describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.", describe:
"If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
default: 90, default: 90,
type: "number", type: "number",
}, },
"pageExtraDelay": { pageExtraDelay: {
alias: "delay", alias: "delay",
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page", describe:
"If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
default: 0, default: 0,
type: "number", type: "number",
}, },
"dedupPolicy": { dedupPolicy: {
describe: "Deduplication policy", describe: "Deduplication policy",
default: "skip", default: "skip",
type: "string", type: "string",
choices: ["skip", "revisit", "keep"], choices: ["skip", "revisit", "keep"],
}, },
"profile": { profile: {
describe: "Path to tar.gz file which will be extracted and used as the browser profile", describe:
"Path to tar.gz file which will be extracted and used as the browser profile",
type: "string", type: "string",
}, },
"screenshot": { screenshot: {
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage", describe:
"Screenshot options for crawler, can include: view, thumbnail, fullPage",
type: "array", type: "array",
default: [], default: [],
choices: Array.from(Object.keys(screenshotTypes)), choices: Array.from(Object.keys(screenshotTypes)),
coerce, coerce,
}, },
"screencastPort": { screencastPort: {
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port", describe:
type: "number", "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
default: 0
},
"screencastRedis": {
describe: "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
type: "boolean",
default: false
},
"warcInfo": {
alias: ["warcinfo"],
describe: "Optional fields added to the warcinfo record in combined WARCs",
//type: "object"
},
"redisStoreUrl": {
describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
type: "string",
default: "redis://localhost:6379/0"
},
"saveState": {
describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
type: "string",
default: "partial",
choices: ["never", "partial", "always"]
},
"saveStateInterval": {
describe: "If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
type: "number",
default: 300,
},
"saveStateHistory": {
describe: "Number of save states to keep during the duration of a crawl",
type: "number",
default: 5,
},
"sizeLimit": {
describe: "If set, save state and exit if size limit exceeds this value",
type: "number", type: "number",
default: 0, default: 0,
}, },
"diskUtilization": { screencastRedis: {
describe: "If set, save state and exit if disk utilization exceeds this percentage value", describe:
"If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
type: "boolean",
default: false,
},
warcInfo: {
alias: ["warcinfo"],
describe:
"Optional fields added to the warcinfo record in combined WARCs",
//type: "object"
},
redisStoreUrl: {
describe:
"If set, url for remote redis server to store state. Otherwise, using in-memory store",
type: "string",
default: "redis://localhost:6379/0",
},
saveState: {
describe:
"If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
type: "string",
default: "partial",
choices: ["never", "partial", "always"],
},
saveStateInterval: {
describe:
"If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
type: "number",
default: 300,
},
saveStateHistory: {
describe:
"Number of save states to keep during the duration of a crawl",
type: "number",
default: 5,
},
sizeLimit: {
describe:
"If set, save state and exit if size limit exceeds this value",
type: "number",
default: 0,
},
diskUtilization: {
describe:
"If set, save state and exit if disk utilization exceeds this percentage value",
type: "number", type: "number",
default: 90, default: 90,
}, },
"timeLimit": { timeLimit: {
describe: "If set, save state and exit after time limit, in seconds", describe: "If set, save state and exit after time limit, in seconds",
type: "number", type: "number",
default: 0, default: 0,
}, },
"healthCheckPort": { healthCheckPort: {
describe: "port to run healthcheck on", describe: "port to run healthcheck on",
type: "number", type: "number",
default: 0, default: 0,
}, },
"overwrite": { overwrite: {
describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started", describe:
"overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
type: "boolean", type: "boolean",
default: false default: false,
}, },
"waitOnDone": { waitOnDone: {
describe: "if set, wait for interrupt signal when finished instead of exiting", describe:
"if set, wait for interrupt signal when finished instead of exiting",
type: "boolean", type: "boolean",
default: false default: false,
}, },
"restartsOnError": { restartsOnError: {
describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt", describe:
"if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
type: "boolean", type: "boolean",
default: false default: false,
}, },
"netIdleWait": { netIdleWait: {
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope", describe:
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
type: "number", type: "number",
default: -1 default: -1,
}, },
"lang": { lang: {
describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code", describe:
type: "string" "if set, sets the language used by the browser, should be ISO 639 language[-country] code",
type: "string",
}, },
"title": { title: {
describe: "If set, write supplied title into WACZ datapackage.json metadata", describe:
type: "string" "If set, write supplied title into WACZ datapackage.json metadata",
type: "string",
}, },
"description": { description: {
alias: ["desc"], alias: ["desc"],
describe: "If set, write supplied description into WACZ datapackage.json metadata", describe:
type: "string" "If set, write supplied description into WACZ datapackage.json metadata",
type: "string",
}, },
"originOverride": { originOverride: {
describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port", describe:
"if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
type: "array", type: "array",
default: [], default: [],
}, },
"logErrorsToRedis": { logErrorsToRedis: {
describe: "If set, write error messages to redis", describe: "If set, write error messages to redis",
type: "boolean", type: "boolean",
default: false, default: false,
}, },
"failOnFailedSeed": { failOnFailedSeed: {
describe: "If set, crawler will fail with exit code 1 if any seed fails", describe:
"If set, crawler will fail with exit code 1 if any seed fails",
type: "boolean", type: "boolean",
default: false default: false,
}, },
"failOnFailedLimit": { failOnFailedLimit: {
describe: "If set, save state and exit if number of failed pages exceeds this value", describe:
"If set, save state and exit if number of failed pages exceeds this value",
type: "number", type: "number",
default: 0, default: 0,
}, },
"customBehaviors": { customBehaviors: {
describe: "injects a custom behavior file or set of behavior files in a directory", describe:
type: "string" "injects a custom behavior file or set of behavior files in a directory",
type: "string",
}, },
"debugAccessRedis": { debugAccessRedis: {
describe: "if set, runs internal redis without protected mode to allow external access (for debugging)", describe:
"if set, runs internal redis without protected mode to allow external access (for debugging)",
type: "boolean", type: "boolean",
} },
}; };
} }
@ -445,16 +502,19 @@ class ArgParser {
const parsed = yargs(hideBin(argv)) const parsed = yargs(hideBin(argv))
.usage("crawler [options]") .usage("crawler [options]")
.option(this.cliOpts) .option(this.cliOpts)
.config("config", "Path to YAML config file", (configPath : string | number) => { .config(
"config",
"Path to YAML config file",
(configPath: string | number) => {
if (configPath === "/crawls/stdin") { if (configPath === "/crawls/stdin") {
configPath = process.stdin.fd; configPath = process.stdin.fd;
} }
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any; origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
return origConfig; return origConfig;
}) },
.check((argv) => this.validateArgs(argv)) )
.argv; .check((argv) => this.validateArgs(argv)).argv;
return { parsed, origConfig }; return { parsed, origConfig };
} }
@ -463,7 +523,7 @@ class ArgParser {
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes // Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
const regex = /"[^"]+"|[^\s]+/g; const regex = /"[^"]+"|[^\s]+/g;
const res = crawlArgs.match(regex); const res = crawlArgs.match(regex);
return res ? res.map(e => e.replace(/"(.+)"/, "$1")) : []; return res ? res.map((e) => e.replace(/"(.+)"/, "$1")) : [];
} }
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -473,12 +533,14 @@ class ArgParser {
// Check that the collection name is valid. // Check that the collection name is valid.
if (argv.collection.search(/^[\w][\w-]*$/) === -1) { if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`); logger.fatal(
`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`,
);
} }
// background behaviors to apply // background behaviors to apply
const behaviorOpts: { [key: string]: string | boolean } = {}; const behaviorOpts: { [key: string]: string | boolean } = {};
argv.behaviors.forEach((x: string) => behaviorOpts[x] = true); argv.behaviors.forEach((x: string) => (behaviorOpts[x] = true));
behaviorOpts.log = BEHAVIOR_LOG_FUNC; behaviorOpts.log = BEHAVIOR_LOG_FUNC;
argv.behaviorOpts = JSON.stringify(behaviorOpts); argv.behaviorOpts = JSON.stringify(behaviorOpts);
@ -486,7 +548,9 @@ class ArgParser {
if (argv.mobileDevice) { if (argv.mobileDevice) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
argv.emulateDevice = (devices as Record<string, any>)[argv.mobileDevice.replace("-", " ")]; argv.emulateDevice = (devices as Record<string, any>)[
argv.mobileDevice.replace("-", " ")
];
if (!argv.emulateDevice) { if (!argv.emulateDevice) {
logger.fatal("Unknown device: " + argv.mobileDevice); logger.fatal("Unknown device: " + argv.mobileDevice);
} }
@ -498,7 +562,7 @@ class ArgParser {
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8"); const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
const urlSeedFileList = urlSeedFile.split("\n"); const urlSeedFileList = urlSeedFile.split("\n");
if (typeof(argv.seeds) === "string") { if (typeof argv.seeds === "string") {
argv.seeds = [argv.seeds]; argv.seeds = [argv.seeds];
} }
@ -530,7 +594,7 @@ class ArgParser {
argv.scopedSeeds = []; argv.scopedSeeds = [];
for (let seed of argv.seeds) { for (let seed of argv.seeds) {
if (typeof(seed) === "string") { if (typeof seed === "string") {
seed = { url: seed }; seed = { url: seed };
} }
@ -552,7 +616,7 @@ class ArgParser {
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename); argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
} }
if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) { if (argv.diskUtilization < 0 || argv.diskUtilization > 99) {
argv.diskUtilization = 90; argv.diskUtilization = 90;
} }

View file

@ -13,7 +13,7 @@ const BlockState = {
BLOCK_PAGE_NAV: "page", BLOCK_PAGE_NAV: "page",
BLOCK_IFRAME_NAV: "iframe", BLOCK_IFRAME_NAV: "iframe",
BLOCK_OTHER: "resource", BLOCK_OTHER: "resource",
BLOCK_AD: "advertisement" BLOCK_AD: "advertisement",
}; };
type BlockRuleDecl = { type BlockRuleDecl = {
@ -21,30 +21,30 @@ type BlockRuleDecl = {
frameTextMatch?: string; frameTextMatch?: string;
inFrameUrl?: string; inFrameUrl?: string;
type?: string; type?: string;
} };
// =========================================================================== // ===========================================================================
class BlockRule class BlockRule {
{
type: string; type: string;
url: RegExp | null; url: RegExp | null;
frameTextMatch?: RegExp | null; frameTextMatch?: RegExp | null;
inFrameUrl?: RegExp | null; inFrameUrl?: RegExp | null;
constructor(data: string | BlockRuleDecl) { constructor(data: string | BlockRuleDecl) {
if (typeof(data) === "string") { if (typeof data === "string") {
this.url = new RegExp(data); this.url = new RegExp(data);
this.type = "block"; this.type = "block";
} else { } else {
this.url = data.url ? new RegExp(data.url) : null; this.url = data.url ? new RegExp(data.url) : null;
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null; this.frameTextMatch = data.frameTextMatch
? new RegExp(data.frameTextMatch)
: null;
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null; this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
this.type = data.type || "block"; this.type = data.type || "block";
} }
if (!RULE_TYPES.includes(this.type)) { if (!RULE_TYPES.includes(this.type)) {
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", ")); logger.fatal('Rule "type" must be: ' + RULE_TYPES.join(", "));
} }
} }
@ -59,16 +59,18 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
} }
} }
// =========================================================================== // ===========================================================================
export class BlockRules export class BlockRules {
{
rules: BlockRule[]; rules: BlockRule[];
blockPutUrl: string; blockPutUrl: string;
blockErrMsg: string; blockErrMsg: string;
blockedUrlSet = new Set(); blockedUrlSet = new Set();
constructor(blockRules: BlockRuleDecl[], blockPutUrl: string, blockErrMsg: string) { constructor(
blockRules: BlockRuleDecl[],
blockPutUrl: string,
blockErrMsg: string,
) {
this.rules = []; this.rules = [];
this.blockPutUrl = blockPutUrl; this.blockPutUrl = blockPutUrl;
this.blockErrMsg = blockErrMsg; this.blockErrMsg = blockErrMsg;
@ -93,7 +95,11 @@ export class BlockRules
try { try {
await this.handleRequest(request, logDetails); await this.handleRequest(request, logDetails);
} catch (e) { } catch (e) {
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking"); logger.warn(
"Error handling request",
{ ...errJSON(e), ...logDetails },
"blocking",
);
} }
}; };
await browser.interceptRequest(page, onRequest); await browser.interceptRequest(page, onRequest);
@ -113,14 +119,22 @@ export class BlockRules
} else { } else {
await request.abort("blockedbyclient", 1); await request.abort("blockedbyclient", 1);
} }
} catch (e) { } catch (e) {
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking"); logger.debug(
`Block: (${blockState}) Failed On: ${url}`,
{ ...errJSON(e), ...logDetails },
"blocking",
);
} }
} }
async shouldBlock(
request: HTTPRequest,
url: string,
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) { logDetails: Record<string, any>,
) {
if (!url.startsWith("http:") && !url.startsWith("https:")) { if (!url.startsWith("http:") && !url.startsWith("https:")) {
return BlockState.ALLOW; return BlockState.ALLOW;
} }
@ -162,14 +176,29 @@ export class BlockRules
} }
for (const rule of this.rules) { for (const rule of this.rules) {
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails); const { done, block } = await this.ruleCheck(
rule,
request,
url,
frameUrl,
isNavReq,
logDetails,
);
if (block) { if (block) {
if (blockState === BlockState.BLOCK_PAGE_NAV) { if (blockState === BlockState.BLOCK_PAGE_NAV) {
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking"); logger.warn(
"Block rule match for page request ignored, set --exclude to block full pages",
{ url, ...logDetails },
"blocking",
);
return BlockState.ALLOW; return BlockState.ALLOW;
} }
logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking"); logger.debug(
"URL Blocked in iframe",
{ url, frameUrl, ...logDetails },
"blocking",
);
await this.recordBlockMsg(url); await this.recordBlockMsg(url);
return blockState; return blockState;
} }
@ -181,19 +210,27 @@ export class BlockRules
return BlockState.ALLOW; return BlockState.ALLOW;
} }
async ruleCheck(
rule: BlockRule,
request: HTTPRequest,
reqUrl: string,
frameUrl: string,
isNavReq: boolean,
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async ruleCheck(rule: BlockRule, request: HTTPRequest, reqUrl: string, frameUrl: string, isNavReq: boolean, logDetails: Record<string, any>) { logDetails: Record<string, any>,
) {
const { url, inFrameUrl, frameTextMatch } = rule; const { url, inFrameUrl, frameTextMatch } = rule;
const type = rule.type || "block"; const type = rule.type || "block";
const allowOnly = (type === "allowOnly"); const allowOnly = type === "allowOnly";
// not a frame match, skip rule // not a frame match, skip rule
if (inFrameUrl && !frameUrl.match(inFrameUrl)) { if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
return { block: false, done: false }; return { block: false, done: false };
} }
const urlMatched = (url && reqUrl.match(url)); const urlMatched = url && reqUrl.match(url);
// if frame text-based rule: if url matched and a frame request // if frame text-based rule: if url matched and a frame request
// frame text-based match: only applies to nav requests, never block otherwise // frame text-based match: only applies to nav requests, never block otherwise
@ -202,8 +239,19 @@ export class BlockRules
return { block: false, done: false }; return { block: false, done: false };
} }
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly; const block = (await this.isTextMatch(
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking"); request,
reqUrl,
frameTextMatch,
logDetails,
))
? !allowOnly
: allowOnly;
logger.debug(
"URL Conditional rule in iframe",
{ ...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl },
"blocking",
);
return { block, done: true }; return { block, done: true };
} }
@ -212,16 +260,25 @@ export class BlockRules
return { block, done: false }; return { block, done: false };
} }
async isTextMatch(
request: HTTPRequest,
reqUrl: string,
frameTextMatch: RegExp,
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async isTextMatch(request: HTTPRequest, reqUrl: string, frameTextMatch: RegExp, logDetails: Record<string, any>) { logDetails: Record<string, any>,
) {
try { try {
const res = await fetch(reqUrl); const res = await fetch(reqUrl);
const text = await res.text(); const text = await res.text();
return !!text.match(frameTextMatch); return !!text.match(frameTextMatch);
} catch (e) { } catch (e) {
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking"); logger.debug(
"Error determining text match",
{ ...errJSON(e), ...logDetails },
"blocking",
);
} }
} }
@ -239,19 +296,29 @@ export class BlockRules
const body = this.blockErrMsg; const body = this.blockErrMsg;
const putUrl = new URL(this.blockPutUrl); const putUrl = new URL(this.blockPutUrl);
putUrl.searchParams.set("url", url); putUrl.searchParams.set("url", url);
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body}); await fetch(putUrl.href, {
method: "PUT",
headers: { "Content-Type": "text/html" },
body,
});
} }
} }
// =========================================================================== // ===========================================================================
export class AdBlockRules extends BlockRules export class AdBlockRules extends BlockRules {
{
adhosts: string[]; adhosts: string[];
constructor(blockPutUrl: string, blockErrMsg: string, adhostsFilePath = "../../ad-hosts.json") { constructor(
blockPutUrl: string,
blockErrMsg: string,
adhostsFilePath = "../../ad-hosts.json",
) {
super([], blockPutUrl, blockErrMsg); super([], blockPutUrl, blockErrMsg);
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {"encoding": "utf-8"})); this.adhosts = JSON.parse(
fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {
encoding: "utf-8",
}),
);
} }
isAdUrl(url: string) { isAdUrl(url: string) {
@ -260,10 +327,19 @@ export class AdBlockRules extends BlockRules
return domain && this.adhosts.includes(domain); return domain && this.adhosts.includes(domain);
} }
async shouldBlock(
request: HTTPRequest,
url: string,
// TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) { logDetails: Record<string, any>,
) {
if (this.isAdUrl(url)) { if (this.isAdUrl(url)) {
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking"); logger.debug(
"URL blocked for being an ad",
{ url, ...logDetails },
"blocking",
);
await this.recordBlockMsg(url); await this.recordBlockMsg(url);
return BlockState.BLOCK_AD; return BlockState.BLOCK_AD;
} }

View file

@ -9,28 +9,32 @@ import path from "path";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
import { initStorage } from "./storage.js"; import { initStorage } from "./storage.js";
import puppeteer, { Frame, HTTPRequest, Page, PuppeteerLaunchOptions, Viewport } from "puppeteer-core"; import puppeteer, {
Frame,
HTTPRequest,
Page,
PuppeteerLaunchOptions,
Viewport,
} from "puppeteer-core";
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core"; import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
type LaunchOpts = { type LaunchOpts = {
profileUrl: string; profileUrl: string;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
chromeOptions: Record<string, any> chromeOptions: Record<string, any>;
signals: boolean; signals: boolean;
headless: boolean; headless: boolean;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
emulateDevice?: Record<string, any> emulateDevice?: Record<string, any>;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
ondisconnect?: ((err: any) => NonNullable<unknown>) | null ondisconnect?: ((err: any) => NonNullable<unknown>) | null;
}; };
// ================================================================== // ==================================================================
export class Browser export class Browser {
{
profileDir: string; profileDir: string;
customProfile = false; customProfile = false;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -48,7 +52,15 @@ export class Browser
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-")); this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
} }
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} : LaunchOpts) { if (this.isLaunched()) { async launch({
profileUrl,
chromeOptions,
signals = false,
headless = false,
emulateDevice = {},
ondisconnect = null,
}: LaunchOpts) {
if (this.isLaunched()) {
return; return;
} }
@ -81,14 +93,17 @@ export class Browser
defaultViewport, defaultViewport,
waitForInitialPage: false, waitForInitialPage: false,
userDataDir: this.profileDir userDataDir: this.profileDir,
}; };
await this._init(launchOpts, ondisconnect); await this._init(launchOpts, ondisconnect);
} }
async setupPage({page} : {page: Page, cdp: CDPSession}) { async setupPage({ page }: { page: Page; cdp: CDPSession }) {
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});"); await this.addInitScript(
page,
'Object.defineProperty(navigator, "webdriver", {value: false});',
);
if (this.customProfile) { if (this.customProfile) {
logger.info("Disabling Service Workers for profile", {}, "browser"); logger.info("Disabling Service Workers for profile", {}, "browser");
@ -100,17 +115,23 @@ export class Browser
async loadProfile(profileFilename: string): Promise<boolean> { async loadProfile(profileFilename: string): Promise<boolean> {
const targetFilename = "/tmp/profile.tar.gz"; const targetFilename = "/tmp/profile.tar.gz";
if (profileFilename && if (
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) { profileFilename &&
(profileFilename.startsWith("http:") ||
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile"); profileFilename.startsWith("https:"))
) {
logger.info(
`Downloading ${profileFilename} to ${targetFilename}`,
{},
"browserProfile",
);
const resp = await fetch(profileFilename); const resp = await fetch(profileFilename);
await pipeline( await pipeline(
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
Readable.fromWeb(resp.body as any), Readable.fromWeb(resp.body as any),
fs.createWriteStream(targetFilename) fs.createWriteStream(targetFilename),
); );
profileFilename = targetFilename; profileFilename = targetFilename;
@ -118,7 +139,9 @@ export class Browser
const storage = initStorage(); const storage = initStorage();
if (!storage) { if (!storage) {
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined"); logger.fatal(
"Profile specified relative to s3 storage, but no S3 storage defined",
);
return false; return false;
} }
@ -129,7 +152,9 @@ export class Browser
if (profileFilename) { if (profileFilename) {
try { try {
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir}); child_process.execSync("tar xvfz " + profileFilename, {
cwd: this.profileDir,
});
return true; return true;
} catch (e) { } catch (e) {
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`); logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
@ -140,7 +165,9 @@ export class Browser
} }
saveProfile(profileFilename: string) { saveProfile(profileFilename: string) {
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir}); child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {
cwd: this.profileDir,
});
} }
chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) { chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) {
@ -162,7 +189,9 @@ export class Browser
if (proxy) { if (proxy) {
args.push("--ignore-certificate-errors"); args.push("--ignore-certificate-errors");
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`); args.push(
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
);
} }
return args; return args;
@ -174,7 +203,9 @@ export class Browser
try { try {
const browser = this.getBrowserExe(); const browser = this.getBrowserExe();
if (browser) { if (browser) {
version = child_process.execFileSync(browser, ["--version"], {encoding: "utf8"}); version = child_process.execFileSync(browser, ["--version"], {
encoding: "utf8",
});
const match = version && version.match(/[\d.]+/); const match = version && version.match(/[\d.]+/);
if (match) { if (match) {
version = match[0]; version = match[0];
@ -188,7 +219,11 @@ export class Browser
} }
getBrowserExe() { getBrowserExe() {
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"]; const files = [
process.env.BROWSER_BIN,
"/usr/bin/google-chrome",
"/usr/bin/chromium-browser",
];
for (const file of files) { for (const file of files) {
if (file && fs.existsSync(file)) { if (file && fs.existsSync(file)) {
return file; return file;
@ -196,14 +231,25 @@ export class Browser
} }
} }
async evaluateWithCLI_(cdp: CDPSession, frame: Frame, cdpContextId: number, funcString: string, logData: Record<string, string>, contextName: string) { async evaluateWithCLI_(
cdp: CDPSession,
frame: Frame,
cdpContextId: number,
funcString: string,
logData: Record<string, string>,
contextName: string,
) {
const frameUrl = frame.url(); const frameUrl = frame.url();
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
let details: Record<string, any> = { frameUrl, ...logData }; let details: Record<string, any> = { frameUrl, ...logData };
if (!frameUrl || frame.isDetached()) { if (!frameUrl || frame.isDetached()) {
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName); logger.info(
"Run Script Skipped, frame no longer attached or has no URL",
details,
contextName,
);
return false; return false;
} }
@ -213,8 +259,7 @@ export class Browser
//const contextId = context._contextId; //const contextId = context._contextId;
const expression = funcString + "\n//# sourceURL=__evaluation_script__"; const expression = funcString + "\n//# sourceURL=__evaluation_script__";
const { exceptionDetails, result } = await cdp const { exceptionDetails, result } = await cdp.send("Runtime.evaluate", {
.send("Runtime.evaluate", {
expression, expression,
contextId: cdpContextId, contextId: cdpContextId,
returnByValue: true, returnByValue: true,
@ -225,7 +270,11 @@ export class Browser
if (exceptionDetails) { if (exceptionDetails) {
if (exceptionDetails.stackTrace) { if (exceptionDetails.stackTrace) {
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details}; details = {
...exceptionDetails.stackTrace,
text: exceptionDetails.text,
...details,
};
} }
logger.error("Run Script Failed", details, contextName); logger.error("Run Script Failed", details, contextName);
} else { } else {
@ -256,8 +305,11 @@ export class Browser
return page.evaluateOnNewDocument(script); return page.evaluateOnNewDocument(script);
} }
async _init(
launchOpts: PuppeteerLaunchOptions,
// eslint-disable-next-line @typescript-eslint/ban-types // eslint-disable-next-line @typescript-eslint/ban-types
async _init(launchOpts: PuppeteerLaunchOptions, ondisconnect : Function | null = null) { ondisconnect: Function | null = null,
) {
this.browser = await puppeteer.launch(launchOpts); this.browser = await puppeteer.launch(launchOpts);
const target = this.browser.target(); const target = this.browser.target();
@ -274,9 +326,10 @@ export class Browser
}); });
} }
async newWindowPageWithCDP() : Promise<{cdp: CDPSession, page: Page}> { async newWindowPageWithCDP(): Promise<{ cdp: CDPSession; page: Page }> {
// unique url to detect new pages // unique url to detect new pages
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2); const startPage =
"about:blank?_browsertrix" + Math.random().toString(36).slice(2);
const p = new Promise<Target>((resolve) => { const p = new Promise<Target>((resolve) => {
const listener = (target: Target) => { const listener = (target: Target) => {
@ -298,7 +351,10 @@ export class Browser
} }
try { try {
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true}); await this.firstCDP.send("Target.createTarget", {
url: startPage,
newWindow: true,
});
} catch (e) { } catch (e) {
if (!this.browser) { if (!this.browser) {
throw e; throw e;
@ -307,7 +363,10 @@ export class Browser
this.firstCDP = await target.createCDPSession(); this.firstCDP = await target.createCDPSession();
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true}); await this.firstCDP.send("Target.createTarget", {
url: startPage,
newWindow: true,
});
} }
const target = await p; const target = await p;
@ -350,7 +409,11 @@ export class Browser
try { try {
await this.firstCDP.send("Fetch.continueResponse", { requestId }); await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) { } catch (e) {
logger.warn("continueResponse failed", {url: request.url}, "recorder"); logger.warn(
"continueResponse failed",
{ url: request.url },
"recorder",
);
} }
return; return;
} }
@ -369,12 +432,20 @@ export class Browser
} }
if (!foundRecorder) { if (!foundRecorder) {
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder"); logger.debug(
"Skipping URL from unknown frame",
{ url: request.url, frameId },
"recorder",
);
try { try {
await this.firstCDP.send("Fetch.continueResponse", { requestId }); await this.firstCDP.send("Fetch.continueResponse", { requestId });
} catch (e) { } catch (e) {
logger.warn("continueResponse failed", {url: request.url}, "recorder"); logger.warn(
"continueResponse failed",
{ url: request.url },
"recorder",
);
} }
return; return;
@ -383,7 +454,9 @@ export class Browser
await foundRecorder.handleRequestPaused(params, this.firstCDP, true); await foundRecorder.handleRequestPaused(params, this.firstCDP, true);
}); });
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]}); await this.firstCDP.send("Fetch.enable", {
patterns: [{ urlPattern: "*", requestStage: "Response" }],
});
} }
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -395,14 +468,21 @@ export class Browser
funcString: string, funcString: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
logData: Record<string, any>, logData: Record<string, any>,
contextName: string contextName: string,
) { ) {
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
const context = await (frame as any).executionContext(); const context = await (frame as any).executionContext();
cdp = context._client; cdp = context._client;
const cdpContextId = context._contextId; const cdpContextId = context._contextId;
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName); return await this.evaluateWithCLI_(
cdp,
frame,
cdpContextId,
funcString,
logData,
contextName,
);
} }
interceptRequest(page: Page, callback: (event: HTTPRequest) => void) { interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
@ -428,7 +508,6 @@ export class Browser
} }
} }
// ================================================================== // ==================================================================
// Default Chromium args from playwright // Default Chromium args from playwright
export const defaultArgs = [ export const defaultArgs = [
@ -470,5 +549,5 @@ export const defaultArgs = [
"--apps-gallery-url=https://invalid.webstore.example.com/", "--apps-gallery-url=https://invalid.webstore.example.com/",
"--apps-gallery-update-url=https://invalid.webstore.example.com/", "--apps-gallery-update-url=https://invalid.webstore.example.com/",
"--component-updater=url-source=http://invalid.dev/", "--component-updater=url-source=http://invalid.dev/",
"--brave-stats-updater-server=url-source=http://invalid.dev/" "--brave-stats-updater-server=url-source=http://invalid.dev/",
]; ];

View file

@ -1,15 +1,24 @@
export const HTML_TYPES = [
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"]; "text/html",
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"]; "application/xhtml",
"application/xhtml+xml",
];
export const WAIT_UNTIL_OPTS = [
"load",
"domcontentloaded",
"networkidle0",
"networkidle2",
];
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
export const BEHAVIOR_LOG_FUNC = "__bx_log"; export const BEHAVIOR_LOG_FUNC = "__bx_log";
export const ADD_LINK_FUNC = "__bx_addLink"; export const ADD_LINK_FUNC = "__bx_addLink";
export const MAX_DEPTH = 1000000; export const MAX_DEPTH = 1000000;
export const DEFAULT_SELECTORS = [{ export const DEFAULT_SELECTORS = [
{
selector: "a[href]", selector: "a[href]",
extract: "href", extract: "href",
isAttribute: false isAttribute: false,
}]; },
];

View file

@ -3,11 +3,17 @@ import path from "path";
const MAX_DEPTH = 2; const MAX_DEPTH = 2;
export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0) : string[] { export function collectAllFileSources(
fileOrDir: string,
ext?: string,
depth = 0,
): string[] {
const resolvedPath = path.resolve(fileOrDir); const resolvedPath = path.resolve(fileOrDir);
if (depth >= MAX_DEPTH) { if (depth >= MAX_DEPTH) {
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`); console.warn(
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
);
return []; return [];
} }
@ -27,7 +33,9 @@ export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0
} }
if (depth === 0) { if (depth === 0) {
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`); console.warn(
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
);
} }
return []; return [];

View file

@ -2,10 +2,8 @@ import http from "http";
import url from "url"; import url from "url";
import { logger } from "./logger.js"; import { logger } from "./logger.js";
// =========================================================================== // ===========================================================================
export class HealthChecker export class HealthChecker {
{
port: number; port: number;
errorThreshold: number; errorThreshold: number;
healthServer: http.Server; healthServer: http.Server;
@ -16,7 +14,9 @@ export class HealthChecker
this.port = port; this.port = port;
this.errorThreshold = errorThreshold; this.errorThreshold = errorThreshold;
this.healthServer = http.createServer((...args) => this.healthCheck(...args)); this.healthServer = http.createServer((...args) =>
this.healthCheck(...args),
);
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck"); logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
this.healthServer.listen(port); this.healthServer.listen(port);
} }
@ -26,21 +26,33 @@ export class HealthChecker
switch (pathname) { switch (pathname) {
case "/healthz": case "/healthz":
if (this.errorCount < this.errorThreshold) { if (this.errorCount < this.errorThreshold) {
logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck"); logger.debug(
`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
{},
"healthcheck",
);
res.writeHead(200); res.writeHead(200);
res.end(); res.end();
} }
return; return;
} }
logger.error(`health check failed: ${this.errorCount} >= ${this.errorThreshold}`, {}, "healthcheck"); logger.error(
`health check failed: ${this.errorCount} >= ${this.errorThreshold}`,
{},
"healthcheck",
);
res.writeHead(503); res.writeHead(503);
res.end(); res.end();
} }
resetErrors() { resetErrors() {
if (this.errorCount > 0) { if (this.errorCount > 0) {
logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck"); logger.info(
`Page loaded, resetting error count ${this.errorCount} to 0`,
{},
"healthcheck",
);
this.errorCount = 0; this.errorCount = 0;
} }
} }
@ -49,4 +61,3 @@ export class HealthChecker
this.errorCount++; this.errorCount++;
} }
} }

View file

@ -5,24 +5,23 @@ import { Writable } from "node:stream";
import { RedisCrawlState } from "./state.js"; import { RedisCrawlState } from "./state.js";
// RegExp.prototype.toJSON = RegExp.prototype.toString; // RegExp.prototype.toJSON = RegExp.prototype.toString;
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString }); Object.defineProperty(RegExp.prototype, "toJSON", {
value: RegExp.prototype.toString,
});
// =========================================================================== // ===========================================================================
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
export function errJSON(e: any) { export function errJSON(e: any) {
if (e instanceof Error) { if (e instanceof Error) {
return {"type": "exception", "message": e.message, "stack": e.stack}; return { type: "exception", message: e.message, stack: e.stack };
} else { } else {
return {"message": e.toString()}; return { message: e.toString() };
} }
} }
// =========================================================================== // ===========================================================================
class Logger class Logger {
{
logStream: Writable | null = null; logStream: Writable | null = null;
debugLogging = false; debugLogging = false;
logErrorsToRedis = false; logErrorsToRedis = false;
@ -66,12 +65,12 @@ class Logger
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
data: Record<string, string> | Error | any, data: Record<string, string> | Error | any,
context: string, context: string,
logLevel="info" logLevel = "info",
) { ) {
if (data instanceof Error) { if (data instanceof Error) {
data = errJSON(data); data = errJSON(data);
} else if (typeof data !== "object") { } else if (typeof data !== "object") {
data = {"message": data.toString()}; data = { message: data.toString() };
} }
if (this.logLevels.length) { if (this.logLevels.length) {
@ -87,11 +86,11 @@ class Logger
} }
const dataToLog = { const dataToLog = {
"timestamp": new Date().toISOString(), timestamp: new Date().toISOString(),
"logLevel": logLevel, logLevel: logLevel,
"context": context, context: context,
"message": message, message: message,
"details": data ? data : {} details: data ? data : {},
}; };
const string = JSON.stringify(dataToLog); const string = JSON.stringify(dataToLog);
console.log(string); console.log(string);
@ -100,7 +99,11 @@ class Logger
} }
const toLogToRedis = ["error", "fatal"]; const toLogToRedis = ["error", "fatal"];
if (this.logErrorsToRedis && this.crawlState && toLogToRedis.includes(logLevel)) { if (
this.logErrorsToRedis &&
this.crawlState &&
toLogToRedis.includes(logLevel)
) {
this.crawlState.logError(string); this.crawlState.logError(string);
} }
} }

View file

@ -2,9 +2,8 @@ import { HTTPRequest, Page } from "puppeteer-core";
import { errJSON, logger } from "./logger.js"; import { errJSON, logger } from "./logger.js";
import { Browser } from "./browser.js"; import { Browser } from "./browser.js";
export class OriginOverride export class OriginOverride {
{ originOverride: { origUrl: URL; destUrl: URL }[];
originOverride: {origUrl: URL, destUrl: URL}[];
constructor(originOverride: string[]) { constructor(originOverride: string[]) {
this.originOverride = originOverride.map((override) => { this.originOverride = originOverride.map((override) => {
@ -50,12 +49,19 @@ export class OriginOverride
const respHeaders = Object.fromEntries(resp.headers); const respHeaders = Object.fromEntries(resp.headers);
const status = resp.status; const status = resp.status;
logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride"); logger.debug(
"Origin overridden",
{ orig: url, dest: newUrl, status, body: body.length },
"originoverride",
);
request.respond({ body, headers: respHeaders, status }, -1); request.respond({ body, headers: respHeaders, status }, -1);
} catch (e) { } catch (e) {
logger.warn("Error overriding origin", {...errJSON(e), url: page.url()}, "originoverride"); logger.warn(
"Error overriding origin",
{ ...errJSON(e), url: page.url() },
"originoverride",
);
request.continue({}, -1); request.continue({}, -1);
} }
}; };

View file

@ -12,8 +12,11 @@ import { RequestResponseInfo } from "./reqresp.js";
// @ts-expect-error TODO fill in why error is expected // @ts-expect-error TODO fill in why error is expected
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js"; import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
import {
rewriteDASH,
rewriteHLS,
// @ts-expect-error TODO fill in why error is expected // @ts-expect-error TODO fill in why error is expected
import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js"; } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js";
import { WARCRecord } from "warcio"; import { WARCRecord } from "warcio";
import { TempFileBuffer, WARCSerializer } from "warcio/node"; import { TempFileBuffer, WARCSerializer } from "warcio/node";
@ -30,7 +33,6 @@ const WRITE_DUPE_KEY = "s:writedupe";
const encoder = new TextEncoder(); const encoder = new TextEncoder();
// ================================================================= // =================================================================
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars // eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
@ -39,8 +41,7 @@ function logNetwork(msg: string, data: any) {
} }
// ================================================================= // =================================================================
export class Recorder export class Recorder {
{
workerid: WorkerId; workerid: WorkerId;
collDir: string; collDir: string;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -76,12 +77,17 @@ export class Recorder
pageid!: string; pageid!: string;
constructor({
workerid,
collDir,
crawler,
}: {
workerid: WorkerId;
collDir: string;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any} crawler: any;
) { }) {
this.workerid = workerid; this.workerid = workerid;
this.crawler = crawler; this.crawler = crawler;
this.crawlState = crawler.crawlState; this.crawlState = crawler.crawlState;
@ -108,7 +114,7 @@ export class Recorder
tempCdxDir: this.tempCdxDir, tempCdxDir: this.tempCdxDir,
filename, filename,
gzip: this.gzip, gzip: this.gzip,
logDetails: this.logDetails logDetails: this.logDetails,
}); });
} }
@ -119,17 +125,25 @@ export class Recorder
this.handleRequestPaused(params, cdp); this.handleRequestPaused(params, cdp);
}); });
await cdp.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]}); await cdp.send("Fetch.enable", {
patterns: [{ urlPattern: "*", requestStage: "Response" }],
});
// Response // Response
cdp.on("Network.responseReceived", (params) => { cdp.on("Network.responseReceived", (params) => {
// handling to fill in security details // handling to fill in security details
logNetwork("Network.responseReceived", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.responseReceived", {
requestId: params.requestId,
...this.logDetails,
});
this.handleResponseReceived(params); this.handleResponseReceived(params);
}); });
cdp.on("Network.responseReceivedExtraInfo", (params) => { cdp.on("Network.responseReceivedExtraInfo", (params) => {
logNetwork("Network.responseReceivedExtraInfo", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.responseReceivedExtraInfo", {
requestId: params.requestId,
...this.logDetails,
});
const reqresp = this.pendingReqResp(params.requestId, true); const reqresp = this.pendingReqResp(params.requestId, true);
if (reqresp) { if (reqresp) {
reqresp.fillResponseReceivedExtraInfo(params); reqresp.fillResponseReceivedExtraInfo(params);
@ -142,29 +156,44 @@ export class Recorder
// only handling redirect here, committing last response in redirect chain // only handling redirect here, committing last response in redirect chain
// request data stored from requestPaused // request data stored from requestPaused
if (params.redirectResponse) { if (params.redirectResponse) {
logNetwork("Network.requestWillBeSent after redirect", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.requestWillBeSent after redirect", {
requestId: params.requestId,
...this.logDetails,
});
this.handleRedirectResponse(params); this.handleRedirectResponse(params);
} }
}); });
cdp.on("Network.requestServedFromCache", (params) => { cdp.on("Network.requestServedFromCache", (params) => {
logNetwork("Network.requestServedFromCache", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.requestServedFromCache", {
requestId: params.requestId,
...this.logDetails,
});
this.removeReqResp(params.requestId); this.removeReqResp(params.requestId);
}); });
cdp.on("Network.requestWillBeSentExtraInfo", (params) => { cdp.on("Network.requestWillBeSentExtraInfo", (params) => {
logNetwork("Network.requestWillBeSentExtraInfo", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.requestWillBeSentExtraInfo", {
requestId: params.requestId,
...this.logDetails,
});
this.handleRequestExtraInfo(params); this.handleRequestExtraInfo(params);
}); });
// Loading // Loading
cdp.on("Network.loadingFinished", (params) => { cdp.on("Network.loadingFinished", (params) => {
logNetwork("Network.loadingFinished", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.loadingFinished", {
requestId: params.requestId,
...this.logDetails,
});
this.handleLoadingFinished(params); this.handleLoadingFinished(params);
}); });
cdp.on("Network.loadingFailed", (params) => { cdp.on("Network.loadingFailed", (params) => {
logNetwork("Network.loadingFailed", {requestId: params.requestId, ...this.logDetails}); logNetwork("Network.loadingFailed", {
requestId: params.requestId,
...this.logDetails,
});
this.handleLoadingFailed(params); this.handleLoadingFailed(params);
}); });
@ -189,7 +218,11 @@ export class Recorder
} }
}); });
await cdp.send("Target.setAutoAttach", {autoAttach: true, waitForDebuggerOnStart: false, flatten: true}); await cdp.send("Target.setAutoAttach", {
autoAttach: true,
waitForDebuggerOnStart: false,
flatten: true,
});
} }
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) { handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
@ -203,7 +236,9 @@ export class Recorder
reqresp.fillResponse(response); reqresp.fillResponse(response);
} }
handleRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) { handleRequestExtraInfo(
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
) {
if (!this.shouldSkip(params.headers)) { if (!this.shouldSkip(params.headers)) {
const reqresp = this.pendingReqResp(params.requestId, true); const reqresp = this.pendingReqResp(params.requestId, true);
if (reqresp) { if (reqresp) {
@ -225,7 +260,11 @@ export class Recorder
reqresp.fillResponse(redirectResponse); reqresp.fillResponse(redirectResponse);
if (reqresp.isSelfRedirect()) { if (reqresp.isSelfRedirect()) {
logger.warn("Skipping self redirect", {url: reqresp. url, status: reqresp.status, ...this.logDetails}, "recorder"); logger.warn(
"Skipping self redirect",
{ url: reqresp.url, status: reqresp.status, ...this.logDetails },
"recorder",
);
return; return;
} }
@ -253,17 +292,34 @@ export class Recorder
if (type === "Document" && reqresp.isValidBinary()) { if (type === "Document" && reqresp.isValidBinary()) {
this.serializeToWARC(reqresp); this.serializeToWARC(reqresp);
//} else if (url) { //} else if (url) {
} else if (url && reqresp.requestHeaders && reqresp.requestHeaders["x-browsertrix-fetch"]) { } else if (
url &&
reqresp.requestHeaders &&
reqresp.requestHeaders["x-browsertrix-fetch"]
) {
delete reqresp.requestHeaders["x-browsertrix-fetch"]; delete reqresp.requestHeaders["x-browsertrix-fetch"];
logger.warn("Attempt direct fetch of failed request", {url, ...this.logDetails}, "recorder"); logger.warn(
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: requestId}); "Attempt direct fetch of failed request",
{ url, ...this.logDetails },
"recorder",
);
const fetcher = new AsyncFetcher({
tempdir: this.tempdir,
reqresp,
recorder: this,
networkId: requestId,
});
this.fetcherQ.add(() => fetcher.load()); this.fetcherQ.add(() => fetcher.load());
return; return;
} }
break; break;
default: default:
logger.warn("Request failed", {url, errorText, ...this.logDetails}, "recorder"); logger.warn(
"Request failed",
{ url, errorText, ...this.logDetails },
"recorder",
);
} }
this.removeReqResp(requestId); this.removeReqResp(requestId);
} }
@ -284,40 +340,82 @@ export class Recorder
this.serializeToWARC(reqresp); this.serializeToWARC(reqresp);
} }
async handleRequestPaused(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker = false) { async handleRequestPaused(
const { requestId, request, responseStatusCode, responseErrorReason, resourceType, networkId } = params; params: Protocol.Fetch.RequestPausedEvent,
cdp: CDPSession,
isSWorker = false,
) {
const {
requestId,
request,
responseStatusCode,
responseErrorReason,
resourceType,
networkId,
} = params;
const { method, headers, url } = request; const { method, headers, url } = request;
logNetwork("Fetch.requestPaused", {requestId, networkId, url, ...this.logDetails}); logNetwork("Fetch.requestPaused", {
requestId,
networkId,
url,
...this.logDetails,
});
let continued = false; let continued = false;
try { try {
if (responseStatusCode && !responseErrorReason && !this.shouldSkip(headers, url, method, resourceType) && !(isSWorker && networkId)) { if (
responseStatusCode &&
!responseErrorReason &&
!this.shouldSkip(headers, url, method, resourceType) &&
!(isSWorker && networkId)
) {
continued = await this.handleFetchResponse(params, cdp, isSWorker); continued = await this.handleFetchResponse(params, cdp, isSWorker);
} }
} catch (e) { } catch (e) {
logger.error("Error handling response, probably skipping URL", {url, ...errJSON(e), ...this.logDetails}, "recorder"); logger.error(
"Error handling response, probably skipping URL",
{ url, ...errJSON(e), ...this.logDetails },
"recorder",
);
} }
if (!continued) { if (!continued) {
try { try {
await cdp.send("Fetch.continueResponse", { requestId }); await cdp.send("Fetch.continueResponse", { requestId });
} catch (e) { } catch (e) {
logger.debug("continueResponse failed", {requestId, networkId, url, ...errJSON(e), ...this.logDetails}, "recorder"); logger.debug(
"continueResponse failed",
{ requestId, networkId, url, ...errJSON(e), ...this.logDetails },
"recorder",
);
} }
} }
} }
async handleFetchResponse(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker: boolean) { async handleFetchResponse(
params: Protocol.Fetch.RequestPausedEvent,
cdp: CDPSession,
isSWorker: boolean,
) {
const { request } = params; const { request } = params;
const { url } = request; const { url } = request;
const {requestId, responseErrorReason, responseStatusCode, responseHeaders} = params; const {
requestId,
responseErrorReason,
responseStatusCode,
responseHeaders,
} = params;
const networkId = params.networkId || requestId; const networkId = params.networkId || requestId;
if (responseErrorReason) { if (responseErrorReason) {
logger.warn("Skipping failed response", {url, reason: responseErrorReason, ...this.logDetails}, "recorder"); logger.warn(
"Skipping failed response",
{ url, reason: responseErrorReason, ...this.logDetails },
"recorder",
);
return false; return false;
} }
@ -325,10 +423,21 @@ export class Recorder
if (responseStatusCode === 206) { if (responseStatusCode === 206) {
const range = this._getContentRange(responseHeaders); const range = this._getContentRange(responseHeaders);
if (this.allowFull206 && range === `bytes 0-${contentLen - 1}/${contentLen}`) { if (
logger.debug("Keep 206 Response, Full Range", {range, contentLen, url, networkId, ...this.logDetails}, "recorder"); this.allowFull206 &&
range === `bytes 0-${contentLen - 1}/${contentLen}`
) {
logger.debug(
"Keep 206 Response, Full Range",
{ range, contentLen, url, networkId, ...this.logDetails },
"recorder",
);
} else { } else {
logger.debug("Skip 206 Response", {range, contentLen, url, ...this.logDetails}, "recorder"); logger.debug(
"Skip 206 Response",
{ range, contentLen, url, ...this.logDetails },
"recorder",
);
this.removeReqResp(networkId); this.removeReqResp(networkId);
return false; return false;
} }
@ -355,11 +464,22 @@ export class Recorder
let streamingConsume = false; let streamingConsume = false;
if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) { if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) {
const opts = {tempdir: this.tempdir, reqresp, expectedSize: contentLen, recorder: this, networkId, cdp}; const opts = {
tempdir: this.tempdir,
reqresp,
expectedSize: contentLen,
recorder: this,
networkId,
cdp,
};
// fetching using response stream, await here and then either call fulFill, or if not started, return false // fetching using response stream, await here and then either call fulFill, or if not started, return false
if (contentLen < 0) { if (contentLen < 0) {
const fetcher = new ResponseStreamAsyncFetcher({...opts, requestId, cdp }); const fetcher = new ResponseStreamAsyncFetcher({
...opts,
requestId,
cdp,
});
const res = await fetcher.load(); const res = await fetcher.load();
switch (res) { switch (res) {
case "dupe": case "dupe":
@ -384,15 +504,31 @@ export class Recorder
this.fetcherQ.add(() => fetcher.load()); this.fetcherQ.add(() => fetcher.load());
return false; return false;
} }
} else { } else {
try { try {
logNetwork("Fetching response", {sizeExpected: this._getContentLen(responseHeaders), url, networkId, ...this.logDetails}); logNetwork("Fetching response", {
const { body, base64Encoded } = await cdp.send("Fetch.getResponseBody", {requestId}); sizeExpected: this._getContentLen(responseHeaders),
url,
networkId,
...this.logDetails,
});
const { body, base64Encoded } = await cdp.send(
"Fetch.getResponseBody",
{ requestId },
);
reqresp.payload = Buffer.from(body, base64Encoded ? "base64" : "utf-8"); reqresp.payload = Buffer.from(body, base64Encoded ? "base64" : "utf-8");
logNetwork("Fetch done", {size: reqresp.payload.length, url, networkId, ...this.logDetails}); logNetwork("Fetch done", {
size: reqresp.payload.length,
url,
networkId,
...this.logDetails,
});
} catch (e) { } catch (e) {
logger.warn("Failed to load response body", {url, networkId, ...errJSON(e), ...this.logDetails}, "recorder"); logger.warn(
"Failed to load response body",
{ url, networkId, ...errJSON(e), ...this.logDetails },
"recorder",
);
return false; return false;
} }
} }
@ -409,39 +545,58 @@ export class Recorder
// not rewritten, and not streaming, return false to continue // not rewritten, and not streaming, return false to continue
if (!rewritten && !streamingConsume) { if (!rewritten && !streamingConsume) {
if (!reqresp.payload) { if (!reqresp.payload) {
logger.error("Unable to get payload skipping recording", {url, ...this.logDetails}, "recorder"); logger.error(
"Unable to get payload skipping recording",
{ url, ...this.logDetails },
"recorder",
);
this.removeReqResp(networkId); this.removeReqResp(networkId);
} }
return false; return false;
} }
// if has payload, encode it, otherwise return empty string // if has payload, encode it, otherwise return empty string
const body = reqresp.payload && reqresp.payload.length ? Buffer.from(reqresp.payload).toString("base64") : ""; const body =
reqresp.payload && reqresp.payload.length
? Buffer.from(reqresp.payload).toString("base64")
: "";
try { try {
await cdp.send("Fetch.fulfillRequest", { await cdp.send("Fetch.fulfillRequest", {
requestId, requestId,
responseCode: responseStatusCode || 0, responseCode: responseStatusCode || 0,
responseHeaders, responseHeaders,
body body,
}); });
} catch (e) { } catch (e) {
const type = reqresp.resourceType; const type = reqresp.resourceType;
if (type === "Document") { if (type === "Document") {
logger.debug("document not loaded in browser, possibly other URLs missing", {url, type: reqresp.resourceType}, "recorder"); logger.debug(
"document not loaded in browser, possibly other URLs missing",
{ url, type: reqresp.resourceType },
"recorder",
);
} else { } else {
logger.debug("URL not loaded in browser", {url, type: reqresp.resourceType}, "recorder"); logger.debug(
"URL not loaded in browser",
{ url, type: reqresp.resourceType },
"recorder",
);
} }
} }
return true; return true;
} }
startPage({pageid, url} : {pageid: string, url: string}) { startPage({ pageid, url }: { pageid: string; url: string }) {
this.pageid = pageid; this.pageid = pageid;
this.logDetails = { page: url, workerid: this.workerid }; this.logDetails = { page: url, workerid: this.workerid };
if (this.pendingRequests && this.pendingRequests.size) { if (this.pendingRequests && this.pendingRequests.size) {
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder"); logger.debug(
"Interrupting timed out requests, moving to next page",
this.logDetails,
"recorder",
);
} }
this.pendingRequests = new Map(); this.pendingRequests = new Map();
this.skipIds = new Set(); this.skipIds = new Set();
@ -465,7 +620,12 @@ export class Recorder
const pending = []; const pending = [];
for (const [requestId, reqresp] of this.pendingRequests.entries()) { for (const [requestId, reqresp] of this.pendingRequests.entries()) {
const url = reqresp.url || ""; const url = reqresp.url || "";
const entry : {requestId: string, url: string, expectedSize?: number, readSize?: number} = {requestId, url}; const entry: {
requestId: string;
url: string;
expectedSize?: number;
readSize?: number;
} = { requestId, url };
if (reqresp.expectedSize) { if (reqresp.expectedSize) {
entry.expectedSize = reqresp.expectedSize; entry.expectedSize = reqresp.expectedSize;
} }
@ -475,7 +635,11 @@ export class Recorder
pending.push(entry); pending.push(entry);
} }
logger.debug("Finishing pending requests for page", {numPending, pending, ...this.logDetails}, "recorder"); logger.debug(
"Finishing pending requests for page",
{ numPending, pending, ...this.logDetails },
"recorder",
);
await sleep(5.0); await sleep(5.0);
numPending = this.pendingRequests.size; numPending = this.pendingRequests.size;
} }
@ -497,7 +661,12 @@ export class Recorder
await this.writer.flush(); await this.writer.flush();
} }
shouldSkip(headers: Protocol.Network.Headers, url?: string, method?: string, resourceType?: string) { shouldSkip(
headers: Protocol.Network.Headers,
url?: string,
method?: string,
resourceType?: string,
) {
if (headers && !method) { if (headers && !method) {
method = headers[":method"]; method = headers[":method"];
} }
@ -520,7 +689,11 @@ export class Recorder
} }
// skip eventsource, resourceType may not be set correctly // skip eventsource, resourceType may not be set correctly
if (headers && (headers["accept"] === "text/event-stream" || headers["Accept"] === "text/event-stream")) { if (
headers &&
(headers["accept"] === "text/event-stream" ||
headers["Accept"] === "text/event-stream")
) {
return true; return true;
} }
@ -572,7 +745,11 @@ export class Recorder
if (newString !== string) { if (newString !== string) {
extraOpts.rewritten = 1; extraOpts.rewritten = 1;
logger.debug("Content Rewritten", {url, ...this.logDetails}, "recorder"); logger.debug(
"Content Rewritten",
{ url, ...this.logDetails },
"recorder",
);
reqresp.payload = encoder.encode(newString); reqresp.payload = encoder.encode(newString);
return true; return true;
} else { } else {
@ -582,7 +759,9 @@ export class Recorder
//return Buffer.from(newString).toString("base64"); //return Buffer.from(newString).toString("base64");
} }
_getContentType(headers? : Protocol.Fetch.HeaderEntry[] | {name: string, value: string}[]) { _getContentType(
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
) {
if (!headers) { if (!headers) {
return null; return null;
} }
@ -622,7 +801,7 @@ export class Recorder
} }
noResponseForStatus(status: number | undefined | null) { noResponseForStatus(status: number | undefined | null) {
return (!status || status === 204 || (status >= 300 && status < 400)); return !status || status === 204 || (status >= 300 && status < 400);
} }
isValidUrl(url?: string) { isValidUrl(url?: string) {
@ -648,7 +827,11 @@ export class Recorder
} else { } else {
const reqresp = this.pendingRequests.get(requestId); const reqresp = this.pendingRequests.get(requestId);
if (reqresp && requestId !== reqresp.requestId) { if (reqresp && requestId !== reqresp.requestId) {
logger.warn("Invalid request id", {requestId, actualRequestId: reqresp.requestId}, "recorder"); logger.warn(
"Invalid request id",
{ requestId, actualRequestId: reqresp.requestId },
"recorder",
);
} }
return reqresp; return reqresp;
} }
@ -669,7 +852,11 @@ export class Recorder
return; return;
} }
if (reqresp.url && reqresp.method === "GET" && !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))) { if (
reqresp.url &&
reqresp.method === "GET" &&
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))
) {
logNetwork("Skipping dupe", { url: reqresp.url }); logNetwork("Skipping dupe", { url: reqresp.url });
return; return;
} }
@ -677,32 +864,52 @@ export class Recorder
const responseRecord = createResponse(reqresp, this.pageid); const responseRecord = createResponse(reqresp, this.pageid);
const requestRecord = createRequest(reqresp, responseRecord, this.pageid); const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
this.warcQ.add(() => this.writer.writeRecordPair(responseRecord, requestRecord)); this.warcQ.add(() =>
this.writer.writeRecordPair(responseRecord, requestRecord),
);
} }
async directFetchCapture(url: string) : Promise<{fetched: boolean, mime: string}>{ async directFetchCapture(
url: string,
): Promise<{ fetched: boolean; mime: string }> {
const reqresp = new RequestResponseInfo("0"); const reqresp = new RequestResponseInfo("0");
reqresp.url = url; reqresp.url = url;
reqresp.method = "GET"; reqresp.method = "GET";
logger.debug("Directly fetching page URL without browser", {url, ...this.logDetails}, "recorder"); logger.debug(
"Directly fetching page URL without browser",
{ url, ...this.logDetails },
"recorder",
);
const filter = (resp: Response) => resp.status === 200 && !resp.headers.get("set-cookie"); const filter = (resp: Response) =>
resp.status === 200 && !resp.headers.get("set-cookie");
// ignore dupes: if previous URL was not a page, still load as page. if previous was page, // ignore dupes: if previous URL was not a page, still load as page. if previous was page,
// should not get here, as dupe pages tracked via seen list // should not get here, as dupe pages tracked via seen list
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: "0", filter, ignoreDupe: true}); const fetcher = new AsyncFetcher({
tempdir: this.tempdir,
reqresp,
recorder: this,
networkId: "0",
filter,
ignoreDupe: true,
});
const res = await fetcher.load(); const res = await fetcher.load();
const mime = reqresp && reqresp.responseHeaders && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0] || ""; const mime =
(reqresp &&
reqresp.responseHeaders &&
reqresp.responseHeaders["content-type"] &&
reqresp.responseHeaders["content-type"].split(";")[0]) ||
"";
return { fetched: res === "fetched", mime }; return { fetched: res === "fetched", mime };
} }
} }
// ================================================================= // =================================================================
class AsyncFetcher class AsyncFetcher {
{
reqresp: RequestResponseInfo; reqresp: RequestResponseInfo;
networkId: string; networkId: string;
@ -714,9 +921,23 @@ class AsyncFetcher
tempdir: string; tempdir: string;
filename: string; filename: string;
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} : constructor({
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder, tempdir,
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) { reqresp,
expectedSize = -1,
recorder,
networkId,
filter = undefined,
ignoreDupe = false,
}: {
tempdir: string;
reqresp: RequestResponseInfo;
expectedSize?: number;
recorder: Recorder;
networkId: string;
filter?: (resp: Response) => boolean;
ignoreDupe?: boolean;
}) {
this.reqresp = reqresp; this.reqresp = reqresp;
this.reqresp.expectedSize = expectedSize; this.reqresp.expectedSize = expectedSize;
this.reqresp.asyncLoading = true; this.reqresp.asyncLoading = true;
@ -728,7 +949,10 @@ class AsyncFetcher
this.recorder = recorder; this.recorder = recorder;
this.tempdir = tempdir; this.tempdir = tempdir;
this.filename = path.join(this.tempdir, `${timestampNow()}-${uuidv4()}.data`); this.filename = path.join(
this.tempdir,
`${timestampNow()}-${uuidv4()}.data`,
);
} }
async load() { async load() {
@ -740,7 +964,11 @@ class AsyncFetcher
let fetched = "notfetched"; let fetched = "notfetched";
try { try {
if (reqresp.method === "GET" && url && !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url))) { if (
reqresp.method === "GET" &&
url &&
!(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url))
) {
if (!this.ignoreDupe) { if (!this.ignoreDupe) {
this.reqresp.asyncLoading = false; this.reqresp.asyncLoading = false;
return "dupe"; return "dupe";
@ -753,7 +981,10 @@ class AsyncFetcher
const responseRecord = createResponse(reqresp, pageid, body); const responseRecord = createResponse(reqresp, pageid, body);
const requestRecord = createRequest(reqresp, responseRecord, pageid); const requestRecord = createRequest(reqresp, responseRecord, pageid);
const serializer = new WARCSerializer(responseRecord, {gzip, maxMemSize: MAX_BROWSER_FETCH_SIZE}); const serializer = new WARCSerializer(responseRecord, {
gzip,
maxMemSize: MAX_BROWSER_FETCH_SIZE,
});
try { try {
let readSize = await serializer.digestRecord(); let readSize = await serializer.digestRecord();
@ -762,19 +993,45 @@ class AsyncFetcher
} }
reqresp.readSize = readSize; reqresp.readSize = readSize;
} catch (e) { } catch (e) {
logger.error("Error reading + digesting payload", {url, filename, ...errJSON(e), ...logDetails}, "recorder"); logger.error(
"Error reading + digesting payload",
{ url, filename, ...errJSON(e), ...logDetails },
"recorder",
);
} }
if (reqresp.readSize === reqresp.expectedSize || reqresp.expectedSize < 0) { if (
logger.debug("Async fetch: streaming done", {size: reqresp.readSize, expected: reqresp.expectedSize, networkId, url, ...logDetails}, "recorder"); reqresp.readSize === reqresp.expectedSize ||
reqresp.expectedSize < 0
) {
logger.debug(
"Async fetch: streaming done",
{
size: reqresp.readSize,
expected: reqresp.expectedSize,
networkId,
url,
...logDetails,
},
"recorder",
);
} else { } else {
logger.warn("Async fetch: possible response size mismatch", {size: reqresp.readSize, expected: reqresp.expectedSize, url, ...logDetails}, "recorder"); logger.warn(
"Async fetch: possible response size mismatch",
{
size: reqresp.readSize,
expected: reqresp.expectedSize,
url,
...logDetails,
},
"recorder",
);
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url); //await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
//return fetched; //return fetched;
} }
const externalBuffer : TempFileBuffer = serializer.externalBuffer as TempFileBuffer; const externalBuffer: TempFileBuffer =
serializer.externalBuffer as TempFileBuffer;
if (externalBuffer) { if (externalBuffer) {
const { currSize, buffers, fh } = externalBuffer; const { currSize, buffers, fh } = externalBuffer;
@ -786,13 +1043,25 @@ class AsyncFetcher
} }
if (Object.keys(reqresp.extraOpts).length) { if (Object.keys(reqresp.extraOpts).length) {
responseRecord.warcHeaders.headers.set("WARC-JSON-Metadata", JSON.stringify(reqresp.extraOpts)); responseRecord.warcHeaders.headers.set(
"WARC-JSON-Metadata",
JSON.stringify(reqresp.extraOpts),
);
} }
recorder.warcQ.add(() => recorder.writer.writeRecordPair(responseRecord, requestRecord, serializer)); recorder.warcQ.add(() =>
recorder.writer.writeRecordPair(
responseRecord,
requestRecord,
serializer,
),
);
} catch (e) { } catch (e) {
logger.error("Streaming Fetch Error", {url, networkId, filename, ...errJSON(e), ...logDetails}, "recorder"); logger.error(
"Streaming Fetch Error",
{ url, networkId, filename, ...errJSON(e), ...logDetails },
"recorder",
);
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!); await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
} finally { } finally {
recorder.removeReqResp(networkId); recorder.removeReqResp(networkId);
@ -816,21 +1085,29 @@ class AsyncFetcher
signal = abort.signal; signal = abort.signal;
} }
const resp = await fetch(url!, {method, headers, body: reqresp.postData || undefined, signal}); const resp = await fetch(url!, {
method,
headers,
body: reqresp.postData || undefined,
signal,
});
if (this.filter && !this.filter(resp) && abort) { if (this.filter && !this.filter(resp) && abort) {
abort.abort(); abort.abort();
throw new Error("invalid response, ignoring fetch"); throw new Error("invalid response, ignoring fetch");
} }
if (reqresp.expectedSize < 0 && resp.headers.get("content-length") && !resp.headers.get("content-encoding")) { if (
reqresp.expectedSize < 0 &&
resp.headers.get("content-length") &&
!resp.headers.get("content-encoding")
) {
reqresp.expectedSize = Number(resp.headers.get("content-length") || -1); reqresp.expectedSize = Number(resp.headers.get("content-length") || -1);
} }
if (reqresp.expectedSize === 0) { if (reqresp.expectedSize === 0) {
reqresp.payload = new Uint8Array(); reqresp.payload = new Uint8Array();
return; return;
} else if (!resp.body) { } else if (!resp.body) {
logger.error("Empty body, stopping fetch", { url }, "recorder"); logger.error("Empty body, stopping fetch", { url }, "recorder");
await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!); await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
@ -853,7 +1130,11 @@ class AsyncFetcher
yield value; yield value;
} }
} catch (e) { } catch (e) {
logger.warn("takeReader interrupted", {...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails}, "recorder"); logger.warn(
"takeReader interrupted",
{ ...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails },
"recorder",
);
this.reqresp.truncated = "disconnect"; this.reqresp.truncated = "disconnect";
} }
} }
@ -861,7 +1142,9 @@ class AsyncFetcher
async *takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) { async *takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) {
try { try {
while (true) { while (true) {
const {data, base64Encoded, eof} = await cdp.send("IO.read", {handle: stream}); const { data, base64Encoded, eof } = await cdp.send("IO.read", {
handle: stream,
});
const buff = Buffer.from(data, base64Encoded ? "base64" : "utf-8"); const buff = Buffer.from(data, base64Encoded ? "base64" : "utf-8");
yield buff; yield buff;
@ -871,15 +1154,18 @@ class AsyncFetcher
} }
} }
} catch (e) { } catch (e) {
logger.warn("takeStream interrupted", {...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails}, "recorder"); logger.warn(
"takeStream interrupted",
{ ...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails },
"recorder",
);
this.reqresp.truncated = "disconnect"; this.reqresp.truncated = "disconnect";
} }
} }
} }
// ================================================================= // =================================================================
class ResponseStreamAsyncFetcher extends AsyncFetcher class ResponseStreamAsyncFetcher extends AsyncFetcher {
{
cdp: CDPSession; cdp: CDPSession;
requestId: string; requestId: string;
@ -896,15 +1182,16 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher
const { url } = reqresp; const { url } = reqresp;
logger.debug("Async started: takeStream", { url }, "recorder"); logger.debug("Async started: takeStream", { url }, "recorder");
const { stream } = await cdp.send("Fetch.takeResponseBodyAsStream", {requestId}); const { stream } = await cdp.send("Fetch.takeResponseBodyAsStream", {
requestId,
});
return this.takeStreamIter(cdp, stream); return this.takeStreamIter(cdp, stream);
} }
} }
// ================================================================= // =================================================================
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
{
cdp: CDPSession; cdp: CDPSession;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -924,21 +1211,45 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
let result = null; let result = null;
try { try {
result = await cdp.send("Network.loadNetworkResource", {frameId: reqresp.frameId, url, options}); result = await cdp.send("Network.loadNetworkResource", {
frameId: reqresp.frameId,
url,
options,
});
} catch (e) { } catch (e) {
logger.debug("Network.loadNetworkResource failed, attempting node fetch", {url, ...errJSON(e), ...this.recorder.logDetails}, "recorder"); logger.debug(
"Network.loadNetworkResource failed, attempting node fetch",
{ url, ...errJSON(e), ...this.recorder.logDetails },
"recorder",
);
return await super._doFetch(); return await super._doFetch();
} }
const { stream, headers, httpStatusCode, success, netError, netErrorName } = result.resource; const { stream, headers, httpStatusCode, success, netError, netErrorName } =
result.resource;
if (!success || !stream) { if (!success || !stream) {
//await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url); //await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
logger.debug("Network.loadNetworkResource failed, attempting node fetch", {url, netErrorName, netError, httpStatusCode, ...this.recorder.logDetails}, "recorder"); logger.debug(
"Network.loadNetworkResource failed, attempting node fetch",
{
url,
netErrorName,
netError,
httpStatusCode,
...this.recorder.logDetails,
},
"recorder",
);
return await super._doFetch(); return await super._doFetch();
} }
if (reqresp.expectedSize < 0 && headers && headers["content-length"] && !headers["content-encoding"]) { if (
reqresp.expectedSize < 0 &&
headers &&
headers["content-length"] &&
!headers["content-encoding"]
) {
reqresp.expectedSize = Number(headers["content-length"] || -1); reqresp.expectedSize = Number(headers["content-length"] || -1);
} }
@ -956,13 +1267,19 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
// ================================================================= // =================================================================
// response // response
function createResponse(reqresp: RequestResponseInfo, pageid: string, contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>) { function createResponse(
reqresp: RequestResponseInfo,
pageid: string,
contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>,
) {
const url = reqresp.url; const url = reqresp.url;
const warcVersion = "WARC/1.1"; const warcVersion = "WARC/1.1";
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`; const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
const date = new Date().toISOString(); const date = new Date().toISOString();
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload ? reqresp.payload.length : 0); const httpHeaders = reqresp.getResponseHeadersDict(
reqresp.payload ? reqresp.payload.length : 0,
);
const warcHeaders: Record<string, string> = { const warcHeaders: Record<string, string> = {
"WARC-Page-ID": pageid, "WARC-Page-ID": pageid,
@ -980,14 +1297,27 @@ function createResponse(reqresp: RequestResponseInfo, pageid: string, contentIte
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts); warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
} }
return WARCRecord.create({ return WARCRecord.create(
url, date, warcVersion, type: "response", warcHeaders, {
httpHeaders, statusline}, contentIter); url,
date,
warcVersion,
type: "response",
warcHeaders,
httpHeaders,
statusline,
},
contentIter,
);
} }
// ================================================================= // =================================================================
// request // request
function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord, pageid: string) { function createRequest(
reqresp: RequestResponseInfo,
responseRecord: WARCRecord,
pageid: string,
) {
const url = reqresp.url; const url = reqresp.url;
const warcVersion = "WARC/1.1"; const warcVersion = "WARC/1.1";
const method = reqresp.method; const method = reqresp.method;
@ -996,7 +1326,9 @@ function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord,
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`; const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
const requestBody = reqresp.postData ? [encoder.encode(reqresp.postData)] : []; const requestBody = reqresp.postData
? [encoder.encode(reqresp.postData)]
: [];
const httpHeaders = reqresp.getRequestHeadersDict(); const httpHeaders = reqresp.getRequestHeadersDict();
@ -1007,7 +1339,16 @@ function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord,
const date = responseRecord.warcDate || undefined; const date = responseRecord.warcDate || undefined;
return WARCRecord.create({ return WARCRecord.create(
url, date, warcVersion, type: "request", warcHeaders, {
httpHeaders, statusline}, requestBody); url,
date,
warcVersion,
type: "request",
warcHeaders,
httpHeaders,
statusline,
},
requestBody,
);
} }

View file

@ -14,10 +14,9 @@ console.error = function (...args) {
typeof args[0] === "string" && typeof args[0] === "string" &&
args[0].indexOf("[ioredis] Unhandled error event") === 0 args[0].indexOf("[ioredis] Unhandled error event") === 0
) { ) {
const now = Date.now(); const now = Date.now();
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) { if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) {
if (lastLogTime && exitOnError) { if (lastLogTime && exitOnError) {
logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis"); logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
} }

View file

@ -7,10 +7,8 @@ const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type"; const CONTENT_TYPE = "content-type";
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"]; const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
// =========================================================================== // ===========================================================================
export class RequestResponseInfo export class RequestResponseInfo {
{
_created: Date = new Date(); _created: Date = new Date();
requestId: string; requestId: string;
@ -33,7 +31,7 @@ export class RequestResponseInfo
statusText?: string; statusText?: string;
responseHeaders?: Record<string, string>; responseHeaders?: Record<string, string>;
responseHeadersList?: {name: string, value: string}[]; responseHeadersList?: { name: string; value: string }[];
responseHeadersText?: string; responseHeadersText?: string;
payload?: Uint8Array; payload?: Uint8Array;
@ -79,7 +77,6 @@ export class RequestResponseInfo
if (params.type) { if (params.type) {
this.resourceType = params.type; this.resourceType = params.type;
} }
} }
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -100,7 +97,12 @@ export class RequestResponseInfo
fillResponse(response: Protocol.Network.Response) { fillResponse(response: Protocol.Network.Response) {
// if initial fetch was a 200, but now replacing with 304, don't! // if initial fetch was a 200, but now replacing with 304, don't!
if (response.status == 304 && this.status && this.status != 304 && this.url) { if (
response.status == 304 &&
this.status &&
this.status != 304 &&
this.url
) {
return; return;
} }
@ -128,7 +130,11 @@ export class RequestResponseInfo
if (response.securityDetails) { if (response.securityDetails) {
const issuer: string = response.securityDetails.issuer || ""; const issuer: string = response.securityDetails.issuer || "";
const ctc : string = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0"; const ctc: string =
response.securityDetails.certificateTransparencyCompliance ===
"compliant"
? "1"
: "0";
this.extraOpts.cert = { issuer, ctc }; this.extraOpts.cert = { issuer, ctc };
} }
} }
@ -161,7 +167,6 @@ export class RequestResponseInfo
this.responseHeaders = Object.fromEntries(response.headers); this.responseHeaders = Object.fromEntries(response.headers);
this.status = response.status; this.status = response.status;
this.statusText = response.statusText || getStatusText(this.status); this.statusText = response.statusText || getStatusText(this.status);
} }
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -175,7 +180,10 @@ export class RequestResponseInfo
if (this.responseHeaders) { if (this.responseHeaders) {
for (const header of Object.keys(this.responseHeaders)) { for (const header of Object.keys(this.responseHeaders)) {
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`; headers += `${header}: ${this.responseHeaders[header].replace(
/\n/g,
", ",
)}\r\n`;
} }
} }
headers += "\r\n"; headers += "\r\n";
@ -191,10 +199,18 @@ export class RequestResponseInfo
} }
getResponseHeadersDict(length = 0) { getResponseHeadersDict(length = 0) {
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length); return this._getHeadersDict(
this.responseHeaders,
this.responseHeadersList,
length,
);
} }
_getHeadersDict(headersDict?: Record<string, string>, headersList?: {name: string, value: string}[], actualContentLength = 0) { _getHeadersDict(
headersDict?: Record<string, string>,
headersList?: { name: string; value: string }[],
actualContentLength = 0,
) {
if (!headersDict && headersList) { if (!headersDict && headersList) {
headersDict = {}; headersDict = {};

View file

@ -9,12 +9,13 @@ import { Duplex } from "stream";
import { CDPSession, Page } from "puppeteer-core"; import { CDPSession, Page } from "puppeteer-core";
import { WorkerId } from "./state.js"; import { WorkerId } from "./state.js";
const indexHTML = fs.readFileSync(new URL("../../html/screencast.html", import.meta.url), {encoding: "utf8"}); const indexHTML = fs.readFileSync(
new URL("../../html/screencast.html", import.meta.url),
{ encoding: "utf8" },
);
// =========================================================================== // ===========================================================================
class WSTransport class WSTransport {
{
allWS = new Set<WebSocket>(); allWS = new Set<WebSocket>();
// eslint-disable-next-line no-use-before-define // eslint-disable-next-line no-use-before-define
caster!: ScreenCaster; caster!: ScreenCaster;
@ -23,7 +24,6 @@ class WSTransport
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
httpServer: any; httpServer: any;
constructor(port: number) { constructor(port: number) {
this.allWS = new Set(); this.allWS = new Set();
@ -31,8 +31,12 @@ class WSTransport
this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws)); this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
this.httpServer = http.createServer((...args) => this.handleRequest(...args)); this.httpServer = http.createServer((...args) =>
this.httpServer.on("upgrade", (request: IncomingMessage, socket: Duplex, head: Buffer) => { this.handleRequest(...args),
);
this.httpServer.on(
"upgrade",
(request: IncomingMessage, socket: Duplex, head: Buffer) => {
const pathname = url.parse(request.url || "").pathname; const pathname = url.parse(request.url || "").pathname;
if (pathname === "/ws") { if (pathname === "/ws") {
@ -40,7 +44,8 @@ class WSTransport
this.wss.emit("connection", ws, request); this.wss.emit("connection", ws, request);
}); });
} }
}); },
);
this.httpServer.listen(port); this.httpServer.listen(port);
} }
@ -65,7 +70,11 @@ class WSTransport
this.allWS.add(ws); this.allWS.add(ws);
logger.debug("New Screencast Conn", {total: this.allWS.size}, "screencast"); logger.debug(
"New Screencast Conn",
{ total: this.allWS.size },
"screencast",
);
if (this.allWS.size === 1) { if (this.allWS.size === 1) {
this.caster.startCastAll(); this.caster.startCastAll();
@ -95,10 +104,8 @@ class WSTransport
} }
} }
// =========================================================================== // ===========================================================================
class RedisPubSubTransport class RedisPubSubTransport {
{
numConnections: number = 0; numConnections: number = 0;
castChannel: string; castChannel: string;
// eslint-disable-next-line no-use-before-define // eslint-disable-next-line no-use-before-define
@ -157,14 +164,12 @@ class RedisPubSubTransport
async isActive() { async isActive() {
const result = await this.redis.pubsub("numsub", this.castChannel); const result = await this.redis.pubsub("numsub", this.castChannel);
return (result.length > 1 ? result[1] > 0: false); return result.length > 1 ? result[1] > 0 : false;
} }
} }
// =========================================================================== // ===========================================================================
class ScreenCaster class ScreenCaster {
{
transport: WSTransport; transport: WSTransport;
caches = new Map<WorkerId, string>(); caches = new Map<WorkerId, string>();
urls = new Map<WorkerId, string>(); urls = new Map<WorkerId, string>();
@ -183,7 +188,7 @@ class ScreenCaster
msg: "init", msg: "init",
width: this.maxWidth, width: this.maxWidth,
height: this.maxHeight, height: this.maxHeight,
browsers: numWorkers browsers: numWorkers,
}; };
} }
@ -277,7 +282,12 @@ class ScreenCaster
logger.info("Started Screencast", { workerid: id }, "screencast"); logger.info("Started Screencast", { workerid: id }, "screencast");
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight}); await cdp.send("Page.startScreencast", {
format: "png",
everyNthFrame: 1,
maxWidth: this.maxWidth,
maxHeight: this.maxHeight,
});
} }
async stopCast(cdp: CDPSession, id: WorkerId) { async stopCast(cdp: CDPSession, id: WorkerId) {

View file

@ -4,31 +4,30 @@ import { WARCResourceWriter } from "./warcresourcewriter.js";
import { logger, errJSON } from "./logger.js"; import { logger, errJSON } from "./logger.js";
import { Browser } from "./browser.js"; import { Browser } from "./browser.js";
// ============================================================================ // ============================================================================
type ScreenShotType = { type ScreenShotType = {
type: string; type: string;
omitBackground: boolean; omitBackground: boolean;
fullPage: boolean; fullPage: boolean;
} };
export const screenshotTypes: Record<string, ScreenShotType> = { export const screenshotTypes: Record<string, ScreenShotType> = {
"view": { view: {
type: "png", type: "png",
omitBackground: true, omitBackground: true,
fullPage: false fullPage: false,
}, },
"thumbnail": { thumbnail: {
type: "jpeg", type: "jpeg",
omitBackground: true, omitBackground: true,
fullPage: false fullPage: false,
}, },
"fullPage": { fullPage: {
type: "png", type: "png",
omitBackground: true, omitBackground: true,
fullPage: true fullPage: true,
} },
}; };
export class Screenshots extends WARCResourceWriter { export class Screenshots extends WARCResourceWriter {
@ -48,14 +47,27 @@ export class Screenshots extends WARCResourceWriter {
async take(screenshotType = "view") { async take(screenshotType = "view") {
try { try {
if (screenshotType !== "fullPage") { if (screenshotType !== "fullPage") {
await this.browser.setViewport(this.page, {width: 1920, height: 1080}); await this.browser.setViewport(this.page, {
width: 1920,
height: 1080,
});
} }
const options = screenshotTypes[screenshotType]; const options = screenshotTypes[screenshotType];
const screenshotBuffer = await this.page.screenshot(options); const screenshotBuffer = await this.page.screenshot(options);
await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type); await this.writeBufferToWARC(
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`); screenshotBuffer,
screenshotType,
"image/" + options.type,
);
logger.info(
`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`,
);
} catch (e) { } catch (e) {
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots"); logger.error(
"Taking screenshot failed",
{ page: this.url, type: screenshotType, ...errJSON(e) },
"screenshots",
);
} }
} }
@ -73,10 +85,20 @@ export class Screenshots extends WARCResourceWriter {
// 16:9 thumbnail // 16:9 thumbnail
.resize(640, 360) .resize(640, 360)
.toBuffer(); .toBuffer();
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type); await this.writeBufferToWARC(
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`); thumbnailBuffer,
screenshotType,
"image/" + options.type,
);
logger.info(
`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`,
);
} catch (e) { } catch (e) {
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots"); logger.error(
"Taking screenshot failed",
{ page: this.url, type: screenshotType, ...errJSON(e) },
"screenshots",
);
} }
} }
} }

View file

@ -10,8 +10,7 @@ type ScopeType =
| "any" | "any"
| "custom"; | "custom";
export class ScopedSeed export class ScopedSeed {
{
url: string; url: string;
scopeType: ScopeType; scopeType: ScopeType;
include: RegExp[]; include: RegExp[];
@ -24,11 +23,25 @@ export class ScopedSeed
maxExtraHops = 0; maxExtraHops = 0;
maxDepth = 0; maxDepth = 0;
constructor({
constructor( url,
{url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} : scopeType,
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: string | boolean | null, extraHops?: number} include,
) { exclude = [],
allowHash = false,
depth = -1,
sitemap = false,
extraHops = 0,
}: {
url: string;
scopeType: ScopeType;
include: string[];
exclude?: string[];
allowHash?: boolean;
depth?: number;
sitemap?: string | boolean | null;
extraHops?: number;
}) {
const parsedUrl = this.parseUrl(url); const parsedUrl = this.parseUrl(url);
if (!parsedUrl) { if (!parsedUrl) {
throw new Error("Invalid URL"); throw new Error("Invalid URL");
@ -43,7 +56,10 @@ export class ScopedSeed
} }
if (this.scopeType !== "custom") { if (this.scopeType !== "custom") {
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl); const [includeNew, allowHashNew] = this.scopeFromType(
this.scopeType,
parsedUrl,
);
this.include = [...includeNew, ...this.include]; this.include = [...includeNew, ...this.include];
allowHash = allowHashNew; allowHash = allowHashNew;
} }
@ -69,7 +85,7 @@ export class ScopedSeed
} else if (!(value instanceof Array)) { } else if (!(value instanceof Array)) {
return [new RegExp(value)]; return [new RegExp(value)];
} else { } else {
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e)); return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
} }
} }
@ -102,7 +118,10 @@ export class ScopedSeed
} }
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") { if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails}); logger.warn("Invalid Page - URL must start with http:// or https://", {
url,
...logDetails,
});
parsedUrl = null; parsedUrl = null;
} }
@ -114,7 +133,7 @@ export class ScopedSeed
const url = new URL(this.url); const url = new URL(this.url);
url.pathname = "/sitemap.xml"; url.pathname = "/sitemap.xml";
return url.href; return url.href;
} else if (typeof(sitemap) === "string") { } else if (typeof sitemap === "string") {
const url = new URL(sitemap, this.url); const url = new URL(sitemap, this.url);
return url.href; return url.href;
} }
@ -133,23 +152,47 @@ export class ScopedSeed
case "page-spa": case "page-spa":
// allow scheme-agnostic URLS as likely redirects // allow scheme-agnostic URLS as likely redirects
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")]; include = [
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
];
allowHash = true; allowHash = true;
break; break;
case "prefix": case "prefix":
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))]; include = [
new RegExp(
"^" +
urlRxEscape(
parsedUrl.origin +
parsedUrl.pathname.slice(
0,
parsedUrl.pathname.lastIndexOf("/") + 1,
),
parsedUrl,
),
),
];
break; break;
case "host": case "host":
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))]; include = [
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
];
break; break;
case "domain": case "domain":
if (parsedUrl.hostname.startsWith("www.")) { if (parsedUrl.hostname.startsWith("www.")) {
parsedUrl.hostname = parsedUrl.hostname.replace("www.", ""); parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
} }
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))]; include = [
new RegExp(
"^" +
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
"\\/\\/",
"\\/\\/([^/]+\\.)*",
),
),
];
break; break;
case "any": case "any":
@ -157,7 +200,9 @@ export class ScopedSeed
break; break;
default: default:
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`); logger.fatal(
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
);
} }
return [include, allowHash]; return [include, allowHash];
@ -232,7 +277,3 @@ export function rxEscape(string: string) {
export function urlRxEscape(url: string, parsedUrl: URL) { export function urlRxEscape(url: string, parsedUrl: URL) {
return rxEscape(url).replace(parsedUrl.protocol, "https?:"); return rxEscape(url).replace(parsedUrl.protocol, "https?:");
} }

View file

@ -6,7 +6,6 @@ import { MAX_DEPTH } from "./constants.js";
import { ScopedSeed } from "./seeds.js"; import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core"; import { Frame } from "puppeteer-core";
// ============================================================================ // ============================================================================
export enum LoadState { export enum LoadState {
FAILED = 0, FAILED = 0,
@ -16,7 +15,6 @@ export enum LoadState {
BEHAVIORS_DONE = 4, BEHAVIORS_DONE = 4,
} }
// ============================================================================ // ============================================================================
export enum QueueState { export enum QueueState {
ADDED = 0, ADDED = 0,
@ -24,14 +22,11 @@ export enum QueueState {
DUPE_URL = 2, DUPE_URL = 2,
} }
// ============================================================================ // ============================================================================
export type WorkerId = number; export type WorkerId = number;
// ============================================================================ // ============================================================================
export class PageState export class PageState {
{
url: string; url: string;
seedId: number; seedId: number;
depth: number; depth: number;
@ -57,7 +52,12 @@ export class PageState
logDetails = {}; logDetails = {};
constructor(redisData: {url: string, seedId: number, depth: number, extraHops: number}) { constructor(redisData: {
url: string;
seedId: number;
depth: number;
extraHops: number;
}) {
this.url = redisData.url; this.url = redisData.url;
this.seedId = redisData.seedId; this.seedId = redisData.seedId;
this.depth = redisData.depth; this.depth = redisData.depth;
@ -78,10 +78,7 @@ declare module "ioredis" {
limit: number, limit: number,
): Result<number, Context>; ): Result<number, Context>;
getnext( getnext(qkey: string, pkey: string): Result<string, Context>;
qkey: string,
pkey: string,
): Result<string, Context>;
markstarted( markstarted(
pkey: string, pkey: string,
@ -103,7 +100,7 @@ declare module "ioredis" {
unlockpending( unlockpending(
pkeyUrl: string, pkeyUrl: string,
uid: string, uid: string,
callback?: Callback<string> callback?: Callback<string>,
): Result<void, Context>; ): Result<void, Context>;
requeue( requeue(
@ -113,13 +110,11 @@ declare module "ioredis" {
url: string, url: string,
maxRetryPending: number, maxRetryPending: number,
): Result<number, Context>; ): Result<number, Context>;
} }
} }
// ============================================================================ // ============================================================================
export class RedisCrawlState export class RedisCrawlState {
{
redis: Redis; redis: Redis;
maxRetryPending = 1; maxRetryPending = 1;
_lastSize = 0; _lastSize = 0;
@ -138,8 +133,6 @@ export class RedisCrawlState
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) { constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
this.redis = redis; this.redis = redis;
this.uid = uid; this.uid = uid;
this.key = key; this.key = key;
this.maxPageTime = maxPageTime; this.maxPageTime = maxPageTime;
@ -172,7 +165,7 @@ end
redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]); redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]);
redis.call('hdel', KEYS[1], ARGV[1]); redis.call('hdel', KEYS[1], ARGV[1]);
return 0; return 0;
` `,
}); });
redis.defineCommand("getnext", { redis.defineCommand("getnext", {
@ -187,7 +180,7 @@ if json then
end end
return json; return json;
` `,
}); });
redis.defineCommand("markstarted", { redis.defineCommand("markstarted", {
@ -203,7 +196,7 @@ if json then
redis.call('setex', KEYS[2], ARGV[3], ARGV[4]); redis.call('setex', KEYS[2], ARGV[3], ARGV[4]);
end end
` `,
}); });
redis.defineCommand("unlockpending", { redis.defineCommand("unlockpending", {
@ -215,7 +208,7 @@ if value == ARGV[1] then
redis.call('del', KEYS[1]) redis.call('del', KEYS[1])
end end
` `,
}); });
redis.defineCommand("movefailed", { redis.defineCommand("movefailed", {
@ -232,7 +225,7 @@ if json then
redis.call('hdel', KEYS[1], ARGV[1]); redis.call('hdel', KEYS[1], ARGV[1]);
end end
` `,
}); });
redis.defineCommand("requeue", { redis.defineCommand("requeue", {
@ -255,9 +248,8 @@ if not res then
end end
end end
return 0; return 0;
` `,
}); });
} }
async _getNext() { async _getNext() {
@ -271,7 +263,14 @@ return 0;
async markStarted(url: string) { async markStarted(url: string) {
const started = this._timestamp(); const started = this._timestamp();
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid); return await this.redis.markstarted(
this.pkey,
this.pkey + ":" + url,
url,
started,
this.maxPageTime,
this.uid,
);
} }
async markFinished(url: string) { async markFinished(url: string) {
@ -292,14 +291,17 @@ return 0;
await this.redis.srem(this.skey, url); await this.redis.srem(this.skey, url);
} }
recheckScope(data: {url: string, depth: number, extraHops: number, seedId: number}, seeds: ScopedSeed[]) { recheckScope(
data: { url: string; depth: number; extraHops: number; seedId: number },
seeds: ScopedSeed[],
) {
const seed = seeds[data.seedId]; const seed = seeds[data.seedId];
return seed.isIncluded(data.url, data.depth, data.extraHops); return seed.isIncluded(data.url, data.depth, data.extraHops);
} }
async isFinished() { async isFinished() {
return ((await this.queueSize()) == 0) && ((await this.numDone()) > 0); return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
} }
async setStatus(status_: string) { async setStatus(status_: string) {
@ -369,9 +371,9 @@ return 0;
} }
break; break;
} }
} // TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
catch (e: any) { } catch (e: any) {
logger.warn("Error processing message", e, "redisMessage"); logger.warn("Error processing message", e, "redisMessage");
} }
} }
@ -389,7 +391,7 @@ return 0;
// regexStr just a string, optimize by using glob matching // regexStr just a string, optimize by using glob matching
if (this.isStrMatch(regexStr)) { if (this.isStrMatch(regexStr)) {
matcher = {"match": `*${regexStr}*`}; matcher = { match: `*${regexStr}*` };
} }
const stream = this.redis.zscanStream(this.qkey, matcher); const stream = this.redis.zscanStream(this.qkey, matcher);
@ -404,14 +406,18 @@ return 0;
//if (removed) { //if (removed) {
await this.markExcluded(url); await this.markExcluded(url);
//} //}
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion"); logger.debug(
"Removing excluded URL",
{ url, regex, removed },
"exclusion",
);
} }
} }
stream.resume(); stream.resume();
}); });
return new Promise<void>(resolve => { return new Promise<void>((resolve) => {
stream.on("end", () => { stream.on("end", () => {
resolve(); resolve();
}); });
@ -424,11 +430,19 @@ return 0;
// consider failed if 3 failed retries in 60 secs // consider failed if 3 failed retries in 60 secs
await this.redis.expire(key, 60); await this.redis.expire(key, 60);
return (res >= 3); return res >= 3;
} }
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) { //async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
async addToQueue({url, seedId, depth = 0, extraHops = 0} : {url: string, seedId: number, depth?: number, extraHops?: number}, limit = 0) { async addToQueue(
{
url,
seedId,
depth = 0,
extraHops = 0,
}: { url: string; seedId: number; depth?: number; extraHops?: number },
limit = 0,
) {
const added = this._timestamp(); const added = this._timestamp();
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -441,7 +455,15 @@ return 0;
// 0 - url queued successfully // 0 - url queued successfully
// 1 - url queue size limit reached // 1 - url queue size limit reached
// 2 - url is a dupe // 2 - url is a dupe
return await this.redis.addqueue(this.pkey, this.qkey, this.skey, url, this._getScore(data), JSON.stringify(data), limit); return await this.redis.addqueue(
this.pkey,
this.qkey,
this.skey,
url,
this._getScore(data),
JSON.stringify(data),
limit,
);
} }
async nextFromQueue() { async nextFromQueue() {
@ -479,7 +501,7 @@ return 0;
return { done, queued, pending, failed, errors }; return { done, queued, pending, failed, errors };
} }
_getScore(data: {depth: number, extraHops: number}) { _getScore(data: { depth: number; extraHops: number }) {
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH; return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
} }
@ -489,7 +511,14 @@ return 0;
const len = await this.redis.zcard(key); const len = await this.redis.zcard(key);
for (let i = 0; i < len; i += inc) { for (let i = 0; i < len; i += inc) {
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "LIMIT", i, inc); const someResults = await this.redis.zrangebyscore(
key,
0,
"inf",
"LIMIT",
i,
inc,
);
results.push(...someResults); results.push(...someResults);
} }
@ -508,9 +537,13 @@ return 0;
return results; return results;
} }
async load(
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
async load(state: Record<string, any>, seeds: ScopedSeed[], checkScope: boolean) { state: Record<string, any>,
seeds: ScopedSeed[],
checkScope: boolean,
) {
const seen: string[] = []; const seen: string[] = [];
// need to delete existing keys, if exist to fully reset state // need to delete existing keys, if exist to fully reset state
@ -545,7 +578,7 @@ return 0;
seen.push(data.url); seen.push(data.url);
} }
if (typeof(state.done) === "number") { if (typeof state.done === "number") {
// done key is just an int counter // done key is just an int counter
await this.redis.set(this.dkey, state.done); await this.redis.set(this.dkey, state.done);
} else if (state.done instanceof Array) { } else if (state.done instanceof Array) {
@ -601,7 +634,7 @@ return 0;
async getPendingList() { async getPendingList() {
const list = await this.redis.hvals(this.pkey); const list = await this.redis.hvals(this.pkey);
return list.map(x => JSON.parse(x)); return list.map((x) => JSON.parse(x));
} }
async getErrorList() { async getErrorList() {
@ -615,9 +648,9 @@ return 0;
for (const url of pendingUrls) { for (const url of pendingUrls) {
await this.redis.unlockpending(this.pkey + ":" + url, this.uid); await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
} }
} // TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
catch (e: any) { } catch (e: any) {
logger.error("Redis Del Pending Failed", e, "state"); logger.error("Redis Del Pending Failed", e, "state");
} }
} }
@ -626,7 +659,13 @@ return 0;
const pendingUrls = await this.redis.hkeys(this.pkey); const pendingUrls = await this.redis.hkeys(this.pkey);
for (const url of pendingUrls) { for (const url of pendingUrls) {
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending); const res = await this.redis.requeue(
this.pkey,
this.qkey,
this.pkey + ":" + url,
url,
this.maxRetryPending,
);
switch (res) { switch (res) {
case 1: case 1:
logger.info(`Requeued: ${url}`); logger.info(`Requeued: ${url}`);
@ -656,4 +695,3 @@ return 0;
return await this.redis.lpush(this.ekey, error); return await this.redis.lpush(this.ekey, error);
} }
} }

View file

@ -16,10 +16,8 @@ import { logger } from "./logger.js";
// @ts-expect-error TODO fill in why error is expected // @ts-expect-error TODO fill in why error is expected
import getFolderSize from "get-folder-size"; import getFolderSize from "get-folder-size";
// =========================================================================== // ===========================================================================
export class S3StorageSync export class S3StorageSync {
{
fullPrefix: string; fullPrefix: string;
client: Minio.Client; client: Minio.Client;
@ -36,21 +34,23 @@ export class S3StorageSync
constructor( constructor(
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
urlOrData: string | any, urlOrData: string | any,
{webhookUrl, userId, crawlId} : {
{webhookUrl?: string, userId: string, crawlId: string} webhookUrl,
userId,
crawlId,
}: { webhookUrl?: string; userId: string; crawlId: string },
) { ) {
let url; let url;
let accessKey; let accessKey;
let secretKey; let secretKey;
if (typeof(urlOrData) === "string") { if (typeof urlOrData === "string") {
url = new URL(urlOrData); url = new URL(urlOrData);
accessKey = url.username; accessKey = url.username;
secretKey = url.password; secretKey = url.password;
url.username = ""; url.username = "";
url.password = ""; url.password = "";
this.fullPrefix = url.href; this.fullPrefix = url.href;
} else { } else {
url = new URL(urlOrData.endpointUrl); url = new URL(urlOrData.endpointUrl);
accessKey = urlOrData.accessKey; accessKey = urlOrData.accessKey;
@ -64,7 +64,7 @@ export class S3StorageSync
useSSL: url.protocol === "https:", useSSL: url.protocol === "https:",
accessKey, accessKey,
secretKey, secretKey,
partSize: 100*1024*1024 partSize: 100 * 1024 * 1024,
}); });
this.bucketName = url.pathname.slice(1).split("/")[0]; this.bucketName = url.pathname.slice(1).split("/")[0];
@ -80,14 +80,18 @@ export class S3StorageSync
async uploadFile(srcFilename: string, targetFilename: string) { async uploadFile(srcFilename: string, targetFilename: string) {
const fileUploadInfo = { const fileUploadInfo = {
"bucket": this.bucketName, bucket: this.bucketName,
"crawlId": this.crawlId, crawlId: this.crawlId,
"prefix": this.objectPrefix, prefix: this.objectPrefix,
targetFilename targetFilename,
}; };
logger.info("S3 file upload information", fileUploadInfo, "s3Upload"); logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename); await this.client.fPutObject(
this.bucketName,
this.objectPrefix + targetFilename,
srcFilename,
);
const { hash, crc32 } = await checksumFile("sha256", srcFilename); const { hash, crc32 } = await checksumFile("sha256", srcFilename);
const path = targetFilename; const path = targetFilename;
@ -99,12 +103,24 @@ export class S3StorageSync
} }
async downloadFile(srcFilename: string, destFilename: string) { async downloadFile(srcFilename: string, destFilename: string) {
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename); await this.client.fGetObject(
this.bucketName,
this.objectPrefix + srcFilename,
destFilename,
);
} }
async uploadCollWACZ(srcFilename: string, targetFilename: string, completed = true) { async uploadCollWACZ(
srcFilename: string,
targetFilename: string,
completed = true,
) {
const resource = await this.uploadFile(srcFilename, targetFilename); const resource = await this.uploadFile(srcFilename, targetFilename);
logger.info("WACZ S3 file upload resource", {targetFilename, resource}, "s3Upload"); logger.info(
"WACZ S3 file upload resource",
{ targetFilename, resource },
"s3Upload",
);
if (this.webhookUrl) { if (this.webhookUrl) {
const body = { const body = {
@ -115,17 +131,25 @@ export class S3StorageSync
filename: this.fullPrefix + targetFilename, filename: this.fullPrefix + targetFilename,
...resource, ...resource,
completed completed,
}; };
logger.info(`Pinging Webhook: ${this.webhookUrl}`); logger.info(`Pinging Webhook: ${this.webhookUrl}`);
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) { if (
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)}); this.webhookUrl.startsWith("http://") ||
this.webhookUrl.startsWith("https://")
) {
await fetch(this.webhookUrl, {
method: "POST",
body: JSON.stringify(body),
});
} else if (this.webhookUrl.startsWith("redis://")) { } else if (this.webhookUrl.startsWith("redis://")) {
const parts = this.webhookUrl.split("/"); const parts = this.webhookUrl.split("/");
if (parts.length !== 5) { if (parts.length !== 5) {
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>"); logger.fatal(
"redis webhook url must be in format: redis://<host>:<port>/<db>/<key>",
);
} }
const redis = await initRedis(parts.slice(0, 4).join("/")); const redis = await initRedis(parts.slice(0, 4).join("/"));
await redis.rpush(parts[4], JSON.stringify(body)); await redis.rpush(parts[4], JSON.stringify(body));
@ -139,7 +163,8 @@ export function initStorage() {
return null; return null;
} }
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || ""); const endpointUrl =
process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
const storeInfo = { const storeInfo = {
endpointUrl, endpointUrl,
accessKey: process.env.STORE_ACCESS_KEY, accessKey: process.env.STORE_ACCESS_KEY,
@ -156,7 +181,6 @@ export function initStorage() {
return new S3StorageSync(storeInfo, opts); return new S3StorageSync(storeInfo, opts);
} }
export async function getFileSize(filename: string) { export async function getFileSize(filename: string) {
const stats = await fsp.stat(filename); const stats = await fsp.stat(filename);
return stats.size; return stats.size;
@ -170,20 +194,29 @@ export async function getDirSize(dir: string) {
return size; return size;
} }
export async function checkDiskUtilization(
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
export async function checkDiskUtilization(params: Record<string, any>, archiveDirSize: number, dfOutput=null) { params: Record<string, any>,
const diskUsage : Record<string, string> = await getDiskUsage("/crawls", dfOutput); archiveDirSize: number,
dfOutput = null,
) {
const diskUsage: Record<string, string> = await getDiskUsage(
"/crawls",
dfOutput,
);
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1)); const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
// Check that disk usage isn't already above threshold // Check that disk usage isn't already above threshold
if (usedPercentage >= params.diskUtilization) { if (usedPercentage >= params.diskUtilization) {
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`); logger.info(
`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`,
);
return { return {
stop: true, stop: true,
used: usedPercentage, used: usedPercentage,
projected: null, projected: null,
threshold: params.diskUtilization threshold: params.diskUtilization,
}; };
} }
@ -199,15 +232,20 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
} }
const projectedTotal = kbUsed + kbArchiveDirSize; const projectedTotal = kbUsed + kbArchiveDirSize;
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal); const projectedUsedPercentage = calculatePercentageUsed(
projectedTotal,
kbTotal,
);
if (projectedUsedPercentage >= params.diskUtilization) { if (projectedUsedPercentage >= params.diskUtilization) {
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`); logger.info(
`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`,
);
return { return {
stop: true, stop: true,
used: usedPercentage, used: usedPercentage,
projected: projectedUsedPercentage, projected: projectedUsedPercentage,
threshold: params.diskUtilization threshold: params.diskUtilization,
}; };
} }
@ -215,7 +253,7 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
stop: false, stop: false,
used: usedPercentage, used: usedPercentage,
projected: projectedUsedPercentage, projected: projectedUsedPercentage,
threshold: params.diskUtilization threshold: params.diskUtilization,
}; };
} }
@ -228,9 +266,9 @@ export async function getDFOutput(path: string) {
export async function getDiskUsage(path = "/crawls", dfOutput = null) { export async function getDiskUsage(path = "/crawls", dfOutput = null) {
const result = dfOutput || (await getDFOutput(path)); const result = dfOutput || (await getDFOutput(path));
const lines = result.split("\n"); const lines = result.split("\n");
const keys = lines[0].split(/\s+/ig); const keys = lines[0].split(/\s+/gi);
const rows = lines.slice(1).map(line => { const rows = lines.slice(1).map((line) => {
const values = line.split(/\s+/ig); const values = line.split(/\s+/gi);
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
return keys.reduce((o: Record<string, any>, k, index) => { return keys.reduce((o: Record<string, any>, k, index) => {
@ -245,13 +283,16 @@ export function calculatePercentageUsed(used: number, total: number) {
return Math.round((used / total) * 100); return Math.round((used / total) * 100);
} }
function checksumFile(hashName: string, path: string) : Promise<{hash: string, crc32: number}>{ function checksumFile(
hashName: string,
path: string,
): Promise<{ hash: string; crc32: number }> {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
const hash = createHash(hashName); const hash = createHash(hashName);
let crc: number = 0; let crc: number = 0;
const stream = fs.createReadStream(path); const stream = fs.createReadStream(path);
stream.on("error", err => reject(err)); stream.on("error", (err) => reject(err));
stream.on("data", (chunk) => { stream.on("data", (chunk) => {
hash.update(chunk); hash.update(chunk);
crc = crc32(chunk, crc); crc = crc32(chunk, crc);
@ -261,10 +302,12 @@ function checksumFile(hashName: string, path: string) : Promise<{hash: string, c
} }
export function interpolateFilename(filename: string, crawlId: string) { export function interpolateFilename(filename: string, crawlId: string) {
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, "")); filename = filename.replace(
"@ts",
new Date().toISOString().replace(/[:TZz.-]/g, ""),
);
filename = filename.replace("@hostname", os.hostname()); filename = filename.replace("@hostname", os.hostname());
filename = filename.replace("@hostsuffix", os.hostname().slice(-14)); filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
filename = filename.replace("@id", crawlId); filename = filename.replace("@id", crawlId);
return filename; return filename;
} }

View file

@ -15,25 +15,39 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
this.cdp = cdp; this.cdp = cdp;
} }
async extractAndStoreText(resourceType: string, ignoreIfMatchesLast = false, saveToWarc = false) { async extractAndStoreText(
resourceType: string,
ignoreIfMatchesLast = false,
saveToWarc = false,
) {
try { try {
const text = await this.doGetText(); const text = await this.doGetText();
if (ignoreIfMatchesLast && text === this.lastText) { if (ignoreIfMatchesLast && text === this.lastText) {
this.lastText = this.text; this.lastText = this.text;
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text"); logger.debug(
"Skipping, extracted text unchanged from last extraction",
{ url: this.url },
"text",
);
return { changed: false, text }; return { changed: false, text };
} }
if (saveToWarc) { if (saveToWarc) {
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain"); await this.writeBufferToWARC(
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`); new TextEncoder().encode(text),
resourceType,
"text/plain",
);
logger.debug(
`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`,
);
} }
this.lastText = text; this.lastText = text;
return { changed: true, text }; return { changed: true, text };
} // TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
catch (e: any) { } catch (e: any) {
logger.debug("Error extracting text", e, "text"); logger.debug("Error extracting text", e, "text");
return { changed: false, text: null }; return { changed: false, text: null };
} }
@ -42,19 +56,30 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
abstract doGetText(): Promise<string>; abstract doGetText(): Promise<string>;
} }
// ============================================================================ // ============================================================================
export class TextExtractViaSnapshot extends BaseTextExtract { export class TextExtractViaSnapshot extends BaseTextExtract {
async doGetText(): Promise<string> { async doGetText(): Promise<string> {
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []}); const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {
computedStyles: [],
});
return this.parseTextFromDOMSnapshot(result); return this.parseTextFromDOMSnapshot(result);
} }
parseTextFromDOMSnapshot(result: Protocol.DOMSnapshot.CaptureSnapshotResponse) : string { parseTextFromDOMSnapshot(
result: Protocol.DOMSnapshot.CaptureSnapshotResponse,
): string {
const TEXT_NODE = 3; const TEXT_NODE = 3;
const ELEMENT_NODE = 1; const ELEMENT_NODE = 1;
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"]; const SKIPPED_NODES = [
"SCRIPT",
"STYLE",
"HEADER",
"FOOTER",
"BANNER-DIV",
"NOSCRIPT",
"TITLE",
];
const { strings, documents } = result; const { strings, documents } = result;
@ -91,11 +116,13 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
} }
} }
// ============================================================================ // ============================================================================
export class TextExtractViaDocument extends BaseTextExtract { export class TextExtractViaDocument extends BaseTextExtract {
async doGetText(): Promise<string> { async doGetText(): Promise<string> {
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true}); const result = await this.cdp.send("DOM.getDocument", {
depth: -1,
pierce: true,
});
return this.parseTextFromDOM(result); return this.parseTextFromDOM(result);
} }
@ -108,8 +135,20 @@ export class TextExtractViaDocument extends BaseTextExtract {
return accum.join("\n"); return accum.join("\n");
} }
parseText(node: Protocol.DOM.Node, metadata: Record<string, string> | null, accum: string[]) { parseText(
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"]; node: Protocol.DOM.Node,
metadata: Record<string, string> | null,
accum: string[],
) {
const SKIPPED_NODES = [
"head",
"script",
"style",
"header",
"footer",
"banner-div",
"noscript",
];
const EMPTY_LIST: Protocol.DOM.Node[] = []; const EMPTY_LIST: Protocol.DOM.Node[] = [];
const TEXT = "#text"; const TEXT = "#text";
const TITLE = "title"; const TITLE = "title";
@ -150,4 +189,3 @@ export class TextExtractViaDocument extends BaseTextExtract {
} }
} }
} }

View file

@ -1,7 +1,7 @@
import { logger } from "./logger.js"; import { logger } from "./logger.js";
export function sleep(seconds: number) { export function sleep(seconds: number) {
return new Promise(resolve => setTimeout(resolve, seconds * 1000)); return new Promise((resolve) => setTimeout(resolve, seconds * 1000));
} }
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
@ -13,27 +13,33 @@ export function timedRun(
message = "Promise timed out", message = "Promise timed out",
logDetails = {}, logDetails = {},
context = "general", context = "general",
isWarn=false isWarn = false,
) { ) {
// return Promise return value or log error if timeout is reached first // return Promise return value or log error if timeout is reached first
const timeout = seconds * 1000; const timeout = seconds * 1000;
const rejectPromiseOnTimeout = (timeout: number) => { const rejectPromiseOnTimeout = (timeout: number) => {
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
setTimeout(() => (reject("timeout reached")), timeout); setTimeout(() => reject("timeout reached"), timeout);
}); });
}; };
return Promise.race([promise, rejectPromiseOnTimeout(timeout)]) return Promise.race([promise, rejectPromiseOnTimeout(timeout)]).catch(
.catch((err) => { (err) => {
if (err == "timeout reached") { if (err == "timeout reached") {
const logFunc = isWarn ? logger.warn : logger.error; const logFunc = isWarn ? logger.warn : logger.error;
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context); logFunc.call(
logger,
message,
{ seconds: seconds, ...logDetails },
context,
);
} else { } else {
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context); //logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
throw err; throw err;
} }
}); },
);
} }
export function secondsElapsed(startTime: number, nowDate: Date | null = null) { export function secondsElapsed(startTime: number, nowDate: Date | null = null) {

View file

@ -2,8 +2,7 @@ import fs from "fs";
import path from "path"; import path from "path";
import * as warcio from "warcio"; import * as warcio from "warcio";
export class WARCResourceWriter export class WARCResourceWriter {
{
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
page: any; page: any;
@ -12,16 +11,32 @@ export class WARCResourceWriter
warcName: string; warcName: string;
date: Date; date: Date;
constructor({url, directory, date, warcName} : {url: string, directory: string, date: Date, warcName: string}) { constructor({
url,
directory,
date,
warcName,
}: {
url: string;
directory: string;
date: Date;
warcName: string;
}) {
this.url = url; this.url = url;
this.directory = directory; this.directory = directory;
this.warcName = path.join(this.directory, warcName); this.warcName = path.join(this.directory, warcName);
this.date = date ? date : new Date(); this.date = date ? date : new Date();
} }
async writeBufferToWARC(contents: Uint8Array, resourceType: string, contentType: string) { async writeBufferToWARC(
contents: Uint8Array,
resourceType: string,
contentType: string,
) {
const warcRecord = await this.wrap(contents, resourceType, contentType); const warcRecord = await this.wrap(contents, resourceType, contentType);
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true}); const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {
gzip: true,
});
fs.appendFileSync(this.warcName, warcRecordBuffer); fs.appendFileSync(this.warcName, warcRecordBuffer);
} }
@ -34,12 +49,15 @@ export class WARCResourceWriter
} }
const resourceUrl = `urn:${resourceType}:${this.url}`; const resourceUrl = `urn:${resourceType}:${this.url}`;
return warcio.WARCRecord.create({ return warcio.WARCRecord.create(
{
url: resourceUrl, url: resourceUrl,
date: this.date.toISOString(), date: this.date.toISOString(),
type: warcRecordType, type: warcRecordType,
warcVersion, warcVersion,
warcHeaders warcHeaders,
}, content()); },
content(),
);
} }
} }

View file

@ -7,10 +7,8 @@ import { WARCSerializer } from "warcio/node";
import { logger, errJSON } from "./logger.js"; import { logger, errJSON } from "./logger.js";
import type { IndexerOffsetLength, WARCRecord } from "warcio"; import type { IndexerOffsetLength, WARCRecord } from "warcio";
// ================================================================= // =================================================================
export class WARCWriter implements IndexerOffsetLength export class WARCWriter implements IndexerOffsetLength {
{
archivesDir: string; archivesDir: string;
tempCdxDir: string; tempCdxDir: string;
filename: string; filename: string;
@ -25,8 +23,19 @@ export class WARCWriter implements IndexerOffsetLength
fh?: Writable | null; fh?: Writable | null;
cdxFH?: Writable | null; cdxFH?: Writable | null;
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails} : constructor({
{archivesDir: string, tempCdxDir: string, filename: string, gzip: boolean, logDetails: Record<string, string>}) { archivesDir,
tempCdxDir,
filename,
gzip,
logDetails,
}: {
archivesDir: string;
tempCdxDir: string;
filename: string;
gzip: boolean;
logDetails: Record<string, string>;
}) {
this.archivesDir = archivesDir; this.archivesDir = archivesDir;
this.tempCdxDir = tempCdxDir; this.tempCdxDir = tempCdxDir;
this.filename = filename; this.filename = filename;
@ -43,14 +52,22 @@ export class WARCWriter implements IndexerOffsetLength
async initFH() { async initFH() {
if (!this.fh) { if (!this.fh) {
this.fh = fs.createWriteStream(path.join(this.archivesDir, this.filename)); this.fh = fs.createWriteStream(
path.join(this.archivesDir, this.filename),
);
} }
if (!this.cdxFH && this.tempCdxDir) { if (!this.cdxFH && this.tempCdxDir) {
this.cdxFH = fs.createWriteStream(path.join(this.tempCdxDir, this.filename + ".cdx")); this.cdxFH = fs.createWriteStream(
path.join(this.tempCdxDir, this.filename + ".cdx"),
);
} }
} }
async writeRecordPair(responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined) { async writeRecordPair(
responseRecord: WARCRecord,
requestRecord: WARCRecord,
responseSerializer: WARCSerializer | undefined = undefined,
) {
const opts = { gzip: this.gzip }; const opts = { gzip: this.gzip };
if (!responseSerializer) { if (!responseSerializer) {
@ -59,15 +76,20 @@ export class WARCWriter implements IndexerOffsetLength
await this.initFH(); await this.initFH();
this.recordLength = await this._writeRecord(responseRecord, responseSerializer); this.recordLength = await this._writeRecord(
responseRecord,
responseSerializer,
);
this._writeCDX(responseRecord); this._writeCDX(responseRecord);
const requestSerializer = new WARCSerializer(requestRecord, opts); const requestSerializer = new WARCSerializer(requestRecord, opts);
this.recordLength = await this._writeRecord(requestRecord, requestSerializer); this.recordLength = await this._writeRecord(
requestRecord,
requestSerializer,
);
this._writeCDX(requestRecord); this._writeCDX(requestRecord);
} }
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) { async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
@ -83,7 +105,11 @@ export class WARCWriter implements IndexerOffsetLength
try { try {
this.fh.write(chunk); this.fh.write(chunk);
} catch (e) { } catch (e) {
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer"); logger.error(
"Error writing to WARC, corruption possible",
{ ...errJSON(e), url, ...this.logDetails },
"writer",
);
} }
} }
@ -119,7 +145,7 @@ export class WARCWriter implements IndexerOffsetLength
// ================================================================= // =================================================================
export function streamFinish(fh: Writable) { export function streamFinish(fh: Writable) {
const p = new Promise<void>(resolve => { const p = new Promise<void>((resolve) => {
fh.once("finish", () => resolve()); fh.once("finish", () => resolve());
}); });
fh.end(); fh.end();

View file

@ -16,9 +16,14 @@ const TEARDOWN_TIMEOUT = 10;
const FINISHED_TIMEOUT = 60; const FINISHED_TIMEOUT = 60;
// =========================================================================== // ===========================================================================
export function runWorkers(
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number, collDir: string) { crawler: any,
numWorkers: number,
maxPageTime: number,
collDir: string,
) {
logger.info(`Creating ${numWorkers} workers`, {}, "worker"); logger.info(`Creating ${numWorkers} workers`, {}, "worker");
const workers = []; const workers = [];
@ -39,13 +44,12 @@ export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number
} }
for (let i = 0; i < numWorkers; i++) { for (let i = 0; i < numWorkers; i++) {
workers.push(new PageWorker((i + offset), crawler, maxPageTime, collDir)); workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir));
} }
return Promise.allSettled(workers.map((worker) => worker.run())); return Promise.allSettled(workers.map((worker) => worker.run()));
} }
// =========================================================================== // ===========================================================================
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -55,17 +59,18 @@ export type WorkerOpts = Record<string, any> & {
workerid: WorkerId; workerid: WorkerId;
// eslint-disable-next-line @typescript-eslint/ban-types // eslint-disable-next-line @typescript-eslint/ban-types
callbacks: Record<string, Function>; callbacks: Record<string, Function>;
directFetchCapture?: ((url: string) => Promise<{fetched: boolean, mime: string}>) | null; directFetchCapture?:
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
| null;
}; };
// =========================================================================== // ===========================================================================
export type WorkerState = WorkerOpts & { export type WorkerState = WorkerOpts & {
data: PageState data: PageState;
}; };
// =========================================================================== // ===========================================================================
export class PageWorker export class PageWorker {
{
id: WorkerId; id: WorkerId;
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -91,16 +96,25 @@ export class PageWorker
recorder: Recorder; recorder: Recorder;
constructor(
id: WorkerId,
// TODO: Fix this the next time the file is edited. // TODO: Fix this the next time the file is edited.
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
constructor(id: WorkerId, crawler: any, maxPageTime: number, collDir: string) { crawler: any,
maxPageTime: number,
collDir: string,
) {
this.id = id; this.id = id;
this.crawler = crawler; this.crawler = crawler;
this.maxPageTime = maxPageTime; this.maxPageTime = maxPageTime;
this.logDetails = { workerid: this.id }; this.logDetails = { workerid: this.id };
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler}); this.recorder = new Recorder({
workerid: id,
collDir,
crawler: this.crawler,
});
this.crawler.browser.recorders.push(this.recorder); this.crawler.browser.recorders.push(this.recorder);
} }
@ -121,7 +135,7 @@ export class PageWorker
TEARDOWN_TIMEOUT, TEARDOWN_TIMEOUT,
"Page Teardown Timed Out", "Page Teardown Timed Out",
this.logDetails, this.logDetails,
"worker" "worker",
); );
} catch (e) { } catch (e) {
// ignore // ignore
@ -129,13 +143,17 @@ export class PageWorker
} }
try { try {
logger.debug("Closing page", {crashed: this.crashed, workerid: this.id}, "worker"); logger.debug(
"Closing page",
{ crashed: this.crashed, workerid: this.id },
"worker",
);
await timedRun( await timedRun(
this.page.close(), this.page.close(),
TEARDOWN_TIMEOUT, TEARDOWN_TIMEOUT,
"Page Close Timed Out", "Page Close Timed Out",
this.logDetails, this.logDetails,
"worker" "worker",
); );
} catch (e) { } catch (e) {
// ignore // ignore
@ -156,8 +174,18 @@ export class PageWorker
} }
async initPage(url: string): Promise<WorkerOpts> { async initPage(url: string): Promise<WorkerOpts> {
if (!this.crashed && this.page && this.opts && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) { if (
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker"); !this.crashed &&
this.page &&
this.opts &&
++this.reuseCount <= MAX_REUSE &&
this.isSameOrigin(url)
) {
logger.debug(
"Reusing page",
{ reuseCount: this.reuseCount, ...this.logDetails },
"worker",
);
return this.opts; return this.opts;
} else if (this.page) { } else if (this.page) {
await this.closePage(); await this.closePage();
@ -176,7 +204,7 @@ export class PageWorker
NEW_WINDOW_TIMEOUT, NEW_WINDOW_TIMEOUT,
"New Window Timed Out", "New Window Timed Out",
{ workerid }, { workerid },
"worker" "worker",
); );
if (!result) { if (!result) {
@ -188,7 +216,9 @@ export class PageWorker
this.page = page; this.page = page;
this.cdp = cdp; this.cdp = cdp;
this.callbacks = {}; this.callbacks = {};
const directFetchCapture = this.recorder ? (x: string) => this.recorder.directFetchCapture(x) : null; const directFetchCapture = this.recorder
? (x: string) => this.recorder.directFetchCapture(x)
: null;
this.opts = { this.opts = {
page, page,
cdp, cdp,
@ -203,7 +233,9 @@ export class PageWorker
// updated per page crawl // updated per page crawl
this.crashed = false; this.crashed = false;
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject); this.crashBreak = new Promise(
(resolve, reject) => (this.markCrashed = reject),
);
this.logDetails = { page: page.url(), workerid }; this.logDetails = { page: page.url(), workerid };
@ -213,7 +245,11 @@ export class PageWorker
page.on("error", (err: any) => { page.on("error", (err: any) => {
// ensure we're still on this page, otherwise ignore! // ensure we're still on this page, otherwise ignore!
if (this.page === page) { if (this.page === page) {
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker"); logger.error(
"Page Crashed",
{ ...errJSON(err), ...this.logDetails },
"worker",
);
this.crashed = true; this.crashed = true;
if (this.markCrashed) { if (this.markCrashed) {
this.markCrashed("crashed"); this.markCrashed("crashed");
@ -224,9 +260,12 @@ export class PageWorker
await this.crawler.setupPage(this.opts); await this.crawler.setupPage(this.opts);
return this.opts; return this.opts;
} catch (err) { } catch (err) {
logger.warn("Error getting new page", {"workerid": this.id, ...errJSON(err)}, "worker"); logger.warn(
"Error getting new page",
{ workerid: this.id, ...errJSON(err) },
"worker",
);
retry++; retry++;
if (!this.crawler.browser.browser) { if (!this.crawler.browser.browser) {
@ -234,7 +273,11 @@ export class PageWorker
} }
if (retry >= MAX_REUSE) { if (retry >= MAX_REUSE) {
logger.fatal("Unable to get new page, browser likely crashed", this.logDetails, "worker"); logger.fatal(
"Unable to get new page, browser likely crashed",
this.logDetails,
"worker",
);
} }
await sleep(0.5); await sleep(0.5);
@ -262,7 +305,7 @@ export class PageWorker
const { data } = opts; const { data } = opts;
const { url } = data; const { url } = data;
logger.info("Starting page", {workerid, "page": url}, "worker"); logger.info("Starting page", { workerid, page: url }, "worker");
this.logDetails = { page: url, workerid }; this.logDetails = { page: url, workerid };
@ -281,14 +324,17 @@ export class PageWorker
this.maxPageTime, this.maxPageTime,
"Page Worker Timeout", "Page Worker Timeout",
this.logDetails, this.logDetails,
"worker" "worker",
), ),
this.crashBreak this.crashBreak,
]); ]);
} catch (e) { } catch (e) {
if (e instanceof Error && e.message !== "logged" && !this.crashed) { if (e instanceof Error && e.message !== "logged" && !this.crashed) {
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker"); logger.error(
"Worker Exception",
{ ...errJSON(e), ...this.logDetails },
"worker",
);
} }
} finally { } finally {
await timedRun( await timedRun(
@ -296,7 +342,7 @@ export class PageWorker
FINISHED_TIMEOUT, FINISHED_TIMEOUT,
"Page Finished Timed Out", "Page Finished Timed Out",
this.logDetails, this.logDetails,
"worker" "worker",
); );
} }
} }
@ -306,9 +352,17 @@ export class PageWorker
try { try {
await this.runLoop(); await this.runLoop();
logger.info("Worker done, all tasks complete", {workerid: this.id}, "worker"); logger.info(
"Worker done, all tasks complete",
{ workerid: this.id },
"worker",
);
} catch (e) { } catch (e) {
logger.error("Worker error, exiting", {...errJSON(e), workerid: this.id}, "worker"); logger.error(
"Worker error, exiting",
{ ...errJSON(e), workerid: this.id },
"worker",
);
} finally { } finally {
if (this.recorder) { if (this.recorder) {
await this.recorder.onDone(); await this.recorder.onDone();
@ -342,7 +396,6 @@ export class PageWorker
await this.timedCrawlPage({ ...opts, data }); await this.timedCrawlPage({ ...opts, data });
loggedWaiting = false; loggedWaiting = false;
} else { } else {
// indicate that the worker has no more work (mostly for screencasting, status, etc...) // indicate that the worker has no more work (mostly for screencasting, status, etc...)
// depending on other works, will either get more work or crawl will end // depending on other works, will either get more work or crawl will end
@ -354,7 +407,11 @@ export class PageWorker
// if pending, sleep and check again // if pending, sleep and check again
if (pending) { if (pending) {
if (!loggedWaiting) { if (!loggedWaiting) {
logger.debug("No crawl tasks, but pending tasks remain, waiting", {pending, workerid: this.id}, "worker"); logger.debug(
"No crawl tasks, but pending tasks remain, waiting",
{ pending, workerid: this.id },
"worker",
);
loggedWaiting = true; loggedWaiting = true;
} }
await sleep(0.5); await sleep(0.5);
@ -368,5 +425,3 @@ export class PageWorker
} }
} }
} }

View file

@ -10,17 +10,21 @@ function runCrawl(name, config, commandExtra = "") {
const configYaml = yaml.dump(config); const configYaml = yaml.dump(config);
try { try {
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"}); const proc = child_process.execSync(
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
);
console.log(proc); console.log(proc);
} } catch (error) {
catch (error) {
console.log(error); console.log(error);
} }
} }
function doesCDXContain(coll, value) { function doesCDXContain(coll, value) {
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`); const data = fs.readFileSync(
`test-crawls/collections/${coll}/indexes/index.cdxj`,
);
return data.indexOf(value) >= 0; return data.indexOf(value) >= 0;
} }
@ -41,11 +45,13 @@ test("test crawl without ad block for specific URL", () => {
test("testcrawl with ad block for specific URL", () => { test("testcrawl with ad block for specific URL", () => {
const config = { const config = {
"url": "https://www.mozilla.org/en-US/firefox/", url: "https://www.mozilla.org/en-US/firefox/",
"blockAds": true, blockAds: true,
}; };
runCrawl("adblock-block", config); runCrawl("adblock-block", config);
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false); expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(
false,
);
}); });

View file

@ -11,7 +11,11 @@ test("dynamically add exclusion while crawl is running", async () => {
}); });
try { try {
exec("docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", {"shell": "/bin/bash"}, callback); exec(
"docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
{ shell: "/bin/bash" },
callback,
);
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
@ -33,7 +37,10 @@ test("dynamically add exclusion while crawl is running", async () => {
const uids = await redis.hkeys("test:status"); const uids = await redis.hkeys("test:status");
// exclude all pages containing 'webrecorder', should clear out the queue and end the crawl // exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
await redis.rpush(`${uids[0]}:msg`, JSON.stringify({type: "addExclusion", regex: "webrecorder"})); await redis.rpush(
`${uids[0]}:msg`,
JSON.stringify({ type: "addExclusion", regex: "webrecorder" }),
);
// ensure 'Add Exclusion is contained in the debug logs // ensure 'Add Exclusion is contained in the debug logs
const { stdout } = await p; const { stdout } = await p;
@ -44,4 +51,3 @@ test("dynamically add exclusion while crawl is running", async () => {
await redis.disconnect(); await redis.disconnect();
}); });

View file

@ -3,16 +3,18 @@ import fs from "fs";
import path from "path"; import path from "path";
import md5 from "md5"; import md5 from "md5";
test("ensure basic crawl run with docker run passes", async () => { test("ensure basic crawl run with docker run passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\""); child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description"',
);
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz"); );
child_process.execSync(
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
);
}); });
test("check that a combined warc file exists in the archive folder", () => { test("check that a combined warc file exists in the archive folder", () => {
@ -27,9 +29,10 @@ test("check that a combined warc file exists in the archive folder", () => {
expect(captureFound).toEqual(1); expect(captureFound).toEqual(1);
}); });
test("check that a combined warc file is under the rolloverSize", () => { test("check that a combined warc file is under the rolloverSize", () => {
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive")); const warcLists = fs.readdirSync(
path.join("test-crawls/collections/wr-net/wacz", "archive"),
);
let rolloverSize = 0; let rolloverSize = 0;
function getFileSize(filename) { function getFileSize(filename) {
@ -37,7 +40,9 @@ test("check that a combined warc file is under the rolloverSize", () => {
} }
for (let i = 0; i < warcLists.length; i++) { for (let i = 0; i < warcLists.length; i++) {
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i])); const size = getFileSize(
path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
);
if (size < 10000) { if (size < 10000) {
rolloverSize = 1; rolloverSize = 1;
} }
@ -46,27 +51,57 @@ test("check that a combined warc file is under the rolloverSize", () => {
}); });
test("check that the pages.jsonl file exists in the collection under the pages folder", () => { test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
).toBe(true);
}); });
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => { test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
).toBe(true);
}); });
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => { test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]); const crawl_hash = md5(
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]); JSON.parse(
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]); fs
.readFileSync(
"test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
"utf8",
)
.split("\n")[1],
)["text"],
);
const wacz_hash = md5(
JSON.parse(
fs
.readFileSync(
"test-crawls/collections/wr-net/pages/pages.jsonl",
"utf8",
)
.split("\n")[1],
)["text"],
);
const fixture_hash = md5(
JSON.parse(
fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
)["text"],
);
expect(wacz_hash).toEqual(fixture_hash); expect(wacz_hash).toEqual(fixture_hash);
expect(wacz_hash).toEqual(crawl_hash); expect(wacz_hash).toEqual(crawl_hash);
}); });
test("check that the supplied title and description made it into datapackage.json", () => { test("check that the supplied title and description made it into datapackage.json", () => {
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true); expect(
fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
).toBe(true);
const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8"); const data = fs.readFileSync(
"test-crawls/collections/wr-net/wacz/datapackage.json",
"utf8",
);
const dataPackageJSON = JSON.parse(data); const dataPackageJSON = JSON.parse(data);
expect(dataPackageJSON.title).toEqual("test title"); expect(dataPackageJSON.title).toEqual("test title");
expect(dataPackageJSON.description).toEqual("test description"); expect(dataPackageJSON.description).toEqual("test description");

View file

@ -10,17 +10,21 @@ function runCrawl(name, config, commandExtra = "") {
const configYaml = yaml.dump(config); const configYaml = yaml.dump(config);
try { try {
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"}); const proc = child_process.execSync(
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
);
console.log(proc); console.log(proc);
} } catch (error) {
catch (error) {
console.log(error); console.log(error);
} }
} }
function doesCDXContain(coll, value) { function doesCDXContain(coll, value) {
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`); const data = fs.readFileSync(
`test-crawls/collections/${coll}/indexes/index.cdxj`,
);
return data.indexOf(value) >= 0; return data.indexOf(value) >= 0;
} }
@ -39,131 +43,154 @@ test("test crawl without block for specific URL", () => {
}); });
*/ */
test("test block rule on specific URL", () => { test("test block rule on specific URL", () => {
const config = { const config = {
"url": "https://www.iana.org/", url: "https://www.iana.org/",
"blockRules": [ blockRules: [{ url: "adsense" }],
{"url": "adsense"}
]
}; };
runCrawl("block-1", config); runCrawl("block-1", config);
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false); expect(
doesCDXContain(
"block-1",
"https://cse.google.com/adsense/search/async-ads.js",
),
).toBe(false);
}); });
test("test block rule based on iframe text, content included due to match", () => { test("test block rule based on iframe text, content included due to match", () => {
const config = { const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{ blockRules: [
"url": "https://www.youtube.com/embed/", {
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"", url: "https://www.youtube.com/embed/",
"type": "allowOnly" frameTextMatch:
}] '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
type: "allowOnly",
},
],
}; };
runCrawl("block-2", config); runCrawl("block-2", config);
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true); expect(doesCDXContain("block-2", '"video/mp4"')).toBe(true);
}); });
test("test block rule based on iframe text, wrong text, content should be excluded", () => { test("test block rule based on iframe text, wrong text, content should be excluded", () => {
const config = { const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{ blockRules: [
"url": "https://www.youtube.com/embed/", {
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"", url: "https://www.youtube.com/embed/",
"type": "allowOnly" frameTextMatch:
}] '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"',
type: "allowOnly",
},
],
}; };
runCrawl("block-3", config); runCrawl("block-3", config);
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false); expect(doesCDXContain("block-3", '"video/mp4"')).toBe(false);
}); });
test("test block rule based on iframe text, block matched", () => { test("test block rule based on iframe text, block matched", () => {
const config = { const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{ blockRules: [
"url": "https://www.youtube.com/embed/", {
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"", url: "https://www.youtube.com/embed/",
}] frameTextMatch:
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
},
],
}; };
runCrawl("block-4", config); runCrawl("block-4", config);
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false); expect(doesCDXContain("block-4", '"video/mp4"')).toBe(false);
}); });
test("test rule based on iframe text not matching, plus allowOnly iframe", () => { test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
const config = { const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{ blockRules: [
"url": "example.com/embed/", {
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"", url: "example.com/embed/",
"type": "block" frameTextMatch:
}, { '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
"url": "(youtube.com|example.com)/embed/", type: "block",
"type": "allowOnly", },
"inFrameUrl": "oembed.link/", {
}] url: "(youtube.com|example.com)/embed/",
type: "allowOnly",
inFrameUrl: "oembed.link/",
},
],
}; };
runCrawl("non-block-5", config); runCrawl("non-block-5", config);
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true); expect(doesCDXContain("non-block-5", '"video/mp4"')).toBe(true);
}); });
test("test block url in frame url", () => { test("test block url in frame url", () => {
const config = { const config = {
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
"blockRules": [{ blockRules: [
"url": "maxresdefault.jpg", {
"type": "block", url: "maxresdefault.jpg",
"inFrameUrl": "youtube.com/embed", type: "block",
}] inFrameUrl: "youtube.com/embed",
},
],
}; };
runCrawl("block-6", config); runCrawl("block-6", config);
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false); expect(
doesCDXContain(
"block-6",
'"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"',
),
).toBe(false);
}); });
test("test block rules complex example, block external urls on main frame, but not on youtube", () => { test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
const config = { const config = {
"seeds": [ seeds: ["https://archiveweb.page/en/troubleshooting/errors/"],
"https://archiveweb.page/en/troubleshooting/errors/", depth: "0",
blockRules: [
{
url: "(archiveweb.page|www.youtube.com)",
type: "allowOnly",
inFrameUrl: "archiveweb.page",
},
{
url: "https://archiveweb.page/assets/js/vendor/lunr.min.js",
inFrameUrl: "archiveweb.page",
},
{
url: "https://www.youtube.com/embed/",
type: "allowOnly",
frameTextMatch:
'(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")',
},
], ],
"depth": "0",
"blockRules": [{
"url": "(archiveweb.page|www.youtube.com)",
"type": "allowOnly",
"inFrameUrl": "archiveweb.page"
}, {
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
"inFrameUrl": "archiveweb.page"
}, {
"url": "https://www.youtube.com/embed/",
"type": "allowOnly",
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
}],
"combineWARC": true, combineWARC: true,
"logging": "stats,debug" logging: "stats,debug",
}; };
runCrawl("block-7", config); runCrawl("block-7", config);
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false); expect(
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true); doesCDXContain(
"block-7",
'"https://archiveweb.page/assets/js/vendor/lunr.min.js"',
),
).toBe(false);
expect(doesCDXContain("block-7", '"video/mp4"')).toBe(true);
}); });

View file

@ -3,31 +3,30 @@ import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback); const exec = util.promisify(execCallback);
test("check that the collection name is properly validated", async () => { test("check that the collection name is properly validated", async () => {
let passed = ""; let passed = "";
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid"); await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid",
);
passed = true; passed = true;
} } catch (error) {
catch (error) {
passed = false; passed = false;
} }
expect(passed).toBe(true); expect(passed).toBe(true);
}); });
test("check that the collection name is not accepted if it doesn't meets our standards", async () => { test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
let passed = ""; let passed = "";
try { try {
await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid"); await exec(
"docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid",
);
passed = true; passed = true;
} } catch (e) {
catch(e){
passed = false; passed = false;
} }
expect(passed).toBe(false); expect(passed).toBe(false);
}); });

View file

@ -6,17 +6,19 @@ import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback); const exec = util.promisify(execCallback);
test("check yaml config file with seed list is used", async () => { test("check yaml config file with seed list is used", async () => {
try { try {
await exec(
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0"); "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0",
} );
catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8"); const crawledPages = fs.readFileSync(
"test-crawls/collections/configtest/pages/pages.jsonl",
"utf8",
);
const pages = new Set(); const pages = new Set();
for (const line of crawledPages.trim().split("\n")) { for (const line of crawledPages.trim().split("\n")) {
@ -26,7 +28,9 @@ test("check yaml config file with seed list is used", async () => {
} }
} }
const config = yaml.load(fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8")); const config = yaml.load(
fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"),
);
let foundAllSeeds = true; let foundAllSeeds = true;
@ -38,20 +42,24 @@ test("check yaml config file with seed list is used", async () => {
} }
expect(foundAllSeeds).toBe(true); expect(foundAllSeeds).toBe(true);
expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true); expect(
fs.existsSync("test-crawls/collections/configtest/configtest.wacz"),
).toBe(true);
}); });
test("check yaml config file will be overwritten by command line", async () => { test("check yaml config file will be overwritten by command line", async () => {
try { try {
await exec(
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000"); "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000",
} );
catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8"); const crawledPages = fs.readFileSync(
"test-crawls/collections/configtest-2/pages/pages.jsonl",
"utf8",
);
const pages = new Set(); const pages = new Set();
for (const line of crawledPages.trim().split("\n")) { for (const line of crawledPages.trim().split("\n")) {
@ -63,5 +71,4 @@ test("check yaml config file will be overwritten by command line", async () => {
expect(pages.has("https://specs.webrecorder.net/")).toBe(true); expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
expect(pages.size).toBe(1); expect(pages.size).toBe(1);
}); });

View file

@ -7,15 +7,20 @@ test("pass config file via stdin", async () => {
const config = yaml.load(configYaml); const config = yaml.load(configYaml);
try { try {
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"}); const proc = child_process.execSync(
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202",
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
);
console.log(proc); console.log(proc);
} } catch (error) {
catch (error) {
console.log(error); console.log(error);
} }
const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8"); const crawledPages = fs.readFileSync(
"test-crawls/collections/config-stdin/pages/pages.jsonl",
"utf8",
);
const pages = new Set(); const pages = new Set();
for (const line of crawledPages.trim().split("\n")) { for (const line of crawledPages.trim().split("\n")) {
@ -37,6 +42,7 @@ test("pass config file via stdin", async () => {
} }
expect(foundAllSeeds).toBe(true); expect(foundAllSeeds).toBe(true);
expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true); expect(
fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz"),
).toBe(true);
}); });

View file

@ -1,31 +1,48 @@
import child_process from "child_process"; import child_process from "child_process";
import fs from "fs"; import fs from "fs";
test("ensure --overwrite with existing collection results in a successful crawl", async () => { test("ensure --overwrite with existing collection results in a successful crawl", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite",
);
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite",
);
}); });
test("check that the pages.jsonl file exists in the collection under the pages folder", () => { test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
).toBe(true);
}); });
test("check that the WACZ file exists in the collection", () => { test("check that the WACZ file exists in the collection", () => {
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
).toBe(true);
}); });
//----------- //-----------
test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => { test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite",
);
}); });
test("check that the pages.jsonl file exists in the collection under the pages folder", () => { test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync(
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
),
).toBe(true);
}); });
test("check that the WACZ file exists in the collection", () => { test("check that the WACZ file exists in the collection", () => {
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync(
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
),
).toBe(true);
}); });

View file

@ -1,23 +1,36 @@
import child_process from "child_process"; import child_process from "child_process";
test("test custom behaviors", async () => { test("test custom behaviors", async () => {
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page"); const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
);
const log = res.toString(); const log = res.toString();
// custom behavior ran for example.com // custom behavior ran for example.com
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true); expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
) > 0,
).toBe(true);
// but not for example.org // but not for example.org
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false); expect(
log.indexOf(
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(false);
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true); expect(
log.indexOf(
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
) > 0,
).toBe(true);
// another custom behavior ran for webrecorder.net // another custom behavior ran for webrecorder.net
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true); expect(
log.indexOf(
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
) > 0,
).toBe(true);
}); });

View file

@ -1,9 +1,8 @@
/* eslint-disable @typescript-eslint/no-unused-vars */ /* eslint-disable @typescript-eslint/no-unused-vars */
class TestBehavior2 class TestBehavior2 {
{
static init() { static init() {
return { return {
state: {} state: {},
}; };
} }
@ -15,7 +14,6 @@ class TestBehavior2
return window.location.origin === "https://webrecorder.net"; return window.location.origin === "https://webrecorder.net";
} }
async *run(ctx) { async *run(ctx) {
ctx.log("In Test Behavior 2!"); ctx.log("In Test Behavior 2!");
yield ctx.Lib.getState(ctx, "test-stat-2"); yield ctx.Lib.getState(ctx, "test-stat-2");

View file

@ -1,9 +1,8 @@
/* eslint-disable @typescript-eslint/no-unused-vars */ /* eslint-disable @typescript-eslint/no-unused-vars */
class TestBehavior class TestBehavior {
{
static init() { static init() {
return { return {
state: {} state: {},
}; };
} }
@ -15,7 +14,6 @@ class TestBehavior
return window.location.origin === "https://example.com"; return window.location.origin === "https://example.com";
} }
async *run(ctx) { async *run(ctx) {
ctx.log("In Test Behavior!"); ctx.log("In Test Behavior!");
yield ctx.Lib.getState(ctx, "test-stat"); yield ctx.Lib.getState(ctx, "test-stat");

View file

@ -1,16 +1,19 @@
import child_process from "child_process"; import child_process from "child_process";
import fs from "fs"; import fs from "fs";
test("ensure custom driver with custom selector crawls JS files as pages", async () => { test("ensure custom driver with custom selector crawls JS files as pages", async () => {
try { try {
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs"); child_process.execSync(
} "docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
catch (error) { );
} catch (error) {
console.log(error); console.log(error);
} }
const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8"); const crawledPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
"utf8",
);
const pages = new Set(); const pages = new Set();
for (const line of crawledPages.trim().split("\n")) { for (const line of crawledPages.trim().split("\n")) {
@ -26,9 +29,8 @@ test("ensure custom driver with custom selector crawls JS files as pages", async
const expectedPages = new Set([ const expectedPages = new Set([
"https://www.iana.org/", "https://www.iana.org/",
"https://www.iana.org/_js/jquery.js", "https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js" "https://www.iana.org/_js/iana.js",
]); ]);
expect(pages).toEqual(expectedPages); expect(pages).toEqual(expectedPages);
}); });

View file

@ -7,16 +7,21 @@ const exec = util.promisify(execCallback);
const extraHopsTimeout = 180000; const extraHopsTimeout = 180000;
test(
test("check that URLs are crawled 2 extra hops beyond depth", async () => { "check that URLs are crawled 2 extra hops beyond depth",
async () => {
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7"); await exec(
} "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7",
catch (error) { );
} catch (error) {
console.log(error); console.log(error);
} }
const crawledPages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8"); const crawledPages = fs.readFileSync(
"test-crawls/collections/extra-hops-beyond/pages/pages.jsonl",
"utf8",
);
const crawledPagesArray = crawledPages.trim().split("\n"); const crawledPagesArray = crawledPages.trim().split("\n");
const expectedPages = [ const expectedPages = [
@ -39,4 +44,6 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
} }
expect(expectedPages.indexOf(url) >= 0).toBe(true); expect(expectedPages.indexOf(url) >= 0).toBe(true);
} }
}, extraHopsTimeout); },
extraHopsTimeout,
);

View file

@ -2,8 +2,9 @@ import child_process from "child_process";
import fs from "fs"; import fs from "fs";
test("ensure that stats file is modified", async () => { test("ensure that stats file is modified", async () => {
const child = child_process.exec(
const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json"); "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
);
// detect crawler exit // detect crawler exit
let crawler_exited = false; let crawler_exited = false;
@ -12,7 +13,7 @@ test("ensure that stats file is modified", async () => {
}); });
// helper function to sleep // helper function to sleep
const sleep = ms => new Promise(res => setTimeout(res, ms)); const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
// wait for stats file creation up to 30 secs (to not wait indefinitely) // wait for stats file creation up to 30 secs (to not wait indefinitely)
let counter = 0; let counter = 0;
@ -23,7 +24,9 @@ test("ensure that stats file is modified", async () => {
} }
// get initial modification time // get initial modification time
const initial_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime; const initial_mtime = fs.fstatSync(
fs.openSync("test-crawls/progress.json", "r"),
).mtime;
// wait for crawler exit // wait for crawler exit
while (!crawler_exited) { while (!crawler_exited) {
@ -31,12 +34,13 @@ test("ensure that stats file is modified", async () => {
} }
// get final modification time // get final modification time
const final_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime; const final_mtime = fs.fstatSync(
fs.openSync("test-crawls/progress.json", "r"),
).mtime;
// compare initial and final modification time // compare initial and final modification time
const diff = Math.abs(final_mtime - initial_mtime); const diff = Math.abs(final_mtime - initial_mtime);
expect(diff > 0).toBe(true); expect(diff > 0).toBe(true);
}); });
test("check that stats file format is correct", () => { test("check that stats file format is correct", () => {

View file

@ -5,4 +5,3 @@ seeds:
- https://specs.webrecorder.net/ - https://specs.webrecorder.net/
generateWACZ: true generateWACZ: true

View file

@ -1,4 +1,5 @@
export default async ({ data, page, crawler }) => { export default async ({ data, page, crawler }) => {
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]); await crawler.loadPage(page, data, [
{ selector: "script[src]", extract: "src", isAttribute: false },
]);
}; };

View file

@ -2,8 +2,9 @@ import child_process from "child_process";
import fs from "fs"; import fs from "fs";
test("ensure page limit reached", async () => { test("ensure page limit reached", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors \"\" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json"); child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
);
}); });
test("check limit written to stats file is as expected", () => { test("check limit written to stats file is as expected", () => {

View file

@ -2,9 +2,9 @@ import child_process from "child_process";
import fs from "fs"; import fs from "fs";
import path from "path"; import path from "path";
function jsonLinesToArray(string) { function jsonLinesToArray(string) {
return string.split("\n") return string
.split("\n")
.filter((line) => { .filter((line) => {
try { try {
JSON.parse(line); JSON.parse(line);
@ -13,19 +13,19 @@ function jsonLinesToArray(string) {
return false; return false;
} }
}) })
.map(line => JSON.parse(line)); .map((line) => JSON.parse(line));
} }
test("ensure crawl run with log options passes", async () => { test("ensure crawl run with log options passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general",
);
}); });
test("check that log files exist and were filtered according to options", () => { test("check that log files exist and were filtered according to options", () => {
const logDir = "test-crawls/collections/wr-specs-logs/logs/"; const logDir = "test-crawls/collections/wr-specs-logs/logs/";
const logFiles = []; const logFiles = [];
fs.readdirSync(logDir).forEach(file => { fs.readdirSync(logDir).forEach((file) => {
if (file.startsWith("crawl-") && file.endsWith(".log")) { if (file.startsWith("crawl-") && file.endsWith(".log")) {
logFiles.push(path.join(logDir, file)); logFiles.push(path.join(logDir, file));
} }
@ -40,7 +40,9 @@ test("check that log files exist and were filtered according to options", () =>
expect(parsedJSONLines.length).toBeGreaterThan(0); expect(parsedJSONLines.length).toBeGreaterThan(0);
parsedJSONLines.forEach((jsonLine) => { parsedJSONLines.forEach((jsonLine) => {
expect(jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn").toBe(true); expect(
jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn",
).toBe(true);
expect(jsonLine.context).toBe("general"); expect(jsonLine.context).toBe("general");
}); });
} }

View file

@ -2,24 +2,47 @@ import child_process from "child_process";
import fs from "fs"; import fs from "fs";
test("ensure multi url crawl run with docker run passes", async () => { test("ensure multi url crawl run with docker run passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2"); child_process.execSync(
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz"); );
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
);
}); });
test("check that the favicon made it into the pages jsonl file", () => { test("check that the favicon made it into the pages jsonl file", () => {
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true); expect(
fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl"),
).toBe(true);
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]); const data1 = JSON.parse(
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]); fs
.readFileSync(
"test-crawls/collections/advanced/pages/pages.jsonl",
"utf8",
)
.split("\n")[1],
);
const data2 = JSON.parse(
fs
.readFileSync(
"test-crawls/collections/advanced/pages/pages.jsonl",
"utf8",
)
.split("\n")[2],
);
const data = [data1, data2]; const data = [data1, data2];
for (const d of data) { for (const d of data) {
if (d.url === "https://webrecorder.net/") { if (d.url === "https://webrecorder.net/") {
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico"); expect(d.favIconUrl).toEqual(
"https://webrecorder.net/assets/favicon.ico",
);
} }
if (d.url === "https://iana.org/") { if (d.url === "https://iana.org/") {
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico"); expect(d.favIconUrl).toEqual(
"https://www.iana.org/_img/bookmark_icon.ico",
);
} }
} }
}); });

View file

@ -1,14 +1,19 @@
import child_process from "child_process"; import child_process from "child_process";
test("ensure crawl run with redis passes", async () => { test("ensure crawl run with redis passes", async () => {
const redis = child_process.spawn("docker run -d --name test-crawl-redis -p 6379:6379 redis"); const redis = child_process.spawn(
"docker run -d --name test-crawl-redis -p 6379:6379 redis",
);
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2",
);
redis.kill("SIGINT"); redis.kill("SIGINT");
}); });
test("check that wacz created is valid", () => { test("check that wacz created is valid", () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
);
}); });

View file

@ -28,9 +28,12 @@ test("check crawl interrupted + saved state written", async () => {
const wait = waitForProcess(); const wait = waitForProcess();
try { try {
proc = exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20", {"shell": "/bin/bash"}, wait.callback); proc = exec(
} "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20",
catch (error) { { shell: "/bin/bash" },
wait.callback,
);
} catch (error) {
console.log(error); console.log(error);
} }
@ -45,7 +48,10 @@ test("check crawl interrupted + saved state written", async () => {
while (true) { while (true) {
try { try {
const pages = fs.readFileSync(pagesFile, {encoding: "utf-8"}).trim().split("\n"); const pages = fs
.readFileSync(pagesFile, { encoding: "utf-8" })
.trim()
.split("\n");
if (pages.length >= 2) { if (pages.length >= 2) {
break; break;
@ -61,17 +67,21 @@ test("check crawl interrupted + saved state written", async () => {
await wait.p; await wait.p;
const savedStates = fs.readdirSync("test-crawls/collections/int-state-test/crawls"); const savedStates = fs.readdirSync(
"test-crawls/collections/int-state-test/crawls",
);
expect(savedStates.length > 0).toEqual(true); expect(savedStates.length > 0).toEqual(true);
savedStateFile = savedStates[savedStates.length - 1]; savedStateFile = savedStates[savedStates.length - 1];
}); });
test("check parsing saved state + page done + queue present", () => { test("check parsing saved state + page done + queue present", () => {
expect(savedStateFile).toBeTruthy(); expect(savedStateFile).toBeTruthy();
const savedState = fs.readFileSync(path.join("test-crawls/collections/int-state-test/crawls", savedStateFile), "utf-8"); const savedState = fs.readFileSync(
path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
"utf-8",
);
const saved = yaml.load(savedState); const saved = yaml.load(savedState);
@ -82,17 +92,19 @@ test("check parsing saved state + page done + queue present", () => {
expect(state.done > 0).toEqual(true); expect(state.done > 0).toEqual(true);
expect(state.queued.length > 0).toEqual(true); expect(state.queued.length > 0).toEqual(true);
}); });
test("check crawl restarted with saved state", async () => { test("check crawl restarted with saved state", async () => {
let proc = null; let proc = null;
const wait = waitForProcess(); const wait = waitForProcess();
try { try {
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback); proc = exec(
`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
{ shell: "/bin/bash" },
wait.callback,
);
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
@ -106,7 +118,7 @@ test("check crawl restarted with saved state", async () => {
maxRetriesPerRequest: 100, maxRetriesPerRequest: 100,
retryStrategy(times) { retryStrategy(times) {
return times < 100 ? 1000 : null; return times < 100 ? 1000 : null;
} },
}); });
await new Promise((resolve) => setTimeout(resolve, 2000)); await new Promise((resolve) => setTimeout(resolve, 2000));
@ -126,5 +138,3 @@ test("interrupt crawl and exit", async () => {
expect(res[0].value).toBe(0); expect(res[0].value).toBe(0);
}); });

View file

@ -23,12 +23,10 @@ seeds:
`); `);
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix"); expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([]); expect(seeds[0].exclude).toEqual([]);
}); });
test("default scope + exclude", async () => { test("default scope + exclude", async () => {
@ -40,15 +38,12 @@ exclude: https://example.com/pathexclude
`); `);
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix"); expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
}); });
test("default scope + exclude is numeric", async () => { test("default scope + exclude is numeric", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
seeds: seeds:
@ -58,17 +53,12 @@ exclude: "2022"
`); `);
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix"); expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([/2022/]); expect(seeds[0].exclude).toEqual([/2022/]);
}); });
test("prefix scope global + exclude", async () => { test("prefix scope global + exclude", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
seeds: seeds:
@ -79,15 +69,12 @@ exclude: https://example.com/pathexclude
`); `);
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix"); expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
}); });
test("prefix scope per seed + exclude", async () => { test("prefix scope per seed + exclude", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
seeds: seeds:
@ -98,15 +85,12 @@ exclude: https://example.com/pathexclude
`); `);
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("prefix"); expect(seeds[0].scopeType).toEqual("prefix");
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
}); });
test("host scope and domain scope", async () => { test("host scope and domain scope", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
@ -123,20 +107,26 @@ seeds:
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true); expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true); expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(true); expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(
expect(!!seeds[0].include[0].exec("https://sub.domain.example.com/path")).toEqual(true); true,
expect(!!seeds[0].include[0].exec("https://notsub.domainexample.com/path")).toEqual(false); );
expect(
!!seeds[0].include[0].exec("https://sub.domain.example.com/path"),
).toEqual(true);
expect(
!!seeds[0].include[0].exec("https://notsub.domainexample.com/path"),
).toEqual(false);
expect(seeds[1].scopeType).toEqual("host"); expect(seeds[1].scopeType).toEqual("host");
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]); expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true); expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true); expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(false); expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(
false,
);
}); });
test("domain scope drop www.", async () => { test("domain scope drop www.", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
seeds: seeds:
- url: https://www.example.com/ - url: https://www.example.com/
@ -146,11 +136,8 @@ seeds:
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("domain"); expect(seeds[0].scopeType).toEqual("domain");
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]); expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
}); });
test("custom scope", async () => { test("custom scope", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
seeds: seeds:
@ -159,14 +146,12 @@ seeds:
exclude: https?://example.com/pathexclude exclude: https?://example.com/pathexclude
`); `);
expect(seeds.length).toEqual(1); expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("custom"); expect(seeds[0].scopeType).toEqual("custom");
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]); expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]); expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
}); });
test("inherit scope", async () => { test("inherit scope", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
@ -178,7 +163,6 @@ include: https?://example.com/(path|other)
exclude: https://example.com/pathexclude exclude: https://example.com/pathexclude
`); `);
expect(seeds.length).toEqual(2); expect(seeds.length).toEqual(2);
expect(seeds[0].scopeType).toEqual("custom"); expect(seeds[0].scopeType).toEqual("custom");
@ -190,10 +174,8 @@ exclude: https://example.com/pathexclude
expect(seeds[1].url).toEqual("https://example.com/2"); expect(seeds[1].url).toEqual("https://example.com/2");
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]); expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]); expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
}); });
test("override scope", async () => { test("override scope", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
@ -225,7 +207,10 @@ include: https://example.com/onlythispath
expect(seeds[2].scopeType).toEqual("prefix"); expect(seeds[2].scopeType).toEqual("prefix");
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html"); expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
expect(seeds[2].include).toEqual([/^https?:\/\/example\.com\/subpath\//, /https:\/\/example.com\/onlythispath/]); expect(seeds[2].include).toEqual([
/^https?:\/\/example\.com\/subpath\//,
/https:\/\/example.com\/onlythispath/,
]);
expect(seeds[2].exclude).toEqual([]); expect(seeds[2].exclude).toEqual([]);
expect(seeds[3].scopeType).toEqual("custom"); expect(seeds[3].scopeType).toEqual("custom");
@ -234,7 +219,6 @@ include: https://example.com/onlythispath
expect(seeds[3].exclude).toEqual([]); expect(seeds[3].exclude).toEqual([]);
}); });
test("override scope with exclude", async () => { test("override scope with exclude", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
@ -288,10 +272,8 @@ exclude:
expect(seeds[4].url).toEqual("https://example.com/4"); expect(seeds[4].url).toEqual("https://example.com/4");
expect(seeds[4].include).toEqual([]); expect(seeds[4].include).toEqual([]);
expect(seeds[4].exclude).toEqual([]); expect(seeds[4].exclude).toEqual([]);
}); });
test("with exclude non-string types", async () => { test("with exclude non-string types", async () => {
const seeds = getSeeds(` const seeds = getSeeds(`
seeds: seeds:
@ -342,5 +324,4 @@ seeds:
expect(seeds[7].exclude).toEqual([/null/]); expect(seeds[7].exclude).toEqual([/null/]);
expect(seeds[8].exclude).toEqual([/false/]); expect(seeds[8].exclude).toEqual([/false/]);
expect(seeds[9].exclude).toEqual([/true/]); expect(seeds[9].exclude).toEqual([/true/]);
}); });

View file

@ -4,48 +4,66 @@ import fs from "fs";
// screenshot // screenshot
test("ensure basic crawl run with --screenshot passes", async () => { test("ensure basic crawl run with --screenshot passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2",
);
}); });
test("check that a screenshots warc file exists in the test collection", () => { test("check that a screenshots warc file exists in the test collection", () => {
const screenshotWarcExists = fs.existsSync("test-crawls/collections/test/archive/screenshots.warc.gz"); const screenshotWarcExists = fs.existsSync(
"test-crawls/collections/test/archive/screenshots.warc.gz",
);
expect(screenshotWarcExists).toBe(true); expect(screenshotWarcExists).toBe(true);
}); });
// fullPageScreenshot // fullPageScreenshot
test("ensure basic crawl run with --fullPageScreenshot passes", async () => { test("ensure basic crawl run with --fullPageScreenshot passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2",
);
}); });
test("check that a screenshots warc file exists in the fullpage collection", () => { test("check that a screenshots warc file exists in the fullpage collection", () => {
const screenshotWarcExists = fs.existsSync("test-crawls/collections/fullpage/archive/screenshots.warc.gz"); const screenshotWarcExists = fs.existsSync(
"test-crawls/collections/fullpage/archive/screenshots.warc.gz",
);
expect(screenshotWarcExists).toBe(true); expect(screenshotWarcExists).toBe(true);
}); });
// thumbnail // thumbnail
test("ensure basic crawl run with --thumbnail passes", async () => { test("ensure basic crawl run with --thumbnail passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2",
);
}); });
test("check that a screenshots warc file exists in the thumbnail collection", () => { test("check that a screenshots warc file exists in the thumbnail collection", () => {
const screenshotWarcExists = fs.existsSync("test-crawls/collections/thumbnail/archive/screenshots.warc.gz"); const screenshotWarcExists = fs.existsSync(
"test-crawls/collections/thumbnail/archive/screenshots.warc.gz",
);
expect(screenshotWarcExists).toBe(true); expect(screenshotWarcExists).toBe(true);
}); });
// combination // combination
test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => { test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2",
);
}); });
test("check that a screenshots warc file exists in the combined collection", () => { test("check that a screenshots warc file exists in the combined collection", () => {
const screenshotWarcExists = fs.existsSync("test-crawls/collections/combined/archive/screenshots.warc.gz"); const screenshotWarcExists = fs.existsSync(
"test-crawls/collections/combined/archive/screenshots.warc.gz",
);
expect(screenshotWarcExists).toBe(true); expect(screenshotWarcExists).toBe(true);
}); });
test("check that a wacz file exists in the combined collection", () => { test("check that a wacz file exists in the combined collection", () => {
const waczExists = fs.existsSync("test-crawls/collections/combined/combined.wacz"); const waczExists = fs.existsSync(
"test-crawls/collections/combined/combined.wacz",
);
expect(waczExists).toBe(true); expect(waczExists).toBe(true);
}); });

View file

@ -3,11 +3,12 @@ import {exec as execCallback } from "child_process";
const exec = util.promisify(execCallback); const exec = util.promisify(execCallback);
test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => { test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
let passed = true; let passed = true;
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed"); await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed",
);
} catch (error) { } catch (error) {
console.log(error); console.log(error);
passed = false; passed = false;
@ -18,9 +19,10 @@ test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set",
test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => { test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
let passed = true; let passed = true;
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed"); await exec(
} "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed",
catch (error) { );
} catch (error) {
passed = false; passed = false;
} }
expect(passed).toBe(false); expect(passed).toBe(false);
@ -29,9 +31,10 @@ test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async ()
test("ensure crawl fails if no valid seeds are passed", async () => { test("ensure crawl fails if no valid seeds are passed", async () => {
let passed = true; let passed = true;
try { try {
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds"); await exec(
} "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds",
catch (error) { );
} catch (error) {
passed = false; passed = false;
} }
expect(passed).toBe(false); expect(passed).toBe(false);

View file

@ -1,5 +1,7 @@
import { calculatePercentageUsed, checkDiskUtilization } from "../dist/util/storage.js"; import {
calculatePercentageUsed,
checkDiskUtilization,
} from "../dist/util/storage.js";
test("ensure calculatePercentageUsed returns expected values", () => { test("ensure calculatePercentageUsed returns expected values", () => {
expect(calculatePercentageUsed(30, 100)).toEqual(30); expect(calculatePercentageUsed(30, 100)).toEqual(30);
@ -13,13 +15,11 @@ test("ensure calculatePercentageUsed returns expected values", () => {
expect(calculatePercentageUsed(0, 5)).toEqual(0); expect(calculatePercentageUsed(0, 5)).toEqual(0);
}); });
test("verify end-to-end disk utilization not exceeded threshold", async () => { test("verify end-to-end disk utilization not exceeded threshold", async () => {
const params = { const params = {
diskUtilization: 90, diskUtilization: 90,
combineWARC: true, combineWARC: true,
generateWACZ: true generateWACZ: true,
}; };
const mockDfOutput = `\ const mockDfOutput = `\
@ -28,22 +28,24 @@ grpcfuse 1000000 285000 715000 28% /crawls`;
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31% // with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
// does not exceed 90% threshold // does not exceed 90% threshold
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput); const returnValue = await checkDiskUtilization(
params,
5000 * 1024,
mockDfOutput,
);
expect(returnValue).toEqual({ expect(returnValue).toEqual({
stop: false, stop: false,
used: 28, used: 28,
projected: 31, projected: 31,
threshold: 90 threshold: 90,
}); });
}); });
test("verify end-to-end disk utilization exceeds threshold", async () => { test("verify end-to-end disk utilization exceeds threshold", async () => {
const params = { const params = {
diskUtilization: 90, diskUtilization: 90,
combineWARC: false, combineWARC: false,
generateWACZ: true generateWACZ: true,
}; };
const mockDfOutput = `\ const mockDfOutput = `\
@ -52,11 +54,15 @@ grpcfuse 100000 85000 15000 85% /crawls`;
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91% // with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
// exceeds 90% threshold // exceeds 90% threshold
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput); const returnValue = await checkDiskUtilization(
params,
3000 * 1024,
mockDfOutput,
);
expect(returnValue).toEqual({ expect(returnValue).toEqual({
stop: true, stop: true,
used: 85, used: 85,
projected: 91, projected: 91,
threshold: 90 threshold: 90,
}); });
}); });

View file

@ -3,16 +3,20 @@ import child_process from "child_process";
test("check that urn:text and urn:textfinal records are written to WARC", async () => { test("check that urn:text and urn:textfinal records are written to WARC", async () => {
try { try {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc"); child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
);
} catch (error) { } catch (error) {
//console.log(new TextDecoder().decode(error)); //console.log(new TextDecoder().decode(error));
console.log(error.stderr); console.log(error.stderr);
} }
const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"}); const data = fs.readFileSync(
"test-crawls/collections/text-extract/indexes/index.cdxj",
{ encoding: "utf-8" },
);
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true); expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true); expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
}); });

View file

@ -6,15 +6,21 @@ const exec = util.promisify(execCallback);
test("check that URLs in seed-list are crawled", async () => { test("check that URLs in seed-list are crawled", async () => {
try { try {
await exec(
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000"); "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
} );
catch (error) { } catch (error) {
console.log(error); console.log(error);
} }
let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8"); let crawled_pages = fs.readFileSync(
let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort(); "test-crawls/collections/filelisttest/pages/pages.jsonl",
"utf8",
);
let seed_file = fs
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
.split("\n")
.sort();
let seed_file_list = []; let seed_file_list = [];
for (var j = 0; j < seed_file.length; j++) { for (var j = 0; j < seed_file.length; j++) {

View file

@ -5,15 +5,19 @@ import child_process from "child_process";
test("check that the warcinfo file works as expected on the command line", async () => { test("check that the warcinfo file works as expected on the command line", async () => {
try { try {
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"}); const proc = child_process.execSync(
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC",
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
);
console.log(proc); console.log(proc);
} } catch (error) {
catch (error) {
console.log(error); console.log(error);
} }
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz"); const warcData = fs.readFileSync(
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
);
const data = zlib.gunzipSync(warcData); const data = zlib.gunzipSync(warcData);
@ -21,8 +25,8 @@ test("check that the warcinfo file works as expected on the command line", async
expect(string.indexOf("operator: test")).toBeGreaterThan(-1); expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null); expect(
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
).not.toEqual(null);
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1); expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
}); });

View file

@ -11,8 +11,12 @@
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */ /* Language and Environment */
"target": "es2022", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ "target": "es2022" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
"lib": ["es2022", "dom", "dom.iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ "lib": [
"es2022",
"dom",
"dom.iterable"
] /* Specify a set of bundled library declaration files that describe the target runtime environment. */,
// "jsx": "preserve", /* Specify what JSX code is generated. */ // "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */ // "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
@ -25,9 +29,9 @@
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */ /* Modules */
"module": "NodeNext", /* Specify what module code is generated. */ "module": "NodeNext" /* Specify what module code is generated. */,
"rootDir": "./src", /* Specify the root folder within your source files. */ "rootDir": "./src" /* Specify the root folder within your source files. */,
"moduleResolution": "NodeNext", /* Specify how TypeScript looks up a file from a given module specifier. */ "moduleResolution": "NodeNext" /* Specify how TypeScript looks up a file from a given module specifier. */,
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */ //"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
@ -39,8 +43,8 @@
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */ // "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */ /* JavaScript Support */
"allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ "allowJs": true /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */,
"checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ "checkJs": true /* Enable error reporting in type-checked JavaScript files. */,
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */ /* Emit */
@ -49,7 +53,7 @@
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */ // "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
"outDir": "./dist/", /* Specify an output folder for all emitted files. */ "outDir": "./dist/" /* Specify an output folder for all emitted files. */,
// "removeComments": true, /* Disable emitting comments. */ // "removeComments": true, /* Disable emitting comments. */
// "noEmit": true, /* Disable emitting files from a compilation. */ // "noEmit": true, /* Disable emitting files from a compilation. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
@ -73,10 +77,10 @@
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ //"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
/* Type Checking */ /* Type Checking */
"strict": true, /* Enable all strict type-checking options. */ "strict": true /* Enable all strict type-checking options. */,
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
@ -101,7 +105,5 @@
"skipLibCheck": true /* Skip type checking all .d.ts files. */ "skipLibCheck": true /* Skip type checking all .d.ts files. */
}, },
"include": [ "include": ["src/**/*"]
"src/**/*",
]
} }

View file

@ -1914,6 +1914,11 @@ escodegen@^2.1.0:
optionalDependencies: optionalDependencies:
source-map "~0.6.1" source-map "~0.6.1"
eslint-config-prettier@^9.0.0:
version "9.0.0"
resolved "https://registry.yarnpkg.com/eslint-config-prettier/-/eslint-config-prettier-9.0.0.tgz#eb25485946dd0c66cd216a46232dc05451518d1f"
integrity sha512-IcJsTkJae2S35pRsRAwoCE+925rJJStOdkKnLVgtE+tEpqU0EVVM7OqrwxqgptKdX29NUwC82I5pXsGFIgSevw==
eslint-plugin-react@^7.22.0: eslint-plugin-react@^7.22.0:
version "7.23.2" version "7.23.2"
resolved "https://registry.yarnpkg.com/eslint-plugin-react/-/eslint-plugin-react-7.23.2.tgz#2d2291b0f95c03728b55869f01102290e792d494" resolved "https://registry.yarnpkg.com/eslint-plugin-react/-/eslint-plugin-react-7.23.2.tgz#2d2291b0f95c03728b55869f01102290e792d494"
@ -3829,6 +3834,11 @@ prelude-ls@^1.2.1:
resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396" resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396"
integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g== integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==
prettier@3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.0.3.tgz#432a51f7ba422d1469096c0fdc28e235db8f9643"
integrity sha512-L/4pUDMxcNa8R/EthV08Zt42WBO4h1rarVtK0K+QJG0X187OLo7l699jWw0GKuwzkPQ//jMFA/8Xm6Fh3J/DAg==
pretty-format@^29.2.1: pretty-format@^29.2.1:
version "29.2.1" version "29.2.1"
resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.2.1.tgz#86e7748fe8bbc96a6a4e04fa99172630907a9611" resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.2.1.tgz#86e7748fe8bbc96a6a4e04fa99172630907a9611"