mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Add Prettier to the repo, and format all the files! (#428)
This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed.
This commit is contained in:
parent
af1e0860e4
commit
2a49406df7
70 changed files with 3192 additions and 2026 deletions
|
@ -5,7 +5,11 @@ module.exports = {
|
|||
node: true,
|
||||
jest: true,
|
||||
},
|
||||
extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"],
|
||||
extends: [
|
||||
"eslint:recommended",
|
||||
"plugin:@typescript-eslint/recommended",
|
||||
"prettier",
|
||||
],
|
||||
parser: "@typescript-eslint/parser",
|
||||
plugins: ["@typescript-eslint"],
|
||||
parserOptions: {
|
||||
|
@ -13,10 +17,6 @@ module.exports = {
|
|||
sourceType: "module",
|
||||
},
|
||||
rules: {
|
||||
indent: ["error", 2],
|
||||
"linebreak-style": ["error", "unix"],
|
||||
quotes: ["error", "double"],
|
||||
semi: ["error", "always"],
|
||||
"no-constant-condition": ["error", { checkLoops: false }],
|
||||
"no-use-before-define": [
|
||||
"error",
|
||||
|
|
9
.github/workflows/ci.yaml
vendored
9
.github/workflows/ci.yaml
vendored
|
@ -6,7 +6,6 @@ on:
|
|||
|
||||
jobs:
|
||||
lint:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
|
@ -22,10 +21,9 @@ jobs:
|
|||
- name: install requirements
|
||||
run: yarn install
|
||||
- name: run linter
|
||||
run: yarn lint
|
||||
run: yarn lint && yarn format
|
||||
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
|
@ -46,8 +44,3 @@ jobs:
|
|||
run: docker-compose build
|
||||
- name: run jest
|
||||
run: sudo yarn test
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
22
.github/workflows/release.yaml
vendored
22
.github/workflows/release.yaml
vendored
|
@ -8,12 +8,10 @@ jobs:
|
|||
name: Build x86 and ARM Images and push to Dockerhub
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
-
|
||||
name: Check out the repo
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
-
|
||||
name: Docker image metadata
|
||||
- name: Docker image metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
|
@ -21,23 +19,19 @@ jobs:
|
|||
tags: |
|
||||
type=semver,pattern={{version}}
|
||||
|
||||
-
|
||||
name: Set up QEMU
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
platforms: arm64
|
||||
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
-
|
||||
name: Login to DockerHub
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
-
|
||||
name: Build and push
|
||||
- name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
|
@ -45,7 +39,5 @@ jobs:
|
|||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
platforms: "linux/amd64,linux/arm64"
|
||||
-
|
||||
name: Image digest
|
||||
- name: Image digest
|
||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -6,3 +6,4 @@ node_modules/
|
|||
crawls/
|
||||
test-crawls/
|
||||
.DS_Store
|
||||
dist
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env sh
|
||||
. "$(dirname -- "$0")/_/husky.sh"
|
||||
|
||||
yarn lint
|
||||
yarn lint:fix
|
||||
|
|
1
.prettierignore
Normal file
1
.prettierignore
Normal file
|
@ -0,0 +1 @@
|
|||
dist
|
18
CHANGES.md
18
CHANGES.md
|
@ -1,11 +1,13 @@
|
|||
## CHANGES
|
||||
|
||||
v0.8.1
|
||||
|
||||
- Logging and Behavior Tweaks by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/229
|
||||
- Fix typos by @stavares843 in https://github.com/webrecorder/browsertrix-crawler/pull/232
|
||||
- Add crawl log to WACZ by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/231
|
||||
|
||||
v0.8.0
|
||||
|
||||
- Switch to Chrome/Chromium 109
|
||||
- Convert to ESM module
|
||||
- Add ad blocking via request interception (#173)
|
||||
|
@ -25,11 +27,13 @@ v0.8.0
|
|||
- update behaviors to 0.4.1, rename 'Behavior line' -> 'Behavior log' by @ikreymer in https://github.com/webrecorder/browsertrix-crawler/pull/223
|
||||
|
||||
v0.7.1
|
||||
|
||||
- Fix for warcio.js by @ikreymer in #178
|
||||
- Guard against pre-existing user/group by @edsu in #176
|
||||
- Fix incorrect combineWARCs property in README.md by @Georift in #180
|
||||
|
||||
v0.7.0
|
||||
|
||||
- Update to Chrome/Chromium 101 - (0.7.0 Beta 0) by @ikreymer in #144
|
||||
- Add --netIdleWait, bump dependencies (0.7.0-beta.2) by @ikreymer in #145
|
||||
- Update README.md by @atomotic in #147
|
||||
|
@ -41,7 +45,6 @@ v0.7.0
|
|||
- Interrupt Handling Fixes by @ikreymer in #167
|
||||
- Run in Docker as User by @edsu in #171
|
||||
|
||||
|
||||
v0.6.0
|
||||
|
||||
- Add a --waitOnDone option, which has browsertrix crawler wait when finished (for use with Browsertrix Cloud)
|
||||
|
@ -56,8 +59,8 @@ v0.6.0
|
|||
- Fixes to interrupting a single instance in a shared state crawl
|
||||
- force all cookies, including session cookies, to fixed duration in days, configurable via --cookieDays
|
||||
|
||||
|
||||
v0.5.0
|
||||
|
||||
- Scope: support for `scopeType: domain` to include all subdomains and ignoring 'www.' if specified in the seed.
|
||||
- Profiles: support loading remote profile from URL as well as local file
|
||||
- Non-HTML Pages: Load non-200 responses in browser, even if non-html, fix waiting issues with non-HTML pages (eg. PDFs)
|
||||
|
@ -75,8 +78,8 @@ v0.5.0
|
|||
- Signing: Support for optional signing of WACZ
|
||||
- Dependencies: update to latest pywb, wacz and browsertrix-behaviors packages
|
||||
|
||||
|
||||
v0.4.4
|
||||
|
||||
- Page Block Rules Fix: 'request already handled' errors by avoiding adding duplicate handlers to same page.
|
||||
- Page Block Rules Fix: await all continue/abort() calls and catch errors.
|
||||
- Page Block Rules: Don't apply to top-level page, print warning and recommend scope rules instead.
|
||||
|
@ -86,11 +89,13 @@ v0.4.4
|
|||
- README: Update old type -> scopeType, list new scope types.
|
||||
|
||||
v0.4.3
|
||||
|
||||
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
|
||||
- BlockRules Fixes: Always allow pywb proxy scripts.
|
||||
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
|
||||
|
||||
v0.4.2
|
||||
|
||||
- Compose/docs: Build latest image by default, update README to refer to latest image
|
||||
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
||||
- Tests: Update all tests to use `test-crawls` directory
|
||||
|
@ -98,6 +103,7 @@ v0.4.2
|
|||
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
|
||||
|
||||
v0.4.1
|
||||
|
||||
- BlockRules Optimizations: don't intercept requests if no blockRules
|
||||
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
|
||||
- Profile Creation: Set default window size to 1600x900, add --windowSize param for setting custom size
|
||||
|
@ -107,6 +113,7 @@ v0.4.1
|
|||
- CI: Build a multi-platform (amd64 and arm64) image on each release
|
||||
|
||||
v0.4.0
|
||||
|
||||
- YAML based config, specifyable via --config property or via stdin (with '--config stdin')
|
||||
- Support for different scope types ('page', 'prefix', 'host', 'any', 'none') + crawl depth at crawl level
|
||||
- Per-Seed scoping, including different scope types, or depth and include/exclude rules configurable per seed in 'seeds' list via YAML config
|
||||
|
@ -120,16 +127,17 @@ v0.4.0
|
|||
- Update to latest pywb (2.5.0b4), browsertrix-behaviors (0.2.3), py-wacz (0.3.1)
|
||||
|
||||
v0.3.2
|
||||
|
||||
- Added a `--urlFile` option: Allows users to specify a .txt file list of exact URLs to crawl (one URL per line).
|
||||
|
||||
|
||||
v0.3.1
|
||||
|
||||
- Improved shutdown wait: Instead of waiting for 5 secs, wait until all pending requests are written to WARCs
|
||||
- Bug fix: Use async APIs for combine WARC to avoid spurious issues with multiple crawls
|
||||
- Behaviors Update to Behaviors to 0.2.1, with support for facebook pages
|
||||
|
||||
|
||||
v0.3.0
|
||||
|
||||
- WARC Combining: `--combineWARC` and `--rolloverSize` flags for generating combined WARC at end of crawl, each WARC upto specified rolloverSize
|
||||
- Profiles: Support for creating reusable browser profiles, stored as tarballs, and running crawl with a login profile (see README for more info)
|
||||
- Behaviors: Switch to Browsertrix Behaviors v0.1.1 for in-page behaviors
|
||||
|
|
34
README.md
34
README.md
|
@ -51,7 +51,6 @@ Browsertrix Crawler includes a number of additional command-line options, explai
|
|||
|
||||
## Crawling Configuration Options
|
||||
|
||||
|
||||
<details>
|
||||
<summary><b>The Browsertrix Crawler docker image currently accepts the following parameters:</b></summary>
|
||||
|
||||
|
@ -269,8 +268,8 @@ Options:
|
|||
ess (for debugging) [boolean]
|
||||
--config Path to YAML config file
|
||||
```
|
||||
</details>
|
||||
|
||||
</details>
|
||||
|
||||
### Waiting for Page Load
|
||||
|
||||
|
@ -282,14 +281,12 @@ See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remar
|
|||
|
||||
The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
|
||||
|
||||
|
||||
### YAML Crawl Config
|
||||
|
||||
Browsertix Crawler supports the use of a yaml file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option.
|
||||
|
||||
The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the yaml file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
|
||||
|
||||
|
||||
```
|
||||
docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml
|
||||
```
|
||||
|
@ -300,7 +297,6 @@ The config can also be passed via stdin, which can simplify the command. Note th
|
|||
cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin
|
||||
```
|
||||
|
||||
|
||||
An example config file (eg. crawl-config.yaml) might contain:
|
||||
|
||||
```
|
||||
|
@ -361,7 +357,6 @@ To make this configuration as simple as possible, there are several predefined s
|
|||
The scope settings for multi-page crawls (page-spa, prefix, host, domain) also include http/https versions, eg. given a prefix of `http://example.com/path/`,
|
||||
`https://example.com/path/` is also included.
|
||||
|
||||
|
||||
#### Custom Scope Inclusion Rules
|
||||
|
||||
Instead of setting a scope type, it is possible to instead configure custom scope regex by setting `--include` config to one or more regular expressions.
|
||||
|
@ -375,7 +370,6 @@ In addition to the inclusion rules, Browsertrix Crawler supports a separate list
|
|||
|
||||
The exclusion regexes are often used with a custom scope, but could be used with a predefined scopeType as well.
|
||||
|
||||
|
||||
#### Extra 'Hops' Beyond Current Scope
|
||||
|
||||
Occasionally, it may be useful to augment the scope by allowing extra links N 'hops' beyond the current scope.
|
||||
|
@ -385,7 +379,6 @@ For example, this is most useful when crawling with a `host` or `prefix` scope,
|
|||
The `--extraHops` setting can be set globally or per seed to allow expanding the current inclusion scope N 'hops' beyond the configured scope. Note that this mechanism only expands the inclusion scope, and any exclusion rules are still applied. If a URL is to be excluded via the exclusion rules,
|
||||
that will take precedence over the `--extraHops`.
|
||||
|
||||
|
||||
#### Scope Rule Examples
|
||||
|
||||
For example, the following seed will start on `https://example.com/startpage.html` and crawl all pages on the `https://example.com/` domain, except pages that match the regexes `example.com/skip.*` or `example.com/search.*`
|
||||
|
@ -456,27 +449,24 @@ If the `--blockMessage` is also specified, a blocked URL is replaced with the sp
|
|||
|
||||
If it seems confusing which rules should be used, here is a quick way to determine:
|
||||
|
||||
- If you'd like to restrict *the pages that are being crawled*, use the crawl scope rules (defined above).
|
||||
- If you'd like to restrict _the pages that are being crawled_, use the crawl scope rules (defined above).
|
||||
|
||||
- If you'd like to restrict *parts of a page* that are being loaded, use the page resource block rules described in this section.
|
||||
- If you'd like to restrict _parts of a page_ that are being loaded, use the page resource block rules described in this section.
|
||||
|
||||
The blockRules add a filter to each URL loaded on a page and incur an extra overhead. They should only be used in advance uses cases where part of a page needs to be blocked.
|
||||
|
||||
These rules can not be used to prevent entire pages for loading -- use the scope exclusion rules for that. (A warning will be printed if a page resource block rule matches a top-level page).
|
||||
|
||||
|
||||
### Ad blocking
|
||||
|
||||
With version 0.8.0, Browsertrix Crawler supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.
|
||||
|
||||
|
||||
### Custom Warcinfo Fields
|
||||
|
||||
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARCs. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.
|
||||
|
||||
For example, the following are equivalent ways to add additional warcinfo fields:
|
||||
|
||||
|
||||
via yaml config:
|
||||
|
||||
```yaml
|
||||
|
@ -622,7 +612,6 @@ docker run -e CHROME_FLAGS="--disable-extensions-except=/ext/ublock --load-exten
|
|||
|
||||
You can also directly use extensions from an existing chrome-profile by using e.g. `~/.config/chromium/Default/Extensions/cjpalhdlnbpafiamejdnhcphjbkeiagm/1.41.8_0/` as the path.
|
||||
|
||||
|
||||
## Saving Crawl State: Interrupting and Restarting the Crawl
|
||||
|
||||
With version 0.5.0, a crawl can be gracefully interrupted with Ctrl-C (SIGINT) or a SIGTERM.
|
||||
|
@ -642,13 +631,11 @@ or `never` respectively, to control when the crawl state file should be written.
|
|||
When the `--saveState` is set to always, Browsertrix Crawler will also save the state automatically during the crawl, as set by the `--saveStateInterval` setting.
|
||||
When The crawler will keep the last `--saveStateHistory` save states and delete older ones. This provides extra backup, in case the crawl fails unexpectedly, or is not terminated via Ctrl-C, several previous crawl states are still available.
|
||||
|
||||
|
||||
## Creating and Using Browser Profiles
|
||||
|
||||
Browsertrix Crawler also includes a way to use existing browser profiles when running a crawl. This allows pre-configuring the browser, such as by logging in
|
||||
to certain sites or setting other settings, and running a crawl exactly with those settings. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies.
|
||||
|
||||
|
||||
### Interactive Profile Creation
|
||||
|
||||
For creating profiles of more complex sites, or logging in to multiple sites at once, the interactive profile creation mode can be used.
|
||||
|
@ -719,7 +706,6 @@ The script will then prompt you for login credentials, attempt to login and crea
|
|||
|
||||
- To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
|
||||
|
||||
|
||||
The current profile creation script is still experimental and the script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. Additional automated profile creation functionality, such as support for custom profile creation scripts, may be added in the future.
|
||||
|
||||
### Using Browser Profile with a Crawl
|
||||
|
@ -743,7 +729,6 @@ All released Docker Images are available from Docker Hub, listed by release tag
|
|||
|
||||
Details for each corresponding release tag are also available on GitHub at: https://github.com/webrecorder/browsertrix-crawler/releases
|
||||
|
||||
|
||||
## Architecture
|
||||
|
||||
The Docker container provided here packages up several components used in Browsertrix.
|
||||
|
@ -752,7 +737,6 @@ The system uses `pywb` in recording mode for capturing the content. The crawl pr
|
|||
|
||||
To access the contents of the crawl, the `/crawls` directory in the container should be mounted to a volume (default in the Docker Compose setup).
|
||||
|
||||
|
||||
### Usage with Docker Compose
|
||||
|
||||
Many examples in this README demonstrate running Browsertrix Crawler with `docker run`.
|
||||
|
@ -775,10 +759,8 @@ docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --
|
|||
|
||||
In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
|
||||
|
||||
|
||||
While the crawl is running, the status of the crawl prints the progress to the JSON log output. This can be disabled by using the `--logging` option and not including `stats`.
|
||||
|
||||
|
||||
### Multi-Platform Build / Support for Apple Silicon (M1/M2)
|
||||
|
||||
Browsertrix Crawler uses a browser image which supports amd64 and arm64.
|
||||
|
@ -787,7 +769,6 @@ This means Browsertrix Crawler can be built natively on Apple Silicon systems us
|
|||
|
||||
On an Apple Silicon system, the browser used will be Chromium instead of Chrome since there is no Linux build of Chrome for ARM, and this now is handled automatically as part of the build. Note that Chromium is different than Chrome, and for example, some video codecs may not be supported in the ARM / Chromium-based version that would be in the amd64 / Chrome version. For production crawling, it is recommended to run on an amd64 Linux environment.
|
||||
|
||||
|
||||
### Modifying Browser Image
|
||||
|
||||
It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Chrome/Chromium (depending on host system chip architecture) and Brave Browser are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base).
|
||||
|
@ -796,7 +777,6 @@ The browser base image used is specified and can be changed at the top of the Do
|
|||
|
||||
Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image.
|
||||
|
||||
|
||||
### Viewing crawled data with pywb
|
||||
|
||||
When a crawler is done, another browsertrix-crawler image can be started with a local [pywb](https://github.com/webrecorder/pywb) instance to view crawl:
|
||||
|
@ -809,17 +789,13 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
|
|||
|
||||
(Previewing crawl results while a crawl its still running should also be possible soon!)
|
||||
|
||||
|
||||
Support
|
||||
-------
|
||||
## Support
|
||||
|
||||
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||
|
||||
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
|
||||
|
||||
|
||||
License
|
||||
-------
|
||||
## License
|
||||
|
||||
[AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see
|
||||
[LICENSE](LICENSE) for more details.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
version: '3.5'
|
||||
version: "3.5"
|
||||
|
||||
services:
|
||||
crawler:
|
||||
|
@ -14,4 +14,3 @@ services:
|
|||
- SYS_ADMIN
|
||||
|
||||
shm_size: 1gb
|
||||
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
<html>
|
||||
<head>
|
||||
<style>
|
||||
html, body, iframe {
|
||||
html,
|
||||
body,
|
||||
iframe {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
margin: 0;
|
||||
|
@ -32,7 +34,11 @@ button {
|
|||
</head>
|
||||
<body>
|
||||
<div id="info">
|
||||
Log in to any site(s) that you want to be part of the crawl profile using the embedded browser below. When done, click <form action="/createProfile" method="post"><button type="submit">Create Profile</button></form>
|
||||
Log in to any site(s) that you want to be part of the crawl profile using
|
||||
the embedded browser below. When done, click
|
||||
<form action="/createProfile" method="post">
|
||||
<button type="submit">Create Profile</button>
|
||||
</form>
|
||||
</div>
|
||||
<iframe id="main" src="$DEVTOOLS_SRC"></iframe>
|
||||
</body>
|
||||
|
|
|
@ -14,7 +14,9 @@
|
|||
}
|
||||
</style>
|
||||
<script>
|
||||
const ws = new WebSocket(window.location.href.replace("http", "ws") + "ws");
|
||||
const ws = new WebSocket(
|
||||
window.location.href.replace("http", "ws") + "ws",
|
||||
);
|
||||
ws.addEventListener("message", (event) => handleMessage(event.data));
|
||||
|
||||
const unusedElems = [];
|
||||
|
@ -70,6 +72,8 @@
|
|||
</script>
|
||||
<head>
|
||||
<body>
|
||||
<div id="content">
|
||||
</div>
|
||||
<div id="content"></div>
|
||||
</body>
|
||||
</head>
|
||||
</head>
|
||||
</html>
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
<!DOCTYPE html>
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
|
||||
<!--
|
||||
noVNC example: lightweight example using minimal UI and features
|
||||
|
||||
|
@ -16,10 +15,9 @@
|
|||
-->
|
||||
<title>noVNC</title>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta charset="utf-8" />
|
||||
|
||||
<style>
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
background-color: dimgrey;
|
||||
|
@ -56,12 +54,11 @@
|
|||
flex: 1; /* fill remaining space */
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
<script type="module" crossorigin="anonymous">
|
||||
// RFB holds the API to connect and communicate with a VNC server
|
||||
import RFB from './core/rfb.js';
|
||||
import RFB from "./core/rfb.js";
|
||||
|
||||
let rfb;
|
||||
let desktopName;
|
||||
|
@ -105,7 +102,7 @@
|
|||
|
||||
// Show a status text in the top bar
|
||||
function status(text) {
|
||||
document.getElementById('status').textContent = text;
|
||||
document.getElementById("status").textContent = text;
|
||||
}
|
||||
|
||||
// This function extracts the value of one variable from the
|
||||
|
@ -124,8 +121,10 @@
|
|||
//
|
||||
// Note that we use location.href instead of location.search
|
||||
// because Firefox < 53 has a bug w.r.t location.search
|
||||
const re = new RegExp('.*[?&]' + name + '=([^&#]*)'),
|
||||
match = ''.concat(document.location.href, window.location.hash).match(re);
|
||||
const re = new RegExp(".*[?&]" + name + "=([^&#]*)"),
|
||||
match = ""
|
||||
.concat(document.location.href, window.location.hash)
|
||||
.match(re);
|
||||
|
||||
if (match) {
|
||||
// We have to decode the URL since want the cleartext value
|
||||
|
@ -135,15 +134,14 @@
|
|||
return defaultValue;
|
||||
}
|
||||
|
||||
document.getElementById('sendCtrlAltDelButton')
|
||||
.onclick = sendCtrlAltDel;
|
||||
document.getElementById("sendCtrlAltDelButton").onclick = sendCtrlAltDel;
|
||||
|
||||
// Read parameters specified in the URL query string
|
||||
// By default, use the host and port of server that served this file
|
||||
const host = readQueryVariable('host', window.location.hostname);
|
||||
let port = readQueryVariable('port', window.location.port);
|
||||
const password = readQueryVariable('password');
|
||||
const path = readQueryVariable('path', 'websockify');
|
||||
const host = readQueryVariable("host", window.location.hostname);
|
||||
let port = readQueryVariable("port", window.location.port);
|
||||
const password = readQueryVariable("password");
|
||||
const path = readQueryVariable("path", "websockify");
|
||||
|
||||
// | | | | | |
|
||||
// | | | Connect | | |
|
||||
|
@ -154,19 +152,20 @@
|
|||
// Build the websocket URL used to connect
|
||||
let url;
|
||||
if (window.location.protocol === "https:") {
|
||||
url = 'wss';
|
||||
url = "wss";
|
||||
} else {
|
||||
url = 'ws';
|
||||
url = "ws";
|
||||
}
|
||||
url += '://' + host;
|
||||
url += "://" + host;
|
||||
if (port) {
|
||||
url += ':' + port;
|
||||
url += ":" + port;
|
||||
}
|
||||
url += '/' + path;
|
||||
url += "/" + path;
|
||||
|
||||
// Creating a new RFB object will start a new connection
|
||||
rfb = new RFB(document.getElementById('screen'), url,
|
||||
{ credentials: { password: password } });
|
||||
rfb = new RFB(document.getElementById("screen"), url, {
|
||||
credentials: { password: password },
|
||||
});
|
||||
|
||||
// Add listeners to important events from the RFB module
|
||||
rfb.addEventListener("connect", connectedToServer);
|
||||
|
@ -175,8 +174,8 @@
|
|||
rfb.addEventListener("desktopname", updateDesktopName);
|
||||
|
||||
// Set parameters that can be changed on an active connection
|
||||
rfb.viewOnly = readQueryVariable('view_only', false);
|
||||
rfb.scaleViewport = readQueryVariable('scale', false);
|
||||
rfb.viewOnly = readQueryVariable("view_only", false);
|
||||
rfb.scaleViewport = readQueryVariable("scale", false);
|
||||
}
|
||||
|
||||
connect();
|
||||
|
|
|
@ -8,7 +8,10 @@
|
|||
"license": "AGPL-3.0-or-later",
|
||||
"scripts": {
|
||||
"tsc": "tsc",
|
||||
"lint": "eslint *.js tests/*.test.js",
|
||||
"format": "prettier . --check",
|
||||
"format:fix": "prettier . --write",
|
||||
"lint": "eslint .",
|
||||
"lint:fix": "yarn format:fix && eslint . --fix",
|
||||
"test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
|
||||
"prepare": "husky install"
|
||||
},
|
||||
|
@ -40,9 +43,11 @@
|
|||
"@typescript-eslint/eslint-plugin": "^6.10.0",
|
||||
"@typescript-eslint/parser": "^6.10.0",
|
||||
"eslint": "^8.53.0",
|
||||
"eslint-config-prettier": "^9.0.0",
|
||||
"eslint-plugin-react": "^7.22.0",
|
||||
"jest": "^29.2.1",
|
||||
"md5": "^2.3.0",
|
||||
"prettier": "3.0.3",
|
||||
"typescript": "^5.2.2"
|
||||
},
|
||||
"jest": {
|
||||
|
|
210
src/crawler.ts
210
src/crawler.ts
|
@ -4,7 +4,13 @@ import fs, { WriteStream } from "fs";
|
|||
import os from "os";
|
||||
import fsp, { FileHandle } from "fs/promises";
|
||||
|
||||
import { RedisCrawlState, LoadState, QueueState, PageState, WorkerId } from "./util/state.js";
|
||||
import {
|
||||
RedisCrawlState,
|
||||
LoadState,
|
||||
QueueState,
|
||||
PageState,
|
||||
WorkerId,
|
||||
} from "./util/state.js";
|
||||
|
||||
import Sitemapper from "sitemapper";
|
||||
import yaml from "js-yaml";
|
||||
|
@ -13,7 +19,14 @@ import * as warcio from "warcio";
|
|||
|
||||
import { HealthChecker } from "./util/healthcheck.js";
|
||||
import { TextExtractViaSnapshot } from "./util/textextract.js";
|
||||
import { initStorage, getFileSize, getDirSize, interpolateFilename, checkDiskUtilization, S3StorageSync } from "./util/storage.js";
|
||||
import {
|
||||
initStorage,
|
||||
getFileSize,
|
||||
getDirSize,
|
||||
interpolateFilename,
|
||||
checkDiskUtilization,
|
||||
S3StorageSync,
|
||||
} from "./util/storage.js";
|
||||
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
|
||||
import { Screenshots } from "./util/screenshots.js";
|
||||
import { parseArgs } from "./util/argParser.js";
|
||||
|
@ -25,7 +38,12 @@ import { collectAllFileSources } from "./util/file_reader.js";
|
|||
|
||||
import { Browser } from "./util/browser.js";
|
||||
|
||||
import { ADD_LINK_FUNC, BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } from "./util/constants.js";
|
||||
import {
|
||||
ADD_LINK_FUNC,
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
HTML_TYPES,
|
||||
DEFAULT_SELECTORS,
|
||||
} from "./util/constants.js";
|
||||
|
||||
import { AdBlockRules, BlockRules } from "./util/blockrules.js";
|
||||
import { OriginOverride } from "./util/originoverride.js";
|
||||
|
@ -41,12 +59,23 @@ const HTTPS_AGENT = new HTTPSAgent({
|
|||
|
||||
const HTTP_AGENT = new HTTPAgent();
|
||||
|
||||
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||
import.meta.url,
|
||||
),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
const FETCH_TIMEOUT_SECS = 30;
|
||||
const PAGE_OP_TIMEOUT_SECS = 5;
|
||||
|
||||
const POST_CRAWL_STATES = ["generate-wacz", "uploading-wacz", "generate-cdx", "generate-warc"];
|
||||
const POST_CRAWL_STATES = [
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"generate-cdx",
|
||||
"generate-warc",
|
||||
];
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
type LogDetails = Record<string, any>;
|
||||
|
@ -62,7 +91,6 @@ type PageEntry = {
|
|||
favIconUrl?: string;
|
||||
};
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class Crawler {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -128,8 +156,12 @@ export class Crawler {
|
|||
maxHeapUsed = 0;
|
||||
maxHeapTotal = 0;
|
||||
|
||||
driver!: (opts: {
|
||||
page: Page;
|
||||
data: PageState;
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
driver!: (opts: { page: Page; data: PageState; crawler: Crawler }) => NonNullable<unknown>;
|
||||
crawler: Crawler;
|
||||
}) => NonNullable<unknown>;
|
||||
|
||||
constructor() {
|
||||
const res = parseArgs();
|
||||
|
@ -140,12 +172,12 @@ export class Crawler {
|
|||
this.collDir = path.join(
|
||||
this.params.cwd,
|
||||
"collections",
|
||||
this.params.collection
|
||||
this.params.collection,
|
||||
);
|
||||
this.logDir = path.join(this.collDir, "logs");
|
||||
this.logFilename = path.join(
|
||||
this.logDir,
|
||||
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`
|
||||
`crawl-${new Date().toISOString().replace(/[^\d]/g, "")}.log`,
|
||||
);
|
||||
|
||||
const debugLogging = this.params.logging.includes("debug");
|
||||
|
@ -252,7 +284,7 @@ export class Crawler {
|
|||
|
||||
if (!redisUrl.startsWith("redis://")) {
|
||||
logger.fatal(
|
||||
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported"
|
||||
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -272,7 +304,7 @@ export class Crawler {
|
|||
logger.debug(
|
||||
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
|
||||
{},
|
||||
"state"
|
||||
"state",
|
||||
);
|
||||
|
||||
logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
|
||||
|
@ -281,7 +313,7 @@ export class Crawler {
|
|||
redis,
|
||||
this.params.crawlId,
|
||||
this.maxPageTime,
|
||||
os.hostname()
|
||||
os.hostname(),
|
||||
);
|
||||
|
||||
// clear any pending URLs from this instance
|
||||
|
@ -291,7 +323,7 @@ export class Crawler {
|
|||
logger.debug(
|
||||
`Saving crawl state every ${this.params.saveStateInterval} seconds, keeping last ${this.params.saveStateHistory} states`,
|
||||
{},
|
||||
"state"
|
||||
"state",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -311,7 +343,7 @@ export class Crawler {
|
|||
logger.debug(
|
||||
`Screencast server started on: ${this.params.screencastPort}`,
|
||||
{},
|
||||
"screencast"
|
||||
"screencast",
|
||||
);
|
||||
}
|
||||
// } else if (this.params.redisStoreUrl && this.params.screencastRedis) {
|
||||
|
@ -383,7 +415,7 @@ export class Crawler {
|
|||
|
||||
if (this.params.customBehaviors) {
|
||||
this.customBehaviors = this.loadCustomBehaviors(
|
||||
this.params.customBehaviors
|
||||
this.params.customBehaviors,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -461,7 +493,7 @@ export class Crawler {
|
|||
_behaviorLog(
|
||||
{ data, type }: { data: string; type: string },
|
||||
pageUrl: string,
|
||||
workerid: WorkerId
|
||||
workerid: WorkerId,
|
||||
) {
|
||||
let behaviorLine;
|
||||
let message;
|
||||
|
@ -506,7 +538,7 @@ export class Crawler {
|
|||
depth,
|
||||
extraHops,
|
||||
}: { seedId: number; url: string; depth: number; extraHops: number },
|
||||
logDetails = {}
|
||||
logDetails = {},
|
||||
) {
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
|
@ -553,7 +585,7 @@ export class Crawler {
|
|||
logger.warn(
|
||||
msg.text(),
|
||||
{ location: msg.location(), page: page.url(), workerid },
|
||||
"jsError"
|
||||
"jsError",
|
||||
);
|
||||
}
|
||||
});
|
||||
|
@ -562,7 +594,7 @@ export class Crawler {
|
|||
logger.warn(
|
||||
"Page Error",
|
||||
{ ...errJSON(e), page: page.url(), workerid },
|
||||
"jsError"
|
||||
"jsError",
|
||||
);
|
||||
});
|
||||
}
|
||||
|
@ -574,14 +606,14 @@ export class Crawler {
|
|||
|
||||
await page.exposeFunction(
|
||||
ADD_LINK_FUNC,
|
||||
(url: string) => callbacks.addLink && callbacks.addLink(url)
|
||||
(url: string) => callbacks.addLink && callbacks.addLink(url),
|
||||
);
|
||||
|
||||
if (this.params.behaviorOpts) {
|
||||
await page.exposeFunction(
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
(logdata: { data: string; type: string }) =>
|
||||
this._behaviorLog(logdata, page.url(), workerid)
|
||||
this._behaviorLog(logdata, page.url(), workerid),
|
||||
);
|
||||
await this.browser.addInitScript(page, behaviors);
|
||||
|
||||
|
@ -622,7 +654,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
logger.warn(
|
||||
"Failed to fetch favicon from browser /json endpoint",
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
return "";
|
||||
}
|
||||
|
@ -645,7 +677,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"HEAD request to determine if URL is HTML page timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true
|
||||
true,
|
||||
);
|
||||
|
||||
if (!data.isHTMLPage && directFetchCapture) {
|
||||
|
@ -656,7 +688,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"Direct fetch capture attempt timed out",
|
||||
logDetails,
|
||||
"fetch",
|
||||
true
|
||||
true,
|
||||
);
|
||||
if (fetched) {
|
||||
data.loadState = LoadState.FULL_PAGE_LOADED;
|
||||
|
@ -666,7 +698,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
"Direct fetch successful",
|
||||
{ url, ...logDetails },
|
||||
"fetch"
|
||||
"fetch",
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
@ -714,7 +746,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const { changed, text } = await textextract.extractAndStoreText(
|
||||
"text",
|
||||
false,
|
||||
this.params.text.includes("to-warc")
|
||||
this.params.text.includes("to-warc"),
|
||||
);
|
||||
|
||||
if (changed && text && this.params.text.includes("to-pages")) {
|
||||
|
@ -729,7 +761,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(
|
||||
"Skipping behaviors for non-HTML page",
|
||||
logDetails,
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
} else if (data.skipBehaviors) {
|
||||
logger.info("Skipping behaviors for slow page", logDetails, "behavior");
|
||||
|
@ -739,7 +771,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
this.params.behaviorTimeout,
|
||||
"Behaviors timed out",
|
||||
logDetails,
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
|
||||
await this.netIdle(page, logDetails);
|
||||
|
@ -757,7 +789,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.params.pageExtraDelay) {
|
||||
logger.info(
|
||||
`Waiting ${this.params.pageExtraDelay} seconds before moving on to next page`,
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
await sleep(this.params.pageExtraDelay);
|
||||
}
|
||||
|
@ -784,7 +816,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.warn(
|
||||
"Page Load Failed",
|
||||
{ loadState, ...logDetails },
|
||||
"pageStatus"
|
||||
"pageStatus",
|
||||
);
|
||||
|
||||
await this.crawlState.markFailed(data.url);
|
||||
|
@ -816,7 +848,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
page: Page,
|
||||
cdp: CDPSession,
|
||||
frames: Frame[],
|
||||
logDetails: LogDetails
|
||||
logDetails: LogDetails,
|
||||
) {
|
||||
try {
|
||||
frames = frames || page.frames();
|
||||
|
@ -828,7 +860,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
frameUrls: frames.map((frame) => frame.url()),
|
||||
...logDetails,
|
||||
},
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
|
@ -844,9 +876,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
self.__bx_behaviors.run();
|
||||
}`,
|
||||
logDetails,
|
||||
"behavior"
|
||||
)
|
||||
)
|
||||
"behavior",
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
for (const res of results) {
|
||||
|
@ -855,7 +887,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.warn(
|
||||
"Behavior run partially failed",
|
||||
{ reason, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -863,14 +895,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
"Behaviors finished",
|
||||
{ finished: results.length, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.warn(
|
||||
"Behavior run failed",
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
@ -886,14 +918,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
// this is all designed to detect and skip PDFs, and other frames that are actually EMBEDs
|
||||
// if there's no tag or an iframe tag, then assume its a regular frame
|
||||
const tagName = await frame.evaluate(
|
||||
"self && self.frameElement && self.frameElement.tagName"
|
||||
"self && self.frameElement && self.frameElement.tagName",
|
||||
);
|
||||
|
||||
if (tagName && tagName !== "IFRAME" && tagName !== "FRAME") {
|
||||
logger.debug(
|
||||
"Skipping processing non-frame object",
|
||||
{ tagName, frameUrl, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
@ -910,7 +942,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(
|
||||
"Skipping processing frame",
|
||||
{ frameUrl, ...logDetails },
|
||||
"behavior"
|
||||
"behavior",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -921,13 +953,13 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const packageFileJSON = JSON.parse(
|
||||
await fsp.readFile(new URL("../package.json", import.meta.url), {
|
||||
encoding: "utf-8",
|
||||
})
|
||||
}),
|
||||
);
|
||||
const warcioPackageJSON = JSON.parse(
|
||||
await fsp.readFile(
|
||||
new URL("../node_modules/warcio/package.json", import.meta.url),
|
||||
{ encoding: "utf-8" }
|
||||
)
|
||||
{ encoding: "utf-8" },
|
||||
),
|
||||
);
|
||||
|
||||
return `Browsertrix-Crawler ${packageFileJSON.version} (with warcio.js ${warcioPackageJSON.version})`;
|
||||
|
@ -945,7 +977,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const warcInfo = { ...info, ...this.params.warcInfo };
|
||||
const record = await warcio.WARCRecord.createWARCInfo(
|
||||
{ filename, type, warcVersion },
|
||||
warcInfo
|
||||
warcInfo,
|
||||
);
|
||||
const buffer = await warcio.WARCSerializer.serialize(record, {
|
||||
gzip: true,
|
||||
|
@ -964,7 +996,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.params.sizeLimit) {
|
||||
if (size >= this.params.sizeLimit) {
|
||||
logger.info(
|
||||
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`
|
||||
`Size threshold reached ${size} >= ${this.params.sizeLimit}, stopping`,
|
||||
);
|
||||
interrupt = true;
|
||||
}
|
||||
|
@ -974,7 +1006,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const elapsed = secondsElapsed(this.startTime);
|
||||
if (elapsed >= this.params.timeLimit) {
|
||||
logger.info(
|
||||
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`
|
||||
`Time threshold reached ${elapsed} > ${this.params.timeLimit}, stopping`,
|
||||
);
|
||||
interrupt = true;
|
||||
}
|
||||
|
@ -992,7 +1024,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const numFailed = this.crawlState.numFailed();
|
||||
if (numFailed >= this.params.failOnFailedLimit) {
|
||||
logger.fatal(
|
||||
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`
|
||||
`Failed threshold reached ${numFailed} >= ${this.params.failedLimit}, failing crawl`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1060,7 +1092,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
if (this.params.healthCheckPort) {
|
||||
this.healthChecker = new HealthChecker(
|
||||
this.params.healthCheckPort,
|
||||
this.params.workers
|
||||
this.params.workers,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1125,7 +1157,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.crawlState.load(
|
||||
this.params.state,
|
||||
this.params.scopedSeeds,
|
||||
true
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1133,14 +1165,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
this.adBlockRules = new AdBlockRules(
|
||||
this.captureBasePrefix,
|
||||
this.params.adBlockMessage
|
||||
this.params.adBlockMessage,
|
||||
);
|
||||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(
|
||||
this.params.blockRules,
|
||||
this.captureBasePrefix,
|
||||
this.params.blockMessage
|
||||
this.params.blockMessage,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1178,7 +1210,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.error(
|
||||
"Browser disconnected (crashed?), interrupting crawl",
|
||||
err,
|
||||
"browser"
|
||||
"browser",
|
||||
);
|
||||
},
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -1220,7 +1252,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const warcList = await fsp.readdir(path.join(this.collDir, "archive"));
|
||||
const warcListFull = warcList.map((filename) =>
|
||||
path.join(this.collDir, "archive", filename)
|
||||
path.join(this.collDir, "archive", filename),
|
||||
);
|
||||
|
||||
//const indexResult = await this.awaitProcess(child_process.spawn("wb-manager", ["reindex", this.params.collection], {cwd: this.params.cwd}));
|
||||
|
@ -1230,7 +1262,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
...warcListFull,
|
||||
];
|
||||
const indexResult = await this.awaitProcess(
|
||||
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd })
|
||||
child_process.spawn("cdxj-indexer", params, { cwd: this.params.cwd }),
|
||||
);
|
||||
if (indexResult === 0) {
|
||||
logger.debug("Indexing complete, CDX successfully created");
|
||||
|
@ -1251,7 +1283,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (uploaded && this.uploadAndDeleteLocal) {
|
||||
logger.info(
|
||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`
|
||||
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
|
||||
);
|
||||
try {
|
||||
fs.rmSync(this.collDir, { recursive: true, force: true });
|
||||
|
@ -1352,13 +1384,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
createArgs.push("-f");
|
||||
|
||||
warcFileList.forEach((val) =>
|
||||
createArgs.push(path.join(archiveDir, val))
|
||||
);
|
||||
warcFileList.forEach((val) => createArgs.push(path.join(archiveDir, val)));
|
||||
|
||||
// create WACZ
|
||||
const waczResult = await this.awaitProcess(
|
||||
child_process.spawn("wacz", createArgs)
|
||||
child_process.spawn("wacz", createArgs),
|
||||
);
|
||||
|
||||
if (waczResult !== 0) {
|
||||
|
@ -1430,7 +1460,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
maxHeapTotal: this.maxHeapTotal,
|
||||
...memUsage,
|
||||
},
|
||||
"memory"
|
||||
"memory",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1461,7 +1491,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
try {
|
||||
await fsp.writeFile(
|
||||
this.params.statsFilename,
|
||||
JSON.stringify(stats, null, 2)
|
||||
JSON.stringify(stats, null, 2),
|
||||
);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (err: any) {
|
||||
|
@ -1473,7 +1503,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
async loadPage(
|
||||
page: Page,
|
||||
data: PageState,
|
||||
selectorOptsList = DEFAULT_SELECTORS
|
||||
selectorOptsList = DEFAULT_SELECTORS,
|
||||
) {
|
||||
const { url, seedId, depth } = data;
|
||||
|
||||
|
@ -1575,7 +1605,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
const frames = await page.frames();
|
||||
|
||||
const filteredFrames = await Promise.allSettled(
|
||||
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails))
|
||||
frames.map((frame) => this.shouldIncludeFrame(frame, logDetails)),
|
||||
);
|
||||
|
||||
data.filteredFrames = filteredFrames
|
||||
|
@ -1640,7 +1670,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
page: Page,
|
||||
data: PageState,
|
||||
selectors = DEFAULT_SELECTORS,
|
||||
logDetails: LogDetails
|
||||
logDetails: LogDetails,
|
||||
) {
|
||||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||
|
||||
|
@ -1651,7 +1681,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
links.push(url);
|
||||
if (links.length == 500) {
|
||||
promiseList.push(
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails)
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||
);
|
||||
links = [];
|
||||
}
|
||||
|
@ -1676,7 +1706,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
document.querySelectorAll(selector).forEach(getter);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const func = (window as any)[addLinkFunc] as (url: string) => NonNullable<unknown>;
|
||||
const func = (window as any)[addLinkFunc] as (
|
||||
url: string,
|
||||
) => NonNullable<unknown>;
|
||||
urls.forEach((url) => func.call(this, url));
|
||||
|
||||
return true;
|
||||
|
@ -1701,9 +1733,9 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}),
|
||||
PAGE_OP_TIMEOUT_SECS,
|
||||
"Link extraction timed out",
|
||||
logDetails
|
||||
)
|
||||
)
|
||||
logDetails,
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
for (let i = 0; i < promiseResults.length; i++) {
|
||||
|
@ -1725,7 +1757,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
if (links.length) {
|
||||
promiseList.push(
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails)
|
||||
this.queueInScopeUrls(seedId, links, depth, extraHops, logDetails),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1737,7 +1769,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
urls: string[],
|
||||
depth: number,
|
||||
extraHops = 0,
|
||||
logDetails: LogDetails = {}
|
||||
logDetails: LogDetails = {},
|
||||
) {
|
||||
try {
|
||||
depth += 1;
|
||||
|
@ -1748,7 +1780,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
for (const possibleUrl of urls) {
|
||||
const res = this.isInScope(
|
||||
{ url: possibleUrl, extraHops: newExtraHops, depth, seedId },
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
|
||||
if (!res) {
|
||||
|
@ -1763,7 +1795,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
url,
|
||||
depth,
|
||||
isOOS ? newExtraHops : extraHops,
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1784,12 +1816,12 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
"Cloudflare check timed out",
|
||||
logDetails,
|
||||
"general",
|
||||
true
|
||||
true,
|
||||
)
|
||||
) {
|
||||
logger.debug(
|
||||
"Cloudflare Check Detected, waiting for reload...",
|
||||
logDetails
|
||||
logDetails,
|
||||
);
|
||||
await sleep(5.5);
|
||||
}
|
||||
|
@ -1803,7 +1835,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
url: string,
|
||||
depth: number,
|
||||
extraHops: number,
|
||||
logDetails: LogDetails = {}
|
||||
logDetails: LogDetails = {},
|
||||
) {
|
||||
if (this.limitHit) {
|
||||
return false;
|
||||
|
@ -1811,7 +1843,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
|
||||
const result = await this.crawlState.addToQueue(
|
||||
{ url, seedId, depth, extraHops },
|
||||
this.pageLimit
|
||||
this.pageLimit,
|
||||
);
|
||||
|
||||
switch (result) {
|
||||
|
@ -1823,7 +1855,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(
|
||||
"Not queued page url, at page limit",
|
||||
{ url, ...logDetails },
|
||||
"links"
|
||||
"links",
|
||||
);
|
||||
this.limitHit = true;
|
||||
return false;
|
||||
|
@ -1832,7 +1864,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.debug(
|
||||
"Not queued page url, already seen",
|
||||
{ url, ...logDetails },
|
||||
"links"
|
||||
"links",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
@ -1961,14 +1993,14 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
logger.info(
|
||||
"Fetching full sitemap (fromDate not specified/valid)",
|
||||
{ url, sitemapFromDate },
|
||||
"sitemap"
|
||||
"sitemap",
|
||||
);
|
||||
} else {
|
||||
lastmodFromTimestamp = dateObj.getTime();
|
||||
logger.info(
|
||||
"Fetching and filtering sitemap by date",
|
||||
{ url, sitemapFromDate },
|
||||
"sitemap"
|
||||
"sitemap",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -2166,8 +2198,11 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
function shouldIgnoreAbort(req: HTTPRequest) {
|
||||
try {
|
||||
const failure = req.failure();
|
||||
const failureText = failure && failure.errorText || "";
|
||||
if (failureText !== "net::ERR_ABORTED" || req.resourceType() !== "document") {
|
||||
const failureText = (failure && failure.errorText) || "";
|
||||
if (
|
||||
failureText !== "net::ERR_ABORTED" ||
|
||||
req.resourceType() !== "document"
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -2178,8 +2213,10 @@ function shouldIgnoreAbort(req: HTTPRequest) {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (headers["content-disposition"] ||
|
||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))) {
|
||||
if (
|
||||
headers["content-disposition"] ||
|
||||
(headers["content-type"] && !headers["content-type"].startsWith("text/"))
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
} catch (e) {
|
||||
|
@ -2188,4 +2225,3 @@ function shouldIgnoreAbort(req: HTTPRequest) {
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,81 +15,99 @@ import { Browser } from "./util/browser.js";
|
|||
import { initStorage } from "./util/storage.js";
|
||||
import { CDPSession, Page, PuppeteerLifeCycleEvent } from "puppeteer-core";
|
||||
|
||||
const profileHTML = fs.readFileSync(new URL("../html/createProfile.html", import.meta.url), {encoding: "utf8"});
|
||||
const vncHTML = fs.readFileSync(new URL("../html/vnc_lite.html", import.meta.url), {encoding: "utf8"});
|
||||
const profileHTML = fs.readFileSync(
|
||||
new URL("../html/createProfile.html", import.meta.url),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
const vncHTML = fs.readFileSync(
|
||||
new URL("../html/vnc_lite.html", import.meta.url),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
const behaviors = fs.readFileSync(new URL("../node_modules/browsertrix-behaviors/dist/behaviors.js", import.meta.url), {encoding: "utf8"});
|
||||
const behaviors = fs.readFileSync(
|
||||
new URL(
|
||||
"../node_modules/browsertrix-behaviors/dist/behaviors.js",
|
||||
import.meta.url,
|
||||
),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
function cliOpts(): { [key: string]: Options } {
|
||||
return {
|
||||
"url": {
|
||||
url: {
|
||||
describe: "The URL of the login page",
|
||||
type: "string",
|
||||
demandOption: true,
|
||||
},
|
||||
|
||||
"user": {
|
||||
describe: "The username for the login. If not specified, will be prompted",
|
||||
user: {
|
||||
describe:
|
||||
"The username for the login. If not specified, will be prompted",
|
||||
},
|
||||
|
||||
"password": {
|
||||
describe: "The password for the login. If not specified, will be prompted (recommended)",
|
||||
password: {
|
||||
describe:
|
||||
"The password for the login. If not specified, will be prompted (recommended)",
|
||||
},
|
||||
|
||||
"filename": {
|
||||
filename: {
|
||||
describe: "The filename for the profile tarball",
|
||||
default: "/crawls/profiles/profile.tar.gz",
|
||||
},
|
||||
|
||||
"debugScreenshot": {
|
||||
describe: "If specified, take a screenshot after login and save as this filename"
|
||||
debugScreenshot: {
|
||||
describe:
|
||||
"If specified, take a screenshot after login and save as this filename",
|
||||
},
|
||||
|
||||
"headless": {
|
||||
headless: {
|
||||
describe: "Run in headless mode, otherwise start xvfb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"automated": {
|
||||
automated: {
|
||||
describe: "Start in automated mode, no interactive browser",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"interactive": {
|
||||
interactive: {
|
||||
describe: "Deprecated. Now the default option!",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"shutdownWait": {
|
||||
describe: "Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
shutdownWait: {
|
||||
describe:
|
||||
"Shutdown browser in interactive after this many seconds, if no pings received",
|
||||
type: "number",
|
||||
default: 0
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
profile: {
|
||||
describe:
|
||||
"Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"windowSize": {
|
||||
windowSize: {
|
||||
type: "string",
|
||||
describe: "Browser window dimensions, specified as: width,height",
|
||||
default: getDefaultWindowSize()
|
||||
default: getDefaultWindowSize(),
|
||||
},
|
||||
|
||||
"proxy": {
|
||||
proxy: {
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"cookieDays": {
|
||||
cookieDays: {
|
||||
type: "number",
|
||||
describe: "If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||
default: 7
|
||||
}
|
||||
describe:
|
||||
"If >0, set all cookies, including session cookies, to have this duration in days before saving profile",
|
||||
default: 7,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -100,14 +118,11 @@ function getDefaultWindowSize() {
|
|||
return `${x},${y}`;
|
||||
}
|
||||
|
||||
|
||||
|
||||
async function main() {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const params: any = yargs(process.argv)
|
||||
.usage("browsertrix-crawler profile [options]")
|
||||
.option(cliOpts())
|
||||
.argv;
|
||||
.option(cliOpts()).argv;
|
||||
|
||||
logger.setDebugLogging(true);
|
||||
|
||||
|
@ -122,7 +137,7 @@ async function main() {
|
|||
process.env.GEOMETRY || "",
|
||||
"-ac",
|
||||
"+extension",
|
||||
"RANDR"
|
||||
"RANDR",
|
||||
]);
|
||||
|
||||
//await fsp.mkdir(path.join(homedir(), ".vnc"), {recursive: true});
|
||||
|
@ -140,7 +155,7 @@ async function main() {
|
|||
"-passwd",
|
||||
process.env.VNC_PASS || "",
|
||||
"-display",
|
||||
process.env.DISPLAY || ""
|
||||
process.env.DISPLAY || "",
|
||||
]);
|
||||
}
|
||||
|
||||
|
@ -156,13 +171,15 @@ async function main() {
|
|||
"--window-position=0,0",
|
||||
`--window-size=${params.windowSize}`,
|
||||
// to disable the 'stability will suffer' infobar
|
||||
"--test-type"
|
||||
]
|
||||
}
|
||||
"--test-type",
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
if (params.interactive) {
|
||||
logger.warn("Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode");
|
||||
logger.warn(
|
||||
"Note: the '--interactive' flag is now deprecated and is the default profile creation option. Use the --automated flag to specify non-interactive mode",
|
||||
);
|
||||
}
|
||||
|
||||
if (params.user || params.password) {
|
||||
|
@ -187,7 +204,10 @@ async function main() {
|
|||
await browser.setupPage({ page, cdp });
|
||||
|
||||
// for testing, inject browsertrix-behaviors
|
||||
await browser.addInitScript(page, behaviors + ";\nself.__bx_behaviors.init();");
|
||||
await browser.addInitScript(
|
||||
page,
|
||||
behaviors + ";\nself.__bx_behaviors.init();",
|
||||
);
|
||||
}
|
||||
|
||||
logger.info(`Loading page: ${params.url}`);
|
||||
|
@ -204,17 +224,26 @@ async function main() {
|
|||
}
|
||||
}
|
||||
|
||||
async function automatedProfile(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async function automatedProfile(params: any, browser: Browser, page: Page, cdp: CDPSession,
|
||||
waitUntil: PuppeteerLifeCycleEvent) {
|
||||
params: any,
|
||||
browser: Browser,
|
||||
page: Page,
|
||||
cdp: CDPSession,
|
||||
waitUntil: PuppeteerLifeCycleEvent,
|
||||
) {
|
||||
let u, p;
|
||||
|
||||
logger.debug("Looking for username and password entry fields on page...");
|
||||
|
||||
try {
|
||||
u = await page.waitForSelector("//input[contains(@name, 'user') or contains(@name, 'email')]");
|
||||
p = await page.waitForSelector("//input[contains(@name, 'pass') and @type='password']");
|
||||
|
||||
u = await page.waitForSelector(
|
||||
"//input[contains(@name, 'user') or contains(@name, 'email')]",
|
||||
);
|
||||
p = await page.waitForSelector(
|
||||
"//input[contains(@name, 'pass') and @type='password']",
|
||||
);
|
||||
} catch (e) {
|
||||
if (params.debugScreenshot) {
|
||||
await page.screenshot({ path: params.debugScreenshot });
|
||||
|
@ -231,7 +260,7 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
|
|||
|
||||
await Promise.allSettled([
|
||||
p!.press("Enter"),
|
||||
page.waitForNavigation({waitUntil})
|
||||
page.waitForNavigation({ waitUntil }),
|
||||
]);
|
||||
|
||||
if (params.debugScreenshot) {
|
||||
|
@ -243,8 +272,15 @@ async function automatedProfile(params: any, browser: Browser, page: Page, cdp:
|
|||
process.exit(0);
|
||||
}
|
||||
|
||||
async function createProfile(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async function createProfile(params: any, browser: Browser, page: Page, cdp: CDPSession, targetFilename = "") {
|
||||
params: any,
|
||||
browser: Browser,
|
||||
page: Page,
|
||||
cdp: CDPSession,
|
||||
targetFilename = "",
|
||||
) {
|
||||
await cdp.send("Network.clearBrowserCache");
|
||||
|
||||
await browser.close();
|
||||
|
@ -276,7 +312,7 @@ function promptInput(msg: string, hidden = false) {
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const rl: any = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout
|
||||
output: process.stdout,
|
||||
});
|
||||
|
||||
if (hidden) {
|
||||
|
@ -303,7 +339,6 @@ function promptInput(msg: string, hidden = false) {
|
|||
});
|
||||
}
|
||||
|
||||
|
||||
class InteractiveBrowser {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
params: any;
|
||||
|
@ -323,7 +358,7 @@ class InteractiveBrowser {
|
|||
browser: Browser,
|
||||
page: Page,
|
||||
cdp: CDPSession,
|
||||
targetId: string
|
||||
targetId: string,
|
||||
) {
|
||||
logger.info("Creating Profile Interactively...");
|
||||
child_process.spawn("socat", [
|
||||
|
@ -359,19 +394,19 @@ class InteractiveBrowser {
|
|||
if (this.shutdownWait) {
|
||||
this.shutdownTimer = setTimeout(() => process.exit(0), this.shutdownWait);
|
||||
logger.debug(
|
||||
`Shutting down in ${this.shutdownWait}ms if no ping received`
|
||||
`Shutting down in ${this.shutdownWait}ms if no ping received`,
|
||||
);
|
||||
} else {
|
||||
this.shutdownTimer = null;
|
||||
}
|
||||
|
||||
const httpServer = http.createServer((req, res) =>
|
||||
this.handleRequest(req, res)
|
||||
this.handleRequest(req, res),
|
||||
);
|
||||
const port = 9223;
|
||||
httpServer.listen(port);
|
||||
logger.info(
|
||||
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`
|
||||
`Browser Profile UI Server started. Load http://localhost:${port}/ to interact with a Chromium-based browser, click 'Create Profile' when done.`,
|
||||
);
|
||||
|
||||
if (!params.headless) {
|
||||
|
@ -452,8 +487,8 @@ class InteractiveBrowser {
|
|||
res.end(
|
||||
profileHTML.replace(
|
||||
"$DEVTOOLS_SRC",
|
||||
targetUrl.replaceAll("$HOST", parsedUrl.hostname)
|
||||
)
|
||||
targetUrl.replaceAll("$HOST", parsedUrl.hostname),
|
||||
),
|
||||
);
|
||||
return;
|
||||
|
||||
|
@ -469,10 +504,10 @@ class InteractiveBrowser {
|
|||
clearTimeout(this.shutdownTimer as any);
|
||||
this.shutdownTimer = setTimeout(
|
||||
() => process.exit(0),
|
||||
this.shutdownWait
|
||||
this.shutdownWait,
|
||||
);
|
||||
logger.debug(
|
||||
`Ping received, delaying shutdown for ${this.shutdownWait}ms`
|
||||
`Ping received, delaying shutdown for ${this.shutdownWait}ms`,
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -530,7 +565,7 @@ class InteractiveBrowser {
|
|||
this.browser,
|
||||
this.page,
|
||||
this.cdp,
|
||||
targetFilename
|
||||
targetFilename,
|
||||
);
|
||||
origins = Array.from(this.originSet.values());
|
||||
|
||||
|
@ -558,13 +593,13 @@ class InteractiveBrowser {
|
|||
|
||||
res.writeHead(200, { "Content-Type": "text/html" });
|
||||
res.end(
|
||||
"<html><body>Profile Created! You may now close this window.</body></html>"
|
||||
"<html><body>Profile Created! You may now close this window.</body></html>",
|
||||
);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
} catch (e: any) {
|
||||
res.writeHead(500, { "Content-Type": "text/html" });
|
||||
res.end(
|
||||
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info"
|
||||
"<html><body>Profile creation failed! See the browsertrix-crawler console for more info",
|
||||
);
|
||||
logger.warn("HTTP Error", e);
|
||||
}
|
||||
|
@ -576,7 +611,7 @@ class InteractiveBrowser {
|
|||
if (pathname.startsWith("/vnc/")) {
|
||||
const fileUrl = new URL(
|
||||
"../node_modules/@novnc/novnc/" + pathname.slice("/vnc/".length),
|
||||
import.meta.url
|
||||
import.meta.url,
|
||||
);
|
||||
const file = fs.readFileSync(fileUrl, { encoding: "utf-8" });
|
||||
res.writeHead(200, { "Content-Type": "application/javascript" });
|
||||
|
@ -607,6 +642,4 @@ class InteractiveBrowser {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
main();
|
||||
|
||||
|
|
|
@ -2,6 +2,14 @@ import { Page } from "puppeteer-core";
|
|||
import { PageState } from "./util/state.js";
|
||||
import { Crawler } from "./crawler.js";
|
||||
|
||||
export default async ({data, page, crawler} : {data: PageState, page: Page, crawler: Crawler}) => {
|
||||
export default async ({
|
||||
data,
|
||||
page,
|
||||
crawler,
|
||||
}: {
|
||||
data: PageState;
|
||||
page: Page;
|
||||
crawler: Crawler;
|
||||
}) => {
|
||||
await crawler.loadPage(page, data);
|
||||
};
|
||||
|
|
|
@ -4,13 +4,11 @@ import { logger } from "./util/logger.js";
|
|||
import { setExitOnRedisError } from "./util/redis.js";
|
||||
import { Crawler } from "./crawler.js";
|
||||
|
||||
|
||||
let crawler: Crawler | null = null;
|
||||
|
||||
let lastSigInt = 0;
|
||||
let forceTerm = false;
|
||||
|
||||
|
||||
async function handleTerminate(signame: string) {
|
||||
logger.info(`${signame} received...`);
|
||||
if (!crawler || !crawler.crawlState) {
|
||||
|
@ -53,5 +51,3 @@ process.on("SIGABRT", async () => {
|
|||
|
||||
crawler = new Crawler();
|
||||
crawler.run();
|
||||
|
||||
|
||||
|
|
|
@ -7,199 +7,225 @@ import { KnownDevices as devices } from "puppeteer-core";
|
|||
import yargs, { Options } from "yargs";
|
||||
import { hideBin } from "yargs/helpers";
|
||||
|
||||
import { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES } from "./constants.js";
|
||||
import {
|
||||
BEHAVIOR_LOG_FUNC,
|
||||
WAIT_UNTIL_OPTS,
|
||||
EXTRACT_TEXT_TYPES,
|
||||
} from "./constants.js";
|
||||
import { ScopedSeed } from "./seeds.js";
|
||||
import { interpolateFilename } from "./storage.js";
|
||||
import { screenshotTypes } from "./screenshots.js";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
class ArgParser {
|
||||
get cliOpts(): { [key: string]: Options } {
|
||||
const coerce = (array: string[]) => {
|
||||
return array.flatMap(v => v.split(",")).filter(x => !!x);
|
||||
return array.flatMap((v) => v.split(",")).filter((x) => !!x);
|
||||
};
|
||||
|
||||
return {
|
||||
"seeds": {
|
||||
seeds: {
|
||||
alias: "url",
|
||||
describe: "The URL to start crawling from",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"seedFile": {
|
||||
seedFile: {
|
||||
alias: ["urlFile"],
|
||||
describe: "If set, read a list of seed urls, one per line, from the specified",
|
||||
describe:
|
||||
"If set, read a list of seed urls, one per line, from the specified",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"workers": {
|
||||
workers: {
|
||||
alias: "w",
|
||||
describe: "The number of workers to run in parallel",
|
||||
default: 1,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"crawlId": {
|
||||
crawlId: {
|
||||
alias: "id",
|
||||
describe: "A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
||||
describe:
|
||||
"A user provided ID for this crawl or crawl configuration (can also be set via CRAWL_ID env var, defaults to hostname)",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"waitUntil": {
|
||||
describe: "Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||
waitUntil: {
|
||||
describe:
|
||||
"Puppeteer page.goto() condition to wait for before continuing, can be multiple separated by ','",
|
||||
type: "array",
|
||||
default: ["load", "networkidle2"],
|
||||
choices: WAIT_UNTIL_OPTS,
|
||||
coerce,
|
||||
},
|
||||
|
||||
"depth": {
|
||||
depth: {
|
||||
describe: "The depth of the crawl for all seeds",
|
||||
default: -1,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"extraHops": {
|
||||
extraHops: {
|
||||
describe: "Number of extra 'hops' to follow, beyond the current scope",
|
||||
default: 0,
|
||||
type: "number"
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"pageLimit": {
|
||||
pageLimit: {
|
||||
alias: "limit",
|
||||
describe: "Limit crawl to this number of pages",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"maxPageLimit": {
|
||||
describe: "Maximum pages to crawl, overriding pageLimit if both are set",
|
||||
maxPageLimit: {
|
||||
describe:
|
||||
"Maximum pages to crawl, overriding pageLimit if both are set",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"pageLoadTimeout": {
|
||||
pageLoadTimeout: {
|
||||
alias: "timeout",
|
||||
describe: "Timeout for each page to load (in seconds)",
|
||||
default: 90,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"scopeType": {
|
||||
describe: "A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
||||
scopeType: {
|
||||
describe:
|
||||
"A predefined scope of the crawl. For more customization, use 'custom' and set scopeIncludeRx regexes",
|
||||
type: "string",
|
||||
choices: ["page", "page-spa", "prefix", "host", "domain", "any", "custom"]
|
||||
choices: [
|
||||
"page",
|
||||
"page-spa",
|
||||
"prefix",
|
||||
"host",
|
||||
"domain",
|
||||
"any",
|
||||
"custom",
|
||||
],
|
||||
},
|
||||
|
||||
"scopeIncludeRx": {
|
||||
scopeIncludeRx: {
|
||||
alias: "include",
|
||||
describe: "Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||
describe:
|
||||
"Regex of page URLs that should be included in the crawl (defaults to the immediate directory of URL)",
|
||||
},
|
||||
|
||||
"scopeExcludeRx": {
|
||||
scopeExcludeRx: {
|
||||
alias: "exclude",
|
||||
describe: "Regex of page URLs that should be excluded from the crawl."
|
||||
describe: "Regex of page URLs that should be excluded from the crawl.",
|
||||
},
|
||||
|
||||
"allowHashUrls": {
|
||||
describe: "Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||
allowHashUrls: {
|
||||
describe:
|
||||
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||
},
|
||||
|
||||
"blockRules": {
|
||||
describe: "Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||
blockRules: {
|
||||
describe:
|
||||
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"blockMessage": {
|
||||
describe: "If specified, when a URL is blocked, a record with this error message is added instead",
|
||||
blockMessage: {
|
||||
describe:
|
||||
"If specified, when a URL is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"blockAds": {
|
||||
blockAds: {
|
||||
alias: "blockads",
|
||||
describe: "If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
||||
describe:
|
||||
"If set, block advertisements from being loaded (based on Stephen Black's blocklist)",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"adBlockMessage": {
|
||||
describe: "If specified, when an ad is blocked, a record with this error message is added instead",
|
||||
adBlockMessage: {
|
||||
describe:
|
||||
"If specified, when an ad is blocked, a record with this error message is added instead",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"collection": {
|
||||
collection: {
|
||||
alias: "c",
|
||||
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
describe:
|
||||
"Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||
type: "string",
|
||||
default: "crawl-@ts"
|
||||
default: "crawl-@ts",
|
||||
},
|
||||
|
||||
"headless": {
|
||||
headless: {
|
||||
describe: "Run in headless mode, otherwise start xvfb",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"driver": {
|
||||
driver: {
|
||||
describe: "JS driver for the crawler",
|
||||
type: "string",
|
||||
default: "./defaultDriver.js",
|
||||
},
|
||||
|
||||
"generateCDX": {
|
||||
generateCDX: {
|
||||
alias: ["generatecdx", "generateCdx"],
|
||||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
describe:
|
||||
"If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"combineWARC": {
|
||||
combineWARC: {
|
||||
alias: ["combinewarc", "combineWarc"],
|
||||
describe: "If set, combine the warcs",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"rolloverSize": {
|
||||
rolloverSize: {
|
||||
describe: "If set, declare the rollover size",
|
||||
default: 1000000000,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"generateWACZ": {
|
||||
generateWACZ: {
|
||||
alias: ["generatewacz", "generateWacz"],
|
||||
describe: "If set, generate wacz",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"logging": {
|
||||
describe: "Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||
logging: {
|
||||
describe:
|
||||
"Logging options for crawler, can include: stats (enabled by default), jserrors, debug",
|
||||
type: "array",
|
||||
default: ["stats"],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"logLevel": {
|
||||
logLevel: {
|
||||
describe: "Comma-separated list of log levels to include in logs",
|
||||
type: "array",
|
||||
default: [],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"context": {
|
||||
context: {
|
||||
describe: "Comma-separated list of contexts to include in logs",
|
||||
type: "array",
|
||||
default: [],
|
||||
coerce,
|
||||
},
|
||||
|
||||
"text": {
|
||||
describe: "Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
text: {
|
||||
describe:
|
||||
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
|
||||
type: "array",
|
||||
choices: EXTRACT_TEXT_TYPES,
|
||||
coerce: (array) => {
|
||||
|
@ -211,45 +237,51 @@ class ArgParser {
|
|||
return [];
|
||||
}
|
||||
return coerce(array);
|
||||
}
|
||||
},
|
||||
},
|
||||
|
||||
"cwd": {
|
||||
describe: "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||
cwd: {
|
||||
describe:
|
||||
"Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()",
|
||||
type: "string",
|
||||
default: process.cwd(),
|
||||
},
|
||||
|
||||
"mobileDevice": {
|
||||
describe: "Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||
mobileDevice: {
|
||||
describe:
|
||||
"Emulate mobile device by name from: https://github.com/puppeteer/puppeteer/blob/main/src/common/DeviceDescriptors.ts",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"userAgent": {
|
||||
userAgent: {
|
||||
describe: "Override user-agent with specified string",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"userAgentSuffix": {
|
||||
describe: "Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||
userAgentSuffix: {
|
||||
describe:
|
||||
"Append suffix to existing browser user-agent (ex: +MyCrawler, info@example.com)",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"useSitemap": {
|
||||
useSitemap: {
|
||||
alias: "sitemap",
|
||||
describe: "If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||
describe:
|
||||
"If enabled, check for sitemaps at /sitemap.xml, or custom URL if URL is specified",
|
||||
},
|
||||
|
||||
"sitemapFromDate": {
|
||||
sitemapFromDate: {
|
||||
alias: "sitemapFrom",
|
||||
describe: "If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
||||
describe:
|
||||
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
|
||||
},
|
||||
|
||||
"statsFilename": {
|
||||
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
||||
statsFilename: {
|
||||
describe:
|
||||
"If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)",
|
||||
},
|
||||
|
||||
"behaviors": {
|
||||
behaviors: {
|
||||
describe: "Which background behaviors to enable on each page",
|
||||
type: "array",
|
||||
default: ["autoplay", "autofetch", "autoscroll", "siteSpecific"],
|
||||
|
@ -257,179 +289,204 @@ class ArgParser {
|
|||
coerce,
|
||||
},
|
||||
|
||||
"behaviorTimeout": {
|
||||
describe: "If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
||||
behaviorTimeout: {
|
||||
describe:
|
||||
"If >0, timeout (in seconds) for in-page behavior will run on each page. If 0, a behavior can run until finish.",
|
||||
default: 90,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"pageExtraDelay": {
|
||||
pageExtraDelay: {
|
||||
alias: "delay",
|
||||
describe: "If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
||||
describe:
|
||||
"If >0, amount of time to sleep (in seconds) after behaviors before moving on to next page",
|
||||
default: 0,
|
||||
type: "number",
|
||||
},
|
||||
|
||||
"dedupPolicy": {
|
||||
dedupPolicy: {
|
||||
describe: "Deduplication policy",
|
||||
default: "skip",
|
||||
type: "string",
|
||||
choices: ["skip", "revisit", "keep"],
|
||||
},
|
||||
|
||||
"profile": {
|
||||
describe: "Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
profile: {
|
||||
describe:
|
||||
"Path to tar.gz file which will be extracted and used as the browser profile",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"screenshot": {
|
||||
describe: "Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
||||
screenshot: {
|
||||
describe:
|
||||
"Screenshot options for crawler, can include: view, thumbnail, fullPage",
|
||||
type: "array",
|
||||
default: [],
|
||||
choices: Array.from(Object.keys(screenshotTypes)),
|
||||
coerce,
|
||||
},
|
||||
|
||||
"screencastPort": {
|
||||
describe: "If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
||||
type: "number",
|
||||
default: 0
|
||||
},
|
||||
|
||||
"screencastRedis": {
|
||||
describe: "If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
|
||||
type: "boolean",
|
||||
default: false
|
||||
},
|
||||
|
||||
"warcInfo": {
|
||||
alias: ["warcinfo"],
|
||||
describe: "Optional fields added to the warcinfo record in combined WARCs",
|
||||
//type: "object"
|
||||
},
|
||||
|
||||
"redisStoreUrl": {
|
||||
describe: "If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
||||
type: "string",
|
||||
default: "redis://localhost:6379/0"
|
||||
},
|
||||
|
||||
"saveState": {
|
||||
describe: "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
||||
type: "string",
|
||||
default: "partial",
|
||||
choices: ["never", "partial", "always"]
|
||||
},
|
||||
|
||||
"saveStateInterval": {
|
||||
describe: "If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
|
||||
type: "number",
|
||||
default: 300,
|
||||
},
|
||||
|
||||
"saveStateHistory": {
|
||||
describe: "Number of save states to keep during the duration of a crawl",
|
||||
type: "number",
|
||||
default: 5,
|
||||
},
|
||||
|
||||
"sizeLimit": {
|
||||
describe: "If set, save state and exit if size limit exceeds this value",
|
||||
screencastPort: {
|
||||
describe:
|
||||
"If set to a non-zero value, starts an HTTP server with screencast accessible on this port",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"diskUtilization": {
|
||||
describe: "If set, save state and exit if disk utilization exceeds this percentage value",
|
||||
screencastRedis: {
|
||||
describe:
|
||||
"If set, will use the state store redis pubsub for screencasting. Requires --redisStoreUrl to be set",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
warcInfo: {
|
||||
alias: ["warcinfo"],
|
||||
describe:
|
||||
"Optional fields added to the warcinfo record in combined WARCs",
|
||||
//type: "object"
|
||||
},
|
||||
|
||||
redisStoreUrl: {
|
||||
describe:
|
||||
"If set, url for remote redis server to store state. Otherwise, using in-memory store",
|
||||
type: "string",
|
||||
default: "redis://localhost:6379/0",
|
||||
},
|
||||
|
||||
saveState: {
|
||||
describe:
|
||||
"If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",
|
||||
type: "string",
|
||||
default: "partial",
|
||||
choices: ["never", "partial", "always"],
|
||||
},
|
||||
|
||||
saveStateInterval: {
|
||||
describe:
|
||||
"If save state is set to 'always', also save state during the crawl at this interval (in seconds)",
|
||||
type: "number",
|
||||
default: 300,
|
||||
},
|
||||
|
||||
saveStateHistory: {
|
||||
describe:
|
||||
"Number of save states to keep during the duration of a crawl",
|
||||
type: "number",
|
||||
default: 5,
|
||||
},
|
||||
|
||||
sizeLimit: {
|
||||
describe:
|
||||
"If set, save state and exit if size limit exceeds this value",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
diskUtilization: {
|
||||
describe:
|
||||
"If set, save state and exit if disk utilization exceeds this percentage value",
|
||||
type: "number",
|
||||
default: 90,
|
||||
},
|
||||
|
||||
"timeLimit": {
|
||||
timeLimit: {
|
||||
describe: "If set, save state and exit after time limit, in seconds",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"healthCheckPort": {
|
||||
healthCheckPort: {
|
||||
describe: "port to run healthcheck on",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"overwrite": {
|
||||
describe: "overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
||||
overwrite: {
|
||||
describe:
|
||||
"overwrite current crawl data: if set, existing collection directory will be deleted before crawl is started",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"waitOnDone": {
|
||||
describe: "if set, wait for interrupt signal when finished instead of exiting",
|
||||
waitOnDone: {
|
||||
describe:
|
||||
"if set, wait for interrupt signal when finished instead of exiting",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"restartsOnError": {
|
||||
describe: "if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
|
||||
restartsOnError: {
|
||||
describe:
|
||||
"if set, assume will be restarted if interrupted, don't run post-crawl processes on interrupt",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"netIdleWait": {
|
||||
describe: "if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||
netIdleWait: {
|
||||
describe:
|
||||
"if set, wait for network idle after page load and after behaviors are done (in seconds). if -1 (default), determine based on scope",
|
||||
type: "number",
|
||||
default: -1
|
||||
default: -1,
|
||||
},
|
||||
|
||||
"lang": {
|
||||
describe: "if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
type: "string"
|
||||
lang: {
|
||||
describe:
|
||||
"if set, sets the language used by the browser, should be ISO 639 language[-country] code",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"title": {
|
||||
describe: "If set, write supplied title into WACZ datapackage.json metadata",
|
||||
type: "string"
|
||||
title: {
|
||||
describe:
|
||||
"If set, write supplied title into WACZ datapackage.json metadata",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"description": {
|
||||
description: {
|
||||
alias: ["desc"],
|
||||
describe: "If set, write supplied description into WACZ datapackage.json metadata",
|
||||
type: "string"
|
||||
describe:
|
||||
"If set, write supplied description into WACZ datapackage.json metadata",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"originOverride": {
|
||||
describe: "if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
|
||||
originOverride: {
|
||||
describe:
|
||||
"if set, will redirect requests from each origin in key to origin in the value, eg. --originOverride https://host:port=http://alt-host:alt-port",
|
||||
type: "array",
|
||||
default: [],
|
||||
},
|
||||
|
||||
"logErrorsToRedis": {
|
||||
logErrorsToRedis: {
|
||||
describe: "If set, write error messages to redis",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"failOnFailedSeed": {
|
||||
describe: "If set, crawler will fail with exit code 1 if any seed fails",
|
||||
failOnFailedSeed: {
|
||||
describe:
|
||||
"If set, crawler will fail with exit code 1 if any seed fails",
|
||||
type: "boolean",
|
||||
default: false
|
||||
default: false,
|
||||
},
|
||||
|
||||
"failOnFailedLimit": {
|
||||
describe: "If set, save state and exit if number of failed pages exceeds this value",
|
||||
failOnFailedLimit: {
|
||||
describe:
|
||||
"If set, save state and exit if number of failed pages exceeds this value",
|
||||
type: "number",
|
||||
default: 0,
|
||||
},
|
||||
|
||||
"customBehaviors": {
|
||||
describe: "injects a custom behavior file or set of behavior files in a directory",
|
||||
type: "string"
|
||||
customBehaviors: {
|
||||
describe:
|
||||
"injects a custom behavior file or set of behavior files in a directory",
|
||||
type: "string",
|
||||
},
|
||||
|
||||
"debugAccessRedis": {
|
||||
describe: "if set, runs internal redis without protected mode to allow external access (for debugging)",
|
||||
debugAccessRedis: {
|
||||
describe:
|
||||
"if set, runs internal redis without protected mode to allow external access (for debugging)",
|
||||
type: "boolean",
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -445,16 +502,19 @@ class ArgParser {
|
|||
const parsed = yargs(hideBin(argv))
|
||||
.usage("crawler [options]")
|
||||
.option(this.cliOpts)
|
||||
.config("config", "Path to YAML config file", (configPath : string | number) => {
|
||||
.config(
|
||||
"config",
|
||||
"Path to YAML config file",
|
||||
(configPath: string | number) => {
|
||||
if (configPath === "/crawls/stdin") {
|
||||
configPath = process.stdin.fd;
|
||||
}
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
origConfig = yaml.load(fs.readFileSync(configPath, "utf8")) as any;
|
||||
return origConfig;
|
||||
})
|
||||
.check((argv) => this.validateArgs(argv))
|
||||
.argv;
|
||||
},
|
||||
)
|
||||
.check((argv) => this.validateArgs(argv)).argv;
|
||||
|
||||
return { parsed, origConfig };
|
||||
}
|
||||
|
@ -463,7 +523,7 @@ class ArgParser {
|
|||
// Split process.env.CRAWL_ARGS on spaces but retaining spaces within double quotes
|
||||
const regex = /"[^"]+"|[^\s]+/g;
|
||||
const res = crawlArgs.match(regex);
|
||||
return res ? res.map(e => e.replace(/"(.+)"/, "$1")) : [];
|
||||
return res ? res.map((e) => e.replace(/"(.+)"/, "$1")) : [];
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -473,12 +533,14 @@ class ArgParser {
|
|||
|
||||
// Check that the collection name is valid.
|
||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
|
||||
logger.fatal(`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`);
|
||||
logger.fatal(
|
||||
`\n${argv.collection} is an invalid collection name. Please supply a collection name only using alphanumeric characters and the following characters [_ - ]\n`,
|
||||
);
|
||||
}
|
||||
|
||||
// background behaviors to apply
|
||||
const behaviorOpts: { [key: string]: string | boolean } = {};
|
||||
argv.behaviors.forEach((x: string) => behaviorOpts[x] = true);
|
||||
argv.behaviors.forEach((x: string) => (behaviorOpts[x] = true));
|
||||
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
|
||||
argv.behaviorOpts = JSON.stringify(behaviorOpts);
|
||||
|
||||
|
@ -486,7 +548,9 @@ class ArgParser {
|
|||
|
||||
if (argv.mobileDevice) {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
argv.emulateDevice = (devices as Record<string, any>)[argv.mobileDevice.replace("-", " ")];
|
||||
argv.emulateDevice = (devices as Record<string, any>)[
|
||||
argv.mobileDevice.replace("-", " ")
|
||||
];
|
||||
if (!argv.emulateDevice) {
|
||||
logger.fatal("Unknown device: " + argv.mobileDevice);
|
||||
}
|
||||
|
@ -498,7 +562,7 @@ class ArgParser {
|
|||
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
|
||||
const urlSeedFileList = urlSeedFile.split("\n");
|
||||
|
||||
if (typeof(argv.seeds) === "string") {
|
||||
if (typeof argv.seeds === "string") {
|
||||
argv.seeds = [argv.seeds];
|
||||
}
|
||||
|
||||
|
@ -530,7 +594,7 @@ class ArgParser {
|
|||
argv.scopedSeeds = [];
|
||||
|
||||
for (let seed of argv.seeds) {
|
||||
if (typeof(seed) === "string") {
|
||||
if (typeof seed === "string") {
|
||||
seed = { url: seed };
|
||||
}
|
||||
|
||||
|
@ -552,7 +616,7 @@ class ArgParser {
|
|||
argv.statsFilename = path.resolve(argv.cwd, argv.statsFilename);
|
||||
}
|
||||
|
||||
if ((argv.diskUtilization < 0 || argv.diskUtilization > 99)) {
|
||||
if (argv.diskUtilization < 0 || argv.diskUtilization > 99) {
|
||||
argv.diskUtilization = 90;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ const BlockState = {
|
|||
BLOCK_PAGE_NAV: "page",
|
||||
BLOCK_IFRAME_NAV: "iframe",
|
||||
BLOCK_OTHER: "resource",
|
||||
BLOCK_AD: "advertisement"
|
||||
BLOCK_AD: "advertisement",
|
||||
};
|
||||
|
||||
type BlockRuleDecl = {
|
||||
|
@ -21,30 +21,30 @@ type BlockRuleDecl = {
|
|||
frameTextMatch?: string;
|
||||
inFrameUrl?: string;
|
||||
type?: string;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
// ===========================================================================
|
||||
class BlockRule
|
||||
{
|
||||
class BlockRule {
|
||||
type: string;
|
||||
url: RegExp | null;
|
||||
frameTextMatch?: RegExp | null;
|
||||
inFrameUrl?: RegExp | null;
|
||||
|
||||
constructor(data: string | BlockRuleDecl) {
|
||||
if (typeof(data) === "string") {
|
||||
if (typeof data === "string") {
|
||||
this.url = new RegExp(data);
|
||||
this.type = "block";
|
||||
} else {
|
||||
this.url = data.url ? new RegExp(data.url) : null;
|
||||
this.frameTextMatch = data.frameTextMatch ? new RegExp(data.frameTextMatch) : null;
|
||||
this.frameTextMatch = data.frameTextMatch
|
||||
? new RegExp(data.frameTextMatch)
|
||||
: null;
|
||||
this.inFrameUrl = data.inFrameUrl ? new RegExp(data.inFrameUrl) : null;
|
||||
this.type = data.type || "block";
|
||||
}
|
||||
|
||||
if (!RULE_TYPES.includes(this.type)) {
|
||||
logger.fatal("Rule \"type\" must be: " + RULE_TYPES.join(", "));
|
||||
logger.fatal('Rule "type" must be: ' + RULE_TYPES.join(", "));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,16 +59,18 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class BlockRules
|
||||
{
|
||||
export class BlockRules {
|
||||
rules: BlockRule[];
|
||||
blockPutUrl: string;
|
||||
blockErrMsg: string;
|
||||
blockedUrlSet = new Set();
|
||||
|
||||
constructor(blockRules: BlockRuleDecl[], blockPutUrl: string, blockErrMsg: string) {
|
||||
constructor(
|
||||
blockRules: BlockRuleDecl[],
|
||||
blockPutUrl: string,
|
||||
blockErrMsg: string,
|
||||
) {
|
||||
this.rules = [];
|
||||
this.blockPutUrl = blockPutUrl;
|
||||
this.blockErrMsg = blockErrMsg;
|
||||
|
@ -93,7 +95,11 @@ export class BlockRules
|
|||
try {
|
||||
await this.handleRequest(request, logDetails);
|
||||
} catch (e) {
|
||||
logger.warn("Error handling request", {...errJSON(e), ...logDetails}, "blocking");
|
||||
logger.warn(
|
||||
"Error handling request",
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
}
|
||||
};
|
||||
await browser.interceptRequest(page, onRequest);
|
||||
|
@ -113,14 +119,22 @@ export class BlockRules
|
|||
} else {
|
||||
await request.abort("blockedbyclient", 1);
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
logger.debug(`Block: (${blockState}) Failed On: ${url}`, {...errJSON(e), ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
`Block: (${blockState}) Failed On: ${url}`,
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async shouldBlock(
|
||||
request: HTTPRequest,
|
||||
url: string,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
if (!url.startsWith("http:") && !url.startsWith("https:")) {
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
|
@ -162,14 +176,29 @@ export class BlockRules
|
|||
}
|
||||
|
||||
for (const rule of this.rules) {
|
||||
const {done, block} = await this.ruleCheck(rule, request, url, frameUrl, isNavReq, logDetails);
|
||||
const { done, block } = await this.ruleCheck(
|
||||
rule,
|
||||
request,
|
||||
url,
|
||||
frameUrl,
|
||||
isNavReq,
|
||||
logDetails,
|
||||
);
|
||||
|
||||
if (block) {
|
||||
if (blockState === BlockState.BLOCK_PAGE_NAV) {
|
||||
logger.warn("Block rule match for page request ignored, set --exclude to block full pages", {url, ...logDetails}, "blocking");
|
||||
logger.warn(
|
||||
"Block rule match for page request ignored, set --exclude to block full pages",
|
||||
{ url, ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
return BlockState.ALLOW;
|
||||
}
|
||||
logger.debug("URL Blocked in iframe", {url, frameUrl, ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
"URL Blocked in iframe",
|
||||
{ url, frameUrl, ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
await this.recordBlockMsg(url);
|
||||
return blockState;
|
||||
}
|
||||
|
@ -181,19 +210,27 @@ export class BlockRules
|
|||
return BlockState.ALLOW;
|
||||
}
|
||||
|
||||
async ruleCheck(
|
||||
rule: BlockRule,
|
||||
request: HTTPRequest,
|
||||
reqUrl: string,
|
||||
frameUrl: string,
|
||||
isNavReq: boolean,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async ruleCheck(rule: BlockRule, request: HTTPRequest, reqUrl: string, frameUrl: string, isNavReq: boolean, logDetails: Record<string, any>) {
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
const { url, inFrameUrl, frameTextMatch } = rule;
|
||||
|
||||
const type = rule.type || "block";
|
||||
const allowOnly = (type === "allowOnly");
|
||||
const allowOnly = type === "allowOnly";
|
||||
|
||||
// not a frame match, skip rule
|
||||
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
|
||||
return { block: false, done: false };
|
||||
}
|
||||
|
||||
const urlMatched = (url && reqUrl.match(url));
|
||||
const urlMatched = url && reqUrl.match(url);
|
||||
|
||||
// if frame text-based rule: if url matched and a frame request
|
||||
// frame text-based match: only applies to nav requests, never block otherwise
|
||||
|
@ -202,8 +239,19 @@ export class BlockRules
|
|||
return { block: false, done: false };
|
||||
}
|
||||
|
||||
const block = await this.isTextMatch(request, reqUrl, frameTextMatch, logDetails) ? !allowOnly : allowOnly;
|
||||
logger.debug("URL Conditional rule in iframe", {...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl}, "blocking");
|
||||
const block = (await this.isTextMatch(
|
||||
request,
|
||||
reqUrl,
|
||||
frameTextMatch,
|
||||
logDetails,
|
||||
))
|
||||
? !allowOnly
|
||||
: allowOnly;
|
||||
logger.debug(
|
||||
"URL Conditional rule in iframe",
|
||||
{ ...logDetails, url, rule: block ? "BLOCKED" : "ALLOWED", frameUrl },
|
||||
"blocking",
|
||||
);
|
||||
return { block, done: true };
|
||||
}
|
||||
|
||||
|
@ -212,16 +260,25 @@ export class BlockRules
|
|||
return { block, done: false };
|
||||
}
|
||||
|
||||
async isTextMatch(
|
||||
request: HTTPRequest,
|
||||
reqUrl: string,
|
||||
frameTextMatch: RegExp,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async isTextMatch(request: HTTPRequest, reqUrl: string, frameTextMatch: RegExp, logDetails: Record<string, any>) {
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
try {
|
||||
const res = await fetch(reqUrl);
|
||||
const text = await res.text();
|
||||
|
||||
return !!text.match(frameTextMatch);
|
||||
|
||||
} catch (e) {
|
||||
logger.debug("Error determining text match", {...errJSON(e), ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
"Error determining text match",
|
||||
{ ...errJSON(e), ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -239,19 +296,29 @@ export class BlockRules
|
|||
const body = this.blockErrMsg;
|
||||
const putUrl = new URL(this.blockPutUrl);
|
||||
putUrl.searchParams.set("url", url);
|
||||
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
|
||||
await fetch(putUrl.href, {
|
||||
method: "PUT",
|
||||
headers: { "Content-Type": "text/html" },
|
||||
body,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class AdBlockRules extends BlockRules
|
||||
{
|
||||
export class AdBlockRules extends BlockRules {
|
||||
adhosts: string[];
|
||||
|
||||
constructor(blockPutUrl: string, blockErrMsg: string, adhostsFilePath = "../../ad-hosts.json") {
|
||||
constructor(
|
||||
blockPutUrl: string,
|
||||
blockErrMsg: string,
|
||||
adhostsFilePath = "../../ad-hosts.json",
|
||||
) {
|
||||
super([], blockPutUrl, blockErrMsg);
|
||||
this.adhosts = JSON.parse(fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {"encoding": "utf-8"}));
|
||||
this.adhosts = JSON.parse(
|
||||
fs.readFileSync(new URL(adhostsFilePath, import.meta.url), {
|
||||
encoding: "utf-8",
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
isAdUrl(url: string) {
|
||||
|
@ -260,10 +327,19 @@ export class AdBlockRules extends BlockRules
|
|||
return domain && this.adhosts.includes(domain);
|
||||
}
|
||||
|
||||
async shouldBlock(
|
||||
request: HTTPRequest,
|
||||
url: string,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async shouldBlock(request: HTTPRequest, url: string, logDetails: Record<string, any>) {
|
||||
logDetails: Record<string, any>,
|
||||
) {
|
||||
if (this.isAdUrl(url)) {
|
||||
logger.debug("URL blocked for being an ad", {url, ...logDetails}, "blocking");
|
||||
logger.debug(
|
||||
"URL blocked for being an ad",
|
||||
{ url, ...logDetails },
|
||||
"blocking",
|
||||
);
|
||||
await this.recordBlockMsg(url);
|
||||
return BlockState.BLOCK_AD;
|
||||
}
|
||||
|
|
|
@ -9,28 +9,32 @@ import path from "path";
|
|||
import { logger } from "./logger.js";
|
||||
import { initStorage } from "./storage.js";
|
||||
|
||||
import puppeteer, { Frame, HTTPRequest, Page, PuppeteerLaunchOptions, Viewport } from "puppeteer-core";
|
||||
import puppeteer, {
|
||||
Frame,
|
||||
HTTPRequest,
|
||||
Page,
|
||||
PuppeteerLaunchOptions,
|
||||
Viewport,
|
||||
} from "puppeteer-core";
|
||||
import { CDPSession, Target, Browser as PptrBrowser } from "puppeteer-core";
|
||||
|
||||
type LaunchOpts = {
|
||||
profileUrl: string;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
chromeOptions: Record<string, any>
|
||||
chromeOptions: Record<string, any>;
|
||||
signals: boolean;
|
||||
headless: boolean;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
emulateDevice?: Record<string, any>
|
||||
emulateDevice?: Record<string, any>;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
ondisconnect?: ((err: any) => NonNullable<unknown>) | null
|
||||
ondisconnect?: ((err: any) => NonNullable<unknown>) | null;
|
||||
};
|
||||
|
||||
|
||||
// ==================================================================
|
||||
export class Browser
|
||||
{
|
||||
export class Browser {
|
||||
profileDir: string;
|
||||
customProfile = false;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -48,7 +52,15 @@ export class Browser
|
|||
this.profileDir = fs.mkdtempSync(path.join(os.tmpdir(), "profile-"));
|
||||
}
|
||||
|
||||
async launch({profileUrl, chromeOptions, signals = false, headless = false, emulateDevice = {}, ondisconnect = null} : LaunchOpts) { if (this.isLaunched()) {
|
||||
async launch({
|
||||
profileUrl,
|
||||
chromeOptions,
|
||||
signals = false,
|
||||
headless = false,
|
||||
emulateDevice = {},
|
||||
ondisconnect = null,
|
||||
}: LaunchOpts) {
|
||||
if (this.isLaunched()) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -81,14 +93,17 @@ export class Browser
|
|||
|
||||
defaultViewport,
|
||||
waitForInitialPage: false,
|
||||
userDataDir: this.profileDir
|
||||
userDataDir: this.profileDir,
|
||||
};
|
||||
|
||||
await this._init(launchOpts, ondisconnect);
|
||||
}
|
||||
|
||||
async setupPage({page} : {page: Page, cdp: CDPSession}) {
|
||||
await this.addInitScript(page, "Object.defineProperty(navigator, \"webdriver\", {value: false});");
|
||||
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
|
||||
await this.addInitScript(
|
||||
page,
|
||||
'Object.defineProperty(navigator, "webdriver", {value: false});',
|
||||
);
|
||||
|
||||
if (this.customProfile) {
|
||||
logger.info("Disabling Service Workers for profile", {}, "browser");
|
||||
|
@ -100,17 +115,23 @@ export class Browser
|
|||
async loadProfile(profileFilename: string): Promise<boolean> {
|
||||
const targetFilename = "/tmp/profile.tar.gz";
|
||||
|
||||
if (profileFilename &&
|
||||
(profileFilename.startsWith("http:") || profileFilename.startsWith("https:"))) {
|
||||
|
||||
logger.info(`Downloading ${profileFilename} to ${targetFilename}`, {}, "browserProfile");
|
||||
if (
|
||||
profileFilename &&
|
||||
(profileFilename.startsWith("http:") ||
|
||||
profileFilename.startsWith("https:"))
|
||||
) {
|
||||
logger.info(
|
||||
`Downloading ${profileFilename} to ${targetFilename}`,
|
||||
{},
|
||||
"browserProfile",
|
||||
);
|
||||
|
||||
const resp = await fetch(profileFilename);
|
||||
await pipeline(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
Readable.fromWeb(resp.body as any),
|
||||
fs.createWriteStream(targetFilename)
|
||||
fs.createWriteStream(targetFilename),
|
||||
);
|
||||
|
||||
profileFilename = targetFilename;
|
||||
|
@ -118,7 +139,9 @@ export class Browser
|
|||
const storage = initStorage();
|
||||
|
||||
if (!storage) {
|
||||
logger.fatal("Profile specified relative to s3 storage, but no S3 storage defined");
|
||||
logger.fatal(
|
||||
"Profile specified relative to s3 storage, but no S3 storage defined",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -129,7 +152,9 @@ export class Browser
|
|||
|
||||
if (profileFilename) {
|
||||
try {
|
||||
child_process.execSync("tar xvfz " + profileFilename, {cwd: this.profileDir});
|
||||
child_process.execSync("tar xvfz " + profileFilename, {
|
||||
cwd: this.profileDir,
|
||||
});
|
||||
return true;
|
||||
} catch (e) {
|
||||
logger.error(`Profile filename ${profileFilename} not a valid tar.gz`);
|
||||
|
@ -140,7 +165,9 @@ export class Browser
|
|||
}
|
||||
|
||||
saveProfile(profileFilename: string) {
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {cwd: this.profileDir});
|
||||
child_process.execFileSync("tar", ["cvfz", profileFilename, "./"], {
|
||||
cwd: this.profileDir,
|
||||
});
|
||||
}
|
||||
|
||||
chromeArgs({ proxy = true, userAgent = null, extraArgs = [] } = {}) {
|
||||
|
@ -162,7 +189,9 @@ export class Browser
|
|||
|
||||
if (proxy) {
|
||||
args.push("--ignore-certificate-errors");
|
||||
args.push(`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`);
|
||||
args.push(
|
||||
`--proxy-server=http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}`,
|
||||
);
|
||||
}
|
||||
|
||||
return args;
|
||||
|
@ -174,7 +203,9 @@ export class Browser
|
|||
try {
|
||||
const browser = this.getBrowserExe();
|
||||
if (browser) {
|
||||
version = child_process.execFileSync(browser, ["--version"], {encoding: "utf8"});
|
||||
version = child_process.execFileSync(browser, ["--version"], {
|
||||
encoding: "utf8",
|
||||
});
|
||||
const match = version && version.match(/[\d.]+/);
|
||||
if (match) {
|
||||
version = match[0];
|
||||
|
@ -188,7 +219,11 @@ export class Browser
|
|||
}
|
||||
|
||||
getBrowserExe() {
|
||||
const files = [process.env.BROWSER_BIN, "/usr/bin/google-chrome", "/usr/bin/chromium-browser"];
|
||||
const files = [
|
||||
process.env.BROWSER_BIN,
|
||||
"/usr/bin/google-chrome",
|
||||
"/usr/bin/chromium-browser",
|
||||
];
|
||||
for (const file of files) {
|
||||
if (file && fs.existsSync(file)) {
|
||||
return file;
|
||||
|
@ -196,14 +231,25 @@ export class Browser
|
|||
}
|
||||
}
|
||||
|
||||
async evaluateWithCLI_(cdp: CDPSession, frame: Frame, cdpContextId: number, funcString: string, logData: Record<string, string>, contextName: string) {
|
||||
async evaluateWithCLI_(
|
||||
cdp: CDPSession,
|
||||
frame: Frame,
|
||||
cdpContextId: number,
|
||||
funcString: string,
|
||||
logData: Record<string, string>,
|
||||
contextName: string,
|
||||
) {
|
||||
const frameUrl = frame.url();
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
let details: Record<string, any> = { frameUrl, ...logData };
|
||||
|
||||
if (!frameUrl || frame.isDetached()) {
|
||||
logger.info("Run Script Skipped, frame no longer attached or has no URL", details, contextName);
|
||||
logger.info(
|
||||
"Run Script Skipped, frame no longer attached or has no URL",
|
||||
details,
|
||||
contextName,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -213,8 +259,7 @@ export class Browser
|
|||
//const contextId = context._contextId;
|
||||
const expression = funcString + "\n//# sourceURL=__evaluation_script__";
|
||||
|
||||
const { exceptionDetails, result } = await cdp
|
||||
.send("Runtime.evaluate", {
|
||||
const { exceptionDetails, result } = await cdp.send("Runtime.evaluate", {
|
||||
expression,
|
||||
contextId: cdpContextId,
|
||||
returnByValue: true,
|
||||
|
@ -225,7 +270,11 @@ export class Browser
|
|||
|
||||
if (exceptionDetails) {
|
||||
if (exceptionDetails.stackTrace) {
|
||||
details = {...exceptionDetails.stackTrace, text: exceptionDetails.text, ...details};
|
||||
details = {
|
||||
...exceptionDetails.stackTrace,
|
||||
text: exceptionDetails.text,
|
||||
...details,
|
||||
};
|
||||
}
|
||||
logger.error("Run Script Failed", details, contextName);
|
||||
} else {
|
||||
|
@ -256,8 +305,11 @@ export class Browser
|
|||
return page.evaluateOnNewDocument(script);
|
||||
}
|
||||
|
||||
async _init(
|
||||
launchOpts: PuppeteerLaunchOptions,
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
async _init(launchOpts: PuppeteerLaunchOptions, ondisconnect : Function | null = null) {
|
||||
ondisconnect: Function | null = null,
|
||||
) {
|
||||
this.browser = await puppeteer.launch(launchOpts);
|
||||
|
||||
const target = this.browser.target();
|
||||
|
@ -274,9 +326,10 @@ export class Browser
|
|||
});
|
||||
}
|
||||
|
||||
async newWindowPageWithCDP() : Promise<{cdp: CDPSession, page: Page}> {
|
||||
async newWindowPageWithCDP(): Promise<{ cdp: CDPSession; page: Page }> {
|
||||
// unique url to detect new pages
|
||||
const startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||
const startPage =
|
||||
"about:blank?_browsertrix" + Math.random().toString(36).slice(2);
|
||||
|
||||
const p = new Promise<Target>((resolve) => {
|
||||
const listener = (target: Target) => {
|
||||
|
@ -298,7 +351,10 @@ export class Browser
|
|||
}
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||
await this.firstCDP.send("Target.createTarget", {
|
||||
url: startPage,
|
||||
newWindow: true,
|
||||
});
|
||||
} catch (e) {
|
||||
if (!this.browser) {
|
||||
throw e;
|
||||
|
@ -307,7 +363,10 @@ export class Browser
|
|||
|
||||
this.firstCDP = await target.createCDPSession();
|
||||
|
||||
await this.firstCDP.send("Target.createTarget", {url: startPage, newWindow: true});
|
||||
await this.firstCDP.send("Target.createTarget", {
|
||||
url: startPage,
|
||||
newWindow: true,
|
||||
});
|
||||
}
|
||||
|
||||
const target = await p;
|
||||
|
@ -350,7 +409,11 @@ export class Browser
|
|||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
||||
logger.warn(
|
||||
"continueResponse failed",
|
||||
{ url: request.url },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -369,12 +432,20 @@ export class Browser
|
|||
}
|
||||
|
||||
if (!foundRecorder) {
|
||||
logger.debug("Skipping URL from unknown frame", {url: request.url, frameId}, "recorder");
|
||||
logger.debug(
|
||||
"Skipping URL from unknown frame",
|
||||
{ url: request.url, frameId },
|
||||
"recorder",
|
||||
);
|
||||
|
||||
try {
|
||||
await this.firstCDP.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.warn("continueResponse failed", {url: request.url}, "recorder");
|
||||
logger.warn(
|
||||
"continueResponse failed",
|
||||
{ url: request.url },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
||||
return;
|
||||
|
@ -383,7 +454,9 @@ export class Browser
|
|||
await foundRecorder.handleRequestPaused(params, this.firstCDP, true);
|
||||
});
|
||||
|
||||
await this.firstCDP.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
||||
await this.firstCDP.send("Fetch.enable", {
|
||||
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -395,14 +468,21 @@ export class Browser
|
|||
funcString: string,
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
logData: Record<string, any>,
|
||||
contextName: string
|
||||
contextName: string,
|
||||
) {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const context = await (frame as any).executionContext();
|
||||
cdp = context._client;
|
||||
const cdpContextId = context._contextId;
|
||||
return await this.evaluateWithCLI_(cdp, frame, cdpContextId, funcString, logData, contextName);
|
||||
return await this.evaluateWithCLI_(
|
||||
cdp,
|
||||
frame,
|
||||
cdpContextId,
|
||||
funcString,
|
||||
logData,
|
||||
contextName,
|
||||
);
|
||||
}
|
||||
|
||||
interceptRequest(page: Page, callback: (event: HTTPRequest) => void) {
|
||||
|
@ -428,7 +508,6 @@ export class Browser
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ==================================================================
|
||||
// Default Chromium args from playwright
|
||||
export const defaultArgs = [
|
||||
|
@ -470,5 +549,5 @@ export const defaultArgs = [
|
|||
"--apps-gallery-url=https://invalid.webstore.example.com/",
|
||||
"--apps-gallery-update-url=https://invalid.webstore.example.com/",
|
||||
"--component-updater=url-source=http://invalid.dev/",
|
||||
"--brave-stats-updater-server=url-source=http://invalid.dev/"
|
||||
"--brave-stats-updater-server=url-source=http://invalid.dev/",
|
||||
];
|
||||
|
|
|
@ -1,15 +1,24 @@
|
|||
|
||||
export const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
export const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
export const HTML_TYPES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
"application/xhtml+xml",
|
||||
];
|
||||
export const WAIT_UNTIL_OPTS = [
|
||||
"load",
|
||||
"domcontentloaded",
|
||||
"networkidle0",
|
||||
"networkidle2",
|
||||
];
|
||||
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
|
||||
|
||||
export const BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||
export const ADD_LINK_FUNC = "__bx_addLink";
|
||||
export const MAX_DEPTH = 1000000;
|
||||
|
||||
export const DEFAULT_SELECTORS = [{
|
||||
export const DEFAULT_SELECTORS = [
|
||||
{
|
||||
selector: "a[href]",
|
||||
extract: "href",
|
||||
isAttribute: false
|
||||
}];
|
||||
|
||||
isAttribute: false,
|
||||
},
|
||||
];
|
||||
|
|
|
@ -3,11 +3,17 @@ import path from "path";
|
|||
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0) : string[] {
|
||||
export function collectAllFileSources(
|
||||
fileOrDir: string,
|
||||
ext?: string,
|
||||
depth = 0,
|
||||
): string[] {
|
||||
const resolvedPath = path.resolve(fileOrDir);
|
||||
|
||||
if (depth >= MAX_DEPTH) {
|
||||
console.warn(`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`);
|
||||
console.warn(
|
||||
`WARN: MAX_DEPTH of ${MAX_DEPTH} reached traversing "${resolvedPath}"`,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
|
||||
|
@ -27,7 +33,9 @@ export function collectAllFileSources(fileOrDir: string, ext?: string, depth = 0
|
|||
}
|
||||
|
||||
if (depth === 0) {
|
||||
console.warn(`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`);
|
||||
console.warn(
|
||||
`WARN: The provided path "${resolvedPath}" is not a .js file or directory.`,
|
||||
);
|
||||
}
|
||||
|
||||
return [];
|
||||
|
|
|
@ -2,10 +2,8 @@ import http from "http";
|
|||
import url from "url";
|
||||
import { logger } from "./logger.js";
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class HealthChecker
|
||||
{
|
||||
export class HealthChecker {
|
||||
port: number;
|
||||
errorThreshold: number;
|
||||
healthServer: http.Server;
|
||||
|
@ -16,7 +14,9 @@ export class HealthChecker
|
|||
this.port = port;
|
||||
this.errorThreshold = errorThreshold;
|
||||
|
||||
this.healthServer = http.createServer((...args) => this.healthCheck(...args));
|
||||
this.healthServer = http.createServer((...args) =>
|
||||
this.healthCheck(...args),
|
||||
);
|
||||
logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
|
||||
this.healthServer.listen(port);
|
||||
}
|
||||
|
@ -26,21 +26,33 @@ export class HealthChecker
|
|||
switch (pathname) {
|
||||
case "/healthz":
|
||||
if (this.errorCount < this.errorThreshold) {
|
||||
logger.debug(`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, {}, "healthcheck");
|
||||
logger.debug(
|
||||
`health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
|
||||
{},
|
||||
"healthcheck",
|
||||
);
|
||||
res.writeHead(200);
|
||||
res.end();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
logger.error(`health check failed: ${this.errorCount} >= ${this.errorThreshold}`, {}, "healthcheck");
|
||||
logger.error(
|
||||
`health check failed: ${this.errorCount} >= ${this.errorThreshold}`,
|
||||
{},
|
||||
"healthcheck",
|
||||
);
|
||||
res.writeHead(503);
|
||||
res.end();
|
||||
}
|
||||
|
||||
resetErrors() {
|
||||
if (this.errorCount > 0) {
|
||||
logger.info(`Page loaded, resetting error count ${this.errorCount} to 0`, {}, "healthcheck");
|
||||
logger.info(
|
||||
`Page loaded, resetting error count ${this.errorCount} to 0`,
|
||||
{},
|
||||
"healthcheck",
|
||||
);
|
||||
this.errorCount = 0;
|
||||
}
|
||||
}
|
||||
|
@ -49,4 +61,3 @@ export class HealthChecker
|
|||
this.errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,24 +5,23 @@ import { Writable } from "node:stream";
|
|||
import { RedisCrawlState } from "./state.js";
|
||||
|
||||
// RegExp.prototype.toJSON = RegExp.prototype.toString;
|
||||
Object.defineProperty(RegExp.prototype, "toJSON", { value: RegExp.prototype.toString });
|
||||
|
||||
Object.defineProperty(RegExp.prototype, "toJSON", {
|
||||
value: RegExp.prototype.toString,
|
||||
});
|
||||
|
||||
// ===========================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export function errJSON(e: any) {
|
||||
if (e instanceof Error) {
|
||||
return {"type": "exception", "message": e.message, "stack": e.stack};
|
||||
return { type: "exception", message: e.message, stack: e.stack };
|
||||
} else {
|
||||
return {"message": e.toString()};
|
||||
return { message: e.toString() };
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class Logger
|
||||
{
|
||||
class Logger {
|
||||
logStream: Writable | null = null;
|
||||
debugLogging = false;
|
||||
logErrorsToRedis = false;
|
||||
|
@ -66,12 +65,12 @@ class Logger
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
data: Record<string, string> | Error | any,
|
||||
context: string,
|
||||
logLevel="info"
|
||||
logLevel = "info",
|
||||
) {
|
||||
if (data instanceof Error) {
|
||||
data = errJSON(data);
|
||||
} else if (typeof data !== "object") {
|
||||
data = {"message": data.toString()};
|
||||
data = { message: data.toString() };
|
||||
}
|
||||
|
||||
if (this.logLevels.length) {
|
||||
|
@ -87,11 +86,11 @@ class Logger
|
|||
}
|
||||
|
||||
const dataToLog = {
|
||||
"timestamp": new Date().toISOString(),
|
||||
"logLevel": logLevel,
|
||||
"context": context,
|
||||
"message": message,
|
||||
"details": data ? data : {}
|
||||
timestamp: new Date().toISOString(),
|
||||
logLevel: logLevel,
|
||||
context: context,
|
||||
message: message,
|
||||
details: data ? data : {},
|
||||
};
|
||||
const string = JSON.stringify(dataToLog);
|
||||
console.log(string);
|
||||
|
@ -100,7 +99,11 @@ class Logger
|
|||
}
|
||||
|
||||
const toLogToRedis = ["error", "fatal"];
|
||||
if (this.logErrorsToRedis && this.crawlState && toLogToRedis.includes(logLevel)) {
|
||||
if (
|
||||
this.logErrorsToRedis &&
|
||||
this.crawlState &&
|
||||
toLogToRedis.includes(logLevel)
|
||||
) {
|
||||
this.crawlState.logError(string);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,9 +2,8 @@ import { HTTPRequest, Page } from "puppeteer-core";
|
|||
import { errJSON, logger } from "./logger.js";
|
||||
import { Browser } from "./browser.js";
|
||||
|
||||
export class OriginOverride
|
||||
{
|
||||
originOverride: {origUrl: URL, destUrl: URL}[];
|
||||
export class OriginOverride {
|
||||
originOverride: { origUrl: URL; destUrl: URL }[];
|
||||
|
||||
constructor(originOverride: string[]) {
|
||||
this.originOverride = originOverride.map((override) => {
|
||||
|
@ -50,12 +49,19 @@ export class OriginOverride
|
|||
const respHeaders = Object.fromEntries(resp.headers);
|
||||
const status = resp.status;
|
||||
|
||||
logger.debug("Origin overridden", {orig: url, dest: newUrl, status, body: body.length}, "originoverride");
|
||||
logger.debug(
|
||||
"Origin overridden",
|
||||
{ orig: url, dest: newUrl, status, body: body.length },
|
||||
"originoverride",
|
||||
);
|
||||
|
||||
request.respond({ body, headers: respHeaders, status }, -1);
|
||||
|
||||
} catch (e) {
|
||||
logger.warn("Error overriding origin", {...errJSON(e), url: page.url()}, "originoverride");
|
||||
logger.warn(
|
||||
"Error overriding origin",
|
||||
{ ...errJSON(e), url: page.url() },
|
||||
"originoverride",
|
||||
);
|
||||
request.continue({}, -1);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -12,8 +12,11 @@ import { RequestResponseInfo } from "./reqresp.js";
|
|||
|
||||
// @ts-expect-error TODO fill in why error is expected
|
||||
import { baseRules as baseDSRules } from "@webrecorder/wabac/src/rewrite/index.js";
|
||||
import {
|
||||
rewriteDASH,
|
||||
rewriteHLS,
|
||||
// @ts-expect-error TODO fill in why error is expected
|
||||
import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo.js";
|
||||
} from "@webrecorder/wabac/src/rewrite/rewriteVideo.js";
|
||||
|
||||
import { WARCRecord } from "warcio";
|
||||
import { TempFileBuffer, WARCSerializer } from "warcio/node";
|
||||
|
@ -30,7 +33,6 @@ const WRITE_DUPE_KEY = "s:writedupe";
|
|||
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
|
||||
// =================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unused-vars
|
||||
|
@ -39,8 +41,7 @@ function logNetwork(msg: string, data: any) {
|
|||
}
|
||||
|
||||
// =================================================================
|
||||
export class Recorder
|
||||
{
|
||||
export class Recorder {
|
||||
workerid: WorkerId;
|
||||
collDir: string;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -76,12 +77,17 @@ export class Recorder
|
|||
|
||||
pageid!: string;
|
||||
|
||||
constructor({
|
||||
workerid,
|
||||
collDir,
|
||||
crawler,
|
||||
}: {
|
||||
workerid: WorkerId;
|
||||
collDir: string;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
||||
constructor(
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
{workerid, collDir, crawler} : {workerid: WorkerId, collDir: string, crawler: any}
|
||||
) {
|
||||
crawler: any;
|
||||
}) {
|
||||
this.workerid = workerid;
|
||||
this.crawler = crawler;
|
||||
this.crawlState = crawler.crawlState;
|
||||
|
@ -108,7 +114,7 @@ export class Recorder
|
|||
tempCdxDir: this.tempCdxDir,
|
||||
filename,
|
||||
gzip: this.gzip,
|
||||
logDetails: this.logDetails
|
||||
logDetails: this.logDetails,
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -119,17 +125,25 @@ export class Recorder
|
|||
this.handleRequestPaused(params, cdp);
|
||||
});
|
||||
|
||||
await cdp.send("Fetch.enable", {patterns: [{urlPattern: "*", requestStage: "Response"}]});
|
||||
await cdp.send("Fetch.enable", {
|
||||
patterns: [{ urlPattern: "*", requestStage: "Response" }],
|
||||
});
|
||||
|
||||
// Response
|
||||
cdp.on("Network.responseReceived", (params) => {
|
||||
// handling to fill in security details
|
||||
logNetwork("Network.responseReceived", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.responseReceived", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleResponseReceived(params);
|
||||
});
|
||||
|
||||
cdp.on("Network.responseReceivedExtraInfo", (params) => {
|
||||
logNetwork("Network.responseReceivedExtraInfo", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.responseReceivedExtraInfo", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
if (reqresp) {
|
||||
reqresp.fillResponseReceivedExtraInfo(params);
|
||||
|
@ -142,29 +156,44 @@ export class Recorder
|
|||
// only handling redirect here, committing last response in redirect chain
|
||||
// request data stored from requestPaused
|
||||
if (params.redirectResponse) {
|
||||
logNetwork("Network.requestWillBeSent after redirect", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.requestWillBeSent after redirect", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleRedirectResponse(params);
|
||||
}
|
||||
});
|
||||
|
||||
cdp.on("Network.requestServedFromCache", (params) => {
|
||||
logNetwork("Network.requestServedFromCache", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.requestServedFromCache", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.removeReqResp(params.requestId);
|
||||
});
|
||||
|
||||
cdp.on("Network.requestWillBeSentExtraInfo", (params) => {
|
||||
logNetwork("Network.requestWillBeSentExtraInfo", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.requestWillBeSentExtraInfo", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleRequestExtraInfo(params);
|
||||
});
|
||||
|
||||
// Loading
|
||||
cdp.on("Network.loadingFinished", (params) => {
|
||||
logNetwork("Network.loadingFinished", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.loadingFinished", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleLoadingFinished(params);
|
||||
});
|
||||
|
||||
cdp.on("Network.loadingFailed", (params) => {
|
||||
logNetwork("Network.loadingFailed", {requestId: params.requestId, ...this.logDetails});
|
||||
logNetwork("Network.loadingFailed", {
|
||||
requestId: params.requestId,
|
||||
...this.logDetails,
|
||||
});
|
||||
this.handleLoadingFailed(params);
|
||||
});
|
||||
|
||||
|
@ -189,7 +218,11 @@ export class Recorder
|
|||
}
|
||||
});
|
||||
|
||||
await cdp.send("Target.setAutoAttach", {autoAttach: true, waitForDebuggerOnStart: false, flatten: true});
|
||||
await cdp.send("Target.setAutoAttach", {
|
||||
autoAttach: true,
|
||||
waitForDebuggerOnStart: false,
|
||||
flatten: true,
|
||||
});
|
||||
}
|
||||
|
||||
handleResponseReceived(params: Protocol.Network.ResponseReceivedEvent) {
|
||||
|
@ -203,7 +236,9 @@ export class Recorder
|
|||
reqresp.fillResponse(response);
|
||||
}
|
||||
|
||||
handleRequestExtraInfo(params: Protocol.Network.RequestWillBeSentExtraInfoEvent) {
|
||||
handleRequestExtraInfo(
|
||||
params: Protocol.Network.RequestWillBeSentExtraInfoEvent,
|
||||
) {
|
||||
if (!this.shouldSkip(params.headers)) {
|
||||
const reqresp = this.pendingReqResp(params.requestId, true);
|
||||
if (reqresp) {
|
||||
|
@ -225,7 +260,11 @@ export class Recorder
|
|||
reqresp.fillResponse(redirectResponse);
|
||||
|
||||
if (reqresp.isSelfRedirect()) {
|
||||
logger.warn("Skipping self redirect", {url: reqresp. url, status: reqresp.status, ...this.logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"Skipping self redirect",
|
||||
{ url: reqresp.url, status: reqresp.status, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -253,17 +292,34 @@ export class Recorder
|
|||
if (type === "Document" && reqresp.isValidBinary()) {
|
||||
this.serializeToWARC(reqresp);
|
||||
//} else if (url) {
|
||||
} else if (url && reqresp.requestHeaders && reqresp.requestHeaders["x-browsertrix-fetch"]) {
|
||||
} else if (
|
||||
url &&
|
||||
reqresp.requestHeaders &&
|
||||
reqresp.requestHeaders["x-browsertrix-fetch"]
|
||||
) {
|
||||
delete reqresp.requestHeaders["x-browsertrix-fetch"];
|
||||
logger.warn("Attempt direct fetch of failed request", {url, ...this.logDetails}, "recorder");
|
||||
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: requestId});
|
||||
logger.warn(
|
||||
"Attempt direct fetch of failed request",
|
||||
{ url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
const fetcher = new AsyncFetcher({
|
||||
tempdir: this.tempdir,
|
||||
reqresp,
|
||||
recorder: this,
|
||||
networkId: requestId,
|
||||
});
|
||||
this.fetcherQ.add(() => fetcher.load());
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
logger.warn("Request failed", {url, errorText, ...this.logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"Request failed",
|
||||
{ url, errorText, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
this.removeReqResp(requestId);
|
||||
}
|
||||
|
@ -284,40 +340,82 @@ export class Recorder
|
|||
this.serializeToWARC(reqresp);
|
||||
}
|
||||
|
||||
async handleRequestPaused(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker = false) {
|
||||
const { requestId, request, responseStatusCode, responseErrorReason, resourceType, networkId } = params;
|
||||
async handleRequestPaused(
|
||||
params: Protocol.Fetch.RequestPausedEvent,
|
||||
cdp: CDPSession,
|
||||
isSWorker = false,
|
||||
) {
|
||||
const {
|
||||
requestId,
|
||||
request,
|
||||
responseStatusCode,
|
||||
responseErrorReason,
|
||||
resourceType,
|
||||
networkId,
|
||||
} = params;
|
||||
const { method, headers, url } = request;
|
||||
|
||||
logNetwork("Fetch.requestPaused", {requestId, networkId, url, ...this.logDetails});
|
||||
logNetwork("Fetch.requestPaused", {
|
||||
requestId,
|
||||
networkId,
|
||||
url,
|
||||
...this.logDetails,
|
||||
});
|
||||
|
||||
let continued = false;
|
||||
|
||||
try {
|
||||
if (responseStatusCode && !responseErrorReason && !this.shouldSkip(headers, url, method, resourceType) && !(isSWorker && networkId)) {
|
||||
if (
|
||||
responseStatusCode &&
|
||||
!responseErrorReason &&
|
||||
!this.shouldSkip(headers, url, method, resourceType) &&
|
||||
!(isSWorker && networkId)
|
||||
) {
|
||||
continued = await this.handleFetchResponse(params, cdp, isSWorker);
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error("Error handling response, probably skipping URL", {url, ...errJSON(e), ...this.logDetails}, "recorder");
|
||||
logger.error(
|
||||
"Error handling response, probably skipping URL",
|
||||
{ url, ...errJSON(e), ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
||||
if (!continued) {
|
||||
try {
|
||||
await cdp.send("Fetch.continueResponse", { requestId });
|
||||
} catch (e) {
|
||||
logger.debug("continueResponse failed", {requestId, networkId, url, ...errJSON(e), ...this.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"continueResponse failed",
|
||||
{ requestId, networkId, url, ...errJSON(e), ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async handleFetchResponse(params: Protocol.Fetch.RequestPausedEvent, cdp: CDPSession, isSWorker: boolean) {
|
||||
async handleFetchResponse(
|
||||
params: Protocol.Fetch.RequestPausedEvent,
|
||||
cdp: CDPSession,
|
||||
isSWorker: boolean,
|
||||
) {
|
||||
const { request } = params;
|
||||
const { url } = request;
|
||||
const {requestId, responseErrorReason, responseStatusCode, responseHeaders} = params;
|
||||
const {
|
||||
requestId,
|
||||
responseErrorReason,
|
||||
responseStatusCode,
|
||||
responseHeaders,
|
||||
} = params;
|
||||
|
||||
const networkId = params.networkId || requestId;
|
||||
|
||||
if (responseErrorReason) {
|
||||
logger.warn("Skipping failed response", {url, reason: responseErrorReason, ...this.logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"Skipping failed response",
|
||||
{ url, reason: responseErrorReason, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -325,10 +423,21 @@ export class Recorder
|
|||
|
||||
if (responseStatusCode === 206) {
|
||||
const range = this._getContentRange(responseHeaders);
|
||||
if (this.allowFull206 && range === `bytes 0-${contentLen - 1}/${contentLen}`) {
|
||||
logger.debug("Keep 206 Response, Full Range", {range, contentLen, url, networkId, ...this.logDetails}, "recorder");
|
||||
if (
|
||||
this.allowFull206 &&
|
||||
range === `bytes 0-${contentLen - 1}/${contentLen}`
|
||||
) {
|
||||
logger.debug(
|
||||
"Keep 206 Response, Full Range",
|
||||
{ range, contentLen, url, networkId, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
} else {
|
||||
logger.debug("Skip 206 Response", {range, contentLen, url, ...this.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"Skip 206 Response",
|
||||
{ range, contentLen, url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
this.removeReqResp(networkId);
|
||||
return false;
|
||||
}
|
||||
|
@ -355,11 +464,22 @@ export class Recorder
|
|||
let streamingConsume = false;
|
||||
|
||||
if (contentLen < 0 || contentLen > MAX_BROWSER_FETCH_SIZE) {
|
||||
const opts = {tempdir: this.tempdir, reqresp, expectedSize: contentLen, recorder: this, networkId, cdp};
|
||||
const opts = {
|
||||
tempdir: this.tempdir,
|
||||
reqresp,
|
||||
expectedSize: contentLen,
|
||||
recorder: this,
|
||||
networkId,
|
||||
cdp,
|
||||
};
|
||||
|
||||
// fetching using response stream, await here and then either call fulFill, or if not started, return false
|
||||
if (contentLen < 0) {
|
||||
const fetcher = new ResponseStreamAsyncFetcher({...opts, requestId, cdp });
|
||||
const fetcher = new ResponseStreamAsyncFetcher({
|
||||
...opts,
|
||||
requestId,
|
||||
cdp,
|
||||
});
|
||||
const res = await fetcher.load();
|
||||
switch (res) {
|
||||
case "dupe":
|
||||
|
@ -384,15 +504,31 @@ export class Recorder
|
|||
this.fetcherQ.add(() => fetcher.load());
|
||||
return false;
|
||||
}
|
||||
|
||||
} else {
|
||||
try {
|
||||
logNetwork("Fetching response", {sizeExpected: this._getContentLen(responseHeaders), url, networkId, ...this.logDetails});
|
||||
const { body, base64Encoded } = await cdp.send("Fetch.getResponseBody", {requestId});
|
||||
logNetwork("Fetching response", {
|
||||
sizeExpected: this._getContentLen(responseHeaders),
|
||||
url,
|
||||
networkId,
|
||||
...this.logDetails,
|
||||
});
|
||||
const { body, base64Encoded } = await cdp.send(
|
||||
"Fetch.getResponseBody",
|
||||
{ requestId },
|
||||
);
|
||||
reqresp.payload = Buffer.from(body, base64Encoded ? "base64" : "utf-8");
|
||||
logNetwork("Fetch done", {size: reqresp.payload.length, url, networkId, ...this.logDetails});
|
||||
logNetwork("Fetch done", {
|
||||
size: reqresp.payload.length,
|
||||
url,
|
||||
networkId,
|
||||
...this.logDetails,
|
||||
});
|
||||
} catch (e) {
|
||||
logger.warn("Failed to load response body", {url, networkId, ...errJSON(e), ...this.logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"Failed to load response body",
|
||||
{ url, networkId, ...errJSON(e), ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -409,39 +545,58 @@ export class Recorder
|
|||
// not rewritten, and not streaming, return false to continue
|
||||
if (!rewritten && !streamingConsume) {
|
||||
if (!reqresp.payload) {
|
||||
logger.error("Unable to get payload skipping recording", {url, ...this.logDetails}, "recorder");
|
||||
logger.error(
|
||||
"Unable to get payload skipping recording",
|
||||
{ url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
this.removeReqResp(networkId);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// if has payload, encode it, otherwise return empty string
|
||||
const body = reqresp.payload && reqresp.payload.length ? Buffer.from(reqresp.payload).toString("base64") : "";
|
||||
const body =
|
||||
reqresp.payload && reqresp.payload.length
|
||||
? Buffer.from(reqresp.payload).toString("base64")
|
||||
: "";
|
||||
|
||||
try {
|
||||
await cdp.send("Fetch.fulfillRequest", {
|
||||
requestId,
|
||||
responseCode: responseStatusCode || 0,
|
||||
responseHeaders,
|
||||
body
|
||||
body,
|
||||
});
|
||||
} catch (e) {
|
||||
const type = reqresp.resourceType;
|
||||
if (type === "Document") {
|
||||
logger.debug("document not loaded in browser, possibly other URLs missing", {url, type: reqresp.resourceType}, "recorder");
|
||||
logger.debug(
|
||||
"document not loaded in browser, possibly other URLs missing",
|
||||
{ url, type: reqresp.resourceType },
|
||||
"recorder",
|
||||
);
|
||||
} else {
|
||||
logger.debug("URL not loaded in browser", {url, type: reqresp.resourceType}, "recorder");
|
||||
logger.debug(
|
||||
"URL not loaded in browser",
|
||||
{ url, type: reqresp.resourceType },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
startPage({pageid, url} : {pageid: string, url: string}) {
|
||||
startPage({ pageid, url }: { pageid: string; url: string }) {
|
||||
this.pageid = pageid;
|
||||
this.logDetails = { page: url, workerid: this.workerid };
|
||||
if (this.pendingRequests && this.pendingRequests.size) {
|
||||
logger.debug("Interrupting timed out requests, moving to next page", this.logDetails, "recorder");
|
||||
logger.debug(
|
||||
"Interrupting timed out requests, moving to next page",
|
||||
this.logDetails,
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
this.pendingRequests = new Map();
|
||||
this.skipIds = new Set();
|
||||
|
@ -465,7 +620,12 @@ export class Recorder
|
|||
const pending = [];
|
||||
for (const [requestId, reqresp] of this.pendingRequests.entries()) {
|
||||
const url = reqresp.url || "";
|
||||
const entry : {requestId: string, url: string, expectedSize?: number, readSize?: number} = {requestId, url};
|
||||
const entry: {
|
||||
requestId: string;
|
||||
url: string;
|
||||
expectedSize?: number;
|
||||
readSize?: number;
|
||||
} = { requestId, url };
|
||||
if (reqresp.expectedSize) {
|
||||
entry.expectedSize = reqresp.expectedSize;
|
||||
}
|
||||
|
@ -475,7 +635,11 @@ export class Recorder
|
|||
pending.push(entry);
|
||||
}
|
||||
|
||||
logger.debug("Finishing pending requests for page", {numPending, pending, ...this.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"Finishing pending requests for page",
|
||||
{ numPending, pending, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
await sleep(5.0);
|
||||
numPending = this.pendingRequests.size;
|
||||
}
|
||||
|
@ -497,7 +661,12 @@ export class Recorder
|
|||
await this.writer.flush();
|
||||
}
|
||||
|
||||
shouldSkip(headers: Protocol.Network.Headers, url?: string, method?: string, resourceType?: string) {
|
||||
shouldSkip(
|
||||
headers: Protocol.Network.Headers,
|
||||
url?: string,
|
||||
method?: string,
|
||||
resourceType?: string,
|
||||
) {
|
||||
if (headers && !method) {
|
||||
method = headers[":method"];
|
||||
}
|
||||
|
@ -520,7 +689,11 @@ export class Recorder
|
|||
}
|
||||
|
||||
// skip eventsource, resourceType may not be set correctly
|
||||
if (headers && (headers["accept"] === "text/event-stream" || headers["Accept"] === "text/event-stream")) {
|
||||
if (
|
||||
headers &&
|
||||
(headers["accept"] === "text/event-stream" ||
|
||||
headers["Accept"] === "text/event-stream")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -572,7 +745,11 @@ export class Recorder
|
|||
|
||||
if (newString !== string) {
|
||||
extraOpts.rewritten = 1;
|
||||
logger.debug("Content Rewritten", {url, ...this.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"Content Rewritten",
|
||||
{ url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
reqresp.payload = encoder.encode(newString);
|
||||
return true;
|
||||
} else {
|
||||
|
@ -582,7 +759,9 @@ export class Recorder
|
|||
//return Buffer.from(newString).toString("base64");
|
||||
}
|
||||
|
||||
_getContentType(headers? : Protocol.Fetch.HeaderEntry[] | {name: string, value: string}[]) {
|
||||
_getContentType(
|
||||
headers?: Protocol.Fetch.HeaderEntry[] | { name: string; value: string }[],
|
||||
) {
|
||||
if (!headers) {
|
||||
return null;
|
||||
}
|
||||
|
@ -622,7 +801,7 @@ export class Recorder
|
|||
}
|
||||
|
||||
noResponseForStatus(status: number | undefined | null) {
|
||||
return (!status || status === 204 || (status >= 300 && status < 400));
|
||||
return !status || status === 204 || (status >= 300 && status < 400);
|
||||
}
|
||||
|
||||
isValidUrl(url?: string) {
|
||||
|
@ -648,7 +827,11 @@ export class Recorder
|
|||
} else {
|
||||
const reqresp = this.pendingRequests.get(requestId);
|
||||
if (reqresp && requestId !== reqresp.requestId) {
|
||||
logger.warn("Invalid request id", {requestId, actualRequestId: reqresp.requestId}, "recorder");
|
||||
logger.warn(
|
||||
"Invalid request id",
|
||||
{ requestId, actualRequestId: reqresp.requestId },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
return reqresp;
|
||||
}
|
||||
|
@ -669,7 +852,11 @@ export class Recorder
|
|||
return;
|
||||
}
|
||||
|
||||
if (reqresp.url && reqresp.method === "GET" && !(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))) {
|
||||
if (
|
||||
reqresp.url &&
|
||||
reqresp.method === "GET" &&
|
||||
!(await this.crawlState.addIfNoDupe(WRITE_DUPE_KEY, reqresp.url))
|
||||
) {
|
||||
logNetwork("Skipping dupe", { url: reqresp.url });
|
||||
return;
|
||||
}
|
||||
|
@ -677,32 +864,52 @@ export class Recorder
|
|||
const responseRecord = createResponse(reqresp, this.pageid);
|
||||
const requestRecord = createRequest(reqresp, responseRecord, this.pageid);
|
||||
|
||||
this.warcQ.add(() => this.writer.writeRecordPair(responseRecord, requestRecord));
|
||||
this.warcQ.add(() =>
|
||||
this.writer.writeRecordPair(responseRecord, requestRecord),
|
||||
);
|
||||
}
|
||||
|
||||
async directFetchCapture(url: string) : Promise<{fetched: boolean, mime: string}>{
|
||||
async directFetchCapture(
|
||||
url: string,
|
||||
): Promise<{ fetched: boolean; mime: string }> {
|
||||
const reqresp = new RequestResponseInfo("0");
|
||||
reqresp.url = url;
|
||||
reqresp.method = "GET";
|
||||
|
||||
logger.debug("Directly fetching page URL without browser", {url, ...this.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"Directly fetching page URL without browser",
|
||||
{ url, ...this.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
|
||||
const filter = (resp: Response) => resp.status === 200 && !resp.headers.get("set-cookie");
|
||||
const filter = (resp: Response) =>
|
||||
resp.status === 200 && !resp.headers.get("set-cookie");
|
||||
|
||||
// ignore dupes: if previous URL was not a page, still load as page. if previous was page,
|
||||
// should not get here, as dupe pages tracked via seen list
|
||||
const fetcher = new AsyncFetcher({tempdir: this.tempdir, reqresp, recorder: this, networkId: "0", filter, ignoreDupe: true});
|
||||
const fetcher = new AsyncFetcher({
|
||||
tempdir: this.tempdir,
|
||||
reqresp,
|
||||
recorder: this,
|
||||
networkId: "0",
|
||||
filter,
|
||||
ignoreDupe: true,
|
||||
});
|
||||
const res = await fetcher.load();
|
||||
|
||||
const mime = reqresp && reqresp.responseHeaders && reqresp.responseHeaders["content-type"] && reqresp.responseHeaders["content-type"].split(";")[0] || "";
|
||||
const mime =
|
||||
(reqresp &&
|
||||
reqresp.responseHeaders &&
|
||||
reqresp.responseHeaders["content-type"] &&
|
||||
reqresp.responseHeaders["content-type"].split(";")[0]) ||
|
||||
"";
|
||||
|
||||
return { fetched: res === "fetched", mime };
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
class AsyncFetcher
|
||||
{
|
||||
class AsyncFetcher {
|
||||
reqresp: RequestResponseInfo;
|
||||
|
||||
networkId: string;
|
||||
|
@ -714,9 +921,23 @@ class AsyncFetcher
|
|||
tempdir: string;
|
||||
filename: string;
|
||||
|
||||
constructor({tempdir, reqresp, expectedSize = -1, recorder, networkId, filter = undefined, ignoreDupe = false} :
|
||||
{tempdir: string, reqresp: RequestResponseInfo, expectedSize?: number, recorder: Recorder,
|
||||
networkId: string, filter?: (resp: Response) => boolean, ignoreDupe?: boolean }) {
|
||||
constructor({
|
||||
tempdir,
|
||||
reqresp,
|
||||
expectedSize = -1,
|
||||
recorder,
|
||||
networkId,
|
||||
filter = undefined,
|
||||
ignoreDupe = false,
|
||||
}: {
|
||||
tempdir: string;
|
||||
reqresp: RequestResponseInfo;
|
||||
expectedSize?: number;
|
||||
recorder: Recorder;
|
||||
networkId: string;
|
||||
filter?: (resp: Response) => boolean;
|
||||
ignoreDupe?: boolean;
|
||||
}) {
|
||||
this.reqresp = reqresp;
|
||||
this.reqresp.expectedSize = expectedSize;
|
||||
this.reqresp.asyncLoading = true;
|
||||
|
@ -728,7 +949,10 @@ class AsyncFetcher
|
|||
this.recorder = recorder;
|
||||
|
||||
this.tempdir = tempdir;
|
||||
this.filename = path.join(this.tempdir, `${timestampNow()}-${uuidv4()}.data`);
|
||||
this.filename = path.join(
|
||||
this.tempdir,
|
||||
`${timestampNow()}-${uuidv4()}.data`,
|
||||
);
|
||||
}
|
||||
|
||||
async load() {
|
||||
|
@ -740,7 +964,11 @@ class AsyncFetcher
|
|||
let fetched = "notfetched";
|
||||
|
||||
try {
|
||||
if (reqresp.method === "GET" && url && !(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url))) {
|
||||
if (
|
||||
reqresp.method === "GET" &&
|
||||
url &&
|
||||
!(await crawlState.addIfNoDupe(ASYNC_FETCH_DUPE_KEY, url))
|
||||
) {
|
||||
if (!this.ignoreDupe) {
|
||||
this.reqresp.asyncLoading = false;
|
||||
return "dupe";
|
||||
|
@ -753,7 +981,10 @@ class AsyncFetcher
|
|||
const responseRecord = createResponse(reqresp, pageid, body);
|
||||
const requestRecord = createRequest(reqresp, responseRecord, pageid);
|
||||
|
||||
const serializer = new WARCSerializer(responseRecord, {gzip, maxMemSize: MAX_BROWSER_FETCH_SIZE});
|
||||
const serializer = new WARCSerializer(responseRecord, {
|
||||
gzip,
|
||||
maxMemSize: MAX_BROWSER_FETCH_SIZE,
|
||||
});
|
||||
|
||||
try {
|
||||
let readSize = await serializer.digestRecord();
|
||||
|
@ -762,19 +993,45 @@ class AsyncFetcher
|
|||
}
|
||||
reqresp.readSize = readSize;
|
||||
} catch (e) {
|
||||
logger.error("Error reading + digesting payload", {url, filename, ...errJSON(e), ...logDetails}, "recorder");
|
||||
logger.error(
|
||||
"Error reading + digesting payload",
|
||||
{ url, filename, ...errJSON(e), ...logDetails },
|
||||
"recorder",
|
||||
);
|
||||
}
|
||||
|
||||
if (reqresp.readSize === reqresp.expectedSize || reqresp.expectedSize < 0) {
|
||||
logger.debug("Async fetch: streaming done", {size: reqresp.readSize, expected: reqresp.expectedSize, networkId, url, ...logDetails}, "recorder");
|
||||
|
||||
if (
|
||||
reqresp.readSize === reqresp.expectedSize ||
|
||||
reqresp.expectedSize < 0
|
||||
) {
|
||||
logger.debug(
|
||||
"Async fetch: streaming done",
|
||||
{
|
||||
size: reqresp.readSize,
|
||||
expected: reqresp.expectedSize,
|
||||
networkId,
|
||||
url,
|
||||
...logDetails,
|
||||
},
|
||||
"recorder",
|
||||
);
|
||||
} else {
|
||||
logger.warn("Async fetch: possible response size mismatch", {size: reqresp.readSize, expected: reqresp.expectedSize, url, ...logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"Async fetch: possible response size mismatch",
|
||||
{
|
||||
size: reqresp.readSize,
|
||||
expected: reqresp.expectedSize,
|
||||
url,
|
||||
...logDetails,
|
||||
},
|
||||
"recorder",
|
||||
);
|
||||
//await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
||||
//return fetched;
|
||||
}
|
||||
|
||||
const externalBuffer : TempFileBuffer = serializer.externalBuffer as TempFileBuffer;
|
||||
const externalBuffer: TempFileBuffer =
|
||||
serializer.externalBuffer as TempFileBuffer;
|
||||
|
||||
if (externalBuffer) {
|
||||
const { currSize, buffers, fh } = externalBuffer;
|
||||
|
@ -786,13 +1043,25 @@ class AsyncFetcher
|
|||
}
|
||||
|
||||
if (Object.keys(reqresp.extraOpts).length) {
|
||||
responseRecord.warcHeaders.headers.set("WARC-JSON-Metadata", JSON.stringify(reqresp.extraOpts));
|
||||
responseRecord.warcHeaders.headers.set(
|
||||
"WARC-JSON-Metadata",
|
||||
JSON.stringify(reqresp.extraOpts),
|
||||
);
|
||||
}
|
||||
|
||||
recorder.warcQ.add(() => recorder.writer.writeRecordPair(responseRecord, requestRecord, serializer));
|
||||
|
||||
recorder.warcQ.add(() =>
|
||||
recorder.writer.writeRecordPair(
|
||||
responseRecord,
|
||||
requestRecord,
|
||||
serializer,
|
||||
),
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Streaming Fetch Error", {url, networkId, filename, ...errJSON(e), ...logDetails}, "recorder");
|
||||
logger.error(
|
||||
"Streaming Fetch Error",
|
||||
{ url, networkId, filename, ...errJSON(e), ...logDetails },
|
||||
"recorder",
|
||||
);
|
||||
await crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
|
||||
} finally {
|
||||
recorder.removeReqResp(networkId);
|
||||
|
@ -816,21 +1085,29 @@ class AsyncFetcher
|
|||
signal = abort.signal;
|
||||
}
|
||||
|
||||
const resp = await fetch(url!, {method, headers, body: reqresp.postData || undefined, signal});
|
||||
const resp = await fetch(url!, {
|
||||
method,
|
||||
headers,
|
||||
body: reqresp.postData || undefined,
|
||||
signal,
|
||||
});
|
||||
|
||||
if (this.filter && !this.filter(resp) && abort) {
|
||||
abort.abort();
|
||||
throw new Error("invalid response, ignoring fetch");
|
||||
}
|
||||
|
||||
if (reqresp.expectedSize < 0 && resp.headers.get("content-length") && !resp.headers.get("content-encoding")) {
|
||||
if (
|
||||
reqresp.expectedSize < 0 &&
|
||||
resp.headers.get("content-length") &&
|
||||
!resp.headers.get("content-encoding")
|
||||
) {
|
||||
reqresp.expectedSize = Number(resp.headers.get("content-length") || -1);
|
||||
}
|
||||
|
||||
if (reqresp.expectedSize === 0) {
|
||||
reqresp.payload = new Uint8Array();
|
||||
return;
|
||||
|
||||
} else if (!resp.body) {
|
||||
logger.error("Empty body, stopping fetch", { url }, "recorder");
|
||||
await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url!);
|
||||
|
@ -853,7 +1130,11 @@ class AsyncFetcher
|
|||
yield value;
|
||||
}
|
||||
} catch (e) {
|
||||
logger.warn("takeReader interrupted", {...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"takeReader interrupted",
|
||||
{ ...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
this.reqresp.truncated = "disconnect";
|
||||
}
|
||||
}
|
||||
|
@ -861,7 +1142,9 @@ class AsyncFetcher
|
|||
async *takeStreamIter(cdp: CDPSession, stream: Protocol.IO.StreamHandle) {
|
||||
try {
|
||||
while (true) {
|
||||
const {data, base64Encoded, eof} = await cdp.send("IO.read", {handle: stream});
|
||||
const { data, base64Encoded, eof } = await cdp.send("IO.read", {
|
||||
handle: stream,
|
||||
});
|
||||
const buff = Buffer.from(data, base64Encoded ? "base64" : "utf-8");
|
||||
|
||||
yield buff;
|
||||
|
@ -871,15 +1154,18 @@ class AsyncFetcher
|
|||
}
|
||||
}
|
||||
} catch (e) {
|
||||
logger.warn("takeStream interrupted", {...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails}, "recorder");
|
||||
logger.warn(
|
||||
"takeStream interrupted",
|
||||
{ ...errJSON(e), url: this.reqresp.url, ...this.recorder.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
this.reqresp.truncated = "disconnect";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
class ResponseStreamAsyncFetcher extends AsyncFetcher
|
||||
{
|
||||
class ResponseStreamAsyncFetcher extends AsyncFetcher {
|
||||
cdp: CDPSession;
|
||||
requestId: string;
|
||||
|
||||
|
@ -896,15 +1182,16 @@ class ResponseStreamAsyncFetcher extends AsyncFetcher
|
|||
const { url } = reqresp;
|
||||
logger.debug("Async started: takeStream", { url }, "recorder");
|
||||
|
||||
const { stream } = await cdp.send("Fetch.takeResponseBodyAsStream", {requestId});
|
||||
const { stream } = await cdp.send("Fetch.takeResponseBodyAsStream", {
|
||||
requestId,
|
||||
});
|
||||
|
||||
return this.takeStreamIter(cdp, stream);
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
||||
{
|
||||
class NetworkLoadStreamAsyncFetcher extends AsyncFetcher {
|
||||
cdp: CDPSession;
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -924,21 +1211,45 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
|||
let result = null;
|
||||
|
||||
try {
|
||||
result = await cdp.send("Network.loadNetworkResource", {frameId: reqresp.frameId, url, options});
|
||||
result = await cdp.send("Network.loadNetworkResource", {
|
||||
frameId: reqresp.frameId,
|
||||
url,
|
||||
options,
|
||||
});
|
||||
} catch (e) {
|
||||
logger.debug("Network.loadNetworkResource failed, attempting node fetch", {url, ...errJSON(e), ...this.recorder.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"Network.loadNetworkResource failed, attempting node fetch",
|
||||
{ url, ...errJSON(e), ...this.recorder.logDetails },
|
||||
"recorder",
|
||||
);
|
||||
return await super._doFetch();
|
||||
}
|
||||
|
||||
const { stream, headers, httpStatusCode, success, netError, netErrorName } = result.resource;
|
||||
const { stream, headers, httpStatusCode, success, netError, netErrorName } =
|
||||
result.resource;
|
||||
|
||||
if (!success || !stream) {
|
||||
//await this.recorder.crawlState.removeDupe(ASYNC_FETCH_DUPE_KEY, url);
|
||||
logger.debug("Network.loadNetworkResource failed, attempting node fetch", {url, netErrorName, netError, httpStatusCode, ...this.recorder.logDetails}, "recorder");
|
||||
logger.debug(
|
||||
"Network.loadNetworkResource failed, attempting node fetch",
|
||||
{
|
||||
url,
|
||||
netErrorName,
|
||||
netError,
|
||||
httpStatusCode,
|
||||
...this.recorder.logDetails,
|
||||
},
|
||||
"recorder",
|
||||
);
|
||||
return await super._doFetch();
|
||||
}
|
||||
|
||||
if (reqresp.expectedSize < 0 && headers && headers["content-length"] && !headers["content-encoding"]) {
|
||||
if (
|
||||
reqresp.expectedSize < 0 &&
|
||||
headers &&
|
||||
headers["content-length"] &&
|
||||
!headers["content-encoding"]
|
||||
) {
|
||||
reqresp.expectedSize = Number(headers["content-length"] || -1);
|
||||
}
|
||||
|
||||
|
@ -956,13 +1267,19 @@ class NetworkLoadStreamAsyncFetcher extends AsyncFetcher
|
|||
|
||||
// =================================================================
|
||||
// response
|
||||
function createResponse(reqresp: RequestResponseInfo, pageid: string, contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>) {
|
||||
function createResponse(
|
||||
reqresp: RequestResponseInfo,
|
||||
pageid: string,
|
||||
contentIter?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>,
|
||||
) {
|
||||
const url = reqresp.url;
|
||||
const warcVersion = "WARC/1.1";
|
||||
const statusline = `HTTP/1.1 ${reqresp.status} ${reqresp.statusText}`;
|
||||
const date = new Date().toISOString();
|
||||
|
||||
const httpHeaders = reqresp.getResponseHeadersDict(reqresp.payload ? reqresp.payload.length : 0);
|
||||
const httpHeaders = reqresp.getResponseHeadersDict(
|
||||
reqresp.payload ? reqresp.payload.length : 0,
|
||||
);
|
||||
|
||||
const warcHeaders: Record<string, string> = {
|
||||
"WARC-Page-ID": pageid,
|
||||
|
@ -980,14 +1297,27 @@ function createResponse(reqresp: RequestResponseInfo, pageid: string, contentIte
|
|||
warcHeaders["WARC-JSON-Metadata"] = JSON.stringify(reqresp.extraOpts);
|
||||
}
|
||||
|
||||
return WARCRecord.create({
|
||||
url, date, warcVersion, type: "response", warcHeaders,
|
||||
httpHeaders, statusline}, contentIter);
|
||||
return WARCRecord.create(
|
||||
{
|
||||
url,
|
||||
date,
|
||||
warcVersion,
|
||||
type: "response",
|
||||
warcHeaders,
|
||||
httpHeaders,
|
||||
statusline,
|
||||
},
|
||||
contentIter,
|
||||
);
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// request
|
||||
function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord, pageid: string) {
|
||||
function createRequest(
|
||||
reqresp: RequestResponseInfo,
|
||||
responseRecord: WARCRecord,
|
||||
pageid: string,
|
||||
) {
|
||||
const url = reqresp.url;
|
||||
const warcVersion = "WARC/1.1";
|
||||
const method = reqresp.method;
|
||||
|
@ -996,7 +1326,9 @@ function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord,
|
|||
|
||||
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
|
||||
|
||||
const requestBody = reqresp.postData ? [encoder.encode(reqresp.postData)] : [];
|
||||
const requestBody = reqresp.postData
|
||||
? [encoder.encode(reqresp.postData)]
|
||||
: [];
|
||||
|
||||
const httpHeaders = reqresp.getRequestHeadersDict();
|
||||
|
||||
|
@ -1007,7 +1339,16 @@ function createRequest(reqresp: RequestResponseInfo, responseRecord: WARCRecord,
|
|||
|
||||
const date = responseRecord.warcDate || undefined;
|
||||
|
||||
return WARCRecord.create({
|
||||
url, date, warcVersion, type: "request", warcHeaders,
|
||||
httpHeaders, statusline}, requestBody);
|
||||
return WARCRecord.create(
|
||||
{
|
||||
url,
|
||||
date,
|
||||
warcVersion,
|
||||
type: "request",
|
||||
warcHeaders,
|
||||
httpHeaders,
|
||||
statusline,
|
||||
},
|
||||
requestBody,
|
||||
);
|
||||
}
|
||||
|
|
|
@ -14,10 +14,9 @@ console.error = function (...args) {
|
|||
typeof args[0] === "string" &&
|
||||
args[0].indexOf("[ioredis] Unhandled error event") === 0
|
||||
) {
|
||||
|
||||
const now = Date.now();
|
||||
|
||||
if ((now - lastLogTime) > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||
if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) {
|
||||
if (lastLogTime && exitOnError) {
|
||||
logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
|
||||
}
|
||||
|
|
|
@ -7,10 +7,8 @@ const CONTENT_LENGTH = "content-length";
|
|||
const CONTENT_TYPE = "content-type";
|
||||
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class RequestResponseInfo
|
||||
{
|
||||
export class RequestResponseInfo {
|
||||
_created: Date = new Date();
|
||||
|
||||
requestId: string;
|
||||
|
@ -33,7 +31,7 @@ export class RequestResponseInfo
|
|||
statusText?: string;
|
||||
|
||||
responseHeaders?: Record<string, string>;
|
||||
responseHeadersList?: {name: string, value: string}[];
|
||||
responseHeadersList?: { name: string; value: string }[];
|
||||
responseHeadersText?: string;
|
||||
|
||||
payload?: Uint8Array;
|
||||
|
@ -79,7 +77,6 @@ export class RequestResponseInfo
|
|||
if (params.type) {
|
||||
this.resourceType = params.type;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -100,7 +97,12 @@ export class RequestResponseInfo
|
|||
|
||||
fillResponse(response: Protocol.Network.Response) {
|
||||
// if initial fetch was a 200, but now replacing with 304, don't!
|
||||
if (response.status == 304 && this.status && this.status != 304 && this.url) {
|
||||
if (
|
||||
response.status == 304 &&
|
||||
this.status &&
|
||||
this.status != 304 &&
|
||||
this.url
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -128,7 +130,11 @@ export class RequestResponseInfo
|
|||
|
||||
if (response.securityDetails) {
|
||||
const issuer: string = response.securityDetails.issuer || "";
|
||||
const ctc : string = response.securityDetails.certificateTransparencyCompliance === "compliant" ? "1" : "0";
|
||||
const ctc: string =
|
||||
response.securityDetails.certificateTransparencyCompliance ===
|
||||
"compliant"
|
||||
? "1"
|
||||
: "0";
|
||||
this.extraOpts.cert = { issuer, ctc };
|
||||
}
|
||||
}
|
||||
|
@ -161,7 +167,6 @@ export class RequestResponseInfo
|
|||
this.responseHeaders = Object.fromEntries(response.headers);
|
||||
this.status = response.status;
|
||||
this.statusText = response.statusText || getStatusText(this.status);
|
||||
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -175,7 +180,10 @@ export class RequestResponseInfo
|
|||
|
||||
if (this.responseHeaders) {
|
||||
for (const header of Object.keys(this.responseHeaders)) {
|
||||
headers += `${header}: ${this.responseHeaders[header].replace(/\n/g, ", ")}\r\n`;
|
||||
headers += `${header}: ${this.responseHeaders[header].replace(
|
||||
/\n/g,
|
||||
", ",
|
||||
)}\r\n`;
|
||||
}
|
||||
}
|
||||
headers += "\r\n";
|
||||
|
@ -191,10 +199,18 @@ export class RequestResponseInfo
|
|||
}
|
||||
|
||||
getResponseHeadersDict(length = 0) {
|
||||
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
|
||||
return this._getHeadersDict(
|
||||
this.responseHeaders,
|
||||
this.responseHeadersList,
|
||||
length,
|
||||
);
|
||||
}
|
||||
|
||||
_getHeadersDict(headersDict?: Record<string, string>, headersList?: {name: string, value: string}[], actualContentLength = 0) {
|
||||
_getHeadersDict(
|
||||
headersDict?: Record<string, string>,
|
||||
headersList?: { name: string; value: string }[],
|
||||
actualContentLength = 0,
|
||||
) {
|
||||
if (!headersDict && headersList) {
|
||||
headersDict = {};
|
||||
|
||||
|
|
|
@ -9,12 +9,13 @@ import { Duplex } from "stream";
|
|||
import { CDPSession, Page } from "puppeteer-core";
|
||||
import { WorkerId } from "./state.js";
|
||||
|
||||
const indexHTML = fs.readFileSync(new URL("../../html/screencast.html", import.meta.url), {encoding: "utf8"});
|
||||
|
||||
const indexHTML = fs.readFileSync(
|
||||
new URL("../../html/screencast.html", import.meta.url),
|
||||
{ encoding: "utf8" },
|
||||
);
|
||||
|
||||
// ===========================================================================
|
||||
class WSTransport
|
||||
{
|
||||
class WSTransport {
|
||||
allWS = new Set<WebSocket>();
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
caster!: ScreenCaster;
|
||||
|
@ -23,7 +24,6 @@ class WSTransport
|
|||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
httpServer: any;
|
||||
|
||||
|
||||
constructor(port: number) {
|
||||
this.allWS = new Set();
|
||||
|
||||
|
@ -31,8 +31,12 @@ class WSTransport
|
|||
|
||||
this.wss.on("connection", (ws: WebSocket) => this.initWebSocket(ws));
|
||||
|
||||
this.httpServer = http.createServer((...args) => this.handleRequest(...args));
|
||||
this.httpServer.on("upgrade", (request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
||||
this.httpServer = http.createServer((...args) =>
|
||||
this.handleRequest(...args),
|
||||
);
|
||||
this.httpServer.on(
|
||||
"upgrade",
|
||||
(request: IncomingMessage, socket: Duplex, head: Buffer) => {
|
||||
const pathname = url.parse(request.url || "").pathname;
|
||||
|
||||
if (pathname === "/ws") {
|
||||
|
@ -40,7 +44,8 @@ class WSTransport
|
|||
this.wss.emit("connection", ws, request);
|
||||
});
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
this.httpServer.listen(port);
|
||||
}
|
||||
|
@ -65,7 +70,11 @@ class WSTransport
|
|||
|
||||
this.allWS.add(ws);
|
||||
|
||||
logger.debug("New Screencast Conn", {total: this.allWS.size}, "screencast");
|
||||
logger.debug(
|
||||
"New Screencast Conn",
|
||||
{ total: this.allWS.size },
|
||||
"screencast",
|
||||
);
|
||||
|
||||
if (this.allWS.size === 1) {
|
||||
this.caster.startCastAll();
|
||||
|
@ -95,10 +104,8 @@ class WSTransport
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class RedisPubSubTransport
|
||||
{
|
||||
class RedisPubSubTransport {
|
||||
numConnections: number = 0;
|
||||
castChannel: string;
|
||||
// eslint-disable-next-line no-use-before-define
|
||||
|
@ -157,14 +164,12 @@ class RedisPubSubTransport
|
|||
|
||||
async isActive() {
|
||||
const result = await this.redis.pubsub("numsub", this.castChannel);
|
||||
return (result.length > 1 ? result[1] > 0: false);
|
||||
return result.length > 1 ? result[1] > 0 : false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class ScreenCaster
|
||||
{
|
||||
class ScreenCaster {
|
||||
transport: WSTransport;
|
||||
caches = new Map<WorkerId, string>();
|
||||
urls = new Map<WorkerId, string>();
|
||||
|
@ -183,7 +188,7 @@ class ScreenCaster
|
|||
msg: "init",
|
||||
width: this.maxWidth,
|
||||
height: this.maxHeight,
|
||||
browsers: numWorkers
|
||||
browsers: numWorkers,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -277,7 +282,12 @@ class ScreenCaster
|
|||
|
||||
logger.info("Started Screencast", { workerid: id }, "screencast");
|
||||
|
||||
await cdp.send("Page.startScreencast", {format: "png", everyNthFrame: 1, maxWidth: this.maxWidth, maxHeight: this.maxHeight});
|
||||
await cdp.send("Page.startScreencast", {
|
||||
format: "png",
|
||||
everyNthFrame: 1,
|
||||
maxWidth: this.maxWidth,
|
||||
maxHeight: this.maxHeight,
|
||||
});
|
||||
}
|
||||
|
||||
async stopCast(cdp: CDPSession, id: WorkerId) {
|
||||
|
|
|
@ -4,31 +4,30 @@ import { WARCResourceWriter } from "./warcresourcewriter.js";
|
|||
import { logger, errJSON } from "./logger.js";
|
||||
import { Browser } from "./browser.js";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
|
||||
type ScreenShotType = {
|
||||
type: string;
|
||||
omitBackground: boolean;
|
||||
fullPage: boolean;
|
||||
}
|
||||
};
|
||||
|
||||
export const screenshotTypes: Record<string, ScreenShotType> = {
|
||||
"view": {
|
||||
view: {
|
||||
type: "png",
|
||||
omitBackground: true,
|
||||
fullPage: false
|
||||
fullPage: false,
|
||||
},
|
||||
"thumbnail": {
|
||||
thumbnail: {
|
||||
type: "jpeg",
|
||||
omitBackground: true,
|
||||
fullPage: false
|
||||
fullPage: false,
|
||||
},
|
||||
"fullPage": {
|
||||
fullPage: {
|
||||
type: "png",
|
||||
omitBackground: true,
|
||||
fullPage: true
|
||||
}
|
||||
fullPage: true,
|
||||
},
|
||||
};
|
||||
|
||||
export class Screenshots extends WARCResourceWriter {
|
||||
|
@ -48,14 +47,27 @@ export class Screenshots extends WARCResourceWriter {
|
|||
async take(screenshotType = "view") {
|
||||
try {
|
||||
if (screenshotType !== "fullPage") {
|
||||
await this.browser.setViewport(this.page, {width: 1920, height: 1080});
|
||||
await this.browser.setViewport(this.page, {
|
||||
width: 1920,
|
||||
height: 1080,
|
||||
});
|
||||
}
|
||||
const options = screenshotTypes[screenshotType];
|
||||
const screenshotBuffer = await this.page.screenshot(options);
|
||||
await this.writeBufferToWARC(screenshotBuffer, screenshotType, "image/" + options.type);
|
||||
logger.info(`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`);
|
||||
await this.writeBufferToWARC(
|
||||
screenshotBuffer,
|
||||
screenshotType,
|
||||
"image/" + options.type,
|
||||
);
|
||||
logger.info(
|
||||
`Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.warcName}`,
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
logger.error(
|
||||
"Taking screenshot failed",
|
||||
{ page: this.url, type: screenshotType, ...errJSON(e) },
|
||||
"screenshots",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -73,10 +85,20 @@ export class Screenshots extends WARCResourceWriter {
|
|||
// 16:9 thumbnail
|
||||
.resize(640, 360)
|
||||
.toBuffer();
|
||||
await this.writeBufferToWARC(thumbnailBuffer, screenshotType, "image/" + options.type);
|
||||
logger.info(`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`);
|
||||
await this.writeBufferToWARC(
|
||||
thumbnailBuffer,
|
||||
screenshotType,
|
||||
"image/" + options.type,
|
||||
);
|
||||
logger.info(
|
||||
`Screenshot (type: thumbnail) for ${this.url} written to ${this.warcName}`,
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Taking screenshot failed", {"page": this.url, type: screenshotType, ...errJSON(e)}, "screenshots");
|
||||
logger.error(
|
||||
"Taking screenshot failed",
|
||||
{ page: this.url, type: screenshotType, ...errJSON(e) },
|
||||
"screenshots",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,8 +10,7 @@ type ScopeType =
|
|||
| "any"
|
||||
| "custom";
|
||||
|
||||
export class ScopedSeed
|
||||
{
|
||||
export class ScopedSeed {
|
||||
url: string;
|
||||
scopeType: ScopeType;
|
||||
include: RegExp[];
|
||||
|
@ -24,11 +23,25 @@ export class ScopedSeed
|
|||
maxExtraHops = 0;
|
||||
maxDepth = 0;
|
||||
|
||||
|
||||
constructor(
|
||||
{url, scopeType, include, exclude = [], allowHash = false, depth = -1, sitemap = false, extraHops = 0} :
|
||||
{url: string, scopeType: ScopeType, include: string[], exclude?: string[], allowHash?: boolean, depth?: number, sitemap?: string | boolean | null, extraHops?: number}
|
||||
) {
|
||||
constructor({
|
||||
url,
|
||||
scopeType,
|
||||
include,
|
||||
exclude = [],
|
||||
allowHash = false,
|
||||
depth = -1,
|
||||
sitemap = false,
|
||||
extraHops = 0,
|
||||
}: {
|
||||
url: string;
|
||||
scopeType: ScopeType;
|
||||
include: string[];
|
||||
exclude?: string[];
|
||||
allowHash?: boolean;
|
||||
depth?: number;
|
||||
sitemap?: string | boolean | null;
|
||||
extraHops?: number;
|
||||
}) {
|
||||
const parsedUrl = this.parseUrl(url);
|
||||
if (!parsedUrl) {
|
||||
throw new Error("Invalid URL");
|
||||
|
@ -43,7 +56,10 @@ export class ScopedSeed
|
|||
}
|
||||
|
||||
if (this.scopeType !== "custom") {
|
||||
const [includeNew, allowHashNew] = this.scopeFromType(this.scopeType, parsedUrl);
|
||||
const [includeNew, allowHashNew] = this.scopeFromType(
|
||||
this.scopeType,
|
||||
parsedUrl,
|
||||
);
|
||||
this.include = [...includeNew, ...this.include];
|
||||
allowHash = allowHashNew;
|
||||
}
|
||||
|
@ -69,7 +85,7 @@ export class ScopedSeed
|
|||
} else if (!(value instanceof Array)) {
|
||||
return [new RegExp(value)];
|
||||
} else {
|
||||
return value.map(e => (e instanceof RegExp) ? e : new RegExp(e));
|
||||
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,7 +118,10 @@ export class ScopedSeed
|
|||
}
|
||||
|
||||
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol != "https:") {
|
||||
logger.warn("Invalid Page - URL must start with http:// or https://", {url, ...logDetails});
|
||||
logger.warn("Invalid Page - URL must start with http:// or https://", {
|
||||
url,
|
||||
...logDetails,
|
||||
});
|
||||
parsedUrl = null;
|
||||
}
|
||||
|
||||
|
@ -114,7 +133,7 @@ export class ScopedSeed
|
|||
const url = new URL(this.url);
|
||||
url.pathname = "/sitemap.xml";
|
||||
return url.href;
|
||||
} else if (typeof(sitemap) === "string") {
|
||||
} else if (typeof sitemap === "string") {
|
||||
const url = new URL(sitemap, this.url);
|
||||
return url.href;
|
||||
}
|
||||
|
@ -133,23 +152,47 @@ export class ScopedSeed
|
|||
|
||||
case "page-spa":
|
||||
// allow scheme-agnostic URLS as likely redirects
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+")];
|
||||
include = [
|
||||
new RegExp("^" + urlRxEscape(parsedUrl.href, parsedUrl) + "#.+"),
|
||||
];
|
||||
allowHash = true;
|
||||
break;
|
||||
|
||||
case "prefix":
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + parsedUrl.pathname.slice(0, parsedUrl.pathname.lastIndexOf("/") + 1), parsedUrl))];
|
||||
include = [
|
||||
new RegExp(
|
||||
"^" +
|
||||
urlRxEscape(
|
||||
parsedUrl.origin +
|
||||
parsedUrl.pathname.slice(
|
||||
0,
|
||||
parsedUrl.pathname.lastIndexOf("/") + 1,
|
||||
),
|
||||
parsedUrl,
|
||||
),
|
||||
),
|
||||
];
|
||||
break;
|
||||
|
||||
case "host":
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl))];
|
||||
include = [
|
||||
new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl)),
|
||||
];
|
||||
break;
|
||||
|
||||
case "domain":
|
||||
if (parsedUrl.hostname.startsWith("www.")) {
|
||||
parsedUrl.hostname = parsedUrl.hostname.replace("www.", "");
|
||||
}
|
||||
include = [new RegExp("^" + urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace("\\/\\/", "\\/\\/([^/]+\\.)*"))];
|
||||
include = [
|
||||
new RegExp(
|
||||
"^" +
|
||||
urlRxEscape(parsedUrl.origin + "/", parsedUrl).replace(
|
||||
"\\/\\/",
|
||||
"\\/\\/([^/]+\\.)*",
|
||||
),
|
||||
),
|
||||
];
|
||||
break;
|
||||
|
||||
case "any":
|
||||
|
@ -157,7 +200,9 @@ export class ScopedSeed
|
|||
break;
|
||||
|
||||
default:
|
||||
logger.fatal(`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`);
|
||||
logger.fatal(
|
||||
`Invalid scope type "${scopeType}" specified, valid types are: page, page-spa, prefix, host, domain, any`,
|
||||
);
|
||||
}
|
||||
|
||||
return [include, allowHash];
|
||||
|
@ -232,7 +277,3 @@ export function rxEscape(string: string) {
|
|||
export function urlRxEscape(url: string, parsedUrl: URL) {
|
||||
return rxEscape(url).replace(parsedUrl.protocol, "https?:");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,6 @@ import { MAX_DEPTH } from "./constants.js";
|
|||
import { ScopedSeed } from "./seeds.js";
|
||||
import { Frame } from "puppeteer-core";
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export enum LoadState {
|
||||
FAILED = 0,
|
||||
|
@ -16,7 +15,6 @@ export enum LoadState {
|
|||
BEHAVIORS_DONE = 4,
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export enum QueueState {
|
||||
ADDED = 0,
|
||||
|
@ -24,14 +22,11 @@ export enum QueueState {
|
|||
DUPE_URL = 2,
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export type WorkerId = number;
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class PageState
|
||||
{
|
||||
export class PageState {
|
||||
url: string;
|
||||
seedId: number;
|
||||
depth: number;
|
||||
|
@ -57,7 +52,12 @@ export class PageState
|
|||
|
||||
logDetails = {};
|
||||
|
||||
constructor(redisData: {url: string, seedId: number, depth: number, extraHops: number}) {
|
||||
constructor(redisData: {
|
||||
url: string;
|
||||
seedId: number;
|
||||
depth: number;
|
||||
extraHops: number;
|
||||
}) {
|
||||
this.url = redisData.url;
|
||||
this.seedId = redisData.seedId;
|
||||
this.depth = redisData.depth;
|
||||
|
@ -78,10 +78,7 @@ declare module "ioredis" {
|
|||
limit: number,
|
||||
): Result<number, Context>;
|
||||
|
||||
getnext(
|
||||
qkey: string,
|
||||
pkey: string,
|
||||
): Result<string, Context>;
|
||||
getnext(qkey: string, pkey: string): Result<string, Context>;
|
||||
|
||||
markstarted(
|
||||
pkey: string,
|
||||
|
@ -103,7 +100,7 @@ declare module "ioredis" {
|
|||
unlockpending(
|
||||
pkeyUrl: string,
|
||||
uid: string,
|
||||
callback?: Callback<string>
|
||||
callback?: Callback<string>,
|
||||
): Result<void, Context>;
|
||||
|
||||
requeue(
|
||||
|
@ -113,13 +110,11 @@ declare module "ioredis" {
|
|||
url: string,
|
||||
maxRetryPending: number,
|
||||
): Result<number, Context>;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
export class RedisCrawlState
|
||||
{
|
||||
export class RedisCrawlState {
|
||||
redis: Redis;
|
||||
maxRetryPending = 1;
|
||||
_lastSize = 0;
|
||||
|
@ -138,8 +133,6 @@ export class RedisCrawlState
|
|||
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
|
||||
this.redis = redis;
|
||||
|
||||
|
||||
|
||||
this.uid = uid;
|
||||
this.key = key;
|
||||
this.maxPageTime = maxPageTime;
|
||||
|
@ -172,7 +165,7 @@ end
|
|||
redis.call('zadd', KEYS[2], ARGV[2], ARGV[3]);
|
||||
redis.call('hdel', KEYS[1], ARGV[1]);
|
||||
return 0;
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("getnext", {
|
||||
|
@ -187,7 +180,7 @@ if json then
|
|||
end
|
||||
|
||||
return json;
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("markstarted", {
|
||||
|
@ -203,7 +196,7 @@ if json then
|
|||
redis.call('setex', KEYS[2], ARGV[3], ARGV[4]);
|
||||
end
|
||||
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("unlockpending", {
|
||||
|
@ -215,7 +208,7 @@ if value == ARGV[1] then
|
|||
redis.call('del', KEYS[1])
|
||||
end
|
||||
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("movefailed", {
|
||||
|
@ -232,7 +225,7 @@ if json then
|
|||
redis.call('hdel', KEYS[1], ARGV[1]);
|
||||
end
|
||||
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("requeue", {
|
||||
|
@ -255,9 +248,8 @@ if not res then
|
|||
end
|
||||
end
|
||||
return 0;
|
||||
`
|
||||
`,
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
async _getNext() {
|
||||
|
@ -271,7 +263,14 @@ return 0;
|
|||
async markStarted(url: string) {
|
||||
const started = this._timestamp();
|
||||
|
||||
return await this.redis.markstarted(this.pkey, this.pkey + ":" + url, url, started, this.maxPageTime, this.uid);
|
||||
return await this.redis.markstarted(
|
||||
this.pkey,
|
||||
this.pkey + ":" + url,
|
||||
url,
|
||||
started,
|
||||
this.maxPageTime,
|
||||
this.uid,
|
||||
);
|
||||
}
|
||||
|
||||
async markFinished(url: string) {
|
||||
|
@ -292,14 +291,17 @@ return 0;
|
|||
await this.redis.srem(this.skey, url);
|
||||
}
|
||||
|
||||
recheckScope(data: {url: string, depth: number, extraHops: number, seedId: number}, seeds: ScopedSeed[]) {
|
||||
recheckScope(
|
||||
data: { url: string; depth: number; extraHops: number; seedId: number },
|
||||
seeds: ScopedSeed[],
|
||||
) {
|
||||
const seed = seeds[data.seedId];
|
||||
|
||||
return seed.isIncluded(data.url, data.depth, data.extraHops);
|
||||
}
|
||||
|
||||
async isFinished() {
|
||||
return ((await this.queueSize()) == 0) && ((await this.numDone()) > 0);
|
||||
return (await this.queueSize()) == 0 && (await this.numDone()) > 0;
|
||||
}
|
||||
|
||||
async setStatus(status_: string) {
|
||||
|
@ -369,9 +371,9 @@ return 0;
|
|||
}
|
||||
break;
|
||||
}
|
||||
} // TODO: Fix this the next time the file is edited.
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
catch (e: any) {
|
||||
} catch (e: any) {
|
||||
logger.warn("Error processing message", e, "redisMessage");
|
||||
}
|
||||
}
|
||||
|
@ -389,7 +391,7 @@ return 0;
|
|||
|
||||
// regexStr just a string, optimize by using glob matching
|
||||
if (this.isStrMatch(regexStr)) {
|
||||
matcher = {"match": `*${regexStr}*`};
|
||||
matcher = { match: `*${regexStr}*` };
|
||||
}
|
||||
|
||||
const stream = this.redis.zscanStream(this.qkey, matcher);
|
||||
|
@ -404,14 +406,18 @@ return 0;
|
|||
//if (removed) {
|
||||
await this.markExcluded(url);
|
||||
//}
|
||||
logger.debug("Removing excluded URL", {url, regex, removed}, "exclusion");
|
||||
logger.debug(
|
||||
"Removing excluded URL",
|
||||
{ url, regex, removed },
|
||||
"exclusion",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
stream.resume();
|
||||
});
|
||||
|
||||
return new Promise<void>(resolve => {
|
||||
return new Promise<void>((resolve) => {
|
||||
stream.on("end", () => {
|
||||
resolve();
|
||||
});
|
||||
|
@ -424,11 +430,19 @@ return 0;
|
|||
|
||||
// consider failed if 3 failed retries in 60 secs
|
||||
await this.redis.expire(key, 60);
|
||||
return (res >= 3);
|
||||
return res >= 3;
|
||||
}
|
||||
|
||||
//async addToQueue({url : string, seedId, depth = 0, extraHops = 0} = {}, limit = 0) {
|
||||
async addToQueue({url, seedId, depth = 0, extraHops = 0} : {url: string, seedId: number, depth?: number, extraHops?: number}, limit = 0) {
|
||||
async addToQueue(
|
||||
{
|
||||
url,
|
||||
seedId,
|
||||
depth = 0,
|
||||
extraHops = 0,
|
||||
}: { url: string; seedId: number; depth?: number; extraHops?: number },
|
||||
limit = 0,
|
||||
) {
|
||||
const added = this._timestamp();
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -441,7 +455,15 @@ return 0;
|
|||
// 0 - url queued successfully
|
||||
// 1 - url queue size limit reached
|
||||
// 2 - url is a dupe
|
||||
return await this.redis.addqueue(this.pkey, this.qkey, this.skey, url, this._getScore(data), JSON.stringify(data), limit);
|
||||
return await this.redis.addqueue(
|
||||
this.pkey,
|
||||
this.qkey,
|
||||
this.skey,
|
||||
url,
|
||||
this._getScore(data),
|
||||
JSON.stringify(data),
|
||||
limit,
|
||||
);
|
||||
}
|
||||
|
||||
async nextFromQueue() {
|
||||
|
@ -479,7 +501,7 @@ return 0;
|
|||
return { done, queued, pending, failed, errors };
|
||||
}
|
||||
|
||||
_getScore(data: {depth: number, extraHops: number}) {
|
||||
_getScore(data: { depth: number; extraHops: number }) {
|
||||
return (data.depth || 0) + (data.extraHops || 0) * MAX_DEPTH;
|
||||
}
|
||||
|
||||
|
@ -489,7 +511,14 @@ return 0;
|
|||
const len = await this.redis.zcard(key);
|
||||
|
||||
for (let i = 0; i < len; i += inc) {
|
||||
const someResults = await this.redis.zrangebyscore(key, 0, "inf", "LIMIT", i, inc);
|
||||
const someResults = await this.redis.zrangebyscore(
|
||||
key,
|
||||
0,
|
||||
"inf",
|
||||
"LIMIT",
|
||||
i,
|
||||
inc,
|
||||
);
|
||||
results.push(...someResults);
|
||||
}
|
||||
|
||||
|
@ -508,9 +537,13 @@ return 0;
|
|||
return results;
|
||||
}
|
||||
|
||||
async load(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
async load(state: Record<string, any>, seeds: ScopedSeed[], checkScope: boolean) {
|
||||
state: Record<string, any>,
|
||||
seeds: ScopedSeed[],
|
||||
checkScope: boolean,
|
||||
) {
|
||||
const seen: string[] = [];
|
||||
|
||||
// need to delete existing keys, if exist to fully reset state
|
||||
|
@ -545,7 +578,7 @@ return 0;
|
|||
seen.push(data.url);
|
||||
}
|
||||
|
||||
if (typeof(state.done) === "number") {
|
||||
if (typeof state.done === "number") {
|
||||
// done key is just an int counter
|
||||
await this.redis.set(this.dkey, state.done);
|
||||
} else if (state.done instanceof Array) {
|
||||
|
@ -601,7 +634,7 @@ return 0;
|
|||
|
||||
async getPendingList() {
|
||||
const list = await this.redis.hvals(this.pkey);
|
||||
return list.map(x => JSON.parse(x));
|
||||
return list.map((x) => JSON.parse(x));
|
||||
}
|
||||
|
||||
async getErrorList() {
|
||||
|
@ -615,9 +648,9 @@ return 0;
|
|||
for (const url of pendingUrls) {
|
||||
await this.redis.unlockpending(this.pkey + ":" + url, this.uid);
|
||||
}
|
||||
} // TODO: Fix this the next time the file is edited.
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
catch (e: any) {
|
||||
} catch (e: any) {
|
||||
logger.error("Redis Del Pending Failed", e, "state");
|
||||
}
|
||||
}
|
||||
|
@ -626,7 +659,13 @@ return 0;
|
|||
const pendingUrls = await this.redis.hkeys(this.pkey);
|
||||
|
||||
for (const url of pendingUrls) {
|
||||
const res = await this.redis.requeue(this.pkey, this.qkey, this.pkey + ":" + url, url, this.maxRetryPending);
|
||||
const res = await this.redis.requeue(
|
||||
this.pkey,
|
||||
this.qkey,
|
||||
this.pkey + ":" + url,
|
||||
url,
|
||||
this.maxRetryPending,
|
||||
);
|
||||
switch (res) {
|
||||
case 1:
|
||||
logger.info(`Requeued: ${url}`);
|
||||
|
@ -656,4 +695,3 @@ return 0;
|
|||
return await this.redis.lpush(this.ekey, error);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,10 +16,8 @@ import { logger } from "./logger.js";
|
|||
// @ts-expect-error TODO fill in why error is expected
|
||||
import getFolderSize from "get-folder-size";
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
export class S3StorageSync
|
||||
{
|
||||
export class S3StorageSync {
|
||||
fullPrefix: string;
|
||||
client: Minio.Client;
|
||||
|
||||
|
@ -36,21 +34,23 @@ export class S3StorageSync
|
|||
constructor(
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
urlOrData: string | any,
|
||||
{webhookUrl, userId, crawlId} :
|
||||
{webhookUrl?: string, userId: string, crawlId: string}
|
||||
{
|
||||
webhookUrl,
|
||||
userId,
|
||||
crawlId,
|
||||
}: { webhookUrl?: string; userId: string; crawlId: string },
|
||||
) {
|
||||
let url;
|
||||
let accessKey;
|
||||
let secretKey;
|
||||
|
||||
if (typeof(urlOrData) === "string") {
|
||||
if (typeof urlOrData === "string") {
|
||||
url = new URL(urlOrData);
|
||||
accessKey = url.username;
|
||||
secretKey = url.password;
|
||||
url.username = "";
|
||||
url.password = "";
|
||||
this.fullPrefix = url.href;
|
||||
|
||||
} else {
|
||||
url = new URL(urlOrData.endpointUrl);
|
||||
accessKey = urlOrData.accessKey;
|
||||
|
@ -64,7 +64,7 @@ export class S3StorageSync
|
|||
useSSL: url.protocol === "https:",
|
||||
accessKey,
|
||||
secretKey,
|
||||
partSize: 100*1024*1024
|
||||
partSize: 100 * 1024 * 1024,
|
||||
});
|
||||
|
||||
this.bucketName = url.pathname.slice(1).split("/")[0];
|
||||
|
@ -80,14 +80,18 @@ export class S3StorageSync
|
|||
|
||||
async uploadFile(srcFilename: string, targetFilename: string) {
|
||||
const fileUploadInfo = {
|
||||
"bucket": this.bucketName,
|
||||
"crawlId": this.crawlId,
|
||||
"prefix": this.objectPrefix,
|
||||
targetFilename
|
||||
bucket: this.bucketName,
|
||||
crawlId: this.crawlId,
|
||||
prefix: this.objectPrefix,
|
||||
targetFilename,
|
||||
};
|
||||
logger.info("S3 file upload information", fileUploadInfo, "s3Upload");
|
||||
|
||||
await this.client.fPutObject(this.bucketName, this.objectPrefix + targetFilename, srcFilename);
|
||||
await this.client.fPutObject(
|
||||
this.bucketName,
|
||||
this.objectPrefix + targetFilename,
|
||||
srcFilename,
|
||||
);
|
||||
|
||||
const { hash, crc32 } = await checksumFile("sha256", srcFilename);
|
||||
const path = targetFilename;
|
||||
|
@ -99,12 +103,24 @@ export class S3StorageSync
|
|||
}
|
||||
|
||||
async downloadFile(srcFilename: string, destFilename: string) {
|
||||
await this.client.fGetObject(this.bucketName, this.objectPrefix + srcFilename, destFilename);
|
||||
await this.client.fGetObject(
|
||||
this.bucketName,
|
||||
this.objectPrefix + srcFilename,
|
||||
destFilename,
|
||||
);
|
||||
}
|
||||
|
||||
async uploadCollWACZ(srcFilename: string, targetFilename: string, completed = true) {
|
||||
async uploadCollWACZ(
|
||||
srcFilename: string,
|
||||
targetFilename: string,
|
||||
completed = true,
|
||||
) {
|
||||
const resource = await this.uploadFile(srcFilename, targetFilename);
|
||||
logger.info("WACZ S3 file upload resource", {targetFilename, resource}, "s3Upload");
|
||||
logger.info(
|
||||
"WACZ S3 file upload resource",
|
||||
{ targetFilename, resource },
|
||||
"s3Upload",
|
||||
);
|
||||
|
||||
if (this.webhookUrl) {
|
||||
const body = {
|
||||
|
@ -115,17 +131,25 @@ export class S3StorageSync
|
|||
filename: this.fullPrefix + targetFilename,
|
||||
|
||||
...resource,
|
||||
completed
|
||||
completed,
|
||||
};
|
||||
|
||||
logger.info(`Pinging Webhook: ${this.webhookUrl}`);
|
||||
|
||||
if (this.webhookUrl.startsWith("http://") || this.webhookUrl.startsWith("https://")) {
|
||||
await fetch(this.webhookUrl, {method: "POST", body: JSON.stringify(body)});
|
||||
if (
|
||||
this.webhookUrl.startsWith("http://") ||
|
||||
this.webhookUrl.startsWith("https://")
|
||||
) {
|
||||
await fetch(this.webhookUrl, {
|
||||
method: "POST",
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
} else if (this.webhookUrl.startsWith("redis://")) {
|
||||
const parts = this.webhookUrl.split("/");
|
||||
if (parts.length !== 5) {
|
||||
logger.fatal("redis webhook url must be in format: redis://<host>:<port>/<db>/<key>");
|
||||
logger.fatal(
|
||||
"redis webhook url must be in format: redis://<host>:<port>/<db>/<key>",
|
||||
);
|
||||
}
|
||||
const redis = await initRedis(parts.slice(0, 4).join("/"));
|
||||
await redis.rpush(parts[4], JSON.stringify(body));
|
||||
|
@ -139,7 +163,8 @@ export function initStorage() {
|
|||
return null;
|
||||
}
|
||||
|
||||
const endpointUrl = process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const endpointUrl =
|
||||
process.env.STORE_ENDPOINT_URL + (process.env.STORE_PATH || "");
|
||||
const storeInfo = {
|
||||
endpointUrl,
|
||||
accessKey: process.env.STORE_ACCESS_KEY,
|
||||
|
@ -156,7 +181,6 @@ export function initStorage() {
|
|||
return new S3StorageSync(storeInfo, opts);
|
||||
}
|
||||
|
||||
|
||||
export async function getFileSize(filename: string) {
|
||||
const stats = await fsp.stat(filename);
|
||||
return stats.size;
|
||||
|
@ -170,20 +194,29 @@ export async function getDirSize(dir: string) {
|
|||
return size;
|
||||
}
|
||||
|
||||
export async function checkDiskUtilization(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export async function checkDiskUtilization(params: Record<string, any>, archiveDirSize: number, dfOutput=null) {
|
||||
const diskUsage : Record<string, string> = await getDiskUsage("/crawls", dfOutput);
|
||||
params: Record<string, any>,
|
||||
archiveDirSize: number,
|
||||
dfOutput = null,
|
||||
) {
|
||||
const diskUsage: Record<string, string> = await getDiskUsage(
|
||||
"/crawls",
|
||||
dfOutput,
|
||||
);
|
||||
const usedPercentage = parseInt(diskUsage["Use%"].slice(0, -1));
|
||||
|
||||
// Check that disk usage isn't already above threshold
|
||||
if (usedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
logger.info(
|
||||
`Disk utilization threshold reached ${usedPercentage}% > ${params.diskUtilization}%, stopping`,
|
||||
);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: null,
|
||||
threshold: params.diskUtilization
|
||||
threshold: params.diskUtilization,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -199,15 +232,20 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
|||
}
|
||||
|
||||
const projectedTotal = kbUsed + kbArchiveDirSize;
|
||||
const projectedUsedPercentage = calculatePercentageUsed(projectedTotal, kbTotal);
|
||||
const projectedUsedPercentage = calculatePercentageUsed(
|
||||
projectedTotal,
|
||||
kbTotal,
|
||||
);
|
||||
|
||||
if (projectedUsedPercentage >= params.diskUtilization) {
|
||||
logger.info(`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`);
|
||||
logger.info(
|
||||
`Disk utilization projected to reach threshold ${projectedUsedPercentage}% > ${params.diskUtilization}%, stopping`,
|
||||
);
|
||||
return {
|
||||
stop: true,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
threshold: params.diskUtilization,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -215,7 +253,7 @@ export async function checkDiskUtilization(params: Record<string, any>, archiveD
|
|||
stop: false,
|
||||
used: usedPercentage,
|
||||
projected: projectedUsedPercentage,
|
||||
threshold: params.diskUtilization
|
||||
threshold: params.diskUtilization,
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -228,9 +266,9 @@ export async function getDFOutput(path: string) {
|
|||
export async function getDiskUsage(path = "/crawls", dfOutput = null) {
|
||||
const result = dfOutput || (await getDFOutput(path));
|
||||
const lines = result.split("\n");
|
||||
const keys = lines[0].split(/\s+/ig);
|
||||
const rows = lines.slice(1).map(line => {
|
||||
const values = line.split(/\s+/ig);
|
||||
const keys = lines[0].split(/\s+/gi);
|
||||
const rows = lines.slice(1).map((line) => {
|
||||
const values = line.split(/\s+/gi);
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return keys.reduce((o: Record<string, any>, k, index) => {
|
||||
|
@ -245,13 +283,16 @@ export function calculatePercentageUsed(used: number, total: number) {
|
|||
return Math.round((used / total) * 100);
|
||||
}
|
||||
|
||||
function checksumFile(hashName: string, path: string) : Promise<{hash: string, crc32: number}>{
|
||||
function checksumFile(
|
||||
hashName: string,
|
||||
path: string,
|
||||
): Promise<{ hash: string; crc32: number }> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const hash = createHash(hashName);
|
||||
let crc: number = 0;
|
||||
|
||||
const stream = fs.createReadStream(path);
|
||||
stream.on("error", err => reject(err));
|
||||
stream.on("error", (err) => reject(err));
|
||||
stream.on("data", (chunk) => {
|
||||
hash.update(chunk);
|
||||
crc = crc32(chunk, crc);
|
||||
|
@ -261,10 +302,12 @@ function checksumFile(hashName: string, path: string) : Promise<{hash: string, c
|
|||
}
|
||||
|
||||
export function interpolateFilename(filename: string, crawlId: string) {
|
||||
filename = filename.replace("@ts", new Date().toISOString().replace(/[:TZz.-]/g, ""));
|
||||
filename = filename.replace(
|
||||
"@ts",
|
||||
new Date().toISOString().replace(/[:TZz.-]/g, ""),
|
||||
);
|
||||
filename = filename.replace("@hostname", os.hostname());
|
||||
filename = filename.replace("@hostsuffix", os.hostname().slice(-14));
|
||||
filename = filename.replace("@id", crawlId);
|
||||
return filename;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,25 +15,39 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
|
|||
this.cdp = cdp;
|
||||
}
|
||||
|
||||
async extractAndStoreText(resourceType: string, ignoreIfMatchesLast = false, saveToWarc = false) {
|
||||
async extractAndStoreText(
|
||||
resourceType: string,
|
||||
ignoreIfMatchesLast = false,
|
||||
saveToWarc = false,
|
||||
) {
|
||||
try {
|
||||
const text = await this.doGetText();
|
||||
|
||||
if (ignoreIfMatchesLast && text === this.lastText) {
|
||||
this.lastText = this.text;
|
||||
logger.debug("Skipping, extracted text unchanged from last extraction", {url: this.url}, "text");
|
||||
logger.debug(
|
||||
"Skipping, extracted text unchanged from last extraction",
|
||||
{ url: this.url },
|
||||
"text",
|
||||
);
|
||||
return { changed: false, text };
|
||||
}
|
||||
if (saveToWarc) {
|
||||
await this.writeBufferToWARC(new TextEncoder().encode(text), resourceType, "text/plain");
|
||||
logger.debug(`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`);
|
||||
await this.writeBufferToWARC(
|
||||
new TextEncoder().encode(text),
|
||||
resourceType,
|
||||
"text/plain",
|
||||
);
|
||||
logger.debug(
|
||||
`Text Extracted (type: ${resourceType}) for ${this.url} written to ${this.warcName}`,
|
||||
);
|
||||
}
|
||||
|
||||
this.lastText = text;
|
||||
return { changed: true, text };
|
||||
} // TODO: Fix this the next time the file is edited.
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
catch (e: any) {
|
||||
} catch (e: any) {
|
||||
logger.debug("Error extracting text", e, "text");
|
||||
return { changed: false, text: null };
|
||||
}
|
||||
|
@ -42,19 +56,30 @@ export abstract class BaseTextExtract extends WARCResourceWriter {
|
|||
abstract doGetText(): Promise<string>;
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class TextExtractViaSnapshot extends BaseTextExtract {
|
||||
async doGetText(): Promise<string> {
|
||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {computedStyles: []});
|
||||
const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {
|
||||
computedStyles: [],
|
||||
});
|
||||
return this.parseTextFromDOMSnapshot(result);
|
||||
}
|
||||
|
||||
parseTextFromDOMSnapshot(result: Protocol.DOMSnapshot.CaptureSnapshotResponse) : string {
|
||||
parseTextFromDOMSnapshot(
|
||||
result: Protocol.DOMSnapshot.CaptureSnapshotResponse,
|
||||
): string {
|
||||
const TEXT_NODE = 3;
|
||||
const ELEMENT_NODE = 1;
|
||||
|
||||
const SKIPPED_NODES = ["SCRIPT", "STYLE", "HEADER", "FOOTER", "BANNER-DIV", "NOSCRIPT", "TITLE"];
|
||||
const SKIPPED_NODES = [
|
||||
"SCRIPT",
|
||||
"STYLE",
|
||||
"HEADER",
|
||||
"FOOTER",
|
||||
"BANNER-DIV",
|
||||
"NOSCRIPT",
|
||||
"TITLE",
|
||||
];
|
||||
|
||||
const { strings, documents } = result;
|
||||
|
||||
|
@ -91,11 +116,13 @@ export class TextExtractViaSnapshot extends BaseTextExtract {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
export class TextExtractViaDocument extends BaseTextExtract {
|
||||
async doGetText(): Promise<string> {
|
||||
const result = await this.cdp.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||
const result = await this.cdp.send("DOM.getDocument", {
|
||||
depth: -1,
|
||||
pierce: true,
|
||||
});
|
||||
return this.parseTextFromDOM(result);
|
||||
}
|
||||
|
||||
|
@ -108,8 +135,20 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
|||
return accum.join("\n");
|
||||
}
|
||||
|
||||
parseText(node: Protocol.DOM.Node, metadata: Record<string, string> | null, accum: string[]) {
|
||||
const SKIPPED_NODES = ["head", "script", "style", "header", "footer", "banner-div", "noscript"];
|
||||
parseText(
|
||||
node: Protocol.DOM.Node,
|
||||
metadata: Record<string, string> | null,
|
||||
accum: string[],
|
||||
) {
|
||||
const SKIPPED_NODES = [
|
||||
"head",
|
||||
"script",
|
||||
"style",
|
||||
"header",
|
||||
"footer",
|
||||
"banner-div",
|
||||
"noscript",
|
||||
];
|
||||
const EMPTY_LIST: Protocol.DOM.Node[] = [];
|
||||
const TEXT = "#text";
|
||||
const TITLE = "title";
|
||||
|
@ -150,4 +189,3 @@ export class TextExtractViaDocument extends BaseTextExtract {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { logger } from "./logger.js";
|
||||
|
||||
export function sleep(seconds: number) {
|
||||
return new Promise(resolve => setTimeout(resolve, seconds * 1000));
|
||||
return new Promise((resolve) => setTimeout(resolve, seconds * 1000));
|
||||
}
|
||||
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
|
@ -13,27 +13,33 @@ export function timedRun(
|
|||
message = "Promise timed out",
|
||||
logDetails = {},
|
||||
context = "general",
|
||||
isWarn=false
|
||||
isWarn = false,
|
||||
) {
|
||||
// return Promise return value or log error if timeout is reached first
|
||||
const timeout = seconds * 1000;
|
||||
|
||||
const rejectPromiseOnTimeout = (timeout: number) => {
|
||||
return new Promise((resolve, reject) => {
|
||||
setTimeout(() => (reject("timeout reached")), timeout);
|
||||
setTimeout(() => reject("timeout reached"), timeout);
|
||||
});
|
||||
};
|
||||
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
|
||||
.catch((err) => {
|
||||
return Promise.race([promise, rejectPromiseOnTimeout(timeout)]).catch(
|
||||
(err) => {
|
||||
if (err == "timeout reached") {
|
||||
const logFunc = isWarn ? logger.warn : logger.error;
|
||||
logFunc.call(logger, message, {"seconds": seconds, ...logDetails}, context);
|
||||
logFunc.call(
|
||||
logger,
|
||||
message,
|
||||
{ seconds: seconds, ...logDetails },
|
||||
context,
|
||||
);
|
||||
} else {
|
||||
//logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
|
||||
throw err;
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
|
||||
|
|
|
@ -2,8 +2,7 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import * as warcio from "warcio";
|
||||
|
||||
export class WARCResourceWriter
|
||||
{
|
||||
export class WARCResourceWriter {
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
page: any;
|
||||
|
@ -12,16 +11,32 @@ export class WARCResourceWriter
|
|||
warcName: string;
|
||||
date: Date;
|
||||
|
||||
constructor({url, directory, date, warcName} : {url: string, directory: string, date: Date, warcName: string}) {
|
||||
constructor({
|
||||
url,
|
||||
directory,
|
||||
date,
|
||||
warcName,
|
||||
}: {
|
||||
url: string;
|
||||
directory: string;
|
||||
date: Date;
|
||||
warcName: string;
|
||||
}) {
|
||||
this.url = url;
|
||||
this.directory = directory;
|
||||
this.warcName = path.join(this.directory, warcName);
|
||||
this.date = date ? date : new Date();
|
||||
}
|
||||
|
||||
async writeBufferToWARC(contents: Uint8Array, resourceType: string, contentType: string) {
|
||||
async writeBufferToWARC(
|
||||
contents: Uint8Array,
|
||||
resourceType: string,
|
||||
contentType: string,
|
||||
) {
|
||||
const warcRecord = await this.wrap(contents, resourceType, contentType);
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {gzip: true});
|
||||
const warcRecordBuffer = await warcio.WARCSerializer.serialize(warcRecord, {
|
||||
gzip: true,
|
||||
});
|
||||
fs.appendFileSync(this.warcName, warcRecordBuffer);
|
||||
}
|
||||
|
||||
|
@ -34,12 +49,15 @@ export class WARCResourceWriter
|
|||
}
|
||||
const resourceUrl = `urn:${resourceType}:${this.url}`;
|
||||
|
||||
return warcio.WARCRecord.create({
|
||||
return warcio.WARCRecord.create(
|
||||
{
|
||||
url: resourceUrl,
|
||||
date: this.date.toISOString(),
|
||||
type: warcRecordType,
|
||||
warcVersion,
|
||||
warcHeaders
|
||||
}, content());
|
||||
warcHeaders,
|
||||
},
|
||||
content(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,10 +7,8 @@ import { WARCSerializer } from "warcio/node";
|
|||
import { logger, errJSON } from "./logger.js";
|
||||
import type { IndexerOffsetLength, WARCRecord } from "warcio";
|
||||
|
||||
|
||||
// =================================================================
|
||||
export class WARCWriter implements IndexerOffsetLength
|
||||
{
|
||||
export class WARCWriter implements IndexerOffsetLength {
|
||||
archivesDir: string;
|
||||
tempCdxDir: string;
|
||||
filename: string;
|
||||
|
@ -25,8 +23,19 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
fh?: Writable | null;
|
||||
cdxFH?: Writable | null;
|
||||
|
||||
constructor({archivesDir, tempCdxDir, filename, gzip, logDetails} :
|
||||
{archivesDir: string, tempCdxDir: string, filename: string, gzip: boolean, logDetails: Record<string, string>}) {
|
||||
constructor({
|
||||
archivesDir,
|
||||
tempCdxDir,
|
||||
filename,
|
||||
gzip,
|
||||
logDetails,
|
||||
}: {
|
||||
archivesDir: string;
|
||||
tempCdxDir: string;
|
||||
filename: string;
|
||||
gzip: boolean;
|
||||
logDetails: Record<string, string>;
|
||||
}) {
|
||||
this.archivesDir = archivesDir;
|
||||
this.tempCdxDir = tempCdxDir;
|
||||
this.filename = filename;
|
||||
|
@ -43,14 +52,22 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
async initFH() {
|
||||
if (!this.fh) {
|
||||
this.fh = fs.createWriteStream(path.join(this.archivesDir, this.filename));
|
||||
this.fh = fs.createWriteStream(
|
||||
path.join(this.archivesDir, this.filename),
|
||||
);
|
||||
}
|
||||
if (!this.cdxFH && this.tempCdxDir) {
|
||||
this.cdxFH = fs.createWriteStream(path.join(this.tempCdxDir, this.filename + ".cdx"));
|
||||
this.cdxFH = fs.createWriteStream(
|
||||
path.join(this.tempCdxDir, this.filename + ".cdx"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async writeRecordPair(responseRecord: WARCRecord, requestRecord: WARCRecord, responseSerializer: WARCSerializer | undefined = undefined) {
|
||||
async writeRecordPair(
|
||||
responseRecord: WARCRecord,
|
||||
requestRecord: WARCRecord,
|
||||
responseSerializer: WARCSerializer | undefined = undefined,
|
||||
) {
|
||||
const opts = { gzip: this.gzip };
|
||||
|
||||
if (!responseSerializer) {
|
||||
|
@ -59,15 +76,20 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
await this.initFH();
|
||||
|
||||
this.recordLength = await this._writeRecord(responseRecord, responseSerializer);
|
||||
this.recordLength = await this._writeRecord(
|
||||
responseRecord,
|
||||
responseSerializer,
|
||||
);
|
||||
|
||||
this._writeCDX(responseRecord);
|
||||
|
||||
const requestSerializer = new WARCSerializer(requestRecord, opts);
|
||||
this.recordLength = await this._writeRecord(requestRecord, requestSerializer);
|
||||
this.recordLength = await this._writeRecord(
|
||||
requestRecord,
|
||||
requestSerializer,
|
||||
);
|
||||
|
||||
this._writeCDX(requestRecord);
|
||||
|
||||
}
|
||||
|
||||
async _writeRecord(record: WARCRecord, serializer: WARCSerializer) {
|
||||
|
@ -83,7 +105,11 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
try {
|
||||
this.fh.write(chunk);
|
||||
} catch (e) {
|
||||
logger.error("Error writing to WARC, corruption possible", {...errJSON(e), url, ...this.logDetails}, "writer");
|
||||
logger.error(
|
||||
"Error writing to WARC, corruption possible",
|
||||
{ ...errJSON(e), url, ...this.logDetails },
|
||||
"writer",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -119,7 +145,7 @@ export class WARCWriter implements IndexerOffsetLength
|
|||
|
||||
// =================================================================
|
||||
export function streamFinish(fh: Writable) {
|
||||
const p = new Promise<void>(resolve => {
|
||||
const p = new Promise<void>((resolve) => {
|
||||
fh.once("finish", () => resolve());
|
||||
});
|
||||
fh.end();
|
||||
|
|
|
@ -16,9 +16,14 @@ const TEARDOWN_TIMEOUT = 10;
|
|||
const FINISHED_TIMEOUT = 60;
|
||||
|
||||
// ===========================================================================
|
||||
export function runWorkers(
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number, collDir: string) {
|
||||
crawler: any,
|
||||
numWorkers: number,
|
||||
maxPageTime: number,
|
||||
collDir: string,
|
||||
) {
|
||||
logger.info(`Creating ${numWorkers} workers`, {}, "worker");
|
||||
|
||||
const workers = [];
|
||||
|
@ -39,13 +44,12 @@ export function runWorkers(crawler: any, numWorkers: number, maxPageTime: number
|
|||
}
|
||||
|
||||
for (let i = 0; i < numWorkers; i++) {
|
||||
workers.push(new PageWorker((i + offset), crawler, maxPageTime, collDir));
|
||||
workers.push(new PageWorker(i + offset, crawler, maxPageTime, collDir));
|
||||
}
|
||||
|
||||
return Promise.allSettled(workers.map((worker) => worker.run()));
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -55,17 +59,18 @@ export type WorkerOpts = Record<string, any> & {
|
|||
workerid: WorkerId;
|
||||
// eslint-disable-next-line @typescript-eslint/ban-types
|
||||
callbacks: Record<string, Function>;
|
||||
directFetchCapture?: ((url: string) => Promise<{fetched: boolean, mime: string}>) | null;
|
||||
directFetchCapture?:
|
||||
| ((url: string) => Promise<{ fetched: boolean; mime: string }>)
|
||||
| null;
|
||||
};
|
||||
|
||||
// ===========================================================================
|
||||
export type WorkerState = WorkerOpts & {
|
||||
data: PageState
|
||||
data: PageState;
|
||||
};
|
||||
|
||||
// ===========================================================================
|
||||
export class PageWorker
|
||||
{
|
||||
export class PageWorker {
|
||||
id: WorkerId;
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
@ -91,16 +96,25 @@ export class PageWorker
|
|||
|
||||
recorder: Recorder;
|
||||
|
||||
constructor(
|
||||
id: WorkerId,
|
||||
// TODO: Fix this the next time the file is edited.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
constructor(id: WorkerId, crawler: any, maxPageTime: number, collDir: string) {
|
||||
crawler: any,
|
||||
maxPageTime: number,
|
||||
collDir: string,
|
||||
) {
|
||||
this.id = id;
|
||||
this.crawler = crawler;
|
||||
this.maxPageTime = maxPageTime;
|
||||
|
||||
this.logDetails = { workerid: this.id };
|
||||
|
||||
this.recorder = new Recorder({workerid: id, collDir, crawler: this.crawler});
|
||||
this.recorder = new Recorder({
|
||||
workerid: id,
|
||||
collDir,
|
||||
crawler: this.crawler,
|
||||
});
|
||||
|
||||
this.crawler.browser.recorders.push(this.recorder);
|
||||
}
|
||||
|
@ -121,7 +135,7 @@ export class PageWorker
|
|||
TEARDOWN_TIMEOUT,
|
||||
"Page Teardown Timed Out",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
|
@ -129,13 +143,17 @@ export class PageWorker
|
|||
}
|
||||
|
||||
try {
|
||||
logger.debug("Closing page", {crashed: this.crashed, workerid: this.id}, "worker");
|
||||
logger.debug(
|
||||
"Closing page",
|
||||
{ crashed: this.crashed, workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
await timedRun(
|
||||
this.page.close(),
|
||||
TEARDOWN_TIMEOUT,
|
||||
"Page Close Timed Out",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
|
@ -156,8 +174,18 @@ export class PageWorker
|
|||
}
|
||||
|
||||
async initPage(url: string): Promise<WorkerOpts> {
|
||||
if (!this.crashed && this.page && this.opts && ++this.reuseCount <= MAX_REUSE && this.isSameOrigin(url)) {
|
||||
logger.debug("Reusing page", {reuseCount: this.reuseCount, ...this.logDetails}, "worker");
|
||||
if (
|
||||
!this.crashed &&
|
||||
this.page &&
|
||||
this.opts &&
|
||||
++this.reuseCount <= MAX_REUSE &&
|
||||
this.isSameOrigin(url)
|
||||
) {
|
||||
logger.debug(
|
||||
"Reusing page",
|
||||
{ reuseCount: this.reuseCount, ...this.logDetails },
|
||||
"worker",
|
||||
);
|
||||
return this.opts;
|
||||
} else if (this.page) {
|
||||
await this.closePage();
|
||||
|
@ -176,7 +204,7 @@ export class PageWorker
|
|||
NEW_WINDOW_TIMEOUT,
|
||||
"New Window Timed Out",
|
||||
{ workerid },
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
|
||||
if (!result) {
|
||||
|
@ -188,7 +216,9 @@ export class PageWorker
|
|||
this.page = page;
|
||||
this.cdp = cdp;
|
||||
this.callbacks = {};
|
||||
const directFetchCapture = this.recorder ? (x: string) => this.recorder.directFetchCapture(x) : null;
|
||||
const directFetchCapture = this.recorder
|
||||
? (x: string) => this.recorder.directFetchCapture(x)
|
||||
: null;
|
||||
this.opts = {
|
||||
page,
|
||||
cdp,
|
||||
|
@ -203,7 +233,9 @@ export class PageWorker
|
|||
|
||||
// updated per page crawl
|
||||
this.crashed = false;
|
||||
this.crashBreak = new Promise((resolve, reject) => this.markCrashed = reject);
|
||||
this.crashBreak = new Promise(
|
||||
(resolve, reject) => (this.markCrashed = reject),
|
||||
);
|
||||
|
||||
this.logDetails = { page: page.url(), workerid };
|
||||
|
||||
|
@ -213,7 +245,11 @@ export class PageWorker
|
|||
page.on("error", (err: any) => {
|
||||
// ensure we're still on this page, otherwise ignore!
|
||||
if (this.page === page) {
|
||||
logger.error("Page Crashed", {...errJSON(err), ...this.logDetails}, "worker");
|
||||
logger.error(
|
||||
"Page Crashed",
|
||||
{ ...errJSON(err), ...this.logDetails },
|
||||
"worker",
|
||||
);
|
||||
this.crashed = true;
|
||||
if (this.markCrashed) {
|
||||
this.markCrashed("crashed");
|
||||
|
@ -224,9 +260,12 @@ export class PageWorker
|
|||
await this.crawler.setupPage(this.opts);
|
||||
|
||||
return this.opts;
|
||||
|
||||
} catch (err) {
|
||||
logger.warn("Error getting new page", {"workerid": this.id, ...errJSON(err)}, "worker");
|
||||
logger.warn(
|
||||
"Error getting new page",
|
||||
{ workerid: this.id, ...errJSON(err) },
|
||||
"worker",
|
||||
);
|
||||
retry++;
|
||||
|
||||
if (!this.crawler.browser.browser) {
|
||||
|
@ -234,7 +273,11 @@ export class PageWorker
|
|||
}
|
||||
|
||||
if (retry >= MAX_REUSE) {
|
||||
logger.fatal("Unable to get new page, browser likely crashed", this.logDetails, "worker");
|
||||
logger.fatal(
|
||||
"Unable to get new page, browser likely crashed",
|
||||
this.logDetails,
|
||||
"worker",
|
||||
);
|
||||
}
|
||||
|
||||
await sleep(0.5);
|
||||
|
@ -262,7 +305,7 @@ export class PageWorker
|
|||
const { data } = opts;
|
||||
const { url } = data;
|
||||
|
||||
logger.info("Starting page", {workerid, "page": url}, "worker");
|
||||
logger.info("Starting page", { workerid, page: url }, "worker");
|
||||
|
||||
this.logDetails = { page: url, workerid };
|
||||
|
||||
|
@ -281,14 +324,17 @@ export class PageWorker
|
|||
this.maxPageTime,
|
||||
"Page Worker Timeout",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
),
|
||||
this.crashBreak
|
||||
this.crashBreak,
|
||||
]);
|
||||
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message !== "logged" && !this.crashed) {
|
||||
logger.error("Worker Exception", {...errJSON(e), ...this.logDetails}, "worker");
|
||||
logger.error(
|
||||
"Worker Exception",
|
||||
{ ...errJSON(e), ...this.logDetails },
|
||||
"worker",
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
await timedRun(
|
||||
|
@ -296,7 +342,7 @@ export class PageWorker
|
|||
FINISHED_TIMEOUT,
|
||||
"Page Finished Timed Out",
|
||||
this.logDetails,
|
||||
"worker"
|
||||
"worker",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -306,9 +352,17 @@ export class PageWorker
|
|||
|
||||
try {
|
||||
await this.runLoop();
|
||||
logger.info("Worker done, all tasks complete", {workerid: this.id}, "worker");
|
||||
logger.info(
|
||||
"Worker done, all tasks complete",
|
||||
{ workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
} catch (e) {
|
||||
logger.error("Worker error, exiting", {...errJSON(e), workerid: this.id}, "worker");
|
||||
logger.error(
|
||||
"Worker error, exiting",
|
||||
{ ...errJSON(e), workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
} finally {
|
||||
if (this.recorder) {
|
||||
await this.recorder.onDone();
|
||||
|
@ -342,7 +396,6 @@ export class PageWorker
|
|||
await this.timedCrawlPage({ ...opts, data });
|
||||
|
||||
loggedWaiting = false;
|
||||
|
||||
} else {
|
||||
// indicate that the worker has no more work (mostly for screencasting, status, etc...)
|
||||
// depending on other works, will either get more work or crawl will end
|
||||
|
@ -354,7 +407,11 @@ export class PageWorker
|
|||
// if pending, sleep and check again
|
||||
if (pending) {
|
||||
if (!loggedWaiting) {
|
||||
logger.debug("No crawl tasks, but pending tasks remain, waiting", {pending, workerid: this.id}, "worker");
|
||||
logger.debug(
|
||||
"No crawl tasks, but pending tasks remain, waiting",
|
||||
{ pending, workerid: this.id },
|
||||
"worker",
|
||||
);
|
||||
loggedWaiting = true;
|
||||
}
|
||||
await sleep(0.5);
|
||||
|
@ -368,5 +425,3 @@ export class PageWorker
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -10,17 +10,21 @@ function runCrawl(name, config, commandExtra = "") {
|
|||
const configYaml = yaml.dump(config);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
function doesCDXContain(coll, value) {
|
||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
||||
const data = fs.readFileSync(
|
||||
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
||||
);
|
||||
return data.indexOf(value) >= 0;
|
||||
}
|
||||
|
||||
|
@ -41,11 +45,13 @@ test("test crawl without ad block for specific URL", () => {
|
|||
|
||||
test("testcrawl with ad block for specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.mozilla.org/en-US/firefox/",
|
||||
"blockAds": true,
|
||||
url: "https://www.mozilla.org/en-US/firefox/",
|
||||
blockAds: true,
|
||||
};
|
||||
|
||||
runCrawl("adblock-block", config);
|
||||
|
||||
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(false);
|
||||
expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
|
|
@ -11,7 +11,11 @@ test("dynamically add exclusion while crawl is running", async () => {
|
|||
});
|
||||
|
||||
try {
|
||||
exec("docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", {"shell": "/bin/bash"}, callback);
|
||||
exec(
|
||||
"docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
|
||||
{ shell: "/bin/bash" },
|
||||
callback,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@ -33,7 +37,10 @@ test("dynamically add exclusion while crawl is running", async () => {
|
|||
const uids = await redis.hkeys("test:status");
|
||||
|
||||
// exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
|
||||
await redis.rpush(`${uids[0]}:msg`, JSON.stringify({type: "addExclusion", regex: "webrecorder"}));
|
||||
await redis.rpush(
|
||||
`${uids[0]}:msg`,
|
||||
JSON.stringify({ type: "addExclusion", regex: "webrecorder" }),
|
||||
);
|
||||
|
||||
// ensure 'Add Exclusion is contained in the debug logs
|
||||
const { stdout } = await p;
|
||||
|
@ -44,4 +51,3 @@ test("dynamically add exclusion while crawl is running", async () => {
|
|||
|
||||
await redis.disconnect();
|
||||
});
|
||||
|
||||
|
|
|
@ -3,16 +3,18 @@ import fs from "fs";
|
|||
import path from "path";
|
||||
import md5 from "md5";
|
||||
|
||||
|
||||
|
||||
|
||||
test("ensure basic crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\"");
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description"',
|
||||
);
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
|
||||
|
||||
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz",
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a combined warc file exists in the archive folder", () => {
|
||||
|
@ -27,9 +29,10 @@ test("check that a combined warc file exists in the archive folder", () => {
|
|||
expect(captureFound).toEqual(1);
|
||||
});
|
||||
|
||||
|
||||
test("check that a combined warc file is under the rolloverSize", () => {
|
||||
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
|
||||
const warcLists = fs.readdirSync(
|
||||
path.join("test-crawls/collections/wr-net/wacz", "archive"),
|
||||
);
|
||||
let rolloverSize = 0;
|
||||
|
||||
function getFileSize(filename) {
|
||||
|
@ -37,7 +40,9 @@ test("check that a combined warc file is under the rolloverSize", () => {
|
|||
}
|
||||
|
||||
for (let i = 0; i < warcLists.length; i++) {
|
||||
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
||||
const size = getFileSize(
|
||||
path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
|
||||
);
|
||||
if (size < 10000) {
|
||||
rolloverSize = 1;
|
||||
}
|
||||
|
@ -46,27 +51,57 @@ test("check that a combined warc file is under the rolloverSize", () => {
|
|||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
||||
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const crawl_hash = md5(
|
||||
JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[1],
|
||||
)["text"],
|
||||
);
|
||||
const wacz_hash = md5(
|
||||
JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/wr-net/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[1],
|
||||
)["text"],
|
||||
);
|
||||
const fixture_hash = md5(
|
||||
JSON.parse(
|
||||
fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
|
||||
)["text"],
|
||||
);
|
||||
|
||||
expect(wacz_hash).toEqual(fixture_hash);
|
||||
expect(wacz_hash).toEqual(crawl_hash);
|
||||
|
||||
});
|
||||
|
||||
test("check that the supplied title and description made it into datapackage.json", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
|
||||
).toBe(true);
|
||||
|
||||
const data = fs.readFileSync("test-crawls/collections/wr-net/wacz/datapackage.json", "utf8");
|
||||
const data = fs.readFileSync(
|
||||
"test-crawls/collections/wr-net/wacz/datapackage.json",
|
||||
"utf8",
|
||||
);
|
||||
const dataPackageJSON = JSON.parse(data);
|
||||
expect(dataPackageJSON.title).toEqual("test title");
|
||||
expect(dataPackageJSON.description).toEqual("test description");
|
||||
|
|
|
@ -10,17 +10,21 @@ function runCrawl(name, config, commandExtra = "") {
|
|||
const configYaml = yaml.dump(config);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync(`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
`docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
function doesCDXContain(coll, value) {
|
||||
const data = fs.readFileSync(`test-crawls/collections/${coll}/indexes/index.cdxj`);
|
||||
const data = fs.readFileSync(
|
||||
`test-crawls/collections/${coll}/indexes/index.cdxj`,
|
||||
);
|
||||
return data.indexOf(value) >= 0;
|
||||
}
|
||||
|
||||
|
@ -39,131 +43,154 @@ test("test crawl without block for specific URL", () => {
|
|||
});
|
||||
*/
|
||||
|
||||
|
||||
test("test block rule on specific URL", () => {
|
||||
const config = {
|
||||
"url": "https://www.iana.org/",
|
||||
"blockRules": [
|
||||
{"url": "adsense"}
|
||||
]
|
||||
url: "https://www.iana.org/",
|
||||
blockRules: [{ url: "adsense" }],
|
||||
};
|
||||
|
||||
runCrawl("block-1", config);
|
||||
|
||||
expect(doesCDXContain("block-1", "https://cse.google.com/adsense/search/async-ads.js")).toBe(false);
|
||||
expect(
|
||||
doesCDXContain(
|
||||
"block-1",
|
||||
"https://cse.google.com/adsense/search/async-ads.js",
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
test("test block rule based on iframe text, content included due to match", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
"type": "allowOnly"
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||
type: "allowOnly",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-2", config);
|
||||
|
||||
expect(doesCDXContain("block-2", "\"video/mp4\"")).toBe(true);
|
||||
expect(doesCDXContain("block-2", '"video/mp4"')).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule based on iframe text, wrong text, content should be excluded", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\\"",
|
||||
"type": "allowOnly"
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"',
|
||||
type: "allowOnly",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-3", config);
|
||||
|
||||
expect(doesCDXContain("block-3", "\"video/mp4\"")).toBe(false);
|
||||
expect(doesCDXContain("block-3", '"video/mp4"')).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
test("test block rule based on iframe text, block matched", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-4", config);
|
||||
|
||||
expect(doesCDXContain("block-4", "\"video/mp4\"")).toBe(false);
|
||||
expect(doesCDXContain("block-4", '"video/mp4"')).toBe(false);
|
||||
});
|
||||
|
||||
test("test rule based on iframe text not matching, plus allowOnly iframe", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "example.com/embed/",
|
||||
"frameTextMatch": "\\\\\"channelId\\\\\":\\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\\"",
|
||||
"type": "block"
|
||||
}, {
|
||||
"url": "(youtube.com|example.com)/embed/",
|
||||
"type": "allowOnly",
|
||||
"inFrameUrl": "oembed.link/",
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "example.com/embed/",
|
||||
frameTextMatch:
|
||||
'\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
|
||||
type: "block",
|
||||
},
|
||||
{
|
||||
url: "(youtube.com|example.com)/embed/",
|
||||
type: "allowOnly",
|
||||
inFrameUrl: "oembed.link/",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("non-block-5", config);
|
||||
|
||||
expect(doesCDXContain("non-block-5", "\"video/mp4\"")).toBe(true);
|
||||
expect(doesCDXContain("non-block-5", '"video/mp4"')).toBe(true);
|
||||
});
|
||||
|
||||
test("test block url in frame url", () => {
|
||||
const config = {
|
||||
"url": "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
"blockRules": [{
|
||||
"url": "maxresdefault.jpg",
|
||||
"type": "block",
|
||||
"inFrameUrl": "youtube.com/embed",
|
||||
}]
|
||||
url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
|
||||
blockRules: [
|
||||
{
|
||||
url: "maxresdefault.jpg",
|
||||
type: "block",
|
||||
inFrameUrl: "youtube.com/embed",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
runCrawl("block-6", config);
|
||||
|
||||
expect(doesCDXContain("block-6", "\"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg\"")).toBe(false);
|
||||
expect(
|
||||
doesCDXContain(
|
||||
"block-6",
|
||||
'"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"',
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
|
||||
const config = {
|
||||
"seeds": [
|
||||
"https://archiveweb.page/en/troubleshooting/errors/",
|
||||
seeds: ["https://archiveweb.page/en/troubleshooting/errors/"],
|
||||
depth: "0",
|
||||
blockRules: [
|
||||
{
|
||||
url: "(archiveweb.page|www.youtube.com)",
|
||||
type: "allowOnly",
|
||||
inFrameUrl: "archiveweb.page",
|
||||
},
|
||||
{
|
||||
url: "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||
inFrameUrl: "archiveweb.page",
|
||||
},
|
||||
{
|
||||
url: "https://www.youtube.com/embed/",
|
||||
type: "allowOnly",
|
||||
frameTextMatch:
|
||||
'(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")',
|
||||
},
|
||||
],
|
||||
"depth": "0",
|
||||
"blockRules": [{
|
||||
"url": "(archiveweb.page|www.youtube.com)",
|
||||
"type": "allowOnly",
|
||||
"inFrameUrl": "archiveweb.page"
|
||||
}, {
|
||||
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
|
||||
"inFrameUrl": "archiveweb.page"
|
||||
}, {
|
||||
"url": "https://www.youtube.com/embed/",
|
||||
"type": "allowOnly",
|
||||
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
|
||||
}],
|
||||
|
||||
"combineWARC": true,
|
||||
combineWARC: true,
|
||||
|
||||
"logging": "stats,debug"
|
||||
logging: "stats,debug",
|
||||
};
|
||||
|
||||
|
||||
runCrawl("block-7", config);
|
||||
|
||||
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
|
||||
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
|
||||
expect(
|
||||
doesCDXContain(
|
||||
"block-7",
|
||||
'"https://archiveweb.page/assets/js/vendor/lunr.min.js"',
|
||||
),
|
||||
).toBe(false);
|
||||
expect(doesCDXContain("block-7", '"video/mp4"')).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -3,31 +3,30 @@ import {exec as execCallback } from "child_process";
|
|||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
test("check that the collection name is properly validated", async () => {
|
||||
let passed = "";
|
||||
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid",
|
||||
);
|
||||
passed = true;
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
|
||||
let passed = "";
|
||||
|
||||
try {
|
||||
await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
|
||||
await exec(
|
||||
"docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid",
|
||||
);
|
||||
passed = true;
|
||||
}
|
||||
catch(e){
|
||||
} catch (e) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
||||
});
|
||||
|
|
|
@ -6,17 +6,19 @@ import {exec as execCallback } from "child_process";
|
|||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
test("check yaml config file with seed list is used", async () => {
|
||||
try {
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/configtest/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -26,7 +28,9 @@ test("check yaml config file with seed list is used", async () => {
|
|||
}
|
||||
}
|
||||
|
||||
const config = yaml.load(fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"));
|
||||
const config = yaml.load(
|
||||
fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"),
|
||||
);
|
||||
|
||||
let foundAllSeeds = true;
|
||||
|
||||
|
@ -38,20 +42,24 @@ test("check yaml config file with seed list is used", async () => {
|
|||
}
|
||||
expect(foundAllSeeds).toBe(true);
|
||||
|
||||
expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true);
|
||||
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/configtest/configtest.wacz"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check yaml config file will be overwritten by command line", async () => {
|
||||
try {
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/configtest-2/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -63,5 +71,4 @@ test("check yaml config file will be overwritten by command line", async () => {
|
|||
|
||||
expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
|
||||
expect(pages.size).toBe(1);
|
||||
|
||||
});
|
||||
|
|
|
@ -7,15 +7,20 @@ test("pass config file via stdin", async () => {
|
|||
const config = yaml.load(configYaml);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202",
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/config-stdin/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -37,6 +42,7 @@ test("pass config file via stdin", async () => {
|
|||
}
|
||||
expect(foundAllSeeds).toBe(true);
|
||||
|
||||
expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
|
||||
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
|
|
@ -1,31 +1,48 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
|
||||
test("ensure --overwrite with existing collection results in a successful crawl", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite",
|
||||
);
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the WACZ file exists in the collection", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
//-----------
|
||||
|
||||
test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync(
|
||||
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the WACZ file exists in the collection", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/overwrite-nothing/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync(
|
||||
"test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
|
|
@ -1,23 +1,36 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
test("test custom behaviors", async () => {
|
||||
const res = child_process.execSync("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page");
|
||||
const res = child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --url https://example.org/ --url https://webrecorder.net/ --customBehaviors /custom-behaviors/ --scopeType page",
|
||||
);
|
||||
|
||||
const log = res.toString();
|
||||
|
||||
// custom behavior ran for example.com
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.com/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://example.com/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// but not for example.org
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(false);
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(false);
|
||||
|
||||
expect(log.indexOf("{\"state\":{\"segments\":1},\"msg\":\"Skipping autoscroll, page seems to not be responsive to scrolling events\",\"page\":\"https://example.org/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://example.org/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
|
||||
// another custom behavior ran for webrecorder.net
|
||||
expect(log.indexOf("{\"state\":{},\"msg\":\"test-stat-2\",\"page\":\"https://webrecorder.net/\",\"workerid\":0}}") > 0).toBe(true);
|
||||
|
||||
|
||||
|
||||
expect(
|
||||
log.indexOf(
|
||||
'{"state":{},"msg":"test-stat-2","page":"https://webrecorder.net/","workerid":0}}',
|
||||
) > 0,
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||
class TestBehavior2
|
||||
{
|
||||
class TestBehavior2 {
|
||||
static init() {
|
||||
return {
|
||||
state: {}
|
||||
state: {},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,7 +14,6 @@ class TestBehavior2
|
|||
return window.location.origin === "https://webrecorder.net";
|
||||
}
|
||||
|
||||
|
||||
async *run(ctx) {
|
||||
ctx.log("In Test Behavior 2!");
|
||||
yield ctx.Lib.getState(ctx, "test-stat-2");
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||
class TestBehavior
|
||||
{
|
||||
class TestBehavior {
|
||||
static init() {
|
||||
return {
|
||||
state: {}
|
||||
state: {},
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -15,7 +14,6 @@ class TestBehavior
|
|||
return window.location.origin === "https://example.com";
|
||||
}
|
||||
|
||||
|
||||
async *run(ctx) {
|
||||
ctx.log("In Test Behavior!");
|
||||
yield ctx.Lib.getState(ctx, "test-stat");
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
|
||||
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
||||
try {
|
||||
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs");
|
||||
}
|
||||
catch (error) {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -26,9 +29,8 @@ test("ensure custom driver with custom selector crawls JS files as pages", async
|
|||
const expectedPages = new Set([
|
||||
"https://www.iana.org/",
|
||||
"https://www.iana.org/_js/jquery.js",
|
||||
"https://www.iana.org/_js/iana.js"
|
||||
"https://www.iana.org/_js/iana.js",
|
||||
]);
|
||||
|
||||
expect(pages).toEqual(expectedPages);
|
||||
|
||||
});
|
||||
|
|
|
@ -7,16 +7,21 @@ const exec = util.promisify(execCallback);
|
|||
|
||||
const extraHopsTimeout = 180000;
|
||||
|
||||
|
||||
test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
||||
test(
|
||||
"check that URLs are crawled 2 extra hops beyond depth",
|
||||
async () => {
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://webrecorder.net/ --limit 7",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync(
|
||||
"test-crawls/collections/extra-hops-beyond/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
const crawledPagesArray = crawledPages.trim().split("\n");
|
||||
|
||||
const expectedPages = [
|
||||
|
@ -39,4 +44,6 @@ test("check that URLs are crawled 2 extra hops beyond depth", async () => {
|
|||
}
|
||||
expect(expectedPages.indexOf(url) >= 0).toBe(true);
|
||||
}
|
||||
}, extraHopsTimeout);
|
||||
},
|
||||
extraHopsTimeout,
|
||||
);
|
||||
|
|
|
@ -2,8 +2,9 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
|
||||
test("ensure that stats file is modified", async () => {
|
||||
|
||||
const child = child_process.exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json");
|
||||
const child = child_process.exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --generateWACZ --text --limit 3 --collection file-stats --statsFilename progress.json",
|
||||
);
|
||||
|
||||
// detect crawler exit
|
||||
let crawler_exited = false;
|
||||
|
@ -12,7 +13,7 @@ test("ensure that stats file is modified", async () => {
|
|||
});
|
||||
|
||||
// helper function to sleep
|
||||
const sleep = ms => new Promise(res => setTimeout(res, ms));
|
||||
const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
|
||||
|
||||
// wait for stats file creation up to 30 secs (to not wait indefinitely)
|
||||
let counter = 0;
|
||||
|
@ -23,7 +24,9 @@ test("ensure that stats file is modified", async () => {
|
|||
}
|
||||
|
||||
// get initial modification time
|
||||
const initial_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime;
|
||||
const initial_mtime = fs.fstatSync(
|
||||
fs.openSync("test-crawls/progress.json", "r"),
|
||||
).mtime;
|
||||
|
||||
// wait for crawler exit
|
||||
while (!crawler_exited) {
|
||||
|
@ -31,12 +34,13 @@ test("ensure that stats file is modified", async () => {
|
|||
}
|
||||
|
||||
// get final modification time
|
||||
const final_mtime = fs.fstatSync(fs.openSync("test-crawls/progress.json", "r")).mtime;
|
||||
const final_mtime = fs.fstatSync(
|
||||
fs.openSync("test-crawls/progress.json", "r"),
|
||||
).mtime;
|
||||
|
||||
// compare initial and final modification time
|
||||
const diff = Math.abs(final_mtime - initial_mtime);
|
||||
expect(diff > 0).toBe(true);
|
||||
|
||||
});
|
||||
|
||||
test("check that stats file format is correct", () => {
|
||||
|
|
1
tests/fixtures/crawl-1.yaml
vendored
1
tests/fixtures/crawl-1.yaml
vendored
|
@ -5,4 +5,3 @@ seeds:
|
|||
- https://specs.webrecorder.net/
|
||||
|
||||
generateWACZ: true
|
||||
|
||||
|
|
5
tests/fixtures/driver-1.mjs
vendored
5
tests/fixtures/driver-1.mjs
vendored
|
@ -1,4 +1,5 @@
|
|||
export default async ({ data, page, crawler }) => {
|
||||
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
|
||||
await crawler.loadPage(page, data, [
|
||||
{ selector: "script[src]", extract: "src", isAttribute: false },
|
||||
]);
|
||||
};
|
||||
|
||||
|
|
|
@ -2,8 +2,9 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
|
||||
test("ensure page limit reached", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors \"\" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json");
|
||||
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json',
|
||||
);
|
||||
});
|
||||
|
||||
test("check limit written to stats file is as expected", () => {
|
||||
|
|
|
@ -2,9 +2,9 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
import path from "path";
|
||||
|
||||
|
||||
function jsonLinesToArray(string) {
|
||||
return string.split("\n")
|
||||
return string
|
||||
.split("\n")
|
||||
.filter((line) => {
|
||||
try {
|
||||
JSON.parse(line);
|
||||
|
@ -13,19 +13,19 @@ function jsonLinesToArray(string) {
|
|||
return false;
|
||||
}
|
||||
})
|
||||
.map(line => JSON.parse(line));
|
||||
.map((line) => JSON.parse(line));
|
||||
}
|
||||
|
||||
|
||||
test("ensure crawl run with log options passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general",
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
test("check that log files exist and were filtered according to options", () => {
|
||||
const logDir = "test-crawls/collections/wr-specs-logs/logs/";
|
||||
const logFiles = [];
|
||||
fs.readdirSync(logDir).forEach(file => {
|
||||
fs.readdirSync(logDir).forEach((file) => {
|
||||
if (file.startsWith("crawl-") && file.endsWith(".log")) {
|
||||
logFiles.push(path.join(logDir, file));
|
||||
}
|
||||
|
@ -40,7 +40,9 @@ test("check that log files exist and were filtered according to options", () =>
|
|||
expect(parsedJSONLines.length).toBeGreaterThan(0);
|
||||
|
||||
parsedJSONLines.forEach((jsonLine) => {
|
||||
expect(jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn").toBe(true);
|
||||
expect(
|
||||
jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn",
|
||||
).toBe(true);
|
||||
expect(jsonLine.context).toBe("general");
|
||||
});
|
||||
}
|
||||
|
|
|
@ -2,24 +2,47 @@ import child_process from "child_process";
|
|||
import fs from "fs";
|
||||
|
||||
test("ensure multi url crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2");
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz");
|
||||
child_process.execSync(
|
||||
'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2',
|
||||
);
|
||||
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that the favicon made it into the pages jsonl file", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true);
|
||||
expect(
|
||||
fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl"),
|
||||
).toBe(true);
|
||||
|
||||
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]);
|
||||
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]);
|
||||
const data1 = JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/advanced/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[1],
|
||||
);
|
||||
const data2 = JSON.parse(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/advanced/pages/pages.jsonl",
|
||||
"utf8",
|
||||
)
|
||||
.split("\n")[2],
|
||||
);
|
||||
const data = [data1, data2];
|
||||
for (const d of data) {
|
||||
if (d.url === "https://webrecorder.net/") {
|
||||
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico");
|
||||
expect(d.favIconUrl).toEqual(
|
||||
"https://webrecorder.net/assets/favicon.ico",
|
||||
);
|
||||
}
|
||||
if (d.url === "https://iana.org/") {
|
||||
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico");
|
||||
expect(d.favIconUrl).toEqual(
|
||||
"https://www.iana.org/_img/bookmark_icon.ico",
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
|
@ -1,14 +1,19 @@
|
|||
import child_process from "child_process";
|
||||
|
||||
|
||||
test("ensure crawl run with redis passes", async () => {
|
||||
const redis = child_process.spawn("docker run -d --name test-crawl-redis -p 6379:6379 redis");
|
||||
const redis = child_process.spawn(
|
||||
"docker run -d --name test-crawl-redis -p 6379:6379 redis",
|
||||
);
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection redis-crawl --redisStoreUrl redis://127.0.0.1:6379 --workers 2",
|
||||
);
|
||||
|
||||
redis.kill("SIGINT");
|
||||
});
|
||||
|
||||
test("check that wacz created is valid", () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/redis-crawl/redis-crawl.wacz",
|
||||
);
|
||||
});
|
||||
|
|
|
@ -28,9 +28,12 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
const wait = waitForProcess();
|
||||
|
||||
try {
|
||||
proc = exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20", {"shell": "/bin/bash"}, wait.callback);
|
||||
}
|
||||
catch (error) {
|
||||
proc = exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --limit 20",
|
||||
{ shell: "/bin/bash" },
|
||||
wait.callback,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
|
@ -45,7 +48,10 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
|
||||
while (true) {
|
||||
try {
|
||||
const pages = fs.readFileSync(pagesFile, {encoding: "utf-8"}).trim().split("\n");
|
||||
const pages = fs
|
||||
.readFileSync(pagesFile, { encoding: "utf-8" })
|
||||
.trim()
|
||||
.split("\n");
|
||||
|
||||
if (pages.length >= 2) {
|
||||
break;
|
||||
|
@ -61,17 +67,21 @@ test("check crawl interrupted + saved state written", async () => {
|
|||
|
||||
await wait.p;
|
||||
|
||||
const savedStates = fs.readdirSync("test-crawls/collections/int-state-test/crawls");
|
||||
const savedStates = fs.readdirSync(
|
||||
"test-crawls/collections/int-state-test/crawls",
|
||||
);
|
||||
expect(savedStates.length > 0).toEqual(true);
|
||||
|
||||
savedStateFile = savedStates[savedStates.length - 1];
|
||||
});
|
||||
|
||||
|
||||
test("check parsing saved state + page done + queue present", () => {
|
||||
expect(savedStateFile).toBeTruthy();
|
||||
|
||||
const savedState = fs.readFileSync(path.join("test-crawls/collections/int-state-test/crawls", savedStateFile), "utf-8");
|
||||
const savedState = fs.readFileSync(
|
||||
path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const saved = yaml.load(savedState);
|
||||
|
||||
|
@ -82,17 +92,19 @@ test("check parsing saved state + page done + queue present", () => {
|
|||
|
||||
expect(state.done > 0).toEqual(true);
|
||||
expect(state.queued.length > 0).toEqual(true);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("check crawl restarted with saved state", async () => {
|
||||
let proc = null;
|
||||
|
||||
const wait = waitForProcess();
|
||||
|
||||
try {
|
||||
proc = exec(`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`, {shell: "/bin/bash"}, wait.callback);
|
||||
proc = exec(
|
||||
`docker run -p 36379:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 5`,
|
||||
{ shell: "/bin/bash" },
|
||||
wait.callback,
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@ -106,7 +118,7 @@ test("check crawl restarted with saved state", async () => {
|
|||
maxRetriesPerRequest: 100,
|
||||
retryStrategy(times) {
|
||||
return times < 100 ? 1000 : null;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 2000));
|
||||
|
@ -126,5 +138,3 @@ test("interrupt crawl and exit", async () => {
|
|||
|
||||
expect(res[0].value).toBe(0);
|
||||
});
|
||||
|
||||
|
||||
|
|
|
@ -23,12 +23,10 @@ seeds:
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
||||
test("default scope + exclude", async () => {
|
||||
|
@ -40,15 +38,12 @@ exclude: https://example.com/pathexclude
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("default scope + exclude is numeric", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -58,17 +53,12 @@ exclude: "2022"
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/2022/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
test("prefix scope global + exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -79,15 +69,12 @@ exclude: https://example.com/pathexclude
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("prefix scope per seed + exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -98,15 +85,12 @@ exclude: https://example.com/pathexclude
|
|||
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("prefix");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/example\.com\//]);
|
||||
expect(seeds[0].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("host scope and domain scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -123,20 +107,26 @@ seeds:
|
|||
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
||||
expect(!!seeds[0].include[0].exec("https://example.com/")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://example.com/path")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://sub.domain.example.com/path")).toEqual(true);
|
||||
expect(!!seeds[0].include[0].exec("https://notsub.domainexample.com/path")).toEqual(false);
|
||||
expect(!!seeds[0].include[0].exec("https://sub.example.com/path")).toEqual(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
!!seeds[0].include[0].exec("https://sub.domain.example.com/path"),
|
||||
).toEqual(true);
|
||||
expect(
|
||||
!!seeds[0].include[0].exec("https://notsub.domainexample.com/path"),
|
||||
).toEqual(false);
|
||||
|
||||
expect(seeds[1].scopeType).toEqual("host");
|
||||
expect(seeds[1].include).toEqual([/^https?:\/\/example\.org\//]);
|
||||
expect(!!seeds[1].include[0].exec("https://example.org/")).toEqual(true);
|
||||
expect(!!seeds[1].include[0].exec("https://example.org/path")).toEqual(true);
|
||||
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(false);
|
||||
expect(!!seeds[1].include[0].exec("https://sub.example.com/path")).toEqual(
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
|
||||
test("domain scope drop www.", async () => {
|
||||
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
- url: https://www.example.com/
|
||||
|
@ -146,11 +136,8 @@ seeds:
|
|||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("domain");
|
||||
expect(seeds[0].include).toEqual([/^https?:\/\/([^/]+\.)*example\.com\//]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
|
||||
test("custom scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -159,14 +146,12 @@ seeds:
|
|||
exclude: https?://example.com/pathexclude
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(1);
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
expect(seeds[0].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[0].exclude).toEqual([/https?:\/\/example.com\/pathexclude/]);
|
||||
});
|
||||
|
||||
|
||||
test("inherit scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -178,7 +163,6 @@ include: https?://example.com/(path|other)
|
|||
exclude: https://example.com/pathexclude
|
||||
`);
|
||||
|
||||
|
||||
expect(seeds.length).toEqual(2);
|
||||
|
||||
expect(seeds[0].scopeType).toEqual("custom");
|
||||
|
@ -190,10 +174,8 @@ exclude: https://example.com/pathexclude
|
|||
expect(seeds[1].url).toEqual("https://example.com/2");
|
||||
expect(seeds[1].include).toEqual([/https?:\/\/example.com\/(path|other)/]);
|
||||
expect(seeds[1].exclude).toEqual([/https:\/\/example.com\/pathexclude/]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("override scope", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -225,7 +207,10 @@ include: https://example.com/onlythispath
|
|||
|
||||
expect(seeds[2].scopeType).toEqual("prefix");
|
||||
expect(seeds[2].url).toEqual("https://example.com/subpath/file.html");
|
||||
expect(seeds[2].include).toEqual([/^https?:\/\/example\.com\/subpath\//, /https:\/\/example.com\/onlythispath/]);
|
||||
expect(seeds[2].include).toEqual([
|
||||
/^https?:\/\/example\.com\/subpath\//,
|
||||
/https:\/\/example.com\/onlythispath/,
|
||||
]);
|
||||
expect(seeds[2].exclude).toEqual([]);
|
||||
|
||||
expect(seeds[3].scopeType).toEqual("custom");
|
||||
|
@ -234,7 +219,6 @@ include: https://example.com/onlythispath
|
|||
expect(seeds[3].exclude).toEqual([]);
|
||||
});
|
||||
|
||||
|
||||
test("override scope with exclude", async () => {
|
||||
const seeds = getSeeds(`
|
||||
|
||||
|
@ -288,10 +272,8 @@ exclude:
|
|||
expect(seeds[4].url).toEqual("https://example.com/4");
|
||||
expect(seeds[4].include).toEqual([]);
|
||||
expect(seeds[4].exclude).toEqual([]);
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("with exclude non-string types", async () => {
|
||||
const seeds = getSeeds(`
|
||||
seeds:
|
||||
|
@ -342,5 +324,4 @@ seeds:
|
|||
expect(seeds[7].exclude).toEqual([/null/]);
|
||||
expect(seeds[8].exclude).toEqual([/false/]);
|
||||
expect(seeds[9].exclude).toEqual([/true/]);
|
||||
|
||||
});
|
||||
|
|
|
@ -4,48 +4,66 @@ import fs from "fs";
|
|||
// screenshot
|
||||
|
||||
test("ensure basic crawl run with --screenshot passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test --url http://www.example.com/ --screenshot view --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the test collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/test/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/test/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
// fullPageScreenshot
|
||||
|
||||
test("ensure basic crawl run with --fullPageScreenshot passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the fullpage collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/fullpage/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/fullpage/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
// thumbnail
|
||||
|
||||
test("ensure basic crawl run with --thumbnail passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the thumbnail collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/thumbnail/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/thumbnail/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
// combination
|
||||
|
||||
test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2",
|
||||
);
|
||||
});
|
||||
|
||||
test("check that a screenshots warc file exists in the combined collection", () => {
|
||||
const screenshotWarcExists = fs.existsSync("test-crawls/collections/combined/archive/screenshots.warc.gz");
|
||||
const screenshotWarcExists = fs.existsSync(
|
||||
"test-crawls/collections/combined/archive/screenshots.warc.gz",
|
||||
);
|
||||
expect(screenshotWarcExists).toBe(true);
|
||||
});
|
||||
|
||||
test("check that a wacz file exists in the combined collection", () => {
|
||||
const waczExists = fs.existsSync("test-crawls/collections/combined/combined.wacz");
|
||||
const waczExists = fs.existsSync(
|
||||
"test-crawls/collections/combined/combined.wacz",
|
||||
);
|
||||
expect(waczExists).toBe(true);
|
||||
});
|
||||
|
|
|
@ -3,11 +3,12 @@ import {exec as execCallback } from "child_process";
|
|||
|
||||
const exec = util.promisify(execCallback);
|
||||
|
||||
|
||||
test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed");
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection invalidseed",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
passed = false;
|
||||
|
@ -18,9 +19,10 @@ test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set",
|
|||
test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --failOnFailedSeed --collection failseed",
|
||||
);
|
||||
} catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
@ -29,9 +31,10 @@ test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async ()
|
|||
test("ensure crawl fails if no valid seeds are passed", async () => {
|
||||
let passed = true;
|
||||
try {
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.com/invalid-seed --generateWACZ --limit 1 --collection allinvalidseeds",
|
||||
);
|
||||
} catch (error) {
|
||||
passed = false;
|
||||
}
|
||||
expect(passed).toBe(false);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import { calculatePercentageUsed, checkDiskUtilization } from "../dist/util/storage.js";
|
||||
|
||||
import {
|
||||
calculatePercentageUsed,
|
||||
checkDiskUtilization,
|
||||
} from "../dist/util/storage.js";
|
||||
|
||||
test("ensure calculatePercentageUsed returns expected values", () => {
|
||||
expect(calculatePercentageUsed(30, 100)).toEqual(30);
|
||||
|
@ -13,13 +15,11 @@ test("ensure calculatePercentageUsed returns expected values", () => {
|
|||
expect(calculatePercentageUsed(0, 5)).toEqual(0);
|
||||
});
|
||||
|
||||
|
||||
test("verify end-to-end disk utilization not exceeded threshold", async () => {
|
||||
|
||||
const params = {
|
||||
diskUtilization: 90,
|
||||
combineWARC: true,
|
||||
generateWACZ: true
|
||||
generateWACZ: true,
|
||||
};
|
||||
|
||||
const mockDfOutput = `\
|
||||
|
@ -28,22 +28,24 @@ grpcfuse 1000000 285000 715000 28% /crawls`;
|
|||
|
||||
// with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
|
||||
// does not exceed 90% threshold
|
||||
const returnValue = await checkDiskUtilization(params, 5000 * 1024, mockDfOutput);
|
||||
const returnValue = await checkDiskUtilization(
|
||||
params,
|
||||
5000 * 1024,
|
||||
mockDfOutput,
|
||||
);
|
||||
expect(returnValue).toEqual({
|
||||
stop: false,
|
||||
used: 28,
|
||||
projected: 31,
|
||||
threshold: 90
|
||||
threshold: 90,
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
test("verify end-to-end disk utilization exceeds threshold", async () => {
|
||||
|
||||
const params = {
|
||||
diskUtilization: 90,
|
||||
combineWARC: false,
|
||||
generateWACZ: true
|
||||
generateWACZ: true,
|
||||
};
|
||||
|
||||
const mockDfOutput = `\
|
||||
|
@ -52,11 +54,15 @@ grpcfuse 100000 85000 15000 85% /crawls`;
|
|||
|
||||
// with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
|
||||
// exceeds 90% threshold
|
||||
const returnValue = await checkDiskUtilization(params, 3000 * 1024, mockDfOutput);
|
||||
const returnValue = await checkDiskUtilization(
|
||||
params,
|
||||
3000 * 1024,
|
||||
mockDfOutput,
|
||||
);
|
||||
expect(returnValue).toEqual({
|
||||
stop: true,
|
||||
used: 85,
|
||||
projected: 91,
|
||||
threshold: 90
|
||||
threshold: 90,
|
||||
});
|
||||
});
|
||||
|
|
|
@ -3,16 +3,20 @@ import child_process from "child_process";
|
|||
|
||||
test("check that urn:text and urn:textfinal records are written to WARC", async () => {
|
||||
try {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc");
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
|
||||
);
|
||||
} catch (error) {
|
||||
//console.log(new TextDecoder().decode(error));
|
||||
console.log(error.stderr);
|
||||
}
|
||||
|
||||
const data = fs.readFileSync("test-crawls/collections/text-extract/indexes/index.cdxj", {"encoding": "utf-8"});
|
||||
const data = fs.readFileSync(
|
||||
"test-crawls/collections/text-extract/indexes/index.cdxj",
|
||||
{ encoding: "utf-8" },
|
||||
);
|
||||
|
||||
expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
|
||||
|
||||
expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
|
||||
});
|
||||
|
||||
|
|
|
@ -6,15 +6,21 @@ const exec = util.promisify(execCallback);
|
|||
|
||||
test("check that URLs in seed-list are crawled", async () => {
|
||||
try {
|
||||
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000");
|
||||
}
|
||||
catch (error) {
|
||||
await exec(
|
||||
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
|
||||
let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort();
|
||||
let crawled_pages = fs.readFileSync(
|
||||
"test-crawls/collections/filelisttest/pages/pages.jsonl",
|
||||
"utf8",
|
||||
);
|
||||
let seed_file = fs
|
||||
.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
|
||||
.split("\n")
|
||||
.sort();
|
||||
|
||||
let seed_file_list = [];
|
||||
for (var j = 0; j < seed_file.length; j++) {
|
||||
|
|
|
@ -5,15 +5,19 @@ import child_process from "child_process";
|
|||
test("check that the warcinfo file works as expected on the command line", async () => {
|
||||
try {
|
||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync(
|
||||
"docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC",
|
||||
{ input: configYaml, stdin: "inherit", encoding: "utf8" },
|
||||
);
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
catch (error) {
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");
|
||||
const warcData = fs.readFileSync(
|
||||
"test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
|
||||
);
|
||||
|
||||
const data = zlib.gunzipSync(warcData);
|
||||
|
||||
|
@ -21,8 +25,8 @@ test("check that the warcinfo file works as expected on the command line", async
|
|||
|
||||
expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
|
||||
expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
|
||||
expect(string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/)).not.toEqual(null);
|
||||
expect(
|
||||
string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
|
||||
).not.toEqual(null);
|
||||
expect(string.indexOf("format: WARC File Format 1.0")).toBeGreaterThan(-1);
|
||||
|
||||
|
||||
});
|
||||
|
|
|
@ -11,8 +11,12 @@
|
|||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
|
||||
/* Language and Environment */
|
||||
"target": "es2022", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
"lib": ["es2022", "dom", "dom.iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
"target": "es2022" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
|
||||
"lib": [
|
||||
"es2022",
|
||||
"dom",
|
||||
"dom.iterable"
|
||||
] /* Specify a set of bundled library declaration files that describe the target runtime environment. */,
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */
|
||||
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
|
||||
|
@ -25,9 +29,9 @@
|
|||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
"module": "NodeNext", /* Specify what module code is generated. */
|
||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||
"moduleResolution": "NodeNext", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
"module": "NodeNext" /* Specify what module code is generated. */,
|
||||
"rootDir": "./src" /* Specify the root folder within your source files. */,
|
||||
"moduleResolution": "NodeNext" /* Specify how TypeScript looks up a file from a given module specifier. */,
|
||||
//"baseUrl": "./src", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||
|
@ -39,8 +43,8 @@
|
|||
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
|
||||
|
||||
/* JavaScript Support */
|
||||
"allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
|
||||
"checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
|
||||
"allowJs": true /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */,
|
||||
"checkJs": true /* Enable error reporting in type-checked JavaScript files. */,
|
||||
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
|
||||
|
||||
/* Emit */
|
||||
|
@ -49,7 +53,7 @@
|
|||
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
|
||||
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
|
||||
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
|
||||
"outDir": "./dist/", /* Specify an output folder for all emitted files. */
|
||||
"outDir": "./dist/" /* Specify an output folder for all emitted files. */,
|
||||
// "removeComments": true, /* Disable emitting comments. */
|
||||
// "noEmit": true, /* Disable emitting files from a compilation. */
|
||||
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
|
||||
|
@ -73,10 +77,10 @@
|
|||
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
|
||||
//"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
|
||||
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
|
||||
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
|
||||
"forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
|
||||
|
||||
/* Type Checking */
|
||||
"strict": true, /* Enable all strict type-checking options. */
|
||||
"strict": true /* Enable all strict type-checking options. */,
|
||||
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
|
||||
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
|
||||
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
|
||||
|
@ -101,7 +105,5 @@
|
|||
"skipLibCheck": true /* Skip type checking all .d.ts files. */
|
||||
},
|
||||
|
||||
"include": [
|
||||
"src/**/*",
|
||||
]
|
||||
"include": ["src/**/*"]
|
||||
}
|
||||
|
|
10
yarn.lock
10
yarn.lock
|
@ -1914,6 +1914,11 @@ escodegen@^2.1.0:
|
|||
optionalDependencies:
|
||||
source-map "~0.6.1"
|
||||
|
||||
eslint-config-prettier@^9.0.0:
|
||||
version "9.0.0"
|
||||
resolved "https://registry.yarnpkg.com/eslint-config-prettier/-/eslint-config-prettier-9.0.0.tgz#eb25485946dd0c66cd216a46232dc05451518d1f"
|
||||
integrity sha512-IcJsTkJae2S35pRsRAwoCE+925rJJStOdkKnLVgtE+tEpqU0EVVM7OqrwxqgptKdX29NUwC82I5pXsGFIgSevw==
|
||||
|
||||
eslint-plugin-react@^7.22.0:
|
||||
version "7.23.2"
|
||||
resolved "https://registry.yarnpkg.com/eslint-plugin-react/-/eslint-plugin-react-7.23.2.tgz#2d2291b0f95c03728b55869f01102290e792d494"
|
||||
|
@ -3829,6 +3834,11 @@ prelude-ls@^1.2.1:
|
|||
resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396"
|
||||
integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==
|
||||
|
||||
prettier@3.0.3:
|
||||
version "3.0.3"
|
||||
resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.0.3.tgz#432a51f7ba422d1469096c0fdc28e235db8f9643"
|
||||
integrity sha512-L/4pUDMxcNa8R/EthV08Zt42WBO4h1rarVtK0K+QJG0X187OLo7l699jWw0GKuwzkPQ//jMFA/8Xm6Fh3J/DAg==
|
||||
|
||||
pretty-format@^29.2.1:
|
||||
version "29.2.1"
|
||||
resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.2.1.tgz#86e7748fe8bbc96a6a4e04fa99172630907a9611"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue