mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
tests text extraction (#30)
* new tests * add jest to eslint, lint fixes Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
This commit is contained in:
parent
748b0399e9
commit
fb0f1d8db9
10 changed files with 14382 additions and 1320 deletions
|
@ -2,7 +2,8 @@ module.exports = {
|
||||||
"env": {
|
"env": {
|
||||||
"browser": true,
|
"browser": true,
|
||||||
"es2021": true,
|
"es2021": true,
|
||||||
"node": true
|
"node": true,
|
||||||
|
"jest": true
|
||||||
},
|
},
|
||||||
"extends": "eslint:recommended",
|
"extends": "eslint:recommended",
|
||||||
"parserOptions": {
|
"parserOptions": {
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
env:
|
env:
|
||||||
|
jest: true
|
||||||
browser: true
|
browser: true
|
||||||
node: true
|
node: true
|
||||||
es2021: true
|
es2021: true
|
||||||
|
|
11
.github/workflows/ci.yaml
vendored
11
.github/workflows/ci.yaml
vendored
|
@ -36,11 +36,20 @@ jobs:
|
||||||
uses: actions/setup-node@v1
|
uses: actions/setup-node@v1
|
||||||
with:
|
with:
|
||||||
node-version: ${{ matrix.node-version }}
|
node-version: ${{ matrix.node-version }}
|
||||||
|
- name: install requirements
|
||||||
|
run: npm install
|
||||||
- name: build docker
|
- name: build docker
|
||||||
run: docker-compose build
|
run: docker-compose build
|
||||||
- name: run crawl
|
- name: run crawl
|
||||||
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --collection wr-net --workers 2
|
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --workers 2
|
||||||
- name: validate existing wacz
|
- name: validate existing wacz
|
||||||
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
|
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
|
||||||
|
- name: unzip wacz
|
||||||
|
run: docker-compose run crawler unzip collections/wr-net/wr-net.wacz -d collections/wr-net/wacz
|
||||||
|
- name: run jest
|
||||||
|
run: sudo yarn jest
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -420,7 +420,7 @@ class Crawler {
|
||||||
if (this.params.text){
|
if (this.params.text){
|
||||||
const client = await page.target().createCDPSession();
|
const client = await page.target().createCDPSession();
|
||||||
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
|
||||||
var text = await new TextExtract(result).parseTextFromDom()
|
text = await new TextExtract(result).parseTextFromDom()
|
||||||
}
|
}
|
||||||
|
|
||||||
this.writePage(data.url, title, this.params.text, text);
|
this.writePage(data.url, title, this.params.text, text);
|
||||||
|
|
9938
package-lock.json
generated
9938
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
@ -16,6 +16,8 @@
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"eslint": "^7.20.0",
|
"eslint": "^7.20.0",
|
||||||
"eslint-plugin-react": "^7.22.0"
|
"eslint-plugin-react": "^7.22.0",
|
||||||
|
"jest": "^26.6.3",
|
||||||
|
"md5": "^2.3.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
BIN
tests/.DS_Store
vendored
Normal file
BIN
tests/.DS_Store
vendored
Normal file
Binary file not shown.
2
tests/fixtures/pages.jsonl
vendored
Normal file
2
tests/fixtures/pages.jsonl
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
{"format":"json-pages-1.0","id":"pages","title":"All Pages","hasText":true}
|
||||||
|
{"title":"Example Domain","url":"http://www.example.com/","id":"2qok7uessksqo91vt90x8q","size":1256,"ts":"2021-02-24T02:31:27.538Z","text":"Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\nMore information..."}
|
22
tests/text.test.js
Normal file
22
tests/text.test.js
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
const fs = require("fs");
|
||||||
|
const md5 = require('md5');
|
||||||
|
|
||||||
|
|
||||||
|
test('check that the pages.jsonl file exists in the collection under the pages folder', () => {
|
||||||
|
expect(fs.existsSync('crawls/collections/wr-net/pages/pages.jsonl')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('check that the pages.jsonl file exists in the wacz under the pages folder', () => {
|
||||||
|
expect(fs.existsSync('crawls/collections/wr-net/wacz/pages/pages.jsonl')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('check that the hash in the pages folder and in the unzipped wacz folders match', () => {
|
||||||
|
const crawl_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/wacz/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
|
||||||
|
const wacz_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
|
||||||
|
const fixture_hash = md5(JSON.parse(fs.readFileSync('tests/fixtures/pages.jsonl', 'utf8').split('\n')[1])['text']);
|
||||||
|
|
||||||
|
expect(wacz_hash).toEqual(fixture_hash);
|
||||||
|
expect(wacz_hash).toEqual(crawl_hash);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue