tests text extraction (#30)

* new tests

* add jest to eslint, lint fixes

Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local>
This commit is contained in:
Emma Dickson 2021-03-01 19:00:23 -05:00 committed by GitHub
parent 748b0399e9
commit fb0f1d8db9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 14382 additions and 1320 deletions

View file

@ -2,7 +2,8 @@ module.exports = {
"env": {
"browser": true,
"es2021": true,
"node": true
"node": true,
"jest": true
},
"extends": "eslint:recommended",
"parserOptions": {

View file

@ -1,4 +1,5 @@
env:
jest: true
browser: true
node: true
es2021: true

View file

@ -36,11 +36,20 @@ jobs:
uses: actions/setup-node@v1
with:
node-version: ${{ matrix.node-version }}
- name: install requirements
run: npm install
- name: build docker
run: docker-compose build
- name: run crawl
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --collection wr-net --workers 2
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --workers 2
- name: validate existing wacz
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
- name: unzip wacz
run: docker-compose run crawler unzip collections/wr-net/wr-net.wacz -d collections/wr-net/wacz
- name: run jest
run: sudo yarn jest

View file

@ -420,7 +420,7 @@ class Crawler {
if (this.params.text){
const client = await page.target().createCDPSession();
const result = await client.send("DOM.getDocument", {"depth": -1, "pierce": true});
var text = await new TextExtract(result).parseTextFromDom()
text = await new TextExtract(result).parseTextFromDom()
}
this.writePage(data.url, title, this.params.text, text);

9938
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -16,6 +16,8 @@
},
"devDependencies": {
"eslint": "^7.20.0",
"eslint-plugin-react": "^7.22.0"
"eslint-plugin-react": "^7.22.0",
"jest": "^26.6.3",
"md5": "^2.3.0"
}
}

BIN
tests/.DS_Store vendored Normal file

Binary file not shown.

2
tests/fixtures/pages.jsonl vendored Normal file
View file

@ -0,0 +1,2 @@
{"format":"json-pages-1.0","id":"pages","title":"All Pages","hasText":true}
{"title":"Example Domain","url":"http://www.example.com/","id":"2qok7uessksqo91vt90x8q","size":1256,"ts":"2021-02-24T02:31:27.538Z","text":"Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\nMore information..."}

22
tests/text.test.js Normal file
View file

@ -0,0 +1,22 @@
const fs = require("fs");
const md5 = require('md5');
test('check that the pages.jsonl file exists in the collection under the pages folder', () => {
expect(fs.existsSync('crawls/collections/wr-net/pages/pages.jsonl')).toBe(true);
});
test('check that the pages.jsonl file exists in the wacz under the pages folder', () => {
expect(fs.existsSync('crawls/collections/wr-net/wacz/pages/pages.jsonl')).toBe(true);
});
test('check that the hash in the pages folder and in the unzipped wacz folders match', () => {
const crawl_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/wacz/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
const wacz_hash = md5(JSON.parse(fs.readFileSync('crawls/collections/wr-net/pages/pages.jsonl', 'utf8').split('\n')[1])['text']);
const fixture_hash = md5(JSON.parse(fs.readFileSync('tests/fixtures/pages.jsonl', 'utf8').split('\n')[1])['text']);
expect(wacz_hash).toEqual(fixture_hash);
expect(wacz_hash).toEqual(crawl_hash);
});

5719
yarn.lock

File diff suppressed because it is too large Load diff