Customizable extract selectors + typo fix (0.4.2) (#72)

* fix typo in setting crawler.capturePrefix which caused directFetchCapture() to fail, causing non-HTML urls to fail.
- wrap directFetchCapture() to retry browser loading in case of failure

* custom link extraction improvements (improvements for #25) 
- extractLinks() returns a list of link URLs to allow for more flexibility in custom driver
- rename queueUrls() to queueInScopeUrls() to indicate the filtering is performed
- loadPage accepts a list of select opts {selector, extract, isAttribute} and defaults to {"a[href]", "href", false}
- tests: add test for custom driver which uses custom selector

* tests
- tests: all tests uses 'test-crawls' instead of crawls
- consolidation: combine initial crawl + rollover, combine warc, text tests into basic_crawl.test.js
- add custom driver test and fixture to test custom link extraction

* add to CHANGES, bump to 0.4.2
This commit is contained in:
Ilya Kreymer 2021-07-23 18:31:43 -07:00 committed by GitHub
parent 36ac3cb905
commit 0e0b85d7c3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 167 additions and 108 deletions

View file

@ -20,7 +20,7 @@ jobs:
- name: install requirements
run: yarn install
- name: run linter
run: yarn run eslint .
run: yarn lint
build:
@ -40,12 +40,6 @@ jobs:
run: yarn install
- name: build docker
run: docker-compose build
- name: run crawl
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2
- name: validate existing wacz
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
- name: unzip wacz
run: sudo unzip crawls/collections/wr-net/wr-net.wacz -d crawls/collections/wr-net/wacz
- name: run jest
run: sudo yarn jest

View file

@ -1,5 +1,12 @@
## CHANGES
v0.4.2
- Compose/docs: Build latest image by default, update README to refer to latest image
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
- Tests: Update all tests to use `test-crawls` directory
- extractLinks() just extracts links from default selectors, allows custom driver to filter results
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
v0.4.1
- BlockRules Optimizations: don't intercept requests if no blockRules
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup

View file

@ -28,7 +28,7 @@ const { parseArgs } = require("./util/argParser");
const { getBrowserExe, loadProfile } = require("./util/browser");
const { BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
const { BlockRules } = require("./util/blockrules");
@ -67,7 +67,7 @@ class Crawler {
this.debugLog("Seeds", this.params.scopedSeeds);
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
this.capturePrefix = this.captureBasePrerix + "/id_/";
this.capturePrefix = this.captureBasePrefix + "/id_/";
this.gotoOpts = {
waitUntil: this.params.waitUntil,
@ -405,12 +405,16 @@ class Crawler {
}
}
async loadPage(page, urlData, selector = "a[href]") {
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
const {url, seedId, depth} = urlData;
if (!await this.isHTML(url)) {
try {
await this.directFetchCapture(url);
return;
} catch (e) {
// ignore failed direct fetch attempt, do browser-based capture
}
}
if (this.blockRules) {
@ -423,36 +427,38 @@ class Crawler {
console.warn(`Load timeout for ${url}`, e);
}
if (selector) {
await this.extractLinks(page, seedId, depth, selector);
}
}
async extractLinks(page, seedId, depth, selector = "a[href]", prop = "href", isAttribute = false) {
const results = [];
const seed = this.params.scopedSeeds[seedId];
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth)) {
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
return;
}
const loadProp = (selector, prop) => {
return [...document.querySelectorAll(selector)].map(elem => elem[prop]);
for (const opts of selectorOptsList) {
const links = await this.extractLinks(page, opts);
this.queueInScopeUrls(seedId, links, depth);
}
}
async extractLinks(page, {selector = "a[href]", extract = "href", isAttribute = false} = {}) {
const results = [];
const loadProp = (selector, extract) => {
return [...document.querySelectorAll(selector)].map(elem => elem[extract]);
};
const loadAttr = (selector, attr) => {
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(attr));
const loadAttr = (selector, extract) => {
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(extract));
};
const loadFunc = isAttribute ? loadAttr : loadProp;
try {
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, prop)));
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, extract)));
if (linkResults) {
for (const linkResult of linkResults) {
if (!linkResult.value) continue;
for (const link of linkResult.value) {
results.push(link);
}
@ -461,12 +467,11 @@ class Crawler {
} catch (e) {
console.warn("Link Extraction failed", e);
return;
}
this.queueUrls(seedId, results, depth);
return results;
}
queueUrls(seedId, urls, depth) {
queueInScopeUrls(seedId, urls, depth) {
try {
depth += 1;
const seed = this.params.scopedSeeds[seedId];
@ -619,7 +624,7 @@ class Crawler {
try {
const { sites } = await sitemapper.fetch();
this.queueUrls(seedId, sites, 0);
this.queueInScopeUrls(seedId, sites, 0);
} catch(e) {
console.warn(e);
}

View file

@ -1,12 +1,12 @@
{
"name": "browsertrix-crawler",
"version": "0.4.1",
"version": "0.4.2",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
"license": "MIT",
"scripts": {
"lint": "eslint *.js util/*.js"
"lint": "eslint *.js util/*.js tests/*.test.js"
},
"dependencies": {
"abort-controller": "^3.0.0",

63
tests/basic_crawl.test.js Normal file
View file

@ -0,0 +1,63 @@
const child_process = require("child_process");
const fs = require("fs");
const path = require("path");
const md5 = require("md5");
test("ensure basic crawl run with docker run passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2");
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
});
test("check that a combined warc file exists in the archive folder", () => {
const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
var captureFound = 0;
for (var i = 0; i < warcLists.length; i++) {
if (warcLists[i].endsWith("_0.warc.gz")){
captureFound = 1;
}
}
expect(captureFound).toEqual(1);
});
test("check that a combined warc file is under the rolloverSize", () => {
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
let rolloverSize = 0;
function getFileSize(filename) {
return fs.statSync(filename).size;
}
for (let i = 0; i < warcLists.length; i++) {
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
if (size < 10000){
rolloverSize = 1;
}
}
expect(rolloverSize).toEqual(1);
});
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
});
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
});
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
expect(wacz_hash).toEqual(fixture_hash);
expect(wacz_hash).toEqual(crawl_hash);
});

View file

@ -6,7 +6,7 @@ test("check that the collection name is properly validated", async () => {
let passed = "";
try{
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
passed = true;
}
catch (error) {
@ -21,7 +21,7 @@ test("check that the collection name is not accepted if it doesn't meets our sta
let passed = "";
try{
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
passed = true;
}
catch(e){

View file

@ -1,13 +0,0 @@
const fs = require("fs");
test("check that a combined warc file exists in the archive folder", () => {
const warcLists = fs.readdirSync("crawls/collections/wr-net");
var captureFound = 0;
for (var i = 0; i < warcLists.length; i++) {
if (warcLists[i].endsWith("_0.warc.gz")){
captureFound = 1;
}
}
expect(captureFound).toEqual(1);
});

View file

@ -8,13 +8,13 @@ test("check yaml config file with seed list is used", async () => {
try{
await exec("docker-compose run -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection configtest --config /tests/fixtures/crawl-1.yaml --depth 0");
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
}
catch (error) {
console.log(error);
}
const crawledPages = fs.readFileSync("crawls/collections/configtest/pages/pages.jsonl", "utf8");
const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8");
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
@ -36,7 +36,7 @@ test("check yaml config file with seed list is used", async () => {
}
expect(foundAllSeeds).toBe(true);
expect(fs.existsSync("crawls/collections/configtest/configtest.wacz")).toBe(true);
expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true);
});
@ -45,22 +45,23 @@ test("check yaml config file will be overwritten by command line", async () => {
try{
await exec("docker-compose run -v $PWD/crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection configtest --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
}
catch (error) {
console.log(error);
}
const crawledPages = fs.readFileSync("crawls/collections/configtest/pages/pages.jsonl", "utf8");
const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8");
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
pages.add(JSON.parse(line).url);
const url = JSON.parse(line).url;
if (url) {
pages.add(url);
}
}
expect(pages.has("https://www.example.com/")).toBe(true);
expect(pages.size).toBe(1);
});

View file

@ -9,7 +9,7 @@ test("pass config file via stdin", async () => {
const config = yaml.load(configYaml);
try {
const proc = child_process.execSync("docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}
@ -17,7 +17,7 @@ test("pass config file via stdin", async () => {
console.log(error);
}
const crawledPages = fs.readFileSync("crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
@ -39,6 +39,6 @@ test("pass config file via stdin", async () => {
}
expect(foundAllSeeds).toBe(true);
expect(fs.existsSync("crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
});

View file

@ -0,0 +1,34 @@
const child_process = require("child_process");
const fs = require("fs");
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
jest.setTimeout(30000);
try {
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.js");
}
catch (error) {
console.log(error);
}
const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8");
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}
const expectedPages = new Set([
"https://www.iana.org/",
"https://www.iana.org/_js/2013.1/jquery.js",
"https://www.iana.org/_js/2013.1/iana.js"
]);
expect(pages).toEqual(expectedPages);
});

View file

@ -1,5 +1,5 @@
name: crawl-test-1
collection: configtest
seeds:
- https://www.example.org
- https://www.iana.org/

4
tests/fixtures/driver-1.js vendored Normal file
View file

@ -0,0 +1,4 @@
module.exports = async ({data, page, crawler}) => {
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
};

View file

@ -1,20 +0,0 @@
const fs = require("fs");
const path = require("path");
function getFileSize(filename) {
var stats = fs.statSync(filename);
return stats.size;
}
test("check that a combined warc file is under the rolloverSize", () => {
const warcLists = fs.readdirSync(path.join("crawls/collections/wr-net/wacz", "archive"));
var rolloverSize = 0;
for (var i = 0; i < warcLists.length; i++) {
var size = getFileSize(path.join("crawls/collections/wr-net/wacz/archive/", warcLists[i]));
if (size < 10000){
rolloverSize = 1;
}
}
expect(rolloverSize).toEqual(1);
});

View file

@ -6,13 +6,13 @@ function getSeeds(config) {
const orig = fs.readFileSync;
fs.readFileSync = (name, ...args) => {
if (name.endsWith("/configtest")) {
if (name.endsWith("/stdinconfig")) {
return config;
}
return orig(name, ...args);
};
return parseArgs(["node", "crawler", "--config", "configtest"]).scopedSeeds;
return parseArgs(["node", "crawler", "--config", "stdinconfig"]).scopedSeeds;
}
test("default scope", async () => {

View file

@ -1,22 +0,0 @@
const fs = require("fs");
const md5 = require("md5");
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
expect(fs.existsSync("crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
});
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
expect(fs.existsSync("crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
});
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
const crawl_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
const wacz_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
expect(wacz_hash).toEqual(fixture_hash);
expect(wacz_hash).toEqual(crawl_hash);
});

View file

@ -7,13 +7,13 @@ test("check that all urls in a file list are crawled when the filelisturl param
try{
await exec("docker-compose run -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
}
catch (error) {
console.log(error);
}
let crawled_pages = fs.readFileSync("crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort();
let seed_file_list = [];

View file

@ -7,7 +7,7 @@ test("check that the warcinfo file works as expected on the command line", async
try{
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
const proc = child_process.execSync("docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
console.log(proc);
}
@ -15,7 +15,7 @@ test("check that the warcinfo file works as expected on the command line", async
console.log(error);
}
const warcData = fs.readFileSync("crawls/collections/warcinfo/warcinfo_0.warc.gz");
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");
const data = zlib.gunzipSync(warcData);

View file

@ -3,3 +3,9 @@ module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtm
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";
module.exports.DEFAULT_SELECTORS = [{
selector: "a[href]",
extract: "href",
isAttribute: false
}];