mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Customizable extract selectors + typo fix (0.4.2) (#72)
* fix typo in setting crawler.capturePrefix which caused directFetchCapture() to fail, causing non-HTML urls to fail. - wrap directFetchCapture() to retry browser loading in case of failure * custom link extraction improvements (improvements for #25) - extractLinks() returns a list of link URLs to allow for more flexibility in custom driver - rename queueUrls() to queueInScopeUrls() to indicate the filtering is performed - loadPage accepts a list of select opts {selector, extract, isAttribute} and defaults to {"a[href]", "href", false} - tests: add test for custom driver which uses custom selector * tests - tests: all tests uses 'test-crawls' instead of crawls - consolidation: combine initial crawl + rollover, combine warc, text tests into basic_crawl.test.js - add custom driver test and fixture to test custom link extraction * add to CHANGES, bump to 0.4.2
This commit is contained in:
parent
36ac3cb905
commit
0e0b85d7c3
18 changed files with 167 additions and 108 deletions
8
.github/workflows/ci.yaml
vendored
8
.github/workflows/ci.yaml
vendored
|
@ -20,7 +20,7 @@ jobs:
|
|||
- name: install requirements
|
||||
run: yarn install
|
||||
- name: run linter
|
||||
run: yarn run eslint .
|
||||
run: yarn lint
|
||||
|
||||
build:
|
||||
|
||||
|
@ -40,12 +40,6 @@ jobs:
|
|||
run: yarn install
|
||||
- name: build docker
|
||||
run: docker-compose build
|
||||
- name: run crawl
|
||||
run: docker-compose run crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2
|
||||
- name: validate existing wacz
|
||||
run: docker-compose run crawler wacz validate --file collections/wr-net/wr-net.wacz
|
||||
- name: unzip wacz
|
||||
run: sudo unzip crawls/collections/wr-net/wr-net.wacz -d crawls/collections/wr-net/wacz
|
||||
- name: run jest
|
||||
run: sudo yarn jest
|
||||
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
## CHANGES
|
||||
|
||||
v0.4.2
|
||||
- Compose/docs: Build latest image by default, update README to refer to latest image
|
||||
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing
|
||||
- Tests: Update all tests to use `test-crawls` directory
|
||||
- extractLinks() just extracts links from default selectors, allows custom driver to filter results
|
||||
- loadPage() accepts a list of selector options with selector, extract, and isAttribute settings for further customization of link extraction
|
||||
|
||||
v0.4.1
|
||||
- BlockRules Optimizations: don't intercept requests if no blockRules
|
||||
- Profile Creation: Support extending existing profile by passing a --profile param to load on startup
|
||||
|
|
47
crawler.js
47
crawler.js
|
@ -28,7 +28,7 @@ const { parseArgs } = require("./util/argParser");
|
|||
|
||||
const { getBrowserExe, loadProfile } = require("./util/browser");
|
||||
|
||||
const { BEHAVIOR_LOG_FUNC, HTML_TYPES } = require("./util/constants");
|
||||
const { BEHAVIOR_LOG_FUNC, HTML_TYPES, DEFAULT_SELECTORS } = require("./util/constants");
|
||||
|
||||
const { BlockRules } = require("./util/blockrules");
|
||||
|
||||
|
@ -67,7 +67,7 @@ class Crawler {
|
|||
this.debugLog("Seeds", this.params.scopedSeeds);
|
||||
|
||||
this.captureBasePrefix = `http://${process.env.PROXY_HOST}:${process.env.PROXY_PORT}/${this.params.collection}/record`;
|
||||
this.capturePrefix = this.captureBasePrerix + "/id_/";
|
||||
this.capturePrefix = this.captureBasePrefix + "/id_/";
|
||||
|
||||
this.gotoOpts = {
|
||||
waitUntil: this.params.waitUntil,
|
||||
|
@ -405,12 +405,16 @@ class Crawler {
|
|||
}
|
||||
}
|
||||
|
||||
async loadPage(page, urlData, selector = "a[href]") {
|
||||
async loadPage(page, urlData, selectorOptsList = DEFAULT_SELECTORS) {
|
||||
const {url, seedId, depth} = urlData;
|
||||
|
||||
if (!await this.isHTML(url)) {
|
||||
try {
|
||||
await this.directFetchCapture(url);
|
||||
return;
|
||||
} catch (e) {
|
||||
// ignore failed direct fetch attempt, do browser-based capture
|
||||
}
|
||||
}
|
||||
|
||||
if (this.blockRules) {
|
||||
|
@ -423,36 +427,38 @@ class Crawler {
|
|||
console.warn(`Load timeout for ${url}`, e);
|
||||
}
|
||||
|
||||
if (selector) {
|
||||
await this.extractLinks(page, seedId, depth, selector);
|
||||
}
|
||||
}
|
||||
|
||||
async extractLinks(page, seedId, depth, selector = "a[href]", prop = "href", isAttribute = false) {
|
||||
const results = [];
|
||||
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
||||
// skip extraction if at max depth
|
||||
if (seed.isAtMaxDepth(depth)) {
|
||||
if (seed.isAtMaxDepth(depth) || !selectorOptsList) {
|
||||
return;
|
||||
}
|
||||
|
||||
const loadProp = (selector, prop) => {
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem[prop]);
|
||||
for (const opts of selectorOptsList) {
|
||||
const links = await this.extractLinks(page, opts);
|
||||
this.queueInScopeUrls(seedId, links, depth);
|
||||
}
|
||||
}
|
||||
|
||||
async extractLinks(page, {selector = "a[href]", extract = "href", isAttribute = false} = {}) {
|
||||
const results = [];
|
||||
|
||||
const loadProp = (selector, extract) => {
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem[extract]);
|
||||
};
|
||||
|
||||
const loadAttr = (selector, attr) => {
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(attr));
|
||||
const loadAttr = (selector, extract) => {
|
||||
return [...document.querySelectorAll(selector)].map(elem => elem.getAttribute(extract));
|
||||
};
|
||||
|
||||
const loadFunc = isAttribute ? loadAttr : loadProp;
|
||||
|
||||
try {
|
||||
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, prop)));
|
||||
const linkResults = await Promise.allSettled(page.frames().map(frame => frame.evaluate(loadFunc, selector, extract)));
|
||||
|
||||
if (linkResults) {
|
||||
for (const linkResult of linkResults) {
|
||||
if (!linkResult.value) continue;
|
||||
for (const link of linkResult.value) {
|
||||
results.push(link);
|
||||
}
|
||||
|
@ -461,12 +467,11 @@ class Crawler {
|
|||
|
||||
} catch (e) {
|
||||
console.warn("Link Extraction failed", e);
|
||||
return;
|
||||
}
|
||||
this.queueUrls(seedId, results, depth);
|
||||
return results;
|
||||
}
|
||||
|
||||
queueUrls(seedId, urls, depth) {
|
||||
queueInScopeUrls(seedId, urls, depth) {
|
||||
try {
|
||||
depth += 1;
|
||||
const seed = this.params.scopedSeeds[seedId];
|
||||
|
@ -619,7 +624,7 @@ class Crawler {
|
|||
|
||||
try {
|
||||
const { sites } = await sitemapper.fetch();
|
||||
this.queueUrls(seedId, sites, 0);
|
||||
this.queueInScopeUrls(seedId, sites, 0);
|
||||
} catch(e) {
|
||||
console.warn(e);
|
||||
}
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
{
|
||||
"name": "browsertrix-crawler",
|
||||
"version": "0.4.1",
|
||||
"version": "0.4.2",
|
||||
"main": "browsertrix-crawler",
|
||||
"repository": "https://github.com/webrecorder/browsertrix-crawler",
|
||||
"author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
|
||||
"license": "MIT",
|
||||
"scripts": {
|
||||
"lint": "eslint *.js util/*.js"
|
||||
"lint": "eslint *.js util/*.js tests/*.test.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"abort-controller": "^3.0.0",
|
||||
|
|
63
tests/basic_crawl.test.js
Normal file
63
tests/basic_crawl.test.js
Normal file
|
@ -0,0 +1,63 @@
|
|||
const child_process = require("child_process");
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const md5 = require("md5");
|
||||
|
||||
|
||||
test("ensure basic crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2");
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/wr-net/wr-net.wacz");
|
||||
|
||||
child_process.execSync("unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz");
|
||||
|
||||
});
|
||||
|
||||
test("check that a combined warc file exists in the archive folder", () => {
|
||||
const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
|
||||
var captureFound = 0;
|
||||
|
||||
for (var i = 0; i < warcLists.length; i++) {
|
||||
if (warcLists[i].endsWith("_0.warc.gz")){
|
||||
captureFound = 1;
|
||||
}
|
||||
}
|
||||
expect(captureFound).toEqual(1);
|
||||
});
|
||||
|
||||
|
||||
test("check that a combined warc file is under the rolloverSize", () => {
|
||||
const warcLists = fs.readdirSync(path.join("test-crawls/collections/wr-net/wacz", "archive"));
|
||||
let rolloverSize = 0;
|
||||
|
||||
function getFileSize(filename) {
|
||||
return fs.statSync(filename).size;
|
||||
}
|
||||
|
||||
for (let i = 0; i < warcLists.length; i++) {
|
||||
const size = getFileSize(path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
||||
if (size < 10000){
|
||||
rolloverSize = 1;
|
||||
}
|
||||
}
|
||||
expect(rolloverSize).toEqual(1);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
||||
const crawl_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const wacz_hash = md5(JSON.parse(fs.readFileSync("test-crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
|
||||
expect(wacz_hash).toEqual(fixture_hash);
|
||||
expect(wacz_hash).toEqual(crawl_hash);
|
||||
|
||||
});
|
||||
|
|
@ -6,7 +6,7 @@ test("check that the collection name is properly validated", async () => {
|
|||
let passed = "";
|
||||
|
||||
try{
|
||||
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid");
|
||||
passed = true;
|
||||
}
|
||||
catch (error) {
|
||||
|
@ -21,7 +21,7 @@ test("check that the collection name is not accepted if it doesn't meets our sta
|
|||
let passed = "";
|
||||
|
||||
try{
|
||||
await exec("docker-compose run crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
|
||||
await exec("docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid");
|
||||
passed = true;
|
||||
}
|
||||
catch(e){
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
const fs = require("fs");
|
||||
|
||||
test("check that a combined warc file exists in the archive folder", () => {
|
||||
const warcLists = fs.readdirSync("crawls/collections/wr-net");
|
||||
var captureFound = 0;
|
||||
|
||||
for (var i = 0; i < warcLists.length; i++) {
|
||||
if (warcLists[i].endsWith("_0.warc.gz")){
|
||||
captureFound = 1;
|
||||
}
|
||||
}
|
||||
expect(captureFound).toEqual(1);
|
||||
});
|
|
@ -8,13 +8,13 @@ test("check yaml config file with seed list is used", async () => {
|
|||
|
||||
try{
|
||||
|
||||
await exec("docker-compose run -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection configtest --config /tests/fixtures/crawl-1.yaml --depth 0");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("crawls/collections/configtest/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest/pages/pages.jsonl", "utf8");
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -36,7 +36,7 @@ test("check yaml config file with seed list is used", async () => {
|
|||
}
|
||||
expect(foundAllSeeds).toBe(true);
|
||||
|
||||
expect(fs.existsSync("crawls/collections/configtest/configtest.wacz")).toBe(true);
|
||||
expect(fs.existsSync("test-crawls/collections/configtest/configtest.wacz")).toBe(true);
|
||||
|
||||
});
|
||||
|
||||
|
@ -45,22 +45,23 @@ test("check yaml config file will be overwritten by command line", async () => {
|
|||
|
||||
try{
|
||||
|
||||
await exec("docker-compose run -v $PWD/crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection configtest --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://www.example.com --timeout 20000");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("crawls/collections/configtest/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/configtest-2/pages/pages.jsonl", "utf8");
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
pages.add(JSON.parse(line).url);
|
||||
const url = JSON.parse(line).url;
|
||||
if (url) {
|
||||
pages.add(url);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
expect(pages.has("https://www.example.com/")).toBe(true);
|
||||
|
||||
expect(pages.size).toBe(1);
|
||||
|
||||
});
|
||||
|
|
|
@ -9,7 +9,7 @@ test("pass config file via stdin", async () => {
|
|||
const config = yaml.load(configYaml);
|
||||
|
||||
try {
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ test("pass config file via stdin", async () => {
|
|||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/config-stdin/pages/pages.jsonl", "utf8");
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
|
@ -39,6 +39,6 @@ test("pass config file via stdin", async () => {
|
|||
}
|
||||
expect(foundAllSeeds).toBe(true);
|
||||
|
||||
expect(fs.existsSync("crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
|
||||
expect(fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz")).toBe(true);
|
||||
|
||||
});
|
||||
|
|
34
tests/custom_driver.test.js
Normal file
34
tests/custom_driver.test.js
Normal file
|
@ -0,0 +1,34 @@
|
|||
const child_process = require("child_process");
|
||||
const fs = require("fs");
|
||||
|
||||
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
||||
jest.setTimeout(30000);
|
||||
|
||||
try {
|
||||
child_process.execSync("docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.js");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
const crawledPages = fs.readFileSync("test-crawls/collections/custom-driver-1/pages/pages.jsonl", "utf8");
|
||||
const pages = new Set();
|
||||
|
||||
for (const line of crawledPages.trim().split("\n")) {
|
||||
const url = JSON.parse(line).url;
|
||||
if (!url) {
|
||||
continue;
|
||||
}
|
||||
pages.add(url);
|
||||
}
|
||||
|
||||
const expectedPages = new Set([
|
||||
"https://www.iana.org/",
|
||||
"https://www.iana.org/_js/2013.1/jquery.js",
|
||||
"https://www.iana.org/_js/2013.1/iana.js"
|
||||
]);
|
||||
|
||||
expect(pages).toEqual(expectedPages);
|
||||
|
||||
});
|
||||
|
2
tests/fixtures/crawl-1.yaml
vendored
2
tests/fixtures/crawl-1.yaml
vendored
|
@ -1,5 +1,5 @@
|
|||
name: crawl-test-1
|
||||
|
||||
collection: configtest
|
||||
seeds:
|
||||
- https://www.example.org
|
||||
- https://www.iana.org/
|
||||
|
|
4
tests/fixtures/driver-1.js
vendored
Normal file
4
tests/fixtures/driver-1.js
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
module.exports = async ({data, page, crawler}) => {
|
||||
await crawler.loadPage(page, data, [{selector: "script[src]", extract: "src", isAttribute: false}]);
|
||||
};
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
function getFileSize(filename) {
|
||||
var stats = fs.statSync(filename);
|
||||
return stats.size;
|
||||
}
|
||||
|
||||
test("check that a combined warc file is under the rolloverSize", () => {
|
||||
const warcLists = fs.readdirSync(path.join("crawls/collections/wr-net/wacz", "archive"));
|
||||
var rolloverSize = 0;
|
||||
|
||||
for (var i = 0; i < warcLists.length; i++) {
|
||||
var size = getFileSize(path.join("crawls/collections/wr-net/wacz/archive/", warcLists[i]));
|
||||
if (size < 10000){
|
||||
rolloverSize = 1;
|
||||
}
|
||||
}
|
||||
expect(rolloverSize).toEqual(1);
|
||||
});
|
|
@ -6,13 +6,13 @@ function getSeeds(config) {
|
|||
const orig = fs.readFileSync;
|
||||
|
||||
fs.readFileSync = (name, ...args) => {
|
||||
if (name.endsWith("/configtest")) {
|
||||
if (name.endsWith("/stdinconfig")) {
|
||||
return config;
|
||||
}
|
||||
return orig(name, ...args);
|
||||
};
|
||||
|
||||
return parseArgs(["node", "crawler", "--config", "configtest"]).scopedSeeds;
|
||||
return parseArgs(["node", "crawler", "--config", "stdinconfig"]).scopedSeeds;
|
||||
}
|
||||
|
||||
test("default scope", async () => {
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
const fs = require("fs");
|
||||
const md5 = require("md5");
|
||||
|
||||
|
||||
test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
|
||||
expect(fs.existsSync("crawls/collections/wr-net/pages/pages.jsonl")).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
|
||||
expect(fs.existsSync("crawls/collections/wr-net/wacz/pages/pages.jsonl")).toBe(true);
|
||||
});
|
||||
|
||||
test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
|
||||
const crawl_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/wacz/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const wacz_hash = md5(JSON.parse(fs.readFileSync("crawls/collections/wr-net/pages/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
const fixture_hash = md5(JSON.parse(fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1])["text"]);
|
||||
|
||||
expect(wacz_hash).toEqual(fixture_hash);
|
||||
expect(wacz_hash).toEqual(crawl_hash);
|
||||
|
||||
});
|
||||
|
|
@ -7,13 +7,13 @@ test("check that all urls in a file list are crawled when the filelisturl param
|
|||
|
||||
try{
|
||||
|
||||
await exec("docker-compose run -v $PWD/tests/fixtures:/tests/fixtures crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
|
||||
await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 10000");
|
||||
}
|
||||
catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
let crawled_pages = fs.readFileSync("crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
|
||||
let crawled_pages = fs.readFileSync("test-crawls/collections/filelisttest/pages/pages.jsonl", "utf8");
|
||||
let seed_file = fs.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8").split("\n").sort();
|
||||
|
||||
let seed_file_list = [];
|
||||
|
|
|
@ -7,7 +7,7 @@ test("check that the warcinfo file works as expected on the command line", async
|
|||
|
||||
try{
|
||||
const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
const proc = child_process.execSync("docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", {input: configYaml, stdin: "inherit", encoding: "utf8"});
|
||||
|
||||
console.log(proc);
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ test("check that the warcinfo file works as expected on the command line", async
|
|||
console.log(error);
|
||||
}
|
||||
|
||||
const warcData = fs.readFileSync("crawls/collections/warcinfo/warcinfo_0.warc.gz");
|
||||
const warcData = fs.readFileSync("test-crawls/collections/warcinfo/warcinfo_0.warc.gz");
|
||||
|
||||
const data = zlib.gunzipSync(warcData);
|
||||
|
||||
|
|
|
@ -3,3 +3,9 @@ module.exports.HTML_TYPES = ["text/html", "application/xhtml", "application/xhtm
|
|||
module.exports.WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
module.exports.BEHAVIOR_LOG_FUNC = "__bx_log";
|
||||
|
||||
module.exports.DEFAULT_SELECTORS = [{
|
||||
selector: "a[href]",
|
||||
extract: "href",
|
||||
isAttribute: false
|
||||
}];
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue