browsertrix-crawler/tests/extra_hops_depth.test.js

const util = require("util");
const exec = util.promisify(require("child_process").exec);
const fs = require("fs");

test("check that URLs are crawled 2 extra hops beyond depth", async () => {
  jest.setTimeout(120000);

  try {
    await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");
  }
  catch (error) {
    console.log(error);
  }

  const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");

  const expectedPages = [
    "https://example.com/",
    "https://www.iana.org/domains/example",
    "http://www.iana.org/",
    "http://www.iana.org/domains",
    "http://www.iana.org/protocols",
    "http://www.iana.org/numbers",
    "http://www.iana.org/about",
  ];

  for (const page of crawled_pages.trim().split("\n")) {
    const url = JSON.parse(page).url;
    if (!url) {
      continue;
    }
    expect(expectedPages.indexOf(url) >= 0).toBe(true);
  }
});
Support Extra Hops beyond current scope with --extraHops option (#98) * extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83 * update README with info on `extraHops`, add tests for extraHops * dependency fix: use pywb 2.6.3, warcio 1.5.0 * bump to 0.5.0-beta.2 2022-01-15 09:03:09 -08:00			`const util = require("util");`
			`const exec = util.promisify(require("child_process").exec);`
			`const fs = require("fs");`

			`test("check that URLs are crawled 2 extra hops beyond depth", async () => {`
Fix for warcio.js (#178) * dependency fix: set warcio to 1.5.1 until we update to esm support bump test timeout fixes #175 bump to 0.7.1 2022-10-24 08:20:01 +02:00			`jest.setTimeout(120000);`
Support Extra Hops beyond current scope with --extraHops option (#98) * extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83 * update README with info on `extraHops`, add tests for extraHops * dependency fix: use pywb 2.6.3, warcio 1.5.0 * bump to 0.5.0-beta.2 2022-01-15 09:03:09 -08:00
			`try {`
			`await exec("docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://example.com/ --limit 7");`
			`}`
			`catch (error) {`
			`console.log(error);`
			`}`

			`const crawled_pages = fs.readFileSync("test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", "utf8");`

			`const expectedPages = [`
			`"https://example.com/",`
			`"https://www.iana.org/domains/example",`
			`"http://www.iana.org/",`
			`"http://www.iana.org/domains",`
			`"http://www.iana.org/protocols",`
			`"http://www.iana.org/numbers",`
			`"http://www.iana.org/about",`
			`];`

			`for (const page of crawled_pages.trim().split("\n")) {`
			`const url = JSON.parse(page).url;`
			`if (!url) {`
			`continue;`
			`}`
			`expect(expectedPages.indexOf(url) >= 0).toBe(true);`
			`}`
			`});`