browsertrix-crawler/tests/exclude-redirected.test.js

import fs from "fs";
import { execSync } from "child_process";

// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded

test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
  execSync(
      "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");

  // no entries besides header
  expect(
    fs
      .readFileSync(
        "test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
        "utf8",
      ).trim().split("\n").length
  ).toBe(1);
  
});
Apply exclusions to redirects (#745) - if redirected page is excluded, block loading of page - mark page as excluded, don't retry, and don't write to page list - support generic blocking of pages based on initial page response - fixes #744 2025-01-28 11:28:23 -08:00			`import fs from "fs";`
			`import { execSync } from "child_process";`

			`// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'`
			`// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded`

			`test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {`
			`execSync(`
			`"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");`

			`// no entries besides header`
			`expect(`
			`fs`
			`.readFileSync(`
			`"test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",`
			`"utf8",`
			`).trim().split("\n").length`
			`).toBe(1);`

			`});`