browsertrix-crawler/tests/sitemap-parse.test.js

import child_process from "child_process";
import Redis from "ioredis";

function sleep(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

async function waitContainer(containerId) {
  try {
    child_process.execSync(`docker kill -s SIGINT ${containerId}`);
  } catch (e) {
    return;
  }

  // containerId is initially the full id, but docker ps
  // only prints the short id (first 12 characters)
  containerId = containerId.slice(0, 12);

  while (true) {
    try {
      const res = child_process.execSync("docker ps -q", { encoding: "utf-8" });
      if (res.indexOf(containerId) < 0) {
        return;
      }
    } catch (e) {
      console.error(e);
    }
    await sleep(500);
  }
}

async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
  const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`;
  const containerId = child_process.execSync(command, {encoding: "utf-8"});

  await sleep(3000);

  const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null });

  let finished = 0;

  try {
    await redis.connect({
      maxRetriesPerRequest: 100,
    });

    while (true) {
      finished = await redis.zcard("test:q");

      if (await redis.get("test:sitemapDone")) {
        break;
      }
      if (finished >= numExpected) {
        break;
      }
    }
  } catch (e) {
    console.error(e);
  } finally {
    await waitContainer(containerId);
  }

  expect(finished).toBeGreaterThanOrEqual(numExpected);

  if (numExpectedLessThan) {
    expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
  }
}

test("test sitemap fully finish", async () => {
  await runCrawl(3500, "https://www.mozilla.org/", "", 0);
});

test("test sitemap with limit", async () => {
  await runCrawl(1900, "https://www.mozilla.org/", "", 2000);
});

test("test sitemap with limit, specific URL", async () => {
  await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
});

test("test sitemap with application/xml content-type", async () => {
  await runCrawl(10, "https://bitarchivist.net/", "", 0);
});

test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
  await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
});
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00			`import child_process from "child_process";`
			`import Redis from "ioredis";`

			`function sleep(ms) {`
			`return new Promise((resolve) => setTimeout(resolve, ms));`
			`}`

			`async function waitContainer(containerId) {`
			`try {`
			child_process.execSync(`docker kill -s SIGINT ${containerId}`);
			`} catch (e) {`
			`return;`
			`}`

			`// containerId is initially the full id, but docker ps`
			`// only prints the short id (first 12 characters)`
			`containerId = containerId.slice(0, 12);`

			`while (true) {`
			`try {`
			`const res = child_process.execSync("docker ps -q", { encoding: "utf-8" });`
			`if (res.indexOf(containerId) < 0) {`
			`return;`
			`}`
			`} catch (e) {`
			`console.error(e);`
			`}`
			`await sleep(500);`
			`}`
			`}`

Fixes from 1.0.3 release -> main (#517) sitemap improvements: gz support + application/xml + extraHops fix #511 - follow up to https://github.com/webrecorder/browsertrix-crawler/issues/496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) fixes redirected seed (from #476) being counted against page limit: #509 - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508 2024-03-26 14:50:36 -07:00			`async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {`
Better default crawlId (#806) - set crawl id from collection, not other way around, to ensure unique redis keyspace for different collections - by default, set crawl id to unique value based on host and collection, eg. '@hostname-@id' - don't include '@id' in collection interpolation, can only used hostname or timestamp - fixes issue mentioned / workaround provided in #784 - ci: add docker login + cacheing to work around rate limits - tests: fix sitemap tests 2025-04-01 13:40:03 -07:00			const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`;
			`const containerId = child_process.execSync(command, {encoding: "utf-8"});`
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00
QA Crawl Support (Beta) (#469) Initial (beta) support for QA/replay crawling! - Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page - Runs local http server with full-page, ui-less ReplayWeb.page embed - ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint. - Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd - Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified. - Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff images. - If using --writePagesToRedis, a `comparison` key is added to existing page data where: ``` comparison: { screenshotMatch?: number; textMatch?: number; resourceCounts: { crawlGood?: number; crawlBad?: number; replayGood?: number; replayBad?: number; }; }; ``` - bump version to 1.1.0-beta.2 2024-03-22 17:32:42 -07:00			`await sleep(3000);`
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00
QA Crawl Support (Beta) (#469) Initial (beta) support for QA/replay crawling! - Supports running a crawl over a given WACZ / list of WACZ (multi WACZ) input, hosted in ReplayWeb.page - Runs local http server with full-page, ui-less ReplayWeb.page embed - ReplayWeb.page release version configured in the Dockerfile, pinned ui.js and sw.js fetched directly from cdnjs Can be deployed with `webrecorder/browsertrix-crawler qa` entrypoint. - Requires `--qaSource`, pointing to WACZ or multi-WACZ json that will be replay/QAd - Also supports `--qaRedisKey` where QA comparison data will be pushed, if specified. - Supports `--qaDebugImageDiff` for outputting crawl / replay/ diff images. - If using --writePagesToRedis, a `comparison` key is added to existing page data where: ``` comparison: { screenshotMatch?: number; textMatch?: number; resourceCounts: { crawlGood?: number; crawlBad?: number; replayGood?: number; replayBad?: number; }; }; ``` - bump version to 1.1.0-beta.2 2024-03-22 17:32:42 -07:00			`const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null });`
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00
			`let finished = 0;`

			`try {`
			`await redis.connect({`
			`maxRetriesPerRequest: 100,`
			`});`

			`while (true) {`
			`finished = await redis.zcard("test:q");`

Better default crawlId (#806) - set crawl id from collection, not other way around, to ensure unique redis keyspace for different collections - by default, set crawl id to unique value based on host and collection, eg. '@hostname-@id' - don't include '@id' in collection interpolation, can only used hostname or timestamp - fixes issue mentioned / workaround provided in #784 - ci: add docker login + cacheing to work around rate limits - tests: fix sitemap tests 2025-04-01 13:40:03 -07:00			`if (await redis.get("test:sitemapDone")) {`
			`break;`
			`}`
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00			`if (finished >= numExpected) {`
			`break;`
			`}`
			`}`
			`} catch (e) {`
			`console.error(e);`
			`} finally {`
			`await waitContainer(containerId);`
			`}`

			`expect(finished).toBeGreaterThanOrEqual(numExpected);`
Fixes from 1.0.3 release -> main (#517) sitemap improvements: gz support + application/xml + extraHops fix #511 - follow up to https://github.com/webrecorder/browsertrix-crawler/issues/496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) fixes redirected seed (from #476) being counted against page limit: #509 - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508 2024-03-26 14:50:36 -07:00
			`if (numExpectedLessThan) {`
			`expect(finished).toBeLessThanOrEqual(numExpectedLessThan);`
			`}`
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00			`}`

			`test("test sitemap fully finish", async () => {`
Better default crawlId (#806) - set crawl id from collection, not other way around, to ensure unique redis keyspace for different collections - by default, set crawl id to unique value based on host and collection, eg. '@hostname-@id' - don't include '@id' in collection interpolation, can only used hostname or timestamp - fixes issue mentioned / workaround provided in #784 - ci: add docker login + cacheing to work around rate limits - tests: fix sitemap tests 2025-04-01 13:40:03 -07:00			`await runCrawl(3500, "https://www.mozilla.org/", "", 0);`
SAX-based sitemap parser (#497) Adds a new SAX-based sitemap parser, inspired by: https://www.npmjs.com/package/sitemap-stream-parser Supports: - recursively parsing sitemap indexes, using p-queue to process N at a time (currently 5) - `fromDate` and `toDate` filter dates, to only include URLs between the given dates, filtering nested sitemap lists included - async parsing, continue parsing in the background after 100 URLs - timeout for initial fetch / first 100 URLs set to 30 seconds to avoid slowing down the crawl - save/load state integration: mark if sitemaps have already been parsed in redis, serialize to save state, to avoid reparsing again. (Will reparse if parsing did not fully finish) - Aware of `pageLimit`, don't add URLs pass the page limit, interrupt further parsing when at limit. - robots.txt `sitemap:` parsing, check URL extension and mime type - automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt, then /sitemap.xml - tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL. Fixes #496 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> 2024-03-18 19:14:07 -07:00			`});`

			`test("test sitemap with limit", async () => {`
			`await runCrawl(1900, "https://www.mozilla.org/", "", 2000);`
			`});`

			`test("test sitemap with limit, specific URL", async () => {`
			`await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);`
			`});`
Fixes from 1.0.3 release -> main (#517) sitemap improvements: gz support + application/xml + extraHops fix #511 - follow up to https://github.com/webrecorder/browsertrix-crawler/issues/496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) fixes redirected seed (from #476) being counted against page limit: #509 - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508 2024-03-26 14:50:36 -07:00
			`test("test sitemap with application/xml content-type", async () => {`
			`await runCrawl(10, "https://bitarchivist.net/", "", 0);`
			`});`

don't disable extraHops when using sitemaps: (#639) - instead, exclude sitemap-discovered page URLs from being counted to extra hops rules, eg. if a sitemap page is not in scope, don't include it. -if extraHops is set with sitemaps, only consider extraHops for links for pages that are in scope. - bump version to 1.2.4 2024-07-11 19:48:43 -07:00			`test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {`
Better default crawlId (#806) - set crawl id from collection, not other way around, to ensure unique redis keyspace for different collections - by default, set crawl id to unique value based on host and collection, eg. '@hostname-@id' - don't include '@id' in collection interpolation, can only used hostname or timestamp - fixes issue mentioned / workaround provided in #784 - ci: add docker login + cacheing to work around rate limits - tests: fix sitemap tests 2025-04-01 13:40:03 -07:00			`await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");`
Fixes from 1.0.3 release -> main (#517) sitemap improvements: gz support + application/xml + extraHops fix #511 - follow up to https://github.com/webrecorder/browsertrix-crawler/issues/496 - support parsing sitemap urls that end in .gz with gzip decompression - support both `application/xml` and `text/xml` as valid sitemap content-types (add test for both) - ignore extraHops for sitemap found URLs by setting to past extraHops limit (otherwise, all sitemap URLs would be treated as links from seed page) fixes redirected seed (from #476) being counted against page limit: #509 - subtract extraSeeds when computing limit - don't include redirect seeds in seen list when serializing - tests: adjust saved-state-test to also check total pages when crawl is done fixes #508 2024-03-26 14:50:36 -07:00			`});`