browsertrix-crawler/tests/url_file_list.test.js

import util from "util";
import { exec as execCallback } from "child_process";
import fs from "fs";

const exec = util.promisify(execCallback);

test("check that URLs in seed-list are crawled", async () => {
  try {
    await exec(
      "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
    );
  } catch (error) {
    console.log(error);
  }

  let crawled_pages = fs.readFileSync(
    "test-crawls/collections/filelisttest/pages/pages.jsonl",
    "utf8",
  );
  let seed_file = fs
    .readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
    .split("\n")
    .sort();

  let seed_file_list = [];
  for (var j = 0; j < seed_file.length; j++) {
    if (seed_file[j] != undefined) {
      seed_file_list.push(seed_file[j]);
    }
  }

  let foundSeedUrl = true;

  for (var i = 1; i < seed_file_list.length; i++) {
    if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
      foundSeedUrl = false;
    }
  }
  expect(foundSeedUrl).toBe(true);
});
Convert to ESM (#179) * switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0 2022-10-24 15:30:10 +02:00			`import util from "util";`
Add Prettier to the repo, and format all the files! (#428) This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed. 2023-11-09 19:11:11 -05:00			`import { exec as execCallback } from "child_process";`
Convert to ESM (#179) * switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0 2022-10-24 15:30:10 +02:00			`import fs from "fs";`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00
Convert to ESM (#179) * switch base image to chrome/chromium 105 with node 18.x * convert all source to esm for node 18.x, remove unneeded node-fetch dependency * ci: use node 18.x, update to latest actions * tests: convert to esm, run with --experimental-vm-modules * tests: set higher default timeout (90s) for all tests * tests: rename driver test fixture to .mjs for loading in jest * bump to 0.8.0 2022-10-24 15:30:10 +02:00			`const exec = util.promisify(execCallback);`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00
Implement improved json-l logging - Add Logger class with methods for info, error, warn, debug, fatal - Add context, timestamp, and details fields to log entries - Log messages as JSON Lines - Replace puppeteer-cluster stats with custom stats implementation - Log behaviors by default - Amend argParser to reflect logging changes - Capture and log stdout/stderr from awaited child_processes - Modify tests to use webrecorder.net to avoid timeouts 2022-12-15 12:38:41 -05:00			`test("check that URLs in seed-list are crawled", async () => {`
Support Extra Hops beyond current scope with --extraHops option (#98) * extra hops depth: add support for --extraHops option, which expands the inclusion scope to go N 'extra hops' beyond the existing scope. fixes most common use case in #83 * update README with info on `extraHops`, add tests for extraHops * dependency fix: use pywb 2.6.3, warcio 1.5.0 * bump to 0.5.0-beta.2 2022-01-15 09:03:09 -08:00			`try {`
Add Prettier to the repo, and format all the files! (#428) This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed. 2023-11-09 19:11:11 -05:00			`await exec(`
			`"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",`
			`);`
			`} catch (error) {`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00			`console.log(error);`
			`}`

Add Prettier to the repo, and format all the files! (#428) This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed. 2023-11-09 19:11:11 -05:00			`let crawled_pages = fs.readFileSync(`
			`"test-crawls/collections/filelisttest/pages/pages.jsonl",`
			`"utf8",`
			`);`
			`let seed_file = fs`
			`.readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")`
			`.split("\n")`
			`.sort();`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00
			`let seed_file_list = [];`
			`for (var j = 0; j < seed_file.length; j++) {`
Add Prettier to the repo, and format all the files! (#428) This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed. 2023-11-09 19:11:11 -05:00			`if (seed_file[j] != undefined) {`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00			`seed_file_list.push(seed_file[j]);`
			`}`
			`}`
Add screenshot functionality (#188) * Add screenshot and thumbnail functionality Introduces a --screenshot CLI option, which takes a comma-separated list of screenshot types: view,fullPage,thumbnail. In addition, this commit: - Adds '--experimental-global-webcrypto' to ensure webcrypto is available in node - Deprecates newContext, instead always using page context for 1 worker and window context for >1 worker * Separate screenshotTypes into exported const Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Air.local> 2022-12-21 12:06:13 -05:00
			`let foundSeedUrl = true;`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00
			`for (var i = 1; i < seed_file_list.length; i++) {`
Add Prettier to the repo, and format all the files! (#428) This adds prettier to the repo, and sets up the pre-commit hook to auto-format as well as lint. Also updates ignores files to exclude crawls, test-crawls, scratch, dist as needed. 2023-11-09 19:11:11 -05:00			`if (crawled_pages.indexOf(seed_file_list[i]) == -1) {`
Add --urlFile param to specify text file with a list of URLs to crawl (#38) * Resolves #12 * Make --url param optional. Only one of --url of --urlFile should be specified. * Add ignoreScope option queueUrls() to support adding specific URLs * add tests for urlFile * bump version to 0.3.2 Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> 2021-05-13 01:57:06 -04:00			`foundSeedUrl = false;`
			`}`
			`}`
			`expect(foundSeedUrl).toBe(true);`
			`});`