SAX-based sitemap parser (#497)

Adds a new SAX-based sitemap parser, inspired by:
https://www.npmjs.com/package/sitemap-stream-parser

Supports:
- recursively parsing sitemap indexes, using p-queue to process N at a
time (currently 5)
- `fromDate` and `toDate` filter dates, to only include URLs between the given
dates, filtering nested sitemap lists included
- async parsing, continue parsing in the background after 100 URLs
- timeout for initial fetch / first 100 URLs set to 30 seconds to avoid
slowing down the crawl
- save/load state integration: mark if sitemaps have already been parsed
in redis, serialize to save state, to avoid reparsing again. (Will
reparse if parsing did not fully finish)
- Aware of `pageLimit`, don't add URLs pass the page limit, interrupt
further parsing when at limit.
- robots.txt `sitemap:` parsing, check URL extension and mime type
- automatic detection of sitemaps for a seed URL if no sitemap url provided - first check robots.txt,
then /sitemap.xml
- tests: test for full sitemap autodetect, sitemap with limit, and sitemap from specific URL.

Fixes #496 

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-03-18 19:14:07 -07:00 committed by GitHub
parent 5060e6b0b1
commit 56053534c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 665 additions and 208 deletions

View file

@ -10,12 +10,20 @@ See [page.goto waitUntil options](https://pptr.dev/api/puppeteer.page.goto#remar
The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first. The `--pageLoadTimeout`/`--timeout` option sets the timeout in seconds for page load, defaulting to 90 seconds. Behaviors will run on the page once either the page load condition or the page load timeout is met, whichever happens first.
## Ad blocking ## Ad Blocking
Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md). Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md).
Browsertrix Crawler also supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking based on this list, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place. Browsertrix Crawler also supports blocking ads from being loaded during capture based on [Stephen Black's list of known ad hosts](https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts). To enable ad blocking based on this list, use the `--blockAds` option. If `--adBlockMessage` is set, a record with the specified error message will be added in the ad's place.
## Sitemap Parsing
The `--sitemap` option can be used to have the crawler parse a sitemap and queue any found URLs while respecting the crawl's scoping rules and limits. Browsertrix Crawler is able to parse regular sitemaps as well as sitemap indices that point out to nested sitemaps.
By default, `--sitemap` will look for a sitemap at `<your-seed>/sitemap.xml`. If a website's sitemap is hosted at a different URL, pass the URL with the flag like `--sitemap <sitemap url>`.
The `--sitemapFrom`/`--sitemapFromDate` and `--sitemapTo`/`--sitemapToDate` options allow for only extracting pages within a specific date range. If set, these options will filter URLs from sitemaps to those greater than or equal to (>=) or lesser than or equal to (<=) a provided ISO Date string (`YYYY-MM-DD`, `YYYY-MM-DDTHH:MM:SS`, or partial date), respectively.
## Custom Warcinfo Fields ## Custom Warcinfo Fields
Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARC. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line. Custom fields can be added to the `warcinfo` WARC record, generated for each combined WARC. The fields can be specified in the YAML config under `warcinfo` section or specifying individually via the command-line.

View file

@ -17,6 +17,7 @@
}, },
"dependencies": { "dependencies": {
"@novnc/novnc": "^1.4.0", "@novnc/novnc": "^1.4.0",
"@types/sax": "^1.2.7",
"@webrecorder/wabac": "^2.16.12", "@webrecorder/wabac": "^2.16.12",
"browsertrix-behaviors": "^0.5.3", "browsertrix-behaviors": "^0.5.3",
"crc": "^4.3.2", "crc": "^4.3.2",
@ -27,8 +28,8 @@
"minio": "^7.1.3", "minio": "^7.1.3",
"p-queue": "^7.3.4", "p-queue": "^7.3.4",
"puppeteer-core": "^20.8.2", "puppeteer-core": "^20.8.2",
"sax": "^1.3.0",
"sharp": "^0.32.6", "sharp": "^0.32.6",
"sitemapper": "^3.2.6",
"tsc": "^2.0.4", "tsc": "^2.0.4",
"uuid": "8.3.2", "uuid": "8.3.2",
"warcio": "^2.2.1", "warcio": "^2.2.1",

View file

@ -13,7 +13,6 @@ import {
PageCallbacks, PageCallbacks,
} from "./util/state.js"; } from "./util/state.js";
import Sitemapper from "sitemapper";
import yaml from "js-yaml"; import yaml from "js-yaml";
import * as warcio from "warcio"; import * as warcio from "warcio";
@ -53,6 +52,8 @@ import { OriginOverride } from "./util/originoverride.js";
import { Agent as HTTPAgent } from "http"; import { Agent as HTTPAgent } from "http";
import { Agent as HTTPSAgent } from "https"; import { Agent as HTTPSAgent } from "https";
import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core"; import { CDPSession, Frame, HTTPRequest, Page } from "puppeteer-core";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed } from "./util/seeds.js";
const HTTPS_AGENT = new HTTPSAgent({ const HTTPS_AGENT = new HTTPSAgent({
rejectUnauthorized: false, rejectUnauthorized: false,
@ -70,6 +71,7 @@ const behaviors = fs.readFileSync(
const FETCH_TIMEOUT_SECS = 30; const FETCH_TIMEOUT_SECS = 30;
const PAGE_OP_TIMEOUT_SECS = 5; const PAGE_OP_TIMEOUT_SECS = 5;
const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
const POST_CRAWL_STATES = [ const POST_CRAWL_STATES = [
"generate-wacz", "generate-wacz",
@ -1225,7 +1227,13 @@ self.__bx_behaviors.selectMainBehavior();
} }
if (seed.sitemap) { if (seed.sitemap) {
await this.parseSitemap(seed.sitemap, i, this.params.sitemapFromDate); await timedRun(
this.parseSitemap(seed, i),
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
"Sitemap initial fetch timed out",
{ sitemap: seed.sitemap, seed: seed.url },
"sitemap",
);
} }
} }
@ -2036,40 +2044,86 @@ self.__bx_behaviors.selectMainBehavior();
return false; return false;
} }
async parseSitemap(url: string, seedId: number, sitemapFromDate: number) { async parseSitemap({ url, sitemap }: ScopedSeed, seedId: number) {
// handle sitemap last modified date if passed if (!sitemap) {
let lastmodFromTimestamp = undefined; return;
const dateObj = new Date(sitemapFromDate);
if (isNaN(dateObj.getTime())) {
logger.info(
"Fetching full sitemap (fromDate not specified/valid)",
{ url, sitemapFromDate },
"sitemap",
);
} else {
lastmodFromTimestamp = dateObj.getTime();
logger.info(
"Fetching and filtering sitemap by date",
{ url, sitemapFromDate },
"sitemap",
);
} }
// eslint-disable-next-line @typescript-eslint/no-explicit-any if (await this.crawlState.isSitemapDone()) {
const sitemapper = new (Sitemapper as any)({ logger.info("Sitemap already processed, skipping", "sitemap");
url, return;
timeout: 15000, }
requestHeaders: this.headers,
lastmod: lastmodFromTimestamp, const fromDate = this.params.sitemapFromDate;
const toDate = this.params.sitemapToDate;
const headers = this.headers;
logger.info(
"Fetching sitemap",
{ from: fromDate || "<any date>", to: fromDate || "<any date>" },
"sitemap",
);
const sitemapper = new SitemapReader({
headers,
fromDate,
toDate,
limit: this.pageLimit,
}); });
try { try {
const { sites } = await sitemapper.fetch(); await sitemapper.parse(sitemap, url);
logger.info("Sitemap Urls Found", { urls: sites.length }, "sitemap");
await this.queueInScopeUrls(seedId, sites, 0);
} catch (e) { } catch (e) {
logger.warn("Error fetching sites from sitemap", e, "sitemap"); logger.warn(
"Sitemap for seed failed",
{ url, sitemap, ...formatErr(e) },
"sitemap",
);
return;
} }
let power = 1;
let resolved = false;
let finished = false;
await new Promise<void>((resolve) => {
sitemapper.on("end", () => {
resolve();
if (!finished) {
logger.info(
"Sitemap Parsing Finished",
{ urlsFound: sitemapper.count, limitHit: sitemapper.atLimit() },
"sitemap",
);
this.crawlState.markSitemapDone();
finished = true;
}
});
sitemapper.on("url", ({ url }) => {
const count = sitemapper.count;
if (count % 10 ** power === 0) {
if (count % 10 ** (power + 1) === 0 && power <= 3) {
power++;
}
const sitemapsQueued = sitemapper.getSitemapsQueued();
logger.debug(
"Sitemap URLs processed so far",
{ count, sitemapsQueued },
"sitemap",
);
}
this.queueInScopeUrls(seedId, [url], 0);
if (count >= 100 && !resolved) {
logger.info(
"Sitemap partially parsed, continue parsing large sitemap in the background",
{ urlsFound: count },
"sitemap",
);
resolve();
resolved = true;
}
});
});
} }
async combineWARC() { async combineWARC() {

View file

@ -287,7 +287,13 @@ class ArgParser {
sitemapFromDate: { sitemapFromDate: {
alias: "sitemapFrom", alias: "sitemapFrom",
describe: describe:
"If set, filter URLs from sitemaps to those greater than or equal to provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)", "If set, filter URLs from sitemaps to those greater than or equal to (>=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
},
sitemapToDate: {
alias: "sitemapTo",
describe:
"If set, filter URLs from sitemaps to those less than or equal to (<=) provided ISO Date string (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or partial date)",
}, },
statsFilename: { statsFilename: {

View file

@ -9,6 +9,9 @@ export const WAIT_UNTIL_OPTS = [
"networkidle0", "networkidle0",
"networkidle2", "networkidle2",
]; ];
export const DETECT_SITEMAP = "<detect>";
export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
export const BEHAVIOR_LOG_FUNC = "__bx_log"; export const BEHAVIOR_LOG_FUNC = "__bx_log";

View file

@ -144,12 +144,9 @@ export class ScopedSeed {
resolveSiteMap(sitemap: boolean | string | null): string | null { resolveSiteMap(sitemap: boolean | string | null): string | null {
if (sitemap === true) { if (sitemap === true) {
const url = new URL(this.url); return "<detect>";
url.pathname = "/sitemap.xml";
return url.href;
} else if (typeof sitemap === "string") { } else if (typeof sitemap === "string") {
const url = new URL(sitemap, this.url); return sitemap;
return url.href;
} }
return null; return null;

441
src/util/sitemapper.ts Normal file
View file

@ -0,0 +1,441 @@
import { Readable } from "stream";
import { ReadableStream } from "node:stream/web";
import EventEmitter from "events";
import sax from "sax";
import PQueue from "p-queue";
import { logger, formatErr } from "./logger.js";
import { DETECT_SITEMAP } from "./constants.js";
import { sleep } from "./timing.js";
const SITEMAP_CONCURRENCY = 5;
export type SitemapOpts = {
headers?: Record<string, string>;
fromDate?: Date;
toDate?: Date;
limit?: number;
};
export class SitemapReader extends EventEmitter {
headers?: Record<string, string>;
fromDate?: Date;
toDate?: Date;
queue: PQueue;
seenSitemapSet: Set<string>;
pending: Set<string>;
count = 0;
limit: number;
constructor(opts: SitemapOpts) {
super();
this.headers = opts.headers;
this.queue = new PQueue({ concurrency: SITEMAP_CONCURRENCY });
this.fromDate = opts.fromDate;
this.toDate = opts.toDate;
this.seenSitemapSet = new Set<string>();
this.limit = opts.limit || 0;
this.pending = new Set<string>();
}
getCT(headers: Headers) {
const ct = headers.get("content-type");
if (!ct) {
return null;
}
return ct.split(";")[0];
}
async _fetchWithRetry(url: string, message: string) {
while (true) {
const resp = await fetch(url, { headers: this.headers });
if (resp.ok) {
return resp;
}
const retry = resp.headers.get("retry-after");
if (retry) {
logger.debug(
"Sitemap Fetch: Retry after",
{ retrySeconds: retry },
"sitemap",
);
await sleep(parseInt(retry));
continue;
}
logger.debug(message, { status: resp.status }, "sitemap");
return null;
}
}
async tryFetch(url: string, expectedCT?: string | null) {
try {
logger.debug(
"Detecting Sitemap: fetching",
{ url, expectedCT },
"sitemap",
);
const resp = await this._fetchWithRetry(
url,
"Detecting Sitemap: invalid status code",
);
if (!resp) {
return null;
}
const ct = resp.headers.get("content-type");
if (expectedCT && ct && ct.split(";")[0] != expectedCT) {
logger.debug(
"Detecting Sitemap: invalid content-type",
{ ct },
"sitemap",
);
return null;
}
return resp;
} catch (e) {
logger.debug("Detecting Sitemap: unknown error", e, "sitemap");
return null;
}
}
async parse(sitemap: string, seedUrl: string) {
let resp: Response | null = null;
let fullUrl: string | null = null;
let isRobots = false;
let isSitemap = false;
// if set to auto-detect, eg. --sitemap / --useSitemap with no URL
// 1. first check robots.txt
// 2. if not found, check /sitemap.xml
if (sitemap === DETECT_SITEMAP) {
logger.debug("Detecting sitemap for seed", { seedUrl }, "sitemap");
fullUrl = new URL("/robots.txt", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/plain");
if (resp) {
isRobots = true;
} else {
fullUrl = new URL("/sitemap.xml", seedUrl).href;
resp = await this.tryFetch(fullUrl, "text/xml");
if (resp) {
isSitemap = true;
}
}
} else {
// if specific URL provided, check if its a .xml file or a robots.txt file
fullUrl = new URL(sitemap, seedUrl).href;
let expected = null;
if (fullUrl.endsWith(".xml")) {
expected = "text/xml";
isSitemap = true;
} else if (fullUrl.endsWith(".txt")) {
expected = "text/plain";
isRobots = true;
}
resp = await this.tryFetch(fullUrl, expected);
}
// fail if no successful response fetched
if (!resp) {
logger.debug(
"Sitemap not found",
{ sitemap, seedUrl, fullUrl },
"sitemap",
);
throw new Error("not found");
}
// fail if neither an xml nor robots.txt
if (!isRobots && !isSitemap) {
logger.info("Sitemap not detected for seed", { seedUrl }, "sitemap");
throw new Error("not xml or robots.txt");
}
if (isRobots) {
logger.debug(
"Sitemap: parsing from robots.txt",
{ fullUrl, seedUrl },
"sitemap",
);
await this._parseRobotsFromResponse(resp);
} else if (isSitemap) {
logger.debug(
"Sitemap: parsing from top-level sitemap XML",
{ fullUrl, seedUrl },
"sitemap",
);
await this._parseSitemapFromResponse(fullUrl, resp);
}
}
async parseFromRobots(url: string) {
const resp = await this._fetchWithRetry(
url,
"Sitemap robots.txt parse failed",
);
if (!resp) {
return;
}
await this._parseRobotsFromResponse(resp);
}
private async _parseRobotsFromResponse(resp: Response) {
const text = await resp.text();
text.replace(/^Sitemap:\s?([^\s]+)$/gim, (m, url) => {
this.addNewSitemap(url, null);
return url;
});
}
async parseSitemap(url: string) {
this.seenSitemapSet.add(url);
const resp = await this._fetchWithRetry(url, "Sitemap parse failed");
if (!resp) {
return;
}
await this._parseSitemapFromResponse(url, resp);
}
private async _parseSitemapFromResponse(url: string, resp: Response) {
const readableNodeStream = Readable.fromWeb(
resp.body as ReadableStream<Uint8Array>,
);
this.initSaxParser(url, readableNodeStream);
}
initSaxParser(url: string, sourceStream: Readable) {
this.pending.add(url);
const parserStream = sax.createStream(false, {
trim: true,
normalize: true,
lowercase: true,
});
let parsingSitemapIndex = false;
let parsingSitemap = false;
let parsingUrlset = false;
let parsingUrl = false;
let parsingLoc = false;
let parsingLastmod = false;
let currUrl: string | null;
let lastmod: Date | null = null;
let otherTags = 0;
parserStream.on("end", async () => {
this.pending.delete(url);
if (!this.pending.size) {
await this.queue.onIdle();
this.emit("end");
}
});
parserStream.on("opentag", (node: sax.Tag) => {
switch (node.name) {
// Single Sitemap
case "url":
parsingUrl = true;
break;
case "loc":
parsingLoc = true;
break;
case "lastmod":
parsingLastmod = true;
break;
case "urlset":
parsingUrlset = true;
break;
// Sitemap Index
case "sitemapindex":
parsingSitemapIndex = true;
break;
case "sitemap":
parsingSitemap = true;
break;
default:
otherTags++;
}
});
parserStream.on("closetag", (tagName: string) => {
switch (tagName) {
// Single Sitemap
case "url":
this.emitEntry(currUrl, lastmod);
if (this.atLimit()) {
parserStream._parser.close();
}
currUrl = null;
lastmod = null;
parsingUrl = false;
break;
case "loc":
parsingLoc = false;
break;
case "lastmod":
parsingLastmod = false;
break;
case "urlset":
parsingUrlset = false;
break;
// Sitemap Index
case "sitemapindex":
parsingSitemapIndex = false;
break;
case "sitemap":
if (currUrl) {
this.addNewSitemap(currUrl, lastmod);
}
currUrl = null;
lastmod = null;
parsingSitemap = false;
break;
default:
otherTags--;
}
});
parserStream.on("text", (text: string) => {
if (parsingLoc) {
currUrl = text;
} else if (parsingLastmod) {
try {
lastmod = new Date(text);
} catch (e) {
lastmod = null;
}
} else if (!otherTags) {
if (parsingUrl) {
console.warn("text in url, ignoring");
} else if (parsingUrlset) {
console.warn("text in urlset, ignoring");
} else if (parsingSitemap) {
console.warn("text in sitemap, ignoring");
} else if (parsingSitemapIndex) {
console.warn("text in sitemapindex, ignoring");
}
}
});
parserStream.on("error", (err: Error) => {
if (this.atLimit()) {
this.pending.delete(url);
return;
}
logger.warn("Sitemap error parsing XML", { err }, "sitemap");
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(parserStream._parser as any).error = null;
parserStream._parser.resume();
});
sourceStream.pipe(parserStream);
}
atLimit(): boolean {
return Boolean(this.limit && this.count >= this.limit);
}
isWithinRange(lastmod: Date | null) {
// always accept entries with no date -- add option to change?
if (!lastmod) {
return true;
}
// earlier than fromDate
if (this.fromDate && lastmod < this.fromDate) {
return false;
}
// later than toDate
if (this.toDate && lastmod > this.toDate) {
return false;
}
return true;
}
addNewSitemap(url: string, lastmod: Date | null) {
if (this.seenSitemapSet.has(url)) {
return;
}
if (!this.isWithinRange(lastmod)) {
return;
}
if (this.atLimit()) {
return;
}
this.queue.add(async () => {
try {
await this.parseSitemap(url);
} catch (e) {
logger.warn(
"Sitemap parse failed",
{ url, ...formatErr(e) },
"sitemap",
);
}
});
}
emitEntry(url: string | null, lastmod: Date | null) {
if (!url) {
return;
}
if (!this.isWithinRange(lastmod)) {
return;
}
if (this.atLimit()) {
this.queue.clear();
return;
}
this.emit("url", { url, lastmod });
this.count++;
}
getSitemapsQueued() {
return this.queue.size;
}
}

View file

@ -142,6 +142,7 @@ type SaveState = {
failed: string[]; failed: string[];
errors: string[]; errors: string[];
extraSeeds: string[]; extraSeeds: string[];
sitemapDone: boolean;
}; };
// ============================================================================ // ============================================================================
@ -162,6 +163,8 @@ export class RedisCrawlState {
pageskey: string; pageskey: string;
esKey: string; esKey: string;
sitemapDoneKey: string;
constructor(redis: Redis, key: string, maxPageTime: number, uid: string) { constructor(redis: Redis, key: string, maxPageTime: number, uid: string) {
this.redis = redis; this.redis = redis;
@ -183,6 +186,8 @@ export class RedisCrawlState {
this.esKey = this.key + ":extraSeeds"; this.esKey = this.key + ":extraSeeds";
this.sitemapDoneKey = this.key + ":sitemapDone";
this._initLuaCommands(this.redis); this._initLuaCommands(this.redis);
} }
@ -521,10 +526,19 @@ return 0;
const failed = await this._iterListKeys(this.fkey, seen); const failed = await this._iterListKeys(this.fkey, seen);
const errors = await this.getErrorList(); const errors = await this.getErrorList();
const extraSeeds = await this._iterListKeys(this.esKey, seen); const extraSeeds = await this._iterListKeys(this.esKey, seen);
const sitemapDone = await this.isSitemapDone();
const finished = [...seen.values()]; const finished = [...seen.values()];
return { extraSeeds, finished, queued, pending, failed, errors }; return {
extraSeeds,
finished,
queued,
pending,
sitemapDone,
failed,
errors,
};
} }
_getScore(data: QueueEntry) { _getScore(data: QueueEntry) {
@ -643,6 +657,10 @@ return 0;
await this.redis.zadd(this.qkey, this._getScore(data), json); await this.redis.zadd(this.qkey, this._getScore(data), json);
seen.push(data.url); seen.push(data.url);
if (state.sitemapDone) {
await this.markSitemapDone();
}
} }
// backwards compatibility: not using done, instead 'finished' // backwards compatibility: not using done, instead 'finished'
@ -793,4 +811,12 @@ return 0;
} }
return seeds; return seeds;
} }
async isSitemapDone() {
return (await this.redis.get(this.sitemapDoneKey)) == "1";
}
async markSitemapDone() {
await this.redis.set(this.sitemapDoneKey, "1");
}
} }

View file

@ -0,0 +1,82 @@
import child_process from "child_process";
import Redis from "ioredis";
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function waitContainer(containerId) {
try {
child_process.execSync(`docker kill -s SIGINT ${containerId}`);
} catch (e) {
return;
}
// containerId is initially the full id, but docker ps
// only prints the short id (first 12 characters)
containerId = containerId.slice(0, 12);
while (true) {
try {
const res = child_process.execSync("docker ps -q", { encoding: "utf-8" });
if (res.indexOf(containerId) < 0) {
return;
}
} catch (e) {
console.error(e);
}
await sleep(500);
}
}
async function runCrawl(numExpected, url, sitemap="", limit=0) {
const containerId = child_process.execSync(`docker run -d -p 36379:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis`, {encoding: "utf-8"});
await sleep(2000);
const redis = new Redis("redis://127.0.0.1:36379/0", { lazyConnect: true });
let finished = 0;
try {
await redis.connect({
maxRetriesPerRequest: 100,
retryStrategy(times) {
return times < 100 ? 1000 : null;
},
});
while (true) {
finished = await redis.zcard("test:q");
if (finished >= numExpected) {
break;
}
}
} catch (e) {
console.error(e);
} finally {
await waitContainer(containerId);
try {
await redis.disconnect();
} catch (e) {
// ignore
}
}
expect(finished).toBeGreaterThanOrEqual(numExpected);
}
test("test sitemap fully finish", async () => {
await runCrawl(8036, "https://www.mozilla.org/", "", 0);
});
test("test sitemap with limit", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "", 2000);
});
test("test sitemap with limit, specific URL", async () => {
await runCrawl(1900, "https://www.mozilla.org/", "https://www.mozilla.org/sitemap.xml", 2000);
});

179
yarn.lock
View file

@ -822,11 +822,6 @@
resolved "https://registry.yarnpkg.com/@sinclair/typebox/-/typebox-0.24.50.tgz#35ee4db4ab8f3a8ff56490c51f92445d2776451e" resolved "https://registry.yarnpkg.com/@sinclair/typebox/-/typebox-0.24.50.tgz#35ee4db4ab8f3a8ff56490c51f92445d2776451e"
integrity sha512-k8ETQOOQDg5FtK7y9KJWpsGLik+QlPmIi8zzl/dGUgshV2QitprkFlCR/AemjWOTyKn9UwSSGRTzLVotvgCjYQ== integrity sha512-k8ETQOOQDg5FtK7y9KJWpsGLik+QlPmIi8zzl/dGUgshV2QitprkFlCR/AemjWOTyKn9UwSSGRTzLVotvgCjYQ==
"@sindresorhus/is@^4.0.0":
version "4.0.1"
resolved "https://registry.yarnpkg.com/@sindresorhus/is/-/is-4.0.1.tgz#d26729db850fa327b7cacc5522252194404226f5"
integrity sha512-Qm9hBEBu18wt1PO2flE7LPb30BHMQt1eQgbV76YntdNk73XZGpn3izvGTYxbGgzXKgbCjiia0uxTd3aTNQrY/g==
"@sinonjs/commons@^1.7.0": "@sinonjs/commons@^1.7.0":
version "1.8.3" version "1.8.3"
resolved "https://registry.yarnpkg.com/@sinonjs/commons/-/commons-1.8.3.tgz#3802ddd21a50a949b6721ddd72da36e67e7f1b2d" resolved "https://registry.yarnpkg.com/@sinonjs/commons/-/commons-1.8.3.tgz#3802ddd21a50a949b6721ddd72da36e67e7f1b2d"
@ -841,13 +836,6 @@
dependencies: dependencies:
"@sinonjs/commons" "^1.7.0" "@sinonjs/commons" "^1.7.0"
"@szmarczak/http-timer@^4.0.5":
version "4.0.5"
resolved "https://registry.yarnpkg.com/@szmarczak/http-timer/-/http-timer-4.0.5.tgz#bfbd50211e9dfa51ba07da58a14cdfd333205152"
integrity sha512-PyRA9sm1Yayuj5OIoJ1hGt2YISX45w9WcFbh6ddT0Z/0yaFxOtGLInr4jUfU1EAFVs0Yfyfev4RNwBlUaHdlDQ==
dependencies:
defer-to-connect "^2.0.0"
"@tootallnate/quickjs-emscripten@^0.23.0": "@tootallnate/quickjs-emscripten@^0.23.0":
version "0.23.0" version "0.23.0"
resolved "https://registry.yarnpkg.com/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz#db4ecfd499a9765ab24002c3b696d02e6d32a12c" resolved "https://registry.yarnpkg.com/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz#db4ecfd499a9765ab24002c3b696d02e6d32a12c"
@ -886,16 +874,6 @@
dependencies: dependencies:
"@babel/types" "^7.3.0" "@babel/types" "^7.3.0"
"@types/cacheable-request@^6.0.1":
version "6.0.1"
resolved "https://registry.yarnpkg.com/@types/cacheable-request/-/cacheable-request-6.0.1.tgz#5d22f3dded1fd3a84c0bbeb5039a7419c2c91976"
integrity sha512-ykFq2zmBGOCbpIXtoVbz4SKY5QriWPh3AjyU4G74RYbtt5yOc5OfaY75ftjg7mikMOla1CTGpX3lLbuJh8DTrQ==
dependencies:
"@types/http-cache-semantics" "*"
"@types/keyv" "*"
"@types/node" "*"
"@types/responselike" "*"
"@types/graceful-fs@^4.1.3": "@types/graceful-fs@^4.1.3":
version "4.1.5" version "4.1.5"
resolved "https://registry.yarnpkg.com/@types/graceful-fs/-/graceful-fs-4.1.5.tgz#21ffba0d98da4350db64891f92a9e5db3cdb4e15" resolved "https://registry.yarnpkg.com/@types/graceful-fs/-/graceful-fs-4.1.5.tgz#21ffba0d98da4350db64891f92a9e5db3cdb4e15"
@ -903,11 +881,6 @@
dependencies: dependencies:
"@types/node" "*" "@types/node" "*"
"@types/http-cache-semantics@*":
version "4.0.0"
resolved "https://registry.yarnpkg.com/@types/http-cache-semantics/-/http-cache-semantics-4.0.0.tgz#9140779736aa2655635ee756e2467d787cfe8a2a"
integrity sha512-c3Xy026kOF7QOTn00hbIllV1dLR9hG9NkSrLQgCVs8NF6sBU+VGWjD3wLPhmh1TYAc7ugCFsvHYMN4VcBN1U1A==
"@types/istanbul-lib-coverage@*", "@types/istanbul-lib-coverage@^2.0.0", "@types/istanbul-lib-coverage@^2.0.1": "@types/istanbul-lib-coverage@*", "@types/istanbul-lib-coverage@^2.0.0", "@types/istanbul-lib-coverage@^2.0.1":
version "2.0.3" version "2.0.3"
resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.3.tgz#4ba8ddb720221f432e443bd5f9117fd22cfd4762" resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.3.tgz#4ba8ddb720221f432e443bd5f9117fd22cfd4762"
@ -937,13 +910,6 @@
resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841"
integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==
"@types/keyv@*":
version "3.1.1"
resolved "https://registry.yarnpkg.com/@types/keyv/-/keyv-3.1.1.tgz#e45a45324fca9dab716ab1230ee249c9fb52cfa7"
integrity sha512-MPtoySlAZQ37VoLaPcTHCu1RWJ4llDkULYZIzOYxlhxBqYPB0RsRlmMU0R6tahtFe27mIdkHV+551ZWV4PLmVw==
dependencies:
"@types/node" "*"
"@types/node@*": "@types/node@*":
version "15.3.0" version "15.3.0"
resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26" resolved "https://registry.yarnpkg.com/@types/node/-/node-15.3.0.tgz#d6fed7d6bc6854306da3dea1af9f874b00783e26"
@ -961,10 +927,10 @@
resolved "https://registry.yarnpkg.com/@types/prettier/-/prettier-2.7.1.tgz#dfd20e2dc35f027cdd6c1908e80a5ddc7499670e" resolved "https://registry.yarnpkg.com/@types/prettier/-/prettier-2.7.1.tgz#dfd20e2dc35f027cdd6c1908e80a5ddc7499670e"
integrity sha512-ri0UmynRRvZiiUJdiz38MmIblKK+oH30MztdBVR95dv/Ubw6neWSb8u1XpRb72L4qsZOhz+L+z9JD40SJmfWow== integrity sha512-ri0UmynRRvZiiUJdiz38MmIblKK+oH30MztdBVR95dv/Ubw6neWSb8u1XpRb72L4qsZOhz+L+z9JD40SJmfWow==
"@types/responselike@*", "@types/responselike@^1.0.0": "@types/sax@^1.2.7":
version "1.0.0" version "1.2.7"
resolved "https://registry.yarnpkg.com/@types/responselike/-/responselike-1.0.0.tgz#251f4fe7d154d2bad125abe1b429b23afd262e29" resolved "https://registry.yarnpkg.com/@types/sax/-/sax-1.2.7.tgz#ba5fe7df9aa9c89b6dff7688a19023dd2963091d"
integrity sha512-85Y2BjiufFzaMIlvJDvTTB8Fxl2xfLo4HgmHzVBz08w4wDePCTjYw66PdrolO0kzli3yam/YCgRufyo1DdQVTA== integrity sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==
dependencies: dependencies:
"@types/node" "*" "@types/node" "*"
@ -1463,24 +1429,6 @@ buffer@^6.0.3:
base64-js "^1.3.1" base64-js "^1.3.1"
ieee754 "^1.2.1" ieee754 "^1.2.1"
cacheable-lookup@^5.0.3:
version "5.0.4"
resolved "https://registry.yarnpkg.com/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz#5a6b865b2c44357be3d5ebc2a467b032719a7005"
integrity sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==
cacheable-request@^7.0.1:
version "7.0.1"
resolved "https://registry.yarnpkg.com/cacheable-request/-/cacheable-request-7.0.1.tgz#062031c2856232782ed694a257fa35da93942a58"
integrity sha512-lt0mJ6YAnsrBErpTMWeu5kl/tg9xMAWjavYTN6VQXM1A/teBITuNcccXsCxF0tDQQJf9DfAaX5O4e0zp0KlfZw==
dependencies:
clone-response "^1.0.2"
get-stream "^5.1.0"
http-cache-semantics "^4.0.0"
keyv "^4.0.0"
lowercase-keys "^2.0.0"
normalize-url "^4.1.0"
responselike "^2.0.0"
call-bind@^1.0.0, call-bind@^1.0.2: call-bind@^1.0.0, call-bind@^1.0.2:
version "1.0.2" version "1.0.2"
resolved "https://registry.yarnpkg.com/call-bind/-/call-bind-1.0.2.tgz#b1d4e89e688119c3c9a903ad30abb2f6a919be3c" resolved "https://registry.yarnpkg.com/call-bind/-/call-bind-1.0.2.tgz#b1d4e89e688119c3c9a903ad30abb2f6a919be3c"
@ -1567,13 +1515,6 @@ cliui@^8.0.1:
strip-ansi "^6.0.1" strip-ansi "^6.0.1"
wrap-ansi "^7.0.0" wrap-ansi "^7.0.0"
clone-response@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/clone-response/-/clone-response-1.0.2.tgz#d1dc973920314df67fbeb94223b4ee350239e96b"
integrity sha1-0dyXOSAxTfZ/vrlCI7TuNQI56Ws=
dependencies:
mimic-response "^1.0.0"
cluster-key-slot@^1.1.0: cluster-key-slot@^1.1.0:
version "1.1.0" version "1.1.0"
resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d" resolved "https://registry.yarnpkg.com/cluster-key-slot/-/cluster-key-slot-1.1.0.tgz#30474b2a981fb12172695833052bc0d01336d10d"
@ -1725,11 +1666,6 @@ deepmerge@^4.2.2:
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955" resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955"
integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg== integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==
defer-to-connect@^2.0.0:
version "2.0.1"
resolved "https://registry.yarnpkg.com/defer-to-connect/-/defer-to-connect-2.0.1.tgz#8016bdb4143e4632b77a3449c6236277de520587"
integrity sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==
define-properties@^1.1.3: define-properties@^1.1.3:
version "1.1.3" version "1.1.3"
resolved "https://registry.yarnpkg.com/define-properties/-/define-properties-1.1.3.tgz#cf88da6cbee26fe6db7094f61d870cbd84cee9f1" resolved "https://registry.yarnpkg.com/define-properties/-/define-properties-1.1.3.tgz#cf88da6cbee26fe6db7094f61d870cbd84cee9f1"
@ -2358,23 +2294,6 @@ globby@^11.1.0:
merge2 "^1.4.1" merge2 "^1.4.1"
slash "^3.0.0" slash "^3.0.0"
got@^11.8.0:
version "11.8.2"
resolved "https://registry.yarnpkg.com/got/-/got-11.8.2.tgz#7abb3959ea28c31f3576f1576c1effce23f33599"
integrity sha512-D0QywKgIe30ODs+fm8wMZiAcZjypcCodPNuMz5H9Mny7RJ+IjJ10BdmGW7OM7fHXP+O7r6ZwapQ/YQmMSvB0UQ==
dependencies:
"@sindresorhus/is" "^4.0.0"
"@szmarczak/http-timer" "^4.0.5"
"@types/cacheable-request" "^6.0.1"
"@types/responselike" "^1.0.0"
cacheable-lookup "^5.0.3"
cacheable-request "^7.0.1"
decompress-response "^6.0.0"
http2-wrapper "^1.0.0-beta.5.2"
lowercase-keys "^2.0.0"
p-cancelable "^2.0.0"
responselike "^2.0.0"
graceful-fs@^4.1.6, graceful-fs@^4.2.0: graceful-fs@^4.1.6, graceful-fs@^4.2.0:
version "4.2.11" version "4.2.11"
resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.11.tgz#4183e4e8bf08bb6e05bbb2f7d2e0c8f712ca40e3" resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.11.tgz#4183e4e8bf08bb6e05bbb2f7d2e0c8f712ca40e3"
@ -2434,11 +2353,6 @@ html-escaper@^2.0.0:
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
http-cache-semantics@^4.0.0:
version "4.1.0"
resolved "https://registry.yarnpkg.com/http-cache-semantics/-/http-cache-semantics-4.1.0.tgz#49e91c5cbf36c9b94bcfcd71c23d5249ec74e390"
integrity sha512-carPklcUh7ROWRK7Cv27RPtdhYhUsela/ue5/jKzjegVvXDqM2ILE9Q2BGn9JZJh1g87cp56su/FgQSzcWS8cQ==
http-link-header@^1.1.0: http-link-header@^1.1.0:
version "1.1.0" version "1.1.0"
resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.0.tgz#a1ca87efdbcb7778d8d0d4525de1e6964ec1f129" resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.0.tgz#a1ca87efdbcb7778d8d0d4525de1e6964ec1f129"
@ -2457,14 +2371,6 @@ http-status-codes@^2.1.4:
resolved "https://registry.yarnpkg.com/http-status-codes/-/http-status-codes-2.2.0.tgz#bb2efe63d941dfc2be18e15f703da525169622be" resolved "https://registry.yarnpkg.com/http-status-codes/-/http-status-codes-2.2.0.tgz#bb2efe63d941dfc2be18e15f703da525169622be"
integrity sha512-feERVo9iWxvnejp3SEfm/+oNG517npqL2/PIA8ORjyOZjGC7TwCRQsZylciLS64i6pJ0wRYz3rkXLRwbtFa8Ng== integrity sha512-feERVo9iWxvnejp3SEfm/+oNG517npqL2/PIA8ORjyOZjGC7TwCRQsZylciLS64i6pJ0wRYz3rkXLRwbtFa8Ng==
http2-wrapper@^1.0.0-beta.5.2:
version "1.0.3"
resolved "https://registry.yarnpkg.com/http2-wrapper/-/http2-wrapper-1.0.3.tgz#b8f55e0c1f25d4ebd08b3b0c2c079f9590800b3d"
integrity sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==
dependencies:
quick-lru "^5.1.1"
resolve-alpn "^1.0.0"
https-proxy-agent@^7.0.0: https-proxy-agent@^7.0.0:
version "7.0.0" version "7.0.0"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-7.0.0.tgz#75cb70d04811685667183b31ab158d006750418a" resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-7.0.0.tgz#75cb70d04811685667183b31ab158d006750418a"
@ -2684,11 +2590,6 @@ is-glob@^4.0.1, is-glob@^4.0.3:
dependencies: dependencies:
is-extglob "^2.1.1" is-extglob "^2.1.1"
is-gzip@2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/is-gzip/-/is-gzip-2.0.0.tgz#f4fed2bbd9f96bf2cb39e19262797fdb15aad933"
integrity sha512-jtO4Njg6q58zDo/Pu4027beSZ0VdsZlt8/5Moco6yAg+DIxb5BK/xUYqYG2+MD4+piKldXJNHxRkhEYI2fvrxA==
is-negative-zero@^2.0.1: is-negative-zero@^2.0.1:
version "2.0.1" version "2.0.1"
resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.1.tgz#3de746c18dda2319241a53675908d8f766f11c24" resolved "https://registry.yarnpkg.com/is-negative-zero/-/is-negative-zero-2.0.1.tgz#3de746c18dda2319241a53675908d8f766f11c24"
@ -3220,11 +3121,6 @@ jsesc@^2.5.1:
resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4"
integrity sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA== integrity sha512-OYu7XEzjkCQ3C5Ps3QIZsQfNpqoJyZZA99wd9aWd05NCtC5pWOkShK2mkL6HXQR6/Cy2lbNdPlZBpuQHXE63gA==
json-buffer@3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/json-buffer/-/json-buffer-3.0.1.tgz#9338802a30d3b6605fbe0613e094008ca8c05a13"
integrity sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==
json-parse-even-better-errors@^2.3.0: json-parse-even-better-errors@^2.3.0:
version "2.3.1" version "2.3.1"
resolved "https://registry.yarnpkg.com/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz#7c47805a94319928e05777405dc12e1f7a4ee02d" resolved "https://registry.yarnpkg.com/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz#7c47805a94319928e05777405dc12e1f7a4ee02d"
@ -3265,13 +3161,6 @@ jsonfile@^4.0.0:
array-includes "^3.1.2" array-includes "^3.1.2"
object.assign "^4.1.2" object.assign "^4.1.2"
keyv@^4.0.0:
version "4.0.3"
resolved "https://registry.yarnpkg.com/keyv/-/keyv-4.0.3.tgz#4f3aa98de254803cafcd2896734108daa35e4254"
integrity sha512-zdGa2TOpSZPq5mU6iowDARnMBZgtCqJ11dJROFi6tg6kTn4nuUdU09lFyLFSaHrWqpIJ+EBq4E8/Dc0Vx5vLdA==
dependencies:
json-buffer "3.0.1"
kleur@^3.0.3: kleur@^3.0.3:
version "3.0.3" version "3.0.3"
resolved "https://registry.yarnpkg.com/kleur/-/kleur-3.0.3.tgz#a79c9ecc86ee1ce3fa6206d1216c501f147fc07e" resolved "https://registry.yarnpkg.com/kleur/-/kleur-3.0.3.tgz#a79c9ecc86ee1ce3fa6206d1216c501f147fc07e"
@ -3336,11 +3225,6 @@ loose-envify@^1.4.0:
dependencies: dependencies:
js-tokens "^3.0.0 || ^4.0.0" js-tokens "^3.0.0 || ^4.0.0"
lowercase-keys@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/lowercase-keys/-/lowercase-keys-2.0.0.tgz#2603e78b7b4b0006cbca2fbcc8a3202558ac9479"
integrity sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==
lru-cache@^6.0.0: lru-cache@^6.0.0:
version "6.0.0" version "6.0.0"
resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-6.0.0.tgz#6d6fe6570ebd96aaf90fcad1dafa3b2566db3a94" resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-6.0.0.tgz#6d6fe6570ebd96aaf90fcad1dafa3b2566db3a94"
@ -3411,11 +3295,6 @@ mimic-fn@^2.1.0:
resolved "https://registry.yarnpkg.com/mimic-fn/-/mimic-fn-2.1.0.tgz#7ed2c2ccccaf84d3ffcb7a69b57711fc2083401b" resolved "https://registry.yarnpkg.com/mimic-fn/-/mimic-fn-2.1.0.tgz#7ed2c2ccccaf84d3ffcb7a69b57711fc2083401b"
integrity sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg== integrity sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==
mimic-response@^1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-1.0.1.tgz#4923538878eef42063cb8a3e3b0798781487ab1b"
integrity sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==
mimic-response@^3.1.0: mimic-response@^3.1.0:
version "3.1.0" version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
@ -3524,11 +3403,6 @@ normalize-path@^3.0.0:
resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65"
integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==
normalize-url@^4.1.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/normalize-url/-/normalize-url-4.5.0.tgz#453354087e6ca96957bd8f5baf753f5982142129"
integrity sha512-2s47yzUxdexf1OhyRi4Em83iQk0aPvwTddtFz4hnSSw9dCEsLEGf6SwIO8ss/19S9iBb5sJaOuTvTGDeZI00BQ==
npm-run-path@^4.0.1: npm-run-path@^4.0.1:
version "4.0.1" version "4.0.1"
resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea" resolved "https://registry.yarnpkg.com/npm-run-path/-/npm-run-path-4.0.1.tgz#b7ecd1e5ed53da8e37a55e1c2269e0b97ed748ea"
@ -3622,11 +3496,6 @@ optionator@^0.9.3:
prelude-ls "^1.2.1" prelude-ls "^1.2.1"
type-check "^0.4.0" type-check "^0.4.0"
p-cancelable@^2.0.0:
version "2.1.1"
resolved "https://registry.yarnpkg.com/p-cancelable/-/p-cancelable-2.1.1.tgz#aab7fbd416582fa32a3db49859c122487c5ed2cf"
integrity sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==
p-limit@^2.2.0: p-limit@^2.2.0:
version "2.3.0" version "2.3.0"
resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.3.0.tgz#3dd33c647a214fdfffd835933eb086da0dc21db1" resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.3.0.tgz#3dd33c647a214fdfffd835933eb086da0dc21db1"
@ -3951,11 +3820,6 @@ queue-tick@^1.0.1:
resolved "https://registry.yarnpkg.com/queue-tick/-/queue-tick-1.0.1.tgz#f6f07ac82c1fd60f82e098b417a80e52f1f4c142" resolved "https://registry.yarnpkg.com/queue-tick/-/queue-tick-1.0.1.tgz#f6f07ac82c1fd60f82e098b417a80e52f1f4c142"
integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag== integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==
quick-lru@^5.1.1:
version "5.1.1"
resolved "https://registry.yarnpkg.com/quick-lru/-/quick-lru-5.1.1.tgz#366493e6b3e42a3a6885e2e99d18f80fb7a8c932"
integrity sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==
rc@^1.2.7: rc@^1.2.7:
version "1.2.8" version "1.2.8"
resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed" resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed"
@ -4024,11 +3888,6 @@ require-directory@^2.1.1:
resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42" resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42"
integrity sha1-jGStX9MNqxyXbiNE/+f3kqam30I= integrity sha1-jGStX9MNqxyXbiNE/+f3kqam30I=
resolve-alpn@^1.0.0:
version "1.1.2"
resolved "https://registry.yarnpkg.com/resolve-alpn/-/resolve-alpn-1.1.2.tgz#30b60cfbb0c0b8dc897940fe13fe255afcdd4d28"
integrity sha512-8OyfzhAtA32LVUsJSke3auIyINcwdh5l3cvYKdKO0nvsYSKuiLfTM5i78PJswFPT8y6cPW+L1v6/hE95chcpDA==
resolve-cwd@^3.0.0: resolve-cwd@^3.0.0:
version "3.0.0" version "3.0.0"
resolved "https://registry.yarnpkg.com/resolve-cwd/-/resolve-cwd-3.0.0.tgz#0f0075f1bb2544766cf73ba6a6e2adfebcb13f2d" resolved "https://registry.yarnpkg.com/resolve-cwd/-/resolve-cwd-3.0.0.tgz#0f0075f1bb2544766cf73ba6a6e2adfebcb13f2d"
@ -4068,13 +3927,6 @@ resolve@^2.0.0-next.3:
is-core-module "^2.2.0" is-core-module "^2.2.0"
path-parse "^1.0.6" path-parse "^1.0.6"
responselike@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/responselike/-/responselike-2.0.0.tgz#26391bcc3174f750f9a79eacc40a12a5c42d7723"
integrity sha512-xH48u3FTB9VsZw7R+vvgaKeLKzT6jOogbQhEe/jewwnZgzPcnyWui2Av6JpoYZF/91uueC+lqhWqeURw5/qhCw==
dependencies:
lowercase-keys "^2.0.0"
reusify@^1.0.4: reusify@^1.0.4:
version "1.0.4" version "1.0.4"
resolved "https://registry.yarnpkg.com/reusify/-/reusify-1.0.4.tgz#90da382b1e126efc02146e90845a88db12925d76" resolved "https://registry.yarnpkg.com/reusify/-/reusify-1.0.4.tgz#90da382b1e126efc02146e90845a88db12925d76"
@ -4109,6 +3961,11 @@ sax@>=0.6.0:
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==
sax@^1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.3.0.tgz#a5dbe77db3be05c9d1ee7785dbd3ea9de51593d0"
integrity sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==
search-params@3.0.0: search-params@3.0.0:
version "3.0.0" version "3.0.0"
resolved "https://registry.yarnpkg.com/search-params/-/search-params-3.0.0.tgz#dbc7c243058e5a33ae1e9870be91f5aced4100d8" resolved "https://registry.yarnpkg.com/search-params/-/search-params-3.0.0.tgz#dbc7c243058e5a33ae1e9870be91f5aced4100d8"
@ -4199,16 +4056,6 @@ sisteransi@^1.0.5:
resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed" resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed"
integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg== integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==
sitemapper@^3.2.6:
version "3.2.6"
resolved "https://registry.yarnpkg.com/sitemapper/-/sitemapper-3.2.6.tgz#892ebdade9a1b0839bd3dee3b67f3d57b10b3a89"
integrity sha512-AZbim4lmKgchUj6yyJ9ru0eLJ4/S6QAqy5QEbpCpvBbBnXxTERLMC6rzgKy1gHM19YUEtYJFTC2t8lxDWO0wkQ==
dependencies:
got "^11.8.0"
is-gzip "2.0.0"
p-limit "^3.1.0"
xml2js "^0.4.23"
slash@^3.0.0: slash@^3.0.0:
version "3.0.0" version "3.0.0"
resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634" resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634"
@ -4812,14 +4659,6 @@ ws@^7.4.4:
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.9.tgz#54fa7db29f4c7cec68b1ddd3a89de099942bb591" resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.9.tgz#54fa7db29f4c7cec68b1ddd3a89de099942bb591"
integrity sha512-F+P9Jil7UiSKSkppIiD94dN07AwvFixvLIj1Og1Rl9GGMuNipJnV9JzjD6XuqmAeiswGvUmNLjr5cFuXwNS77Q== integrity sha512-F+P9Jil7UiSKSkppIiD94dN07AwvFixvLIj1Og1Rl9GGMuNipJnV9JzjD6XuqmAeiswGvUmNLjr5cFuXwNS77Q==
xml2js@^0.4.23:
version "0.4.23"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.23.tgz#a0c69516752421eb2ac758ee4d4ccf58843eac66"
integrity sha512-ySPiMjM0+pLDftHgXY4By0uswI3SPKLDw/i3UXbnO8M/p28zqexCUoPmQFrYD+/1BzhGJSs2i1ERWKJAtiLrug==
dependencies:
sax ">=0.6.0"
xmlbuilder "~11.0.0"
xml2js@^0.5.0: xml2js@^0.5.0:
version "0.5.0" version "0.5.0"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.5.0.tgz#d9440631fbb2ed800203fad106f2724f62c493b7" resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.5.0.tgz#d9440631fbb2ed800203fad106f2724f62c493b7"