mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Support custom css selectors for extracting links (#689)
Support array of selectors via --selectLinks property in the form [css selector]->[property] or [css selector]->@[attribute].
This commit is contained in:
parent
2a9b152531
commit
d04509639a
11 changed files with 194 additions and 109 deletions
|
@ -50,6 +50,11 @@ Options:
|
||||||
e-page-application crawling or when
|
e-page-application crawling or when
|
||||||
different hashtags load dynamic cont
|
different hashtags load dynamic cont
|
||||||
ent
|
ent
|
||||||
|
--selectLinks one or more selectors for extracting
|
||||||
|
links, in the format [css selector]
|
||||||
|
->[property to use],[css selector]->
|
||||||
|
@[attribute to use]
|
||||||
|
[array] [default: ["a[href]->href"]]
|
||||||
--blockRules Additional rules for blocking certai
|
--blockRules Additional rules for blocking certai
|
||||||
n URLs from being loaded, by URL reg
|
n URLs from being loaded, by URL reg
|
||||||
ex and optionally via text match in
|
ex and optionally via text match in
|
||||||
|
@ -70,8 +75,7 @@ Options:
|
||||||
[string] [default: "crawl-@ts"]
|
[string] [default: "crawl-@ts"]
|
||||||
--headless Run in headless mode, otherwise star
|
--headless Run in headless mode, otherwise star
|
||||||
t xvfb [boolean] [default: false]
|
t xvfb [boolean] [default: false]
|
||||||
--driver JS driver for the crawler
|
--driver JS driver for the crawler [string]
|
||||||
[string] [default: "./defaultDriver.js"]
|
|
||||||
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
|
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
|
||||||
rateCdx e with pywb after crawl is done
|
rateCdx e with pywb after crawl is done
|
||||||
[boolean] [default: false]
|
[boolean] [default: false]
|
||||||
|
@ -248,8 +252,8 @@ Options:
|
||||||
[boolean] [default: false]
|
[boolean] [default: false]
|
||||||
--customBehaviors Custom behavior files to inject. Val
|
--customBehaviors Custom behavior files to inject. Val
|
||||||
ues can be URLs, paths to individual
|
ues can be URLs, paths to individual
|
||||||
behavior files, or paths to a direct
|
behavior files, or paths to a direc
|
||||||
ory of behavior files.
|
tory of behavior files
|
||||||
[array] [default: []]
|
[array] [default: []]
|
||||||
--debugAccessRedis if set, runs internal redis without
|
--debugAccessRedis if set, runs internal redis without
|
||||||
protected mode to allow external acc
|
protected mode to allow external acc
|
||||||
|
@ -289,14 +293,14 @@ Options:
|
||||||
--version Show version number [boolean]
|
--version Show version number [boolean]
|
||||||
--url The URL of the login page [string] [required]
|
--url The URL of the login page [string] [required]
|
||||||
--user The username for the login. If not specified, will b
|
--user The username for the login. If not specified, will b
|
||||||
e prompted
|
e prompted [string]
|
||||||
--password The password for the login. If not specified, will b
|
--password The password for the login. If not specified, will b
|
||||||
e prompted (recommended)
|
e prompted (recommended) [string]
|
||||||
--filename The filename for the profile tarball, stored within
|
--filename The filename for the profile tarball, stored within
|
||||||
/crawls/profiles if absolute path not provided
|
/crawls/profiles if absolute path not provided
|
||||||
[default: "/crawls/profiles/profile.tar.gz"]
|
[string] [default: "/crawls/profiles/profile.tar.gz"]
|
||||||
--debugScreenshot If specified, take a screenshot after login and save
|
--debugScreenshot If specified, take a screenshot after login and save
|
||||||
as this filename
|
as this filename [boolean] [default: false]
|
||||||
--headless Run in headless mode, otherwise start xvfb
|
--headless Run in headless mode, otherwise start xvfb
|
||||||
[boolean] [default: false]
|
[boolean] [default: false]
|
||||||
--automated Start in automated mode, no interactive browser
|
--automated Start in automated mode, no interactive browser
|
||||||
|
|
|
@ -17,6 +17,16 @@ can be used to specify additional seconds to wait after the page appears to have
|
||||||
|
|
||||||
(On the other hand, the `--pageExtraDelay`/`--delay` adds an extra after all post-load actions have taken place, and can be useful for rate-limiting.)
|
(On the other hand, the `--pageExtraDelay`/`--delay` adds an extra after all post-load actions have taken place, and can be useful for rate-limiting.)
|
||||||
|
|
||||||
|
## Link Extraction
|
||||||
|
|
||||||
|
By default, the crawler will extract all `href` properties from all `<a>` tags that have an `href`.
|
||||||
|
This can be customized with the `--selectLinks` option, which can provide alternative selectors of the form:
|
||||||
|
`[css selector]->[property to use]` or `[css selector]->@[attribute to use]`. The default value is `a[href]->href`.
|
||||||
|
|
||||||
|
For example, to specify the default, but also include all `divs` that have class `mylink` and use `custom-href` attribute as the link, use `--selectLinks 'a[href]->href' --selectLinks 'div.mylink->@custom-href'`.
|
||||||
|
|
||||||
|
Any number of selectors can be specified in this way, and each will be applied in sequence on each page.
|
||||||
|
|
||||||
## Ad Blocking
|
## Ad Blocking
|
||||||
|
|
||||||
Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md).
|
Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md).
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
"@webrecorder/wabac": "^2.20.0",
|
"@webrecorder/wabac": "^2.20.0",
|
||||||
"browsertrix-behaviors": "^0.6.4",
|
"browsertrix-behaviors": "^0.6.4",
|
||||||
"client-zip": "^2.4.5",
|
"client-zip": "^2.4.5",
|
||||||
|
"css-selector-parser": "^3.0.5",
|
||||||
"fetch-socks": "^1.3.0",
|
"fetch-socks": "^1.3.0",
|
||||||
"get-folder-size": "^4.0.0",
|
"get-folder-size": "^4.0.0",
|
||||||
"husky": "^8.0.3",
|
"husky": "^8.0.3",
|
||||||
|
|
|
@ -46,8 +46,8 @@ import { Browser } from "./util/browser.js";
|
||||||
import {
|
import {
|
||||||
ADD_LINK_FUNC,
|
ADD_LINK_FUNC,
|
||||||
BEHAVIOR_LOG_FUNC,
|
BEHAVIOR_LOG_FUNC,
|
||||||
DEFAULT_SELECTORS,
|
|
||||||
DISPLAY,
|
DISPLAY,
|
||||||
|
ExtractSelector,
|
||||||
PAGE_OP_TIMEOUT_SECS,
|
PAGE_OP_TIMEOUT_SECS,
|
||||||
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
|
||||||
} from "./util/constants.js";
|
} from "./util/constants.js";
|
||||||
|
@ -191,12 +191,14 @@ export class Crawler {
|
||||||
|
|
||||||
proxyServer?: string;
|
proxyServer?: string;
|
||||||
|
|
||||||
driver!: (opts: {
|
driver:
|
||||||
page: Page;
|
| ((opts: {
|
||||||
data: PageState;
|
page: Page;
|
||||||
// eslint-disable-next-line no-use-before-define
|
data: PageState;
|
||||||
crawler: Crawler;
|
// eslint-disable-next-line no-use-before-define
|
||||||
}) => Promise<void>;
|
crawler: Crawler;
|
||||||
|
}) => Promise<void>)
|
||||||
|
| null = null;
|
||||||
|
|
||||||
recording: boolean;
|
recording: boolean;
|
||||||
|
|
||||||
|
@ -491,6 +493,8 @@ export class Crawler {
|
||||||
|
|
||||||
logger.info("Seeds", this.seeds);
|
logger.info("Seeds", this.seeds);
|
||||||
|
|
||||||
|
logger.info("Link Selectors", this.params.selectLinks);
|
||||||
|
|
||||||
if (this.params.behaviorOpts) {
|
if (this.params.behaviorOpts) {
|
||||||
logger.info("Behavior Options", this.params.behaviorOpts);
|
logger.info("Behavior Options", this.params.behaviorOpts);
|
||||||
} else {
|
} else {
|
||||||
|
@ -930,8 +934,12 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await page.setExtraHTTPHeaders({});
|
await page.setExtraHTTPHeaders({});
|
||||||
}
|
}
|
||||||
|
|
||||||
// run custom driver here
|
// run custom driver here, if any
|
||||||
await this.driver({ page, data, crawler: this });
|
if (this.driver) {
|
||||||
|
await this.driver({ page, data, crawler: this });
|
||||||
|
} else {
|
||||||
|
await this.loadPage(page, data);
|
||||||
|
}
|
||||||
|
|
||||||
data.title = await timedRun(
|
data.title = await timedRun(
|
||||||
page.title(),
|
page.title(),
|
||||||
|
@ -1347,12 +1355,14 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
if (this.params.driver) {
|
||||||
const driverUrl = new URL(this.params.driver, import.meta.url);
|
try {
|
||||||
this.driver = (await import(driverUrl.href)).default;
|
const driverUrl = new URL(this.params.driver, import.meta.url);
|
||||||
} catch (e) {
|
this.driver = (await import(driverUrl.href)).default;
|
||||||
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
} catch (e) {
|
||||||
return;
|
logger.warn(`Error importing driver ${this.params.driver}`, e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.initCrawlState();
|
await this.initCrawlState();
|
||||||
|
@ -1741,11 +1751,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async loadPage(
|
async loadPage(page: Page, data: PageState) {
|
||||||
page: Page,
|
|
||||||
data: PageState,
|
|
||||||
selectorOptsList = DEFAULT_SELECTORS,
|
|
||||||
) {
|
|
||||||
const { url, depth } = data;
|
const { url, depth } = data;
|
||||||
|
|
||||||
const logDetails = data.logDetails;
|
const logDetails = data.logDetails;
|
||||||
|
@ -1946,14 +1952,18 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
await this.awaitPageLoad(page.mainFrame(), logDetails);
|
||||||
|
|
||||||
// skip extraction if at max depth
|
// skip extraction if at max depth
|
||||||
if (seed.isAtMaxDepth(depth, extraHops) || !selectorOptsList) {
|
if (seed.isAtMaxDepth(depth, extraHops)) {
|
||||||
logger.debug("Skipping Link Extraction, At Max Depth");
|
logger.debug("Skipping Link Extraction, At Max Depth", {}, "links");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Extracting links", logDetails);
|
logger.debug(
|
||||||
|
"Extracting links",
|
||||||
|
{ selectors: this.params.selectLinks, ...logDetails },
|
||||||
|
"links",
|
||||||
|
);
|
||||||
|
|
||||||
await this.extractLinks(page, data, selectorOptsList, logDetails);
|
await this.extractLinks(page, data, this.params.selectLinks, logDetails);
|
||||||
}
|
}
|
||||||
|
|
||||||
async netIdle(page: Page, details: LogDetails) {
|
async netIdle(page: Page, details: LogDetails) {
|
||||||
|
@ -1999,7 +2009,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
async extractLinks(
|
async extractLinks(
|
||||||
page: Page,
|
page: Page,
|
||||||
data: PageState,
|
data: PageState,
|
||||||
selectors = DEFAULT_SELECTORS,
|
selectors: ExtractSelector[],
|
||||||
logDetails: LogDetails,
|
logDetails: LogDetails,
|
||||||
) {
|
) {
|
||||||
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
|
||||||
|
@ -2045,11 +2055,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
const frames = filteredFrames || page.frames();
|
const frames = filteredFrames || page.frames();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (const {
|
for (const { selector, extract, isAttribute } of selectors) {
|
||||||
selector = "a[href]",
|
|
||||||
extract = "href",
|
|
||||||
isAttribute = false,
|
|
||||||
} of selectors) {
|
|
||||||
await Promise.allSettled(
|
await Promise.allSettled(
|
||||||
frames.map((frame) => {
|
frames.map((frame) => {
|
||||||
const getLinks = frame
|
const getLinks = frame
|
||||||
|
|
|
@ -1,15 +0,0 @@
|
||||||
import { Page } from "puppeteer-core";
|
|
||||||
import { PageState } from "./util/state.js";
|
|
||||||
import { Crawler } from "./crawler.js";
|
|
||||||
|
|
||||||
export default async ({
|
|
||||||
data,
|
|
||||||
page,
|
|
||||||
crawler,
|
|
||||||
}: {
|
|
||||||
data: PageState;
|
|
||||||
page: Page;
|
|
||||||
crawler: Crawler;
|
|
||||||
}) => {
|
|
||||||
await crawler.loadPage(page, data);
|
|
||||||
};
|
|
|
@ -7,11 +7,15 @@ import { KnownDevices as devices } from "puppeteer-core";
|
||||||
import yargs from "yargs";
|
import yargs from "yargs";
|
||||||
import { hideBin } from "yargs/helpers";
|
import { hideBin } from "yargs/helpers";
|
||||||
|
|
||||||
|
import { createParser } from "css-selector-parser";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
BEHAVIOR_LOG_FUNC,
|
BEHAVIOR_LOG_FUNC,
|
||||||
WAIT_UNTIL_OPTS,
|
WAIT_UNTIL_OPTS,
|
||||||
EXTRACT_TEXT_TYPES,
|
EXTRACT_TEXT_TYPES,
|
||||||
SERVICE_WORKER_OPTS,
|
SERVICE_WORKER_OPTS,
|
||||||
|
DEFAULT_SELECTORS,
|
||||||
|
ExtractSelector,
|
||||||
} from "./constants.js";
|
} from "./constants.js";
|
||||||
import { ScopedSeed } from "./seeds.js";
|
import { ScopedSeed } from "./seeds.js";
|
||||||
import { interpolateFilename } from "./storage.js";
|
import { interpolateFilename } from "./storage.js";
|
||||||
|
@ -32,6 +36,8 @@ export type CrawlerArgs = ReturnType<typeof parseArgs> & {
|
||||||
|
|
||||||
scopedSeeds: ScopedSeed[];
|
scopedSeeds: ScopedSeed[];
|
||||||
|
|
||||||
|
selectLinks: ExtractSelector[];
|
||||||
|
|
||||||
crawlId: string;
|
crawlId: string;
|
||||||
|
|
||||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
@ -156,6 +162,14 @@ class ArgParser {
|
||||||
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
selectLinks: {
|
||||||
|
describe:
|
||||||
|
"One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]",
|
||||||
|
type: "array",
|
||||||
|
default: ["a[href]->href"],
|
||||||
|
coerce,
|
||||||
|
},
|
||||||
|
|
||||||
blockRules: {
|
blockRules: {
|
||||||
describe:
|
describe:
|
||||||
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
|
||||||
|
@ -200,9 +214,8 @@ class ArgParser {
|
||||||
},
|
},
|
||||||
|
|
||||||
driver: {
|
driver: {
|
||||||
describe: "JS driver for the crawler",
|
describe: "Custom driver for the crawler, if any",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: "./defaultDriver.js",
|
|
||||||
},
|
},
|
||||||
|
|
||||||
generateCDX: {
|
generateCDX: {
|
||||||
|
@ -714,6 +727,30 @@ class ArgParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let selectLinks: ExtractSelector[];
|
||||||
|
|
||||||
|
const parser = createParser();
|
||||||
|
|
||||||
|
if (argv.selectLinks) {
|
||||||
|
selectLinks = argv.selectLinks.map((x: string) => {
|
||||||
|
const parts = x.split("->");
|
||||||
|
const selector = parts[0];
|
||||||
|
const value = parts[1] || "";
|
||||||
|
const extract = parts.length > 1 ? value.replace("@", "") : "href";
|
||||||
|
const isAttribute = value.startsWith("@");
|
||||||
|
try {
|
||||||
|
parser(selector);
|
||||||
|
} catch (e) {
|
||||||
|
logger.fatal("Invalid Link Extraction CSS Selector", { selector });
|
||||||
|
}
|
||||||
|
return { selector, extract, isAttribute };
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
selectLinks = DEFAULT_SELECTORS;
|
||||||
|
}
|
||||||
|
|
||||||
|
argv.selectLinks = selectLinks;
|
||||||
|
|
||||||
if (argv.netIdleWait === -1) {
|
if (argv.netIdleWait === -1) {
|
||||||
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
|
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
|
||||||
argv.netIdleWait = 15;
|
argv.netIdleWait = 15;
|
||||||
|
|
|
@ -30,7 +30,13 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
|
||||||
export const PAGE_OP_TIMEOUT_SECS = 5;
|
export const PAGE_OP_TIMEOUT_SECS = 5;
|
||||||
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
|
||||||
|
|
||||||
export const DEFAULT_SELECTORS = [
|
export type ExtractSelector = {
|
||||||
|
selector: string;
|
||||||
|
extract: string;
|
||||||
|
isAttribute: boolean;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const DEFAULT_SELECTORS: ExtractSelector[] = [
|
||||||
{
|
{
|
||||||
selector: "a[href]",
|
selector: "a[href]",
|
||||||
extract: "href",
|
extract: "href",
|
||||||
|
|
|
@ -1,52 +1,15 @@
|
||||||
import child_process from "child_process";
|
import child_process from "child_process";
|
||||||
import fs from "fs";
|
import fs from "fs";
|
||||||
|
|
||||||
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
|
test("ensure custom driver creates PDF", async () => {
|
||||||
try {
|
try {
|
||||||
child_process.execSync(
|
child_process.execSync(
|
||||||
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
|
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1",
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
const crawledPages = fs.readFileSync(
|
const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf"));
|
||||||
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
|
expect(pdfs.length).toBe(1);
|
||||||
"utf8",
|
|
||||||
);
|
|
||||||
const pages = new Set();
|
|
||||||
|
|
||||||
for (const line of crawledPages.trim().split("\n")) {
|
|
||||||
const url = JSON.parse(line).url;
|
|
||||||
if (!url) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
pages.add(url);
|
|
||||||
}
|
|
||||||
|
|
||||||
const crawledExtraPages = fs.readFileSync(
|
|
||||||
"test-crawls/collections/custom-driver-1/pages/extraPages.jsonl",
|
|
||||||
"utf8",
|
|
||||||
);
|
|
||||||
const extraPages = new Set();
|
|
||||||
|
|
||||||
for (const line of crawledExtraPages.trim().split("\n")) {
|
|
||||||
const url = JSON.parse(line).url;
|
|
||||||
if (!url) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
extraPages.add(url);
|
|
||||||
}
|
|
||||||
|
|
||||||
const expectedPages = new Set([
|
|
||||||
"https://www.iana.org/",
|
|
||||||
]);
|
|
||||||
|
|
||||||
const expectedExtraPages = new Set([
|
|
||||||
"https://www.iana.org/_js/jquery.js",
|
|
||||||
"https://www.iana.org/_js/iana.js",
|
|
||||||
]);
|
|
||||||
|
|
||||||
expect(pages).toEqual(expectedPages);
|
|
||||||
expect(extraPages).toEqual(expectedExtraPages);
|
|
||||||
});
|
});
|
||||||
|
|
68
tests/custom_selector.test.js
Normal file
68
tests/custom_selector.test.js
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
import child_process from "child_process";
|
||||||
|
import fs from "fs";
|
||||||
|
|
||||||
|
test("test custom selector crawls JS files as pages", async () => {
|
||||||
|
try {
|
||||||
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"",
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.log(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
const crawledPages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/custom-sel-1/pages/pages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
const pages = new Set();
|
||||||
|
|
||||||
|
for (const line of crawledPages.trim().split("\n")) {
|
||||||
|
const url = JSON.parse(line).url;
|
||||||
|
if (!url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
pages.add(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
const crawledExtraPages = fs.readFileSync(
|
||||||
|
"test-crawls/collections/custom-sel-1/pages/extraPages.jsonl",
|
||||||
|
"utf8",
|
||||||
|
);
|
||||||
|
const extraPages = new Set();
|
||||||
|
|
||||||
|
for (const line of crawledExtraPages.trim().split("\n")) {
|
||||||
|
const url = JSON.parse(line).url;
|
||||||
|
if (!url) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
extraPages.add(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
const expectedPages = new Set([
|
||||||
|
"https://www.iana.org/",
|
||||||
|
]);
|
||||||
|
|
||||||
|
const expectedExtraPages = new Set([
|
||||||
|
"https://www.iana.org/_js/jquery.js",
|
||||||
|
"https://www.iana.org/_js/iana.js",
|
||||||
|
]);
|
||||||
|
|
||||||
|
expect(pages).toEqual(expectedPages);
|
||||||
|
expect(extraPages).toEqual(expectedExtraPages);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test("test invalid selector, crawl fails", async () => {
|
||||||
|
let failed = false;
|
||||||
|
try {
|
||||||
|
child_process.execSync(
|
||||||
|
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
failed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
expect(failed).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
6
tests/fixtures/driver-1.mjs
vendored
6
tests/fixtures/driver-1.mjs
vendored
|
@ -1,5 +1,5 @@
|
||||||
export default async ({ data, page, crawler }) => {
|
export default async ({ data, page, crawler }) => {
|
||||||
await crawler.loadPage(page, data, [
|
await crawler.loadPage(page, data);
|
||||||
{ selector: "script[src]", extract: "src", isAttribute: false },
|
|
||||||
]);
|
await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`});
|
||||||
};
|
};
|
||||||
|
|
23
yarn.lock
23
yarn.lock
|
@ -1312,16 +1312,16 @@
|
||||||
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
|
||||||
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
|
||||||
|
|
||||||
"@webrecorder/wabac@^2.20.0-beta.4":
|
"@webrecorder/wabac@^2.20.0":
|
||||||
version "2.20.0-beta.4"
|
version "2.20.1"
|
||||||
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.0-beta.4.tgz#c60fcd00f449cca52ce1a0bef305a06922c9e3e8"
|
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.1.tgz#58e397e2ef1c33de1bb37aa4f51fc7f3eec8a1f7"
|
||||||
integrity sha512-enHYcZoqs7cOu2tdTqVeB/zB27uL4wmCMzvF55bJqdB8d5zgPpY+/fpRA3eLxGrPc0nFYAjsI/aNaa62FH7WKQ==
|
integrity sha512-RX+U6m7aVgvsAfLb9FuLY/PcHCNL5dc1FPaD0GnUiFgswSSe5v4MjIhqJNOnbrJYEcbib81AJfxNuvOyXAJDJQ==
|
||||||
dependencies:
|
dependencies:
|
||||||
"@peculiar/asn1-ecc" "^2.3.4"
|
"@peculiar/asn1-ecc" "^2.3.4"
|
||||||
"@peculiar/asn1-schema" "^2.3.3"
|
"@peculiar/asn1-schema" "^2.3.3"
|
||||||
"@peculiar/x509" "^1.9.2"
|
"@peculiar/x509" "^1.9.2"
|
||||||
"@types/js-levenshtein" "^1.1.3"
|
"@types/js-levenshtein" "^1.1.3"
|
||||||
"@webrecorder/wombat" "^3.8.2"
|
"@webrecorder/wombat" "^3.8.3"
|
||||||
acorn "^8.10.0"
|
acorn "^8.10.0"
|
||||||
auto-js-ipfs "^2.1.1"
|
auto-js-ipfs "^2.1.1"
|
||||||
base64-js "^1.5.1"
|
base64-js "^1.5.1"
|
||||||
|
@ -1342,10 +1342,10 @@
|
||||||
stream-browserify "^3.0.0"
|
stream-browserify "^3.0.0"
|
||||||
warcio "^2.3.1"
|
warcio "^2.3.1"
|
||||||
|
|
||||||
"@webrecorder/wombat@^3.8.2":
|
"@webrecorder/wombat@^3.8.3":
|
||||||
version "3.8.2"
|
version "3.8.3"
|
||||||
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.2.tgz#e46e18719834d633175eec52ce753a4dc4e48e27"
|
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.3.tgz#c5a077225d1a70def9fbbbfcd50fa4465d236546"
|
||||||
integrity sha512-uUZr9V4UYpVOpM64Tm27ND/hMjDbT37+/qyNaNV6loqDuVzBVQh5w7SfTEy0Bbjj1MYyNZP244mOtWtotTpUEA==
|
integrity sha512-dqgoxigB3OdX5JeB3yxJrUNwFwUBlYC+LmGrLEgGeP259MFzXQLD2pmfuqGt5ygWvIv56SrAMV4sUceux07X2A==
|
||||||
dependencies:
|
dependencies:
|
||||||
warcio "^2.3.1"
|
warcio "^2.3.1"
|
||||||
|
|
||||||
|
@ -1963,6 +1963,11 @@ crypto-random-string@^4.0.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
type-fest "^1.0.1"
|
type-fest "^1.0.1"
|
||||||
|
|
||||||
|
css-selector-parser@^3.0.5:
|
||||||
|
version "3.0.5"
|
||||||
|
resolved "https://registry.yarnpkg.com/css-selector-parser/-/css-selector-parser-3.0.5.tgz#9b636ebccf7c4bcce5c1ac21ae27de9f01180ae9"
|
||||||
|
integrity sha512-3itoDFbKUNx1eKmVpYMFyqKX04Ww9osZ+dLgrk6GEv6KMVeXUhUnp4I5X+evw+u3ZxVU6RFXSSRxlTeMh8bA+g==
|
||||||
|
|
||||||
data-uri-to-buffer@^5.0.1:
|
data-uri-to-buffer@^5.0.1:
|
||||||
version "5.0.1"
|
version "5.0.1"
|
||||||
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c"
|
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue