Support custom css selectors for extracting links (#689)

Support array of selectors via --selectLinks property in the
form [css selector]->[property] or [css selector]->@[attribute].
This commit is contained in:
Ilya Kreymer 2024-11-08 08:04:41 -08:00 committed by GitHub
parent 2a9b152531
commit d04509639a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 194 additions and 109 deletions

View file

@ -50,6 +50,11 @@ Options:
e-page-application crawling or when
different hashtags load dynamic cont
ent
--selectLinks one or more selectors for extracting
links, in the format [css selector]
->[property to use],[css selector]->
@[attribute to use]
[array] [default: ["a[href]->href"]]
--blockRules Additional rules for blocking certai
n URLs from being loaded, by URL reg
ex and optionally via text match in
@ -70,8 +75,7 @@ Options:
[string] [default: "crawl-@ts"]
--headless Run in headless mode, otherwise star
t xvfb [boolean] [default: false]
--driver JS driver for the crawler
[string] [default: "./defaultDriver.js"]
--driver JS driver for the crawler [string]
--generateCDX, --generatecdx, --gene If set, generate index (CDXJ) for us
rateCdx e with pywb after crawl is done
[boolean] [default: false]
@ -248,8 +252,8 @@ Options:
[boolean] [default: false]
--customBehaviors Custom behavior files to inject. Val
ues can be URLs, paths to individual
behavior files, or paths to a direct
ory of behavior files.
behavior files, or paths to a direc
tory of behavior files
[array] [default: []]
--debugAccessRedis if set, runs internal redis without
protected mode to allow external acc
@ -289,14 +293,14 @@ Options:
--version Show version number [boolean]
--url The URL of the login page [string] [required]
--user The username for the login. If not specified, will b
e prompted
e prompted [string]
--password The password for the login. If not specified, will b
e prompted (recommended)
e prompted (recommended) [string]
--filename The filename for the profile tarball, stored within
/crawls/profiles if absolute path not provided
[default: "/crawls/profiles/profile.tar.gz"]
[string] [default: "/crawls/profiles/profile.tar.gz"]
--debugScreenshot If specified, take a screenshot after login and save
as this filename
as this filename [boolean] [default: false]
--headless Run in headless mode, otherwise start xvfb
[boolean] [default: false]
--automated Start in automated mode, no interactive browser

View file

@ -17,6 +17,16 @@ can be used to specify additional seconds to wait after the page appears to have
(On the other hand, the `--pageExtraDelay`/`--delay` adds an extra after all post-load actions have taken place, and can be useful for rate-limiting.)
## Link Extraction
By default, the crawler will extract all `href` properties from all `<a>` tags that have an `href`.
This can be customized with the `--selectLinks` option, which can provide alternative selectors of the form:
`[css selector]->[property to use]` or `[css selector]->@[attribute to use]`. The default value is `a[href]->href`.
For example, to specify the default, but also include all `divs` that have class `mylink` and use `custom-href` attribute as the link, use `--selectLinks 'a[href]->href' --selectLinks 'div.mylink->@custom-href'`.
Any number of selectors can be specified in this way, and each will be applied in sequence on each page.
## Ad Blocking
Brave Browser, the browser used by Browsertrix Crawler for crawling, has some ad and tracker blocking features enabled by default. These [Shields](https://brave.com/shields/) be disabled or customized using [Browser Profiles](browser-profiles.md).

View file

@ -20,6 +20,7 @@
"@webrecorder/wabac": "^2.20.0",
"browsertrix-behaviors": "^0.6.4",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",
"get-folder-size": "^4.0.0",
"husky": "^8.0.3",

View file

@ -46,8 +46,8 @@ import { Browser } from "./util/browser.js";
import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
DEFAULT_SELECTORS,
DISPLAY,
ExtractSelector,
PAGE_OP_TIMEOUT_SECS,
SITEMAP_INITIAL_FETCH_TIMEOUT_SECS,
} from "./util/constants.js";
@ -191,12 +191,14 @@ export class Crawler {
proxyServer?: string;
driver!: (opts: {
page: Page;
data: PageState;
// eslint-disable-next-line no-use-before-define
crawler: Crawler;
}) => Promise<void>;
driver:
| ((opts: {
page: Page;
data: PageState;
// eslint-disable-next-line no-use-before-define
crawler: Crawler;
}) => Promise<void>)
| null = null;
recording: boolean;
@ -491,6 +493,8 @@ export class Crawler {
logger.info("Seeds", this.seeds);
logger.info("Link Selectors", this.params.selectLinks);
if (this.params.behaviorOpts) {
logger.info("Behavior Options", this.params.behaviorOpts);
} else {
@ -930,8 +934,12 @@ self.__bx_behaviors.selectMainBehavior();
await page.setExtraHTTPHeaders({});
}
// run custom driver here
await this.driver({ page, data, crawler: this });
// run custom driver here, if any
if (this.driver) {
await this.driver({ page, data, crawler: this });
} else {
await this.loadPage(page, data);
}
data.title = await timedRun(
page.title(),
@ -1347,12 +1355,14 @@ self.__bx_behaviors.selectMainBehavior();
);
}
try {
const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl.href)).default;
} catch (e) {
logger.warn(`Error importing driver ${this.params.driver}`, e);
return;
if (this.params.driver) {
try {
const driverUrl = new URL(this.params.driver, import.meta.url);
this.driver = (await import(driverUrl.href)).default;
} catch (e) {
logger.warn(`Error importing driver ${this.params.driver}`, e);
return;
}
}
await this.initCrawlState();
@ -1741,11 +1751,7 @@ self.__bx_behaviors.selectMainBehavior();
}
}
async loadPage(
page: Page,
data: PageState,
selectorOptsList = DEFAULT_SELECTORS,
) {
async loadPage(page: Page, data: PageState) {
const { url, depth } = data;
const logDetails = data.logDetails;
@ -1946,14 +1952,18 @@ self.__bx_behaviors.selectMainBehavior();
await this.awaitPageLoad(page.mainFrame(), logDetails);
// skip extraction if at max depth
if (seed.isAtMaxDepth(depth, extraHops) || !selectorOptsList) {
logger.debug("Skipping Link Extraction, At Max Depth");
if (seed.isAtMaxDepth(depth, extraHops)) {
logger.debug("Skipping Link Extraction, At Max Depth", {}, "links");
return;
}
logger.debug("Extracting links", logDetails);
logger.debug(
"Extracting links",
{ selectors: this.params.selectLinks, ...logDetails },
"links",
);
await this.extractLinks(page, data, selectorOptsList, logDetails);
await this.extractLinks(page, data, this.params.selectLinks, logDetails);
}
async netIdle(page: Page, details: LogDetails) {
@ -1999,7 +2009,7 @@ self.__bx_behaviors.selectMainBehavior();
async extractLinks(
page: Page,
data: PageState,
selectors = DEFAULT_SELECTORS,
selectors: ExtractSelector[],
logDetails: LogDetails,
) {
const { seedId, depth, extraHops = 0, filteredFrames, callbacks } = data;
@ -2045,11 +2055,7 @@ self.__bx_behaviors.selectMainBehavior();
const frames = filteredFrames || page.frames();
try {
for (const {
selector = "a[href]",
extract = "href",
isAttribute = false,
} of selectors) {
for (const { selector, extract, isAttribute } of selectors) {
await Promise.allSettled(
frames.map((frame) => {
const getLinks = frame

View file

@ -1,15 +0,0 @@
import { Page } from "puppeteer-core";
import { PageState } from "./util/state.js";
import { Crawler } from "./crawler.js";
export default async ({
data,
page,
crawler,
}: {
data: PageState;
page: Page;
crawler: Crawler;
}) => {
await crawler.loadPage(page, data);
};

View file

@ -7,11 +7,15 @@ import { KnownDevices as devices } from "puppeteer-core";
import yargs from "yargs";
import { hideBin } from "yargs/helpers";
import { createParser } from "css-selector-parser";
import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
DEFAULT_SELECTORS,
ExtractSelector,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
@ -32,6 +36,8 @@ export type CrawlerArgs = ReturnType<typeof parseArgs> & {
scopedSeeds: ScopedSeed[];
selectLinks: ExtractSelector[];
crawlId: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
@ -156,6 +162,14 @@ class ArgParser {
"Allow Hashtag URLs, useful for single-page-application crawling or when different hashtags load dynamic content",
},
selectLinks: {
describe:
"One or more selectors for extracting links, in the format [css selector]->[property to use],[css selector]->@[attribute to use]",
type: "array",
default: ["a[href]->href"],
coerce,
},
blockRules: {
describe:
"Additional rules for blocking certain URLs from being loaded, by URL regex and optionally via text match in an iframe",
@ -200,9 +214,8 @@ class ArgParser {
},
driver: {
describe: "JS driver for the crawler",
describe: "Custom driver for the crawler, if any",
type: "string",
default: "./defaultDriver.js",
},
generateCDX: {
@ -714,6 +727,30 @@ class ArgParser {
}
}
let selectLinks: ExtractSelector[];
const parser = createParser();
if (argv.selectLinks) {
selectLinks = argv.selectLinks.map((x: string) => {
const parts = x.split("->");
const selector = parts[0];
const value = parts[1] || "";
const extract = parts.length > 1 ? value.replace("@", "") : "href";
const isAttribute = value.startsWith("@");
try {
parser(selector);
} catch (e) {
logger.fatal("Invalid Link Extraction CSS Selector", { selector });
}
return { selector, extract, isAttribute };
});
} else {
selectLinks = DEFAULT_SELECTORS;
}
argv.selectLinks = selectLinks;
if (argv.netIdleWait === -1) {
if (argv.scopeType === "page" || argv.scopeType === "page-spa") {
argv.netIdleWait = 15;

View file

@ -30,7 +30,13 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
export const DEFAULT_SELECTORS = [
export type ExtractSelector = {
selector: string;
extract: string;
isAttribute: boolean;
};
export const DEFAULT_SELECTORS: ExtractSelector[] = [
{
selector: "a[href]",
extract: "href",

View file

@ -1,52 +1,15 @@
import child_process from "child_process";
import fs from "fs";
test("ensure custom driver with custom selector crawls JS files as pages", async () => {
test("ensure custom driver creates PDF", async () => {
try {
child_process.execSync(
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs",
"docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1",
);
} catch (error) {
console.log(error);
}
const crawledPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/pages.jsonl",
"utf8",
);
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}
const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-driver-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();
for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}
const expectedPages = new Set([
"https://www.iana.org/",
]);
const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);
expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf"));
expect(pdfs.length).toBe(1);
});

View file

@ -0,0 +1,68 @@
import child_process from "child_process";
import fs from "fs";
test("test custom selector crawls JS files as pages", async () => {
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"",
);
} catch (error) {
console.log(error);
}
const crawledPages = fs.readFileSync(
"test-crawls/collections/custom-sel-1/pages/pages.jsonl",
"utf8",
);
const pages = new Set();
for (const line of crawledPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
pages.add(url);
}
const crawledExtraPages = fs.readFileSync(
"test-crawls/collections/custom-sel-1/pages/extraPages.jsonl",
"utf8",
);
const extraPages = new Set();
for (const line of crawledExtraPages.trim().split("\n")) {
const url = JSON.parse(line).url;
if (!url) {
continue;
}
extraPages.add(url);
}
const expectedPages = new Set([
"https://www.iana.org/",
]);
const expectedExtraPages = new Set([
"https://www.iana.org/_js/jquery.js",
"https://www.iana.org/_js/iana.js",
]);
expect(pages).toEqual(expectedPages);
expect(extraPages).toEqual(expectedExtraPages);
});
test("test invalid selector, crawl fails", async () => {
let failed = false;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
);
} catch (error) {
failed = true;
}
expect(failed).toBe(true);
});

View file

@ -1,5 +1,5 @@
export default async ({ data, page, crawler }) => {
await crawler.loadPage(page, data, [
{ selector: "script[src]", extract: "src", isAttribute: false },
]);
await crawler.loadPage(page, data);
await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`});
};

View file

@ -1312,16 +1312,16 @@
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
"@webrecorder/wabac@^2.20.0-beta.4":
version "2.20.0-beta.4"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.0-beta.4.tgz#c60fcd00f449cca52ce1a0bef305a06922c9e3e8"
integrity sha512-enHYcZoqs7cOu2tdTqVeB/zB27uL4wmCMzvF55bJqdB8d5zgPpY+/fpRA3eLxGrPc0nFYAjsI/aNaa62FH7WKQ==
"@webrecorder/wabac@^2.20.0":
version "2.20.1"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.20.1.tgz#58e397e2ef1c33de1bb37aa4f51fc7f3eec8a1f7"
integrity sha512-RX+U6m7aVgvsAfLb9FuLY/PcHCNL5dc1FPaD0GnUiFgswSSe5v4MjIhqJNOnbrJYEcbib81AJfxNuvOyXAJDJQ==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2"
"@types/js-levenshtein" "^1.1.3"
"@webrecorder/wombat" "^3.8.2"
"@webrecorder/wombat" "^3.8.3"
acorn "^8.10.0"
auto-js-ipfs "^2.1.1"
base64-js "^1.5.1"
@ -1342,10 +1342,10 @@
stream-browserify "^3.0.0"
warcio "^2.3.1"
"@webrecorder/wombat@^3.8.2":
version "3.8.2"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.2.tgz#e46e18719834d633175eec52ce753a4dc4e48e27"
integrity sha512-uUZr9V4UYpVOpM64Tm27ND/hMjDbT37+/qyNaNV6loqDuVzBVQh5w7SfTEy0Bbjj1MYyNZP244mOtWtotTpUEA==
"@webrecorder/wombat@^3.8.3":
version "3.8.3"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.8.3.tgz#c5a077225d1a70def9fbbbfcd50fa4465d236546"
integrity sha512-dqgoxigB3OdX5JeB3yxJrUNwFwUBlYC+LmGrLEgGeP259MFzXQLD2pmfuqGt5ygWvIv56SrAMV4sUceux07X2A==
dependencies:
warcio "^2.3.1"
@ -1963,6 +1963,11 @@ crypto-random-string@^4.0.0:
dependencies:
type-fest "^1.0.1"
css-selector-parser@^3.0.5:
version "3.0.5"
resolved "https://registry.yarnpkg.com/css-selector-parser/-/css-selector-parser-3.0.5.tgz#9b636ebccf7c4bcce5c1ac21ae27de9f01180ae9"
integrity sha512-3itoDFbKUNx1eKmVpYMFyqKX04Ww9osZ+dLgrk6GEv6KMVeXUhUnp4I5X+evw+u3ZxVU6RFXSSRxlTeMh8bA+g==
data-uri-to-buffer@^5.0.1:
version "5.0.1"
resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-5.0.1.tgz#db89a9e279c2ffe74f50637a59a32fb23b3e4d7c"