Validate Autoclick selector, fail crawl if invalid (#800)

Fixes #798 

Also modifies the existing test for link selector validation to check 17
status code on exit when link selectors fail validation.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
Tessa Walsh 2025-03-30 16:48:41 -04:00 committed by GitHub
parent 47d61a6baf
commit 8f581a587c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 47 additions and 5 deletions

View file

@ -700,6 +700,9 @@ class ArgParser {
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname(); argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
argv.collection = interpolateFilename(argv.collection, argv.crawlId); argv.collection = interpolateFilename(argv.collection, argv.crawlId);
// css selector parser
const parser = createParser();
// Check that the collection name is valid. // Check that the collection name is valid.
if (argv.collection.search(/^[\w][\w-]*$/) === -1) { if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
logger.fatal( logger.fatal(
@ -710,6 +713,16 @@ class ArgParser {
// background behaviors to apply // background behaviors to apply
const behaviorOpts: { [key: string]: string | boolean } = {}; const behaviorOpts: { [key: string]: string | boolean } = {};
if (argv.behaviors.length > 0) { if (argv.behaviors.length > 0) {
if (argv.clickSelector) {
try {
parser(argv.clickSelector);
} catch (e) {
logger.fatal("Invalid Autoclick CSS Selector", {
selector: argv.clickSelector,
});
}
}
argv.behaviors.forEach((x: string) => { argv.behaviors.forEach((x: string) => {
if (BEHAVIOR_TYPES.includes(x)) { if (BEHAVIOR_TYPES.includes(x)) {
behaviorOpts[x] = true; behaviorOpts[x] = true;
@ -761,8 +774,6 @@ class ArgParser {
let selectLinks: ExtractSelector[]; let selectLinks: ExtractSelector[];
const parser = createParser();
if (argv.selectLinks) { if (argv.selectLinks) {
selectLinks = argv.selectLinks.map((x: string) => { selectLinks = argv.selectLinks.map((x: string) => {
const parts = x.split("->"); const parts = x.split("->");

View file

@ -53,16 +53,47 @@ test("test custom selector crawls JS files as pages", async () => {
test("test invalid selector, crawl fails", async () => { test("test invalid selector, crawl fails", async () => {
let failed = false; let status = 0;
try { try {
child_process.execSync( child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"", "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
); );
} catch (error) { } catch (e) {
status = e.status;
}
// logger fatal exit code
expect(status).toBe(17);
});
test("test valid autoclick selector passes validation", async () => {
let failed = false;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page",
);
} catch (e) {
failed = true; failed = true;
} }
expect(failed).toBe(true); // valid clickSelector
expect(failed).toBe(false);
}); });
test("test invalid autoclick selector fails validation, crawl fails", async () => {
let status = 0;
try {
child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page",
);
} catch (e) {
status = e.status;
}
// logger fatal exit code
expect(status).toBe(17);
});