mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
Validate Autoclick selector, fail crawl if invalid (#800)
Fixes #798 Also modifies the existing test for link selector validation to check 17 status code on exit when link selectors fail validation. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
47d61a6baf
commit
8f581a587c
2 changed files with 47 additions and 5 deletions
|
@ -700,6 +700,9 @@ class ArgParser {
|
|||
argv.crawlId = argv.crawlId || process.env.CRAWL_ID || os.hostname();
|
||||
argv.collection = interpolateFilename(argv.collection, argv.crawlId);
|
||||
|
||||
// css selector parser
|
||||
const parser = createParser();
|
||||
|
||||
// Check that the collection name is valid.
|
||||
if (argv.collection.search(/^[\w][\w-]*$/) === -1) {
|
||||
logger.fatal(
|
||||
|
@ -710,6 +713,16 @@ class ArgParser {
|
|||
// background behaviors to apply
|
||||
const behaviorOpts: { [key: string]: string | boolean } = {};
|
||||
if (argv.behaviors.length > 0) {
|
||||
if (argv.clickSelector) {
|
||||
try {
|
||||
parser(argv.clickSelector);
|
||||
} catch (e) {
|
||||
logger.fatal("Invalid Autoclick CSS Selector", {
|
||||
selector: argv.clickSelector,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
argv.behaviors.forEach((x: string) => {
|
||||
if (BEHAVIOR_TYPES.includes(x)) {
|
||||
behaviorOpts[x] = true;
|
||||
|
@ -761,8 +774,6 @@ class ArgParser {
|
|||
|
||||
let selectLinks: ExtractSelector[];
|
||||
|
||||
const parser = createParser();
|
||||
|
||||
if (argv.selectLinks) {
|
||||
selectLinks = argv.selectLinks.map((x: string) => {
|
||||
const parts = x.split("->");
|
||||
|
|
|
@ -53,16 +53,47 @@ test("test custom selector crawls JS files as pages", async () => {
|
|||
|
||||
|
||||
test("test invalid selector, crawl fails", async () => {
|
||||
let failed = false;
|
||||
let status = 0;
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
|
||||
);
|
||||
} catch (error) {
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
|
||||
// logger fatal exit code
|
||||
expect(status).toBe(17);
|
||||
});
|
||||
|
||||
test("test valid autoclick selector passes validation", async () => {
|
||||
let failed = false;
|
||||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
failed = true;
|
||||
}
|
||||
|
||||
expect(failed).toBe(true);
|
||||
// valid clickSelector
|
||||
expect(failed).toBe(false);
|
||||
});
|
||||
|
||||
|
||||
test("test invalid autoclick selector fails validation, crawl fails", async () => {
|
||||
let status = 0;
|
||||
|
||||
try {
|
||||
child_process.execSync(
|
||||
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page",
|
||||
);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
|
||||
// logger fatal exit code
|
||||
expect(status).toBe(17);
|
||||
});
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue