mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
lang code fixes: (#834)
- validate --lang values, fail immediately with invalid iso-639-1 country code - ignore --lang value when using profile, print warning that profile language takes precedence - fixes #833
This commit is contained in:
parent
e39d5a31eb
commit
71de8d6582
5 changed files with 44 additions and 1 deletions
|
@ -26,6 +26,7 @@
|
|||
"get-folder-size": "^4.0.0",
|
||||
"husky": "^8.0.3",
|
||||
"ioredis": "^5.3.2",
|
||||
"iso-639-1": "^3.1.5",
|
||||
"js-levenshtein": "^1.1.6",
|
||||
"js-yaml": "^4.1.0",
|
||||
"minio": "^7.1.3",
|
||||
|
|
|
@ -592,7 +592,14 @@ export class Crawler {
|
|||
extraChromeArgs() {
|
||||
const args = [];
|
||||
if (this.params.lang) {
|
||||
args.push(`--accept-lang=${this.params.lang}`);
|
||||
if (this.params.profile) {
|
||||
logger.warn(
|
||||
"Ignoring --lang option with profile, using language configured in the profile",
|
||||
{ lang: this.params.lang },
|
||||
);
|
||||
} else {
|
||||
args.push(`--accept-lang=${this.params.lang}`);
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import path from "path";
|
||||
import fs from "fs";
|
||||
import ISO6391 from "iso-639-1";
|
||||
|
||||
import yaml from "js-yaml";
|
||||
import { KnownDevices as devices } from "puppeteer-core";
|
||||
|
@ -770,6 +771,12 @@ class ArgParser {
|
|||
argv.emulateDevice = { viewport: null };
|
||||
}
|
||||
|
||||
if (argv.lang) {
|
||||
if (!ISO6391.validate(argv.lang)) {
|
||||
logger.fatal("Invalid ISO-639-1 country code for --lang: " + argv.lang);
|
||||
}
|
||||
}
|
||||
|
||||
if (argv.seedFile) {
|
||||
const urlSeedFile = fs.readFileSync(argv.seedFile, "utf8");
|
||||
const urlSeedFileList = urlSeedFile.split("\n");
|
||||
|
|
23
tests/lang-code.test.js
Normal file
23
tests/lang-code.test.js
Normal file
|
@ -0,0 +1,23 @@
|
|||
import { execSync } from "child_process";
|
||||
|
||||
test("run crawl with invalid lang", () => {
|
||||
let status = 0;
|
||||
try {
|
||||
execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang e --limit 1`);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(17);
|
||||
});
|
||||
|
||||
test("run crawl with valid lang", () => {
|
||||
let status = 0;
|
||||
try {
|
||||
execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang en --limit 1`);
|
||||
} catch (e) {
|
||||
status = e.status;
|
||||
}
|
||||
expect(status).toBe(0);
|
||||
});
|
||||
|
||||
|
|
@ -3269,6 +3269,11 @@ isexe@^2.0.0:
|
|||
resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
|
||||
integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==
|
||||
|
||||
iso-639-1@^3.1.5:
|
||||
version "3.1.5"
|
||||
resolved "https://registry.yarnpkg.com/iso-639-1/-/iso-639-1-3.1.5.tgz#e8205aceeeea0f64d6b12f5fac6a943b0d5b452c"
|
||||
integrity sha512-gXkz5+KN7HrG0Q5UGqSMO2qB9AsbEeyLP54kF1YrMsIxmu+g4BdB7rflReZTSTZGpfj8wywu6pfPBCylPIzGQA==
|
||||
|
||||
istanbul-lib-coverage@^3.0.0, istanbul-lib-coverage@^3.2.0:
|
||||
version "3.2.2"
|
||||
resolved "https://registry.yarnpkg.com/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz#2d166c4b0644d43a39f04bf6c2edd1e585f31756"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue