mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 06:23:16 +00:00
case insensitive params (#27)
* make --generateWacz, --generateCdx case insensitive with alias option * fix eslint config and eslint issues Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
4d6dcbc3d6
commit
0688674f6f
7 changed files with 1245 additions and 1258 deletions
14
.eslintrc.js
Normal file
14
.eslintrc.js
Normal file
|
@ -0,0 +1,14 @@
|
|||
module.exports = {
|
||||
"env": {
|
||||
"browser": true,
|
||||
"es2021": true,
|
||||
"node": true
|
||||
},
|
||||
"extends": "eslint:recommended",
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 12,
|
||||
"sourceType": "module"
|
||||
},
|
||||
"rules": {
|
||||
}
|
||||
};
|
|
@ -11,7 +11,7 @@ function autofetcher() {
|
|||
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
|
||||
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
|
||||
|
||||
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
||||
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; // eslint-disable-line no-useless-escape
|
||||
|
||||
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
|
@ -35,6 +35,7 @@ function autofetcher() {
|
|||
}
|
||||
|
||||
async run() {
|
||||
/*eslint no-constant-condition: ["error", { "checkLoops": false }]*/
|
||||
while (true) {
|
||||
this.extractSrcSrcSetAll(document);
|
||||
this.extractStyleSheets();
|
||||
|
@ -211,7 +212,7 @@ function autofetcher() {
|
|||
}
|
||||
|
||||
new AutoFetcher().init();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
|
|
|
@ -88,7 +88,7 @@ function autoplay() {
|
|||
}
|
||||
}, 3000);
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
|
|
|
@ -22,12 +22,12 @@ async function autoScroll() {
|
|||
class AutoScrollBehavior
|
||||
{
|
||||
|
||||
async beforeLoad(page, crawler) {
|
||||
async beforeLoad() {
|
||||
}
|
||||
|
||||
async afterLoad(page, crawler) {
|
||||
try {
|
||||
await Promise.race([page.evaluate(autoscroll), crawler.sleep(30000)]);
|
||||
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
|
||||
} catch (e) {
|
||||
console.warn("Autoscroll Behavior Failed", e);
|
||||
}
|
||||
|
|
|
@ -209,12 +209,14 @@ class Crawler {
|
|||
},
|
||||
|
||||
"generateCDX": {
|
||||
alias: ["generatecdx", "generateCdx"],
|
||||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
},
|
||||
|
||||
"generateWACZ": {
|
||||
alias: ["generatewacz", "generateWacz"],
|
||||
describe: "If set, generate wacz",
|
||||
type: "boolean",
|
||||
default: false,
|
||||
|
@ -464,7 +466,7 @@ class Crawler {
|
|||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
||||
}
|
||||
|
||||
if (this.params.generateWACZ) {
|
||||
if (this.params.generateWACZ || this.params.generateWacz || this.params.generatewacz ) {
|
||||
console.log("Generating WACZ");
|
||||
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
|
@ -476,7 +478,7 @@ class Crawler {
|
|||
const waczFilename = this.params.collection.concat(".wacz");
|
||||
const waczPath = path.join(this.collDir, waczFilename);
|
||||
const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
|
||||
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val)));
|
||||
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
||||
|
||||
// Run the wacz create command
|
||||
child_process.spawnSync("wacz" , argument_list);
|
||||
|
@ -557,7 +559,6 @@ class Crawler {
|
|||
|
||||
writePage(url, title){
|
||||
const id = uuidv4();
|
||||
const today = new Date();
|
||||
const row = {"id": id, "url": url, "title": title};
|
||||
const processedRow = JSON.stringify(row).concat("\n");
|
||||
try {
|
||||
|
|
|
@ -11,11 +11,11 @@
|
|||
"puppeteer-cluster": "^0.22.0",
|
||||
"puppeteer-core": "^5.3.1",
|
||||
"sitemapper": "^3.1.2",
|
||||
"yargs": "^16.0.3",
|
||||
"uuid": "8.3.2"
|
||||
"uuid": "8.3.2",
|
||||
"yargs": "^16.0.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"eslint": "^7.19.0",
|
||||
"eslint": "^7.20.0",
|
||||
"eslint-plugin-react": "^7.22.0"
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue