mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
case insensitive params (#27)
* make --generateWacz, --generateCdx case insensitive with alias option * fix eslint config and eslint issues Co-authored-by: Emma Dickson <emmadickson@Emmas-MacBook-Pro.local> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
4d6dcbc3d6
commit
0688674f6f
7 changed files with 1245 additions and 1258 deletions
14
.eslintrc.js
Normal file
14
.eslintrc.js
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
module.exports = {
|
||||||
|
"env": {
|
||||||
|
"browser": true,
|
||||||
|
"es2021": true,
|
||||||
|
"node": true
|
||||||
|
},
|
||||||
|
"extends": "eslint:recommended",
|
||||||
|
"parserOptions": {
|
||||||
|
"ecmaVersion": 12,
|
||||||
|
"sourceType": "module"
|
||||||
|
},
|
||||||
|
"rules": {
|
||||||
|
}
|
||||||
|
};
|
|
@ -11,7 +11,7 @@ function autofetcher() {
|
||||||
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
|
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
|
||||||
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
|
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
|
||||||
|
|
||||||
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; // eslint-disable-line no-useless-escape
|
||||||
|
|
||||||
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||||
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||||
|
@ -35,6 +35,7 @@ function autofetcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
async run() {
|
async run() {
|
||||||
|
/*eslint no-constant-condition: ["error", { "checkLoops": false }]*/
|
||||||
while (true) {
|
while (true) {
|
||||||
this.extractSrcSrcSetAll(document);
|
this.extractSrcSrcSetAll(document);
|
||||||
this.extractStyleSheets();
|
this.extractStyleSheets();
|
||||||
|
@ -211,7 +212,7 @@ function autofetcher() {
|
||||||
}
|
}
|
||||||
|
|
||||||
new AutoFetcher().init();
|
new AutoFetcher().init();
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
|
|
|
@ -88,7 +88,7 @@ function autoplay() {
|
||||||
}
|
}
|
||||||
}, 3000);
|
}, 3000);
|
||||||
|
|
||||||
};
|
}
|
||||||
|
|
||||||
|
|
||||||
// ===========================================================================
|
// ===========================================================================
|
||||||
|
|
|
@ -22,12 +22,12 @@ async function autoScroll() {
|
||||||
class AutoScrollBehavior
|
class AutoScrollBehavior
|
||||||
{
|
{
|
||||||
|
|
||||||
async beforeLoad(page, crawler) {
|
async beforeLoad() {
|
||||||
}
|
}
|
||||||
|
|
||||||
async afterLoad(page, crawler) {
|
async afterLoad(page, crawler) {
|
||||||
try {
|
try {
|
||||||
await Promise.race([page.evaluate(autoscroll), crawler.sleep(30000)]);
|
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn("Autoscroll Behavior Failed", e);
|
console.warn("Autoscroll Behavior Failed", e);
|
||||||
}
|
}
|
||||||
|
|
|
@ -209,12 +209,14 @@ class Crawler {
|
||||||
},
|
},
|
||||||
|
|
||||||
"generateCDX": {
|
"generateCDX": {
|
||||||
|
alias: ["generatecdx", "generateCdx"],
|
||||||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
"generateWACZ": {
|
"generateWACZ": {
|
||||||
|
alias: ["generatewacz", "generateWacz"],
|
||||||
describe: "If set, generate wacz",
|
describe: "If set, generate wacz",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
|
@ -464,7 +466,7 @@ class Crawler {
|
||||||
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
child_process.spawnSync("wb-manager", ["reindex", this.params.collection], {stdio: "inherit", cwd: this.params.cwd});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.params.generateWACZ) {
|
if (this.params.generateWACZ || this.params.generateWacz || this.params.generatewacz ) {
|
||||||
console.log("Generating WACZ");
|
console.log("Generating WACZ");
|
||||||
|
|
||||||
const archiveDir = path.join(this.collDir, "archive");
|
const archiveDir = path.join(this.collDir, "archive");
|
||||||
|
@ -476,7 +478,7 @@ class Crawler {
|
||||||
const waczFilename = this.params.collection.concat(".wacz");
|
const waczFilename = this.params.collection.concat(".wacz");
|
||||||
const waczPath = path.join(this.collDir, waczFilename);
|
const waczPath = path.join(this.collDir, waczFilename);
|
||||||
const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
|
const argument_list = ["create", "-o", waczPath, "--pages", this.pagesFile, "-f"];
|
||||||
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val)));
|
warcFileList.forEach((val, index) => argument_list.push(path.join(archiveDir, val))); // eslint-disable-line no-unused-vars
|
||||||
|
|
||||||
// Run the wacz create command
|
// Run the wacz create command
|
||||||
child_process.spawnSync("wacz" , argument_list);
|
child_process.spawnSync("wacz" , argument_list);
|
||||||
|
@ -557,7 +559,6 @@ class Crawler {
|
||||||
|
|
||||||
writePage(url, title){
|
writePage(url, title){
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
const today = new Date();
|
|
||||||
const row = {"id": id, "url": url, "title": title};
|
const row = {"id": id, "url": url, "title": title};
|
||||||
const processedRow = JSON.stringify(row).concat("\n");
|
const processedRow = JSON.stringify(row).concat("\n");
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -11,11 +11,11 @@
|
||||||
"puppeteer-cluster": "^0.22.0",
|
"puppeteer-cluster": "^0.22.0",
|
||||||
"puppeteer-core": "^5.3.1",
|
"puppeteer-core": "^5.3.1",
|
||||||
"sitemapper": "^3.1.2",
|
"sitemapper": "^3.1.2",
|
||||||
"yargs": "^16.0.3",
|
"uuid": "8.3.2",
|
||||||
"uuid": "8.3.2"
|
"yargs": "^16.0.3"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"eslint": "^7.19.0",
|
"eslint": "^7.20.0",
|
||||||
"eslint-plugin-react": "^7.22.0"
|
"eslint-plugin-react": "^7.22.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue