mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Capture Favicon (#362)
- get favicon from CDP debug page, if available, log warning if not - store in favIconUrl in pages.jsonl - test: add test for favIcon and additional multi-page crawls
This commit is contained in:
parent
d42010a598
commit
1c486ea1f3
2 changed files with 44 additions and 1 deletions
20
crawler.js
20
crawler.js
|
@ -435,6 +435,19 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getFavicon(page, logDetails) {
|
||||||
|
const resp = await fetch("http://localhost:9221/json");
|
||||||
|
if (resp.status === 200) {
|
||||||
|
const browserJson = await resp.json();
|
||||||
|
for (const jsons of browserJson) {
|
||||||
|
if (jsons.id === page.target()._targetId) {
|
||||||
|
return jsons.faviconUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.warn("Failed to fetch Favicon from localhost debugger", logDetails);
|
||||||
|
}
|
||||||
|
|
||||||
async crawlPage(opts) {
|
async crawlPage(opts) {
|
||||||
await this.writeStats();
|
await this.writeStats();
|
||||||
|
|
||||||
|
@ -455,6 +468,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
await this.driver({page, data, crawler: this});
|
await this.driver({page, data, crawler: this});
|
||||||
|
|
||||||
data.title = await page.title();
|
data.title = await page.title();
|
||||||
|
data.favicon = await this.getFavicon(page, logDetails);
|
||||||
|
|
||||||
if (this.params.screenshot) {
|
if (this.params.screenshot) {
|
||||||
if (!data.isHTMLPage) {
|
if (!data.isHTMLPage) {
|
||||||
|
@ -1313,7 +1327,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async writePage({url, depth, title, text, loadState}) {
|
async writePage({url, depth, title, text, loadState, favicon}) {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
const row = {id, url, title, loadState};
|
const row = {id, url, title, loadState};
|
||||||
|
|
||||||
|
@ -1325,6 +1339,10 @@ self.__bx_behaviors.selectMainBehavior();
|
||||||
row.text = text;
|
row.text = text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (favicon !== null) {
|
||||||
|
row.favIconUrl = favicon;
|
||||||
|
}
|
||||||
|
|
||||||
const processedRow = JSON.stringify(row) + "\n";
|
const processedRow = JSON.stringify(row) + "\n";
|
||||||
try {
|
try {
|
||||||
await this.pagesFH.writeFile(processedRow);
|
await this.pagesFH.writeFile(processedRow);
|
||||||
|
|
25
tests/mult_url_crawl_with_favicon.test.js
Normal file
25
tests/mult_url_crawl_with_favicon.test.js
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import child_process from "child_process";
|
||||||
|
import fs from "fs";
|
||||||
|
|
||||||
|
test("ensure multi url crawl run with docker run passes", async () => {
|
||||||
|
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2");
|
||||||
|
|
||||||
|
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz");
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
test("check that the favicon made it into the pages jsonl file", () => {
|
||||||
|
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true);
|
||||||
|
|
||||||
|
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]);
|
||||||
|
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]);
|
||||||
|
const data = [ data1, data2 ];
|
||||||
|
for (const d of data) {
|
||||||
|
if (d.url === "https://webrecorder.net/") {
|
||||||
|
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico");
|
||||||
|
}
|
||||||
|
if (d.url === "https://iana.org/") {
|
||||||
|
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
Loading…
Add table
Add a link
Reference in a new issue