Capture Favicon (#362)

- get favicon from CDP debug page, if available, log warning if not
- store in favIconUrl in pages.jsonl
- test: add test for favIcon and additional multi-page crawls
This commit is contained in:
Anish Lakhwara 2023-09-10 11:29:35 -07:00 committed by GitHub
parent d42010a598
commit 1c486ea1f3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 44 additions and 1 deletions

View file

@ -435,6 +435,19 @@ self.__bx_behaviors.selectMainBehavior();
return str; return str;
} }
async getFavicon(page, logDetails) {
const resp = await fetch("http://localhost:9221/json");
if (resp.status === 200) {
const browserJson = await resp.json();
for (const jsons of browserJson) {
if (jsons.id === page.target()._targetId) {
return jsons.faviconUrl;
}
}
}
logger.warn("Failed to fetch Favicon from localhost debugger", logDetails);
}
async crawlPage(opts) { async crawlPage(opts) {
await this.writeStats(); await this.writeStats();
@ -455,6 +468,7 @@ self.__bx_behaviors.selectMainBehavior();
await this.driver({page, data, crawler: this}); await this.driver({page, data, crawler: this});
data.title = await page.title(); data.title = await page.title();
data.favicon = await this.getFavicon(page, logDetails);
if (this.params.screenshot) { if (this.params.screenshot) {
if (!data.isHTMLPage) { if (!data.isHTMLPage) {
@ -1313,7 +1327,7 @@ self.__bx_behaviors.selectMainBehavior();
} }
} }
async writePage({url, depth, title, text, loadState}) { async writePage({url, depth, title, text, loadState, favicon}) {
const id = uuidv4(); const id = uuidv4();
const row = {id, url, title, loadState}; const row = {id, url, title, loadState};
@ -1325,6 +1339,10 @@ self.__bx_behaviors.selectMainBehavior();
row.text = text; row.text = text;
} }
if (favicon !== null) {
row.favIconUrl = favicon;
}
const processedRow = JSON.stringify(row) + "\n"; const processedRow = JSON.stringify(row) + "\n";
try { try {
await this.pagesFH.writeFile(processedRow); await this.pagesFH.writeFile(processedRow);

View file

@ -0,0 +1,25 @@
import child_process from "child_process";
import fs from "fs";
test("ensure multi url crawl run with docker run passes", async () => {
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2");
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz");
});
test("check that the favicon made it into the pages jsonl file", () => {
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true);
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]);
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]);
const data = [ data1, data2 ];
for (const d of data) {
if (d.url === "https://webrecorder.net/") {
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico");
}
if (d.url === "https://iana.org/") {
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico");
}
}
});