mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-10-19 14:33:17 +00:00
Capture Favicon (#362)
- get favicon from CDP debug page, if available, log warning if not - store in favIconUrl in pages.jsonl - test: add test for favIcon and additional multi-page crawls
This commit is contained in:
parent
d42010a598
commit
1c486ea1f3
2 changed files with 44 additions and 1 deletions
20
crawler.js
20
crawler.js
|
@ -435,6 +435,19 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
return str;
|
||||
}
|
||||
|
||||
async getFavicon(page, logDetails) {
|
||||
const resp = await fetch("http://localhost:9221/json");
|
||||
if (resp.status === 200) {
|
||||
const browserJson = await resp.json();
|
||||
for (const jsons of browserJson) {
|
||||
if (jsons.id === page.target()._targetId) {
|
||||
return jsons.faviconUrl;
|
||||
}
|
||||
}
|
||||
}
|
||||
logger.warn("Failed to fetch Favicon from localhost debugger", logDetails);
|
||||
}
|
||||
|
||||
async crawlPage(opts) {
|
||||
await this.writeStats();
|
||||
|
||||
|
@ -455,6 +468,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
await this.driver({page, data, crawler: this});
|
||||
|
||||
data.title = await page.title();
|
||||
data.favicon = await this.getFavicon(page, logDetails);
|
||||
|
||||
if (this.params.screenshot) {
|
||||
if (!data.isHTMLPage) {
|
||||
|
@ -1313,7 +1327,7 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
}
|
||||
}
|
||||
|
||||
async writePage({url, depth, title, text, loadState}) {
|
||||
async writePage({url, depth, title, text, loadState, favicon}) {
|
||||
const id = uuidv4();
|
||||
const row = {id, url, title, loadState};
|
||||
|
||||
|
@ -1325,6 +1339,10 @@ self.__bx_behaviors.selectMainBehavior();
|
|||
row.text = text;
|
||||
}
|
||||
|
||||
if (favicon !== null) {
|
||||
row.favIconUrl = favicon;
|
||||
}
|
||||
|
||||
const processedRow = JSON.stringify(row) + "\n";
|
||||
try {
|
||||
await this.pagesFH.writeFile(processedRow);
|
||||
|
|
25
tests/mult_url_crawl_with_favicon.test.js
Normal file
25
tests/mult_url_crawl_with_favicon.test.js
Normal file
|
@ -0,0 +1,25 @@
|
|||
import child_process from "child_process";
|
||||
import fs from "fs";
|
||||
|
||||
test("ensure multi url crawl run with docker run passes", async () => {
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title \"test title\" --description \"test description\" --pages 2 --limit 2");
|
||||
|
||||
child_process.execSync("docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler wacz validate --file collections/advanced/advanced.wacz");
|
||||
|
||||
});
|
||||
|
||||
test("check that the favicon made it into the pages jsonl file", () => {
|
||||
expect(fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl")).toBe(true);
|
||||
|
||||
const data1 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[1]);
|
||||
const data2 = JSON.parse(fs.readFileSync("test-crawls/collections/advanced/pages/pages.jsonl", "utf8").split("\n")[2]);
|
||||
const data = [ data1, data2 ];
|
||||
for (const d of data) {
|
||||
if (d.url === "https://webrecorder.net/") {
|
||||
expect(d.favIconUrl).toEqual("https://webrecorder.net/assets/favicon.ico");
|
||||
}
|
||||
if (d.url === "https://iana.org/") {
|
||||
expect(d.favIconUrl).toEqual("https://www.iana.org/_img/bookmark_icon.ico");
|
||||
}
|
||||
}
|
||||
});
|
Loading…
Add table
Add a link
Reference in a new issue