From 67e7bcc6fb90ec869d3e61543baa8f788b8c5f9c Mon Sep 17 00:00:00 2001 From: W13R <9070224-W13R@users.noreply.gitlab.com> Date: Sun, 12 Dec 2021 17:45:38 +0100 Subject: [PATCH] Added concurrency for requests to speed things up --- lib/graph.py | 2 +- lib/linkmap.py | 102 -------------------------- lib/linkmap_from_sitelinks.py | 130 ++++++++++++++++++++++++++++++++++ linkmapy.py | 7 +- 4 files changed, 132 insertions(+), 109 deletions(-) create mode 100644 lib/linkmap_from_sitelinks.py diff --git a/lib/graph.py b/lib/graph.py index 744a5a2..2c8c61e 100644 --- a/lib/graph.py +++ b/lib/graph.py @@ -6,7 +6,7 @@ from pyvis.network import Network def pyvis_graph_from_pandas_DF(pandas_df:DataFrame, source_column:str="link1", target_column:str="link2", heading:str=None) -> Network: nx = from_pandas_edgelist(pandas_df, source=source_column, target=target_column) - pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%") + pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True) pyvis_net.from_nx(nx, default_node_size=8) if heading != None: diff --git a/lib/linkmap.py b/lib/linkmap.py index daf61e2..c3db5f1 100644 --- a/lib/linkmap.py +++ b/lib/linkmap.py @@ -1,30 +1,4 @@ - -from html.parser import HTMLParser -from random import sample -from requests import get as request_get -from sys import stderr -from urllib.parse import urlparse - - -class _HTMLExternalLinkFinder(HTMLParser): - - links = [] - - def handle_starttag(self, tag, attrs): - if tag == "a": - for a in attrs: - attr, val = a - if attr == "href": - if val.startswith("https://") or val.startswith("http://"): - if not val in self.links: - self.links.append(val) - - def get_links(self, input_html:str): - self.feed(input_html) - return self.links - - class LinkMap: links = [] @@ -42,79 +16,3 @@ class LinkMap: def add_link(self, link): if not link in self.links: self.links.append(link) - - -class LinkMapFromSitelinksGenerator: - - site_request_max_len = 10000000 # bytes - site_request_timeout = 10 # seconds - already_visited = [] - generated_linkmap = LinkMap() - max_links_per_site = 3 - max_depth = 3 - enable_log = False - - def log(self, something): - if self.enable_log: - print(something, file=stderr) - - def _get_html(self, url:str) -> str: - html_content = "" - # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds - self.log("-----" + url) - response = request_get(url, stream=True, timeout=self.site_request_timeout) - response.raise_for_status() - content_size = 0 - content_chunks = [] - for chunk in response.iter_content(1024, decode_unicode=True): - content_size += len(chunk) - if content_size > self.site_request_max_len: - self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") - break - else: - content_chunks.append(chunk) - html_content = "".join(content_chunks) - return html_content - - def _get_linked_sites(self, url:str): - sites = [] - if not url in self.already_visited: - try: - html = self._get_html(url) - found_links = _HTMLExternalLinkFinder().get_links(html) - found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) - self.log("\n".join(found_links)) - for l in found_links: - if l != None: - sites.append(l) - self.already_visited.append(url) - except KeyboardInterrupt: - exit("KeyboardInterrupt") - except Exception as e: - self.log("An exception occcured while trying to get links from '" + url + "': ") - self.log(e) - return sites - - def _generate_linkmap(self, start_urls:list, _current_depth:int): - linkdict = {} - for url in start_urls: - linkdict[url] = {} - self.generated_linkmap.add_link(url) - linked_sites = self._get_linked_sites(url) - for l in linked_sites: - if l != url: - linkdict[url][l] = {} - self.generated_linkmap.add_link_connection(url, l) - if _current_depth < self.max_depth: - for url in linkdict: - linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) - - def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): - self.already_visited = [] - self.generated_linkmap = LinkMap() - self.max_links_per_site = max_links_per_site - self.max_depth = max_depth - self._generate_linkmap([start_url], 1) - - def get_linkmap(self) -> LinkMap: - return self.generated_linkmap diff --git a/lib/linkmap_from_sitelinks.py b/lib/linkmap_from_sitelinks.py new file mode 100644 index 0000000..43d1855 --- /dev/null +++ b/lib/linkmap_from_sitelinks.py @@ -0,0 +1,130 @@ + + +from concurrent.futures import ThreadPoolExecutor +from html.parser import HTMLParser +from random import sample +from requests import get as request_get +from sys import stderr +from urllib.parse import urlparse + +from .linkmap import LinkMap + + +class _HTMLExternalLinkFinder(HTMLParser): + + links = [] + + def handle_starttag(self, tag, attrs): + if tag == "a": + for a in attrs: + attr, val = a + if attr == "href": + if val.startswith("https://") or val.startswith("http://"): + if not val in self.links: + self.links.append(val) + + def get_links(self, input_html:str): + self.feed(input_html) + return self.links + + +class LinkMapFromSitelinksGenerator: + + site_request_max_len = 10000000 # bytes + site_request_timeout = 10 # seconds + generated_linkmap = LinkMap() + max_links_per_site = 3 + max_depth = 3 + max_threads = 4 + enable_log = False + + def log(self, something): + if self.enable_log: + print(something, file=stderr) + + def _get_html(self, url:str) -> str: + html_content = "" + # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds + self.log("----" + url) + response = request_get(url, stream=True, timeout=self.site_request_timeout) + response.raise_for_status() + content_size = 0 + content_chunks = [] + for chunk in response.iter_content(1024, decode_unicode=True): + content_size += len(chunk) + if content_size > self.site_request_max_len: + self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") + break + else: + content_chunks.append(chunk) + html_content = "".join(content_chunks) + return html_content + + def _get_linked_sites_thread(self, urls:list): + def _get_links(url:str): + sites = [] + try: + html = self._get_html(url) + found_links = _HTMLExternalLinkFinder().get_links(html) + found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) + self.log("\n".join(found_links)) + for l in found_links: + if l != None: + sites.append(l) + except KeyboardInterrupt: + exit("KeyboardInterrupt") + except Exception as e: + self.log("An exception occcured while trying to get links from '" + url + "': ") + self.log(e) + return sites + links = {} + for url in urls: + links[url] = _get_links(url) + return links + + def _get_linked_sites(self, urls:list): + # split urls into self.max_threads chunks + urlchunks = [] + chunk_size = max(int(len(urls) / self.max_threads), 1) + for i in range(self.max_threads): + start = i*chunk_size + end = (i*chunk_size)+chunk_size + new_chunk = urls[start:end] + if len(new_chunk) > 0: + urlchunks.append(new_chunk) + results = [] + # threads + with ThreadPoolExecutor() as tpe: + self.log(f"--Using {len(urlchunks)} concurrent connections...") + futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks] + for f in futures: + # wait for results + results.append(f.result()) + results_combined = {} + for result_chunk in results: + for url in result_chunk: + results_combined[url] = result_chunk[url] + return results_combined + + def _generate_linkmap(self, start_urls:list, _current_depth:int): + linkdict = {} + linked_sites = self._get_linked_sites(start_urls) + for url in linked_sites: + linkdict[url] = {} + self.generated_linkmap.add_link(url) + for l in linked_sites[url]: + if l != url: + linkdict[url][l] = {} + self.generated_linkmap.add_link_connection(url, l)# + if _current_depth < self.max_depth: + for url in linkdict: + linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) + + def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): + self.generated_linkmap = LinkMap() + self.max_links_per_site = max_links_per_site + self.max_depth = max_depth + self._generate_linkmap([start_url], 1) + + def get_linkmap(self) -> LinkMap: + return self.generated_linkmap diff --git a/linkmapy.py b/linkmapy.py index 768819e..1bd520b 100755 --- a/linkmapy.py +++ b/linkmapy.py @@ -1,15 +1,10 @@ #!/usr/bin/env python3 -import json - -from sys import stderr - from lib.args import argparser from lib.graph import pyvis_graph_from_pandas_DF -from lib.linkmap import LinkMapFromSitelinksGenerator +from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator from lib.linkmap2pandasdf import linkmap2pandasDF - if __name__ == "__main__": args = argparser.parse_args()