From 2d2afe7930e61721daabe339ae3f0d134c34eec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= <9070224-W13R@users.noreply.gitlab.com> Date: Sat, 13 Aug 2022 13:15:50 +0200 Subject: [PATCH] Refactored, changed to httx and async instead of requests and threads --- lib/args.py | 2 - lib/graph.py | 4 - lib/linkmap_from_sitelinks.py | 141 ++++++++++++++++------------------ linkmapy.py | 25 ++++-- requirements.txt | 2 +- 5 files changed, 84 insertions(+), 90 deletions(-) diff --git a/lib/args.py b/lib/args.py index 24a99a7..c697394 100644 --- a/lib/args.py +++ b/lib/args.py @@ -2,9 +2,7 @@ from argparse import ArgumentParser argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)") - argparser.add_argument("url", help="The URL of the site you want to start from") - argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit") argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3) argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3) diff --git a/lib/graph.py b/lib/graph.py index 34c76aa..2fc6970 100644 --- a/lib/graph.py +++ b/lib/graph.py @@ -1,14 +1,11 @@ - from pyvis.network import Network from .linkmap import LinkMap def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: - pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True) - pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links)) pyvis_net.add_edges(linkmap.link_connections) @@ -66,6 +63,5 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: # pyvis_net.show_buttons() pyvis_net.set_options(pyvis_options) - return pyvis_net diff --git a/lib/linkmap_from_sitelinks.py b/lib/linkmap_from_sitelinks.py index 43d1855..b5c2b93 100644 --- a/lib/linkmap_from_sitelinks.py +++ b/lib/linkmap_from_sitelinks.py @@ -1,27 +1,30 @@ +import asyncio -from concurrent.futures import ThreadPoolExecutor from html.parser import HTMLParser from random import sample -from requests import get as request_get from sys import stderr from urllib.parse import urlparse +from httpx import AsyncClient + from .linkmap import LinkMap -class _HTMLExternalLinkFinder(HTMLParser): +class HTMLLinkFinder(HTMLParser): links = [] def handle_starttag(self, tag, attrs): - if tag == "a": - for a in attrs: - attr, val = a - if attr == "href": - if val.startswith("https://") or val.startswith("http://"): - if not val in self.links: - self.links.append(val) + if tag != "a": + return + for a in attrs: + attr, val = a + if attr != "href": + continue + if val.startswith("https://") or val.startswith("http://"): + if not val in self.links: + self.links.append(val) def get_links(self, input_html:str): self.feed(input_html) @@ -35,80 +38,67 @@ class LinkMapFromSitelinksGenerator: generated_linkmap = LinkMap() max_links_per_site = 3 max_depth = 3 - max_threads = 4 enable_log = False def log(self, something): if self.enable_log: print(something, file=stderr) - def _get_html(self, url:str) -> str: - html_content = "" - # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds - self.log("----" + url) - response = request_get(url, stream=True, timeout=self.site_request_timeout) - response.raise_for_status() + async def _get_html(self, url:str, client:AsyncClient) -> str: + content = bytearray() content_size = 0 - content_chunks = [] - for chunk in response.iter_content(1024, decode_unicode=True): - content_size += len(chunk) - if content_size > self.site_request_max_len: - self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") - break - else: - content_chunks.append(chunk) - html_content = "".join(content_chunks) + # receive up to self.site_request_max_len bytes after + # a maximum of self.site_request_timeout seconds + self.log(f"Request: {url}") + async with client.stream( + "GET", + url, + timeout=self.site_request_timeout, + follow_redirects=True + ) as stream: + async for chunk in stream.aiter_bytes(1024): + content_size += len(chunk) + if content_size > self.site_request_max_len: + self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") + break + else: + content.extend(chunk) + # decode + try: + html_content = content.decode() + except UnicodeDecodeError: + self.log(f"Couldn't decode {url}") + html_content = "" return html_content - def _get_linked_sites_thread(self, urls:list): - def _get_links(url:str): - sites = [] - try: - html = self._get_html(url) - found_links = _HTMLExternalLinkFinder().get_links(html) - found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) - self.log("\n".join(found_links)) - for l in found_links: - if l != None: - sites.append(l) - except KeyboardInterrupt: - exit("KeyboardInterrupt") - except Exception as e: - self.log("An exception occcured while trying to get links from '" + url + "': ") - self.log(e) - return sites - links = {} - for url in urls: - links[url] = _get_links(url) - return links + async def _get_linked_sites_coro(self, url, client:AsyncClient): + linked_sites = [] + try: + html = await self._get_html(url, client) + found_links = HTMLLinkFinder().get_links(html) + found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) + for l in found_links: + self.log(f"Found {l}") + if l != None: + linked_sites.append(l) + except KeyboardInterrupt: + exit("KeyboardInterrupt") + except Exception as e: + self.log("An exception occcured while trying to get links from '" + url + "': ") + self.log(e) + return url, linked_sites - def _get_linked_sites(self, urls:list): - # split urls into self.max_threads chunks - urlchunks = [] - chunk_size = max(int(len(urls) / self.max_threads), 1) - for i in range(self.max_threads): - start = i*chunk_size - end = (i*chunk_size)+chunk_size - new_chunk = urls[start:end] - if len(new_chunk) > 0: - urlchunks.append(new_chunk) - results = [] - # threads - with ThreadPoolExecutor() as tpe: - self.log(f"--Using {len(urlchunks)} concurrent connections...") - futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks] - for f in futures: - # wait for results - results.append(f.result()) - results_combined = {} - for result_chunk in results: - for url in result_chunk: - results_combined[url] = result_chunk[url] - return results_combined + async def _get_linked_sites(self, urls:list, client:AsyncClient): + # get results + results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls]) + results_as_dict = {} + for url, links in results: + results_as_dict[url] = links + return results_as_dict - def _generate_linkmap(self, start_urls:list, _current_depth:int): + async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient): linkdict = {} - linked_sites = self._get_linked_sites(start_urls) + linked_sites = await self._get_linked_sites(start_urls, client) for url in linked_sites: linkdict[url] = {} self.generated_linkmap.add_link(url) @@ -118,13 +108,14 @@ class LinkMapFromSitelinksGenerator: self.generated_linkmap.add_link_connection(url, l)# if _current_depth < self.max_depth: for url in linkdict: - linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) + linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client) - def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): + async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): self.generated_linkmap = LinkMap() self.max_links_per_site = max_links_per_site self.max_depth = max_depth - self._generate_linkmap([start_url], 1) + async with AsyncClient() as client: + await self._generate_linkmap([start_url], 1, client) def get_linkmap(self) -> LinkMap: return self.generated_linkmap diff --git a/linkmapy.py b/linkmapy.py index fb7617d..2a828d9 100755 --- a/linkmapy.py +++ b/linkmapy.py @@ -1,13 +1,16 @@ #!/usr/bin/env python3 +# Copyright (c) 2021 Julian Müller (W13R) + + +import asyncio + from lib.args import argparser from lib.graph import pyvis_graph_from_linkmap from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator -if __name__ == "__main__": - - args = argparser.parse_args() +async def main(args): nm = LinkMapFromSitelinksGenerator() nm.site_request_max_len = args.http_download_limit nm.enable_log = args.log @@ -15,16 +18,22 @@ if __name__ == "__main__": starturl = args.url if not (starturl.startswith("https://") or starturl.startswith("http://")): starturl = "https://" + starturl - - nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site) + + await nm.generate( + starturl, + max_depth=args.max_depth, + max_links_per_site=args.max_links_per_site + ) if args.dump: - print( "\n".join(str(c) for c in nm.get_linkmap().link_connections) ) - else: - pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl) pyvis_network_graph.show("output.html") + + +if __name__ == "__main__": + args = argparser.parse_args() + asyncio.run(main(args)) diff --git a/requirements.txt b/requirements.txt index 71d9dfe..35bfa31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ pyvis -requests \ No newline at end of file +httpx \ No newline at end of file