From 2d2afe7930e61721daabe339ae3f0d134c34eec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julian=20M=C3=BCller?=
 <9070224-W13R@users.noreply.gitlab.com>
Date: Sat, 13 Aug 2022 13:15:50 +0200
Subject: [PATCH] Refactored, changed to httx and async instead of requests and
 threads

---
 lib/args.py                   |   2 -
 lib/graph.py                  |   4 -
 lib/linkmap_from_sitelinks.py | 141 ++++++++++++++++------------------
 linkmapy.py                   |  25 ++++--
 requirements.txt              |   2 +-
 5 files changed, 84 insertions(+), 90 deletions(-)

diff --git a/lib/args.py b/lib/args.py
index 24a99a7..c697394 100644
--- a/lib/args.py
+++ b/lib/args.py
@@ -2,9 +2,7 @@
 from argparse import ArgumentParser
 
 argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
-
 argparser.add_argument("url", help="The URL of the site you want to start from")
-
 argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
 argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
 argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)
diff --git a/lib/graph.py b/lib/graph.py
index 34c76aa..2fc6970 100644
--- a/lib/graph.py
+++ b/lib/graph.py
@@ -1,14 +1,11 @@
 
-
 from pyvis.network import Network
 
 from .linkmap import LinkMap
 
 
 def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
-
     pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
-
     pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
     pyvis_net.add_edges(linkmap.link_connections)    
 
@@ -66,6 +63,5 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
 
     # pyvis_net.show_buttons()
     pyvis_net.set_options(pyvis_options)
-
     return pyvis_net
 
diff --git a/lib/linkmap_from_sitelinks.py b/lib/linkmap_from_sitelinks.py
index 43d1855..b5c2b93 100644
--- a/lib/linkmap_from_sitelinks.py
+++ b/lib/linkmap_from_sitelinks.py
@@ -1,27 +1,30 @@
 
+import asyncio
 
-from concurrent.futures import ThreadPoolExecutor
 from html.parser import HTMLParser
 from random import sample
-from requests import get as request_get
 from sys import stderr
 from urllib.parse import urlparse
 
+from httpx import AsyncClient
+
 from .linkmap import LinkMap
 
 
-class _HTMLExternalLinkFinder(HTMLParser):
+class HTMLLinkFinder(HTMLParser):
 
     links = []
 
     def handle_starttag(self, tag, attrs):
-        if tag == "a":
-            for a in attrs:
-                attr, val = a
-                if attr == "href":
-                    if val.startswith("https://") or val.startswith("http://"):
-                        if not val in self.links:
-                            self.links.append(val)
+        if tag != "a":
+            return
+        for a in attrs:
+            attr, val = a
+            if attr != "href":
+                continue
+            if val.startswith("https://") or val.startswith("http://"):
+                if not val in self.links:
+                    self.links.append(val)
 
     def get_links(self, input_html:str):
         self.feed(input_html)
@@ -35,80 +38,67 @@ class LinkMapFromSitelinksGenerator:
     generated_linkmap = LinkMap()
     max_links_per_site = 3
     max_depth = 3
-    max_threads = 4
     enable_log = False
 
     def log(self, something):
         if self.enable_log:
             print(something, file=stderr)
 
-    def _get_html(self, url:str) -> str:
-        html_content = ""
-        # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
-        self.log("----" + url)
-        response = request_get(url, stream=True, timeout=self.site_request_timeout)
-        response.raise_for_status()
+    async def _get_html(self, url:str, client:AsyncClient) -> str:
+        content = bytearray()
         content_size = 0
-        content_chunks = []
-        for chunk in response.iter_content(1024, decode_unicode=True):
-            content_size += len(chunk)
-            if content_size > self.site_request_max_len:
-                self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
-                break
-            else:
-                content_chunks.append(chunk)
-        html_content = "".join(content_chunks)
+        # receive up to self.site_request_max_len bytes after
+        # a maximum of self.site_request_timeout seconds
+        self.log(f"Request: {url}")
+        async with client.stream(
+            "GET",
+            url,
+            timeout=self.site_request_timeout,
+            follow_redirects=True
+        ) as stream:
+            async for chunk in stream.aiter_bytes(1024):
+                content_size += len(chunk)
+                if content_size > self.site_request_max_len:
+                    self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
+                    break
+                else:
+                    content.extend(chunk)
+        # decode
+        try:
+            html_content = content.decode()
+        except UnicodeDecodeError:
+            self.log(f"Couldn't decode {url}")
+            html_content = ""
         return html_content
 
-    def _get_linked_sites_thread(self, urls:list):
-        def _get_links(url:str):
-            sites = []
-            try:
-                html = self._get_html(url)
-                found_links = _HTMLExternalLinkFinder().get_links(html)
-                found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
-                self.log("\n".join(found_links))
-                for l in found_links:
-                    if l != None:
-                        sites.append(l)
-            except KeyboardInterrupt:
-                exit("KeyboardInterrupt")
-            except Exception as e:
-                self.log("An exception occcured while trying to get links from '" + url + "': ")
-                self.log(e)
-            return sites
-        links = {}
-        for url in urls:
-            links[url] = _get_links(url)
-        return links
+    async def _get_linked_sites_coro(self, url, client:AsyncClient):
+        linked_sites = []
+        try:
+            html = await self._get_html(url, client)
+            found_links = HTMLLinkFinder().get_links(html)
+            found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
+            for l in found_links:
+                self.log(f"Found {l}")
+                if l != None:
+                    linked_sites.append(l)
+        except KeyboardInterrupt:
+            exit("KeyboardInterrupt")
+        except Exception as e:
+            self.log("An exception occcured while trying to get links from '" + url + "': ")
+            self.log(e)
+        return url, linked_sites
 
-    def _get_linked_sites(self, urls:list):
-        # split urls into self.max_threads chunks
-        urlchunks = []
-        chunk_size = max(int(len(urls) / self.max_threads), 1)
-        for i in range(self.max_threads):
-            start = i*chunk_size
-            end = (i*chunk_size)+chunk_size
-            new_chunk = urls[start:end]
-            if len(new_chunk) > 0:
-                urlchunks.append(new_chunk)
-        results = []
-        # threads
-        with ThreadPoolExecutor() as tpe:
-            self.log(f"--Using {len(urlchunks)} concurrent connections...")
-            futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks]
-            for f in futures:
-                # wait for results
-                results.append(f.result())
-        results_combined = {}
-        for result_chunk in results:
-            for url in result_chunk:
-                results_combined[url] = result_chunk[url]
-        return results_combined
+    async def _get_linked_sites(self, urls:list, client:AsyncClient):
+        # get results
+        results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls])
+        results_as_dict = {}
+        for url, links in results:
+            results_as_dict[url] = links
+        return results_as_dict
 
-    def _generate_linkmap(self, start_urls:list, _current_depth:int):
+    async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient):
         linkdict = {}
-        linked_sites = self._get_linked_sites(start_urls)
+        linked_sites = await self._get_linked_sites(start_urls, client)
         for url in linked_sites:
             linkdict[url] = {}
             self.generated_linkmap.add_link(url)
@@ -118,13 +108,14 @@ class LinkMapFromSitelinksGenerator:
                     self.generated_linkmap.add_link_connection(url, l)#
         if _current_depth < self.max_depth:
             for url in linkdict:
-                linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1)
+                linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client)
 
-    def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
+    async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
         self.generated_linkmap = LinkMap()
         self.max_links_per_site = max_links_per_site
         self.max_depth = max_depth
-        self._generate_linkmap([start_url], 1)
+        async with AsyncClient() as client:
+            await self._generate_linkmap([start_url], 1, client)
 
     def get_linkmap(self) -> LinkMap:
         return self.generated_linkmap
diff --git a/linkmapy.py b/linkmapy.py
index fb7617d..2a828d9 100755
--- a/linkmapy.py
+++ b/linkmapy.py
@@ -1,13 +1,16 @@
 #!/usr/bin/env python3
 
+# Copyright (c) 2021 Julian Müller (W13R)
+
+
+import asyncio
+
 from lib.args import argparser
 from lib.graph import pyvis_graph_from_linkmap
 from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator
 
-if __name__ == "__main__":
-
-    args = argparser.parse_args()
 
+async def main(args):
     nm = LinkMapFromSitelinksGenerator()
     nm.site_request_max_len = args.http_download_limit
     nm.enable_log = args.log
@@ -15,16 +18,22 @@ if __name__ == "__main__":
     starturl = args.url
     if not (starturl.startswith("https://") or starturl.startswith("http://")):
         starturl = "https://" + starturl
-    
-    nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site)
+
+    await nm.generate(
+        starturl,
+        max_depth=args.max_depth,
+        max_links_per_site=args.max_links_per_site
+    )
 
     if args.dump:
-
         print(
             "\n".join(str(c) for c in nm.get_linkmap().link_connections)
         )
-    
     else:
-
         pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
         pyvis_network_graph.show("output.html")
+
+
+if __name__ == "__main__":
+    args = argparser.parse_args()
+    asyncio.run(main(args))
diff --git a/requirements.txt b/requirements.txt
index 71d9dfe..35bfa31 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
 pyvis
-requests
\ No newline at end of file
+httpx
\ No newline at end of file