8 changed files with 95 additions and 107 deletions
--- a/21
+++ b/21
@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2022 Julian Müller (ChaoticByte)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/README.md
+++ b/README.md
@ -8,10 +8,13 @@ Create a graph from the links on a website and the following sites.

 ### Pip Dependencies

+- pyvis
+- requests
+
 You can install those dependencies with 

 ```
-pip install -r requirements.txt
+pip install -r pip-dependencies.txt
 ```

 ## Usage
--- a/lib/args.py
+++ b/lib/args.py
@ -2,7 +2,9 @@
 from argparse import ArgumentParser

 argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
+
 argparser.add_argument("url", help="The URL of the site you want to start from")
+
 argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
 argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
 argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)
--- a/lib/graph.py
+++ b/lib/graph.py
@ -1,11 +1,14 @@

+
 from pyvis.network import Network

 from .linkmap import LinkMap


 def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
+
    pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
+
    pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
    pyvis_net.add_edges(linkmap.link_connections)    

@ -63,5 +66,6 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:

    # pyvis_net.show_buttons()
    pyvis_net.set_options(pyvis_options)
+
    return pyvis_net

--- a/lib/linkmap_from_sitelinks.py
+++ b/lib/linkmap_from_sitelinks.py
@ -1,30 +1,27 @@

-import asyncio

+from concurrent.futures import ThreadPoolExecutor
 from html.parser import HTMLParser
 from random import sample
+from requests import get as request_get
 from sys import stderr
 from urllib.parse import urlparse

-from httpx import AsyncClient
-
 from .linkmap import LinkMap


-class HTMLLinkFinder(HTMLParser):
+class _HTMLExternalLinkFinder(HTMLParser):

    links = []

    def handle_starttag(self, tag, attrs):
-        if tag != "a":
-            return
-        for a in attrs:
-            attr, val = a
-            if attr != "href":
-                continue
-            if val.startswith("https://") or val.startswith("http://"):
-                if not val in self.links:
-                    self.links.append(val)
+        if tag == "a":
+            for a in attrs:
+                attr, val = a
+                if attr == "href":
+                    if val.startswith("https://") or val.startswith("http://"):
+                        if not val in self.links:
+                            self.links.append(val)

    def get_links(self, input_html:str):
        self.feed(input_html)
@ -38,67 +35,80 @@ class LinkMapFromSitelinksGenerator:
    generated_linkmap = LinkMap()
    max_links_per_site = 3
    max_depth = 3
+    max_threads = 4
    enable_log = False

    def log(self, something):
        if self.enable_log:
            print(something, file=stderr)

-    async def _get_html(self, url:str, client:AsyncClient) -> str:
-        content = bytearray()
+    def _get_html(self, url:str) -> str:
+        html_content = ""
+        # receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
+        self.log("----" + url)
+        response = request_get(url, stream=True, timeout=self.site_request_timeout)
+        response.raise_for_status()
        content_size = 0
-        # receive up to self.site_request_max_len bytes after
-        # a maximum of self.site_request_timeout seconds
-        self.log(f"Request: {url}")
-        async with client.stream(
-            "GET",
-            url,
-            timeout=self.site_request_timeout,
-            follow_redirects=True
-        ) as stream:
-            async for chunk in stream.aiter_bytes(1024):
-                content_size += len(chunk)
-                if content_size > self.site_request_max_len:
-                    self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
-                    break
-                else:
-                    content.extend(chunk)
-        # decode
-        try:
-            html_content = content.decode()
-        except UnicodeDecodeError:
-            self.log(f"Couldn't decode {url}")
-            html_content = ""
+        content_chunks = []
+        for chunk in response.iter_content(1024, decode_unicode=True):
+            content_size += len(chunk)
+            if content_size > self.site_request_max_len:
+                self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
+                break
+            else:
+                content_chunks.append(chunk)
+        html_content = "".join(content_chunks)
        return html_content

-    async def _get_linked_sites_coro(self, url, client:AsyncClient):
-        linked_sites = []
-        try:
-            html = await self._get_html(url, client)
-            found_links = HTMLLinkFinder().get_links(html)
-            found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
-            for l in found_links:
-                self.log(f"Found {l}")
-                if l != None:
-                    linked_sites.append(l)
-        except KeyboardInterrupt:
-            exit("KeyboardInterrupt")
-        except Exception as e:
-            self.log("An exception occcured while trying to get links from '" + url + "': ")
-            self.log(e)
-        return url, linked_sites
+    def _get_linked_sites_thread(self, urls:list):
+        def _get_links(url:str):
+            sites = []
+            try:
+                html = self._get_html(url)
+                found_links = _HTMLExternalLinkFinder().get_links(html)
+                found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
+                self.log("\n".join(found_links))
+                for l in found_links:
+                    if l != None:
+                        sites.append(l)
+            except KeyboardInterrupt:
+                exit("KeyboardInterrupt")
+            except Exception as e:
+                self.log("An exception occcured while trying to get links from '" + url + "': ")
+                self.log(e)
+            return sites
+        links = {}
+        for url in urls:
+            links[url] = _get_links(url)
+        return links

-    async def _get_linked_sites(self, urls:list, client:AsyncClient):
-        # get results
-        results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls])
-        results_as_dict = {}
-        for url, links in results:
-            results_as_dict[url] = links
-        return results_as_dict
+    def _get_linked_sites(self, urls:list):
+        # split urls into self.max_threads chunks
+        urlchunks = []
+        chunk_size = max(int(len(urls) / self.max_threads), 1)
+        for i in range(self.max_threads):
+            start = i*chunk_size
+            end = (i*chunk_size)+chunk_size
+            new_chunk = urls[start:end]
+            if len(new_chunk) > 0:
+                urlchunks.append(new_chunk)
+        results = []
+        # threads
+        with ThreadPoolExecutor() as tpe:
+            self.log(f"--Using {len(urlchunks)} concurrent connections...")
+            futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks]
+            for f in futures:
+                # wait for results
+                results.append(f.result())
+        results_combined = {}
+        for result_chunk in results:
+            for url in result_chunk:
+                results_combined[url] = result_chunk[url]
+        return results_combined

-    async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient):
+    def _generate_linkmap(self, start_urls:list, _current_depth:int):
        linkdict = {}
-        linked_sites = await self._get_linked_sites(start_urls, client)
+        linked_sites = self._get_linked_sites(start_urls)
        for url in linked_sites:
            linkdict[url] = {}
            self.generated_linkmap.add_link(url)
@ -108,14 +118,13 @@ class LinkMapFromSitelinksGenerator:
                    self.generated_linkmap.add_link_connection(url, l)#
        if _current_depth < self.max_depth:
            for url in linkdict:
-                linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client)
+                linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1)

-    async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
+    def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
        self.generated_linkmap = LinkMap()
        self.max_links_per_site = max_links_per_site
        self.max_depth = max_depth
-        async with AsyncClient() as client:
-            await self._generate_linkmap([start_url], 1, client)
+        self._generate_linkmap([start_url], 1)

    def get_linkmap(self) -> LinkMap:
        return self.generated_linkmap
--- a/linkmapy.py
+++ b/linkmapy.py
@ -1,16 +1,13 @@
 #!/usr/bin/env python3

-# Copyright (c) 2021 Julian Müller (ChaoticByte)
-
-
-import asyncio
-
 from lib.args import argparser
 from lib.graph import pyvis_graph_from_linkmap
 from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator

+if __name__ == "__main__":
+
+    args = argparser.parse_args()

-async def main(args):
    nm = LinkMapFromSitelinksGenerator()
    nm.site_request_max_len = args.http_download_limit
    nm.enable_log = args.log
@ -18,22 +15,16 @@ async def main(args):
    starturl = args.url
    if not (starturl.startswith("https://") or starturl.startswith("http://")):
        starturl = "https://" + starturl
-
-    await nm.generate(
-        starturl,
-        max_depth=args.max_depth,
-        max_links_per_site=args.max_links_per_site
-    )
+    
+    nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site)

    if args.dump:
+
        print(
            "\n".join(str(c) for c in nm.get_linkmap().link_connections)
        )
+    
    else:
+
        pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
        pyvis_network_graph.show("output.html")
-
-
-if __name__ == "__main__":
-    args = argparser.parse_args()
-    asyncio.run(main(args))
--- a/pip-dependencies.txt
+++ b/pip-dependencies.txt
@ -0,0 +1,2 @@
+pyvis
+requests
--- a/requirements.txt
+++ b/requirements.txt
@ -1,2 +0,0 @@
-pyvis
-httpx