Refactored, changed to httx and async instead of requests and threads

This commit is contained in:
Julian Müller 2022-08-13 13:15:50 +02:00
parent 56b850dc4f
commit 2d2afe7930
5 changed files with 84 additions and 90 deletions

View file

@ -2,9 +2,7 @@
from argparse import ArgumentParser from argparse import ArgumentParser
argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)") argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
argparser.add_argument("url", help="The URL of the site you want to start from") argparser.add_argument("url", help="The URL of the site you want to start from")
argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit") argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3) argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3) argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)

View file

@ -1,14 +1,11 @@
from pyvis.network import Network from pyvis.network import Network
from .linkmap import LinkMap from .linkmap import LinkMap
def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network: def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True) pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links)) pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
pyvis_net.add_edges(linkmap.link_connections) pyvis_net.add_edges(linkmap.link_connections)
@ -66,6 +63,5 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
# pyvis_net.show_buttons() # pyvis_net.show_buttons()
pyvis_net.set_options(pyvis_options) pyvis_net.set_options(pyvis_options)
return pyvis_net return pyvis_net

View file

@ -1,24 +1,27 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from html.parser import HTMLParser from html.parser import HTMLParser
from random import sample from random import sample
from requests import get as request_get
from sys import stderr from sys import stderr
from urllib.parse import urlparse from urllib.parse import urlparse
from httpx import AsyncClient
from .linkmap import LinkMap from .linkmap import LinkMap
class _HTMLExternalLinkFinder(HTMLParser): class HTMLLinkFinder(HTMLParser):
links = [] links = []
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
if tag == "a": if tag != "a":
return
for a in attrs: for a in attrs:
attr, val = a attr, val = a
if attr == "href": if attr != "href":
continue
if val.startswith("https://") or val.startswith("http://"): if val.startswith("https://") or val.startswith("http://"):
if not val in self.links: if not val in self.links:
self.links.append(val) self.links.append(val)
@ -35,80 +38,67 @@ class LinkMapFromSitelinksGenerator:
generated_linkmap = LinkMap() generated_linkmap = LinkMap()
max_links_per_site = 3 max_links_per_site = 3
max_depth = 3 max_depth = 3
max_threads = 4
enable_log = False enable_log = False
def log(self, something): def log(self, something):
if self.enable_log: if self.enable_log:
print(something, file=stderr) print(something, file=stderr)
def _get_html(self, url:str) -> str: async def _get_html(self, url:str, client:AsyncClient) -> str:
html_content = "" content = bytearray()
# receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
self.log("----" + url)
response = request_get(url, stream=True, timeout=self.site_request_timeout)
response.raise_for_status()
content_size = 0 content_size = 0
content_chunks = [] # receive up to self.site_request_max_len bytes after
for chunk in response.iter_content(1024, decode_unicode=True): # a maximum of self.site_request_timeout seconds
self.log(f"Request: {url}")
async with client.stream(
"GET",
url,
timeout=self.site_request_timeout,
follow_redirects=True
) as stream:
async for chunk in stream.aiter_bytes(1024):
content_size += len(chunk) content_size += len(chunk)
if content_size > self.site_request_max_len: if content_size > self.site_request_max_len:
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})") self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
break break
else: else:
content_chunks.append(chunk) content.extend(chunk)
html_content = "".join(content_chunks) # decode
try:
html_content = content.decode()
except UnicodeDecodeError:
self.log(f"Couldn't decode {url}")
html_content = ""
return html_content return html_content
def _get_linked_sites_thread(self, urls:list): async def _get_linked_sites_coro(self, url, client:AsyncClient):
def _get_links(url:str): linked_sites = []
sites = []
try: try:
html = self._get_html(url) html = await self._get_html(url, client)
found_links = _HTMLExternalLinkFinder().get_links(html) found_links = HTMLLinkFinder().get_links(html)
found_links = sample(found_links, min(self.max_links_per_site, len(found_links))) found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
self.log("\n".join(found_links))
for l in found_links: for l in found_links:
self.log(f"Found {l}")
if l != None: if l != None:
sites.append(l) linked_sites.append(l)
except KeyboardInterrupt: except KeyboardInterrupt:
exit("KeyboardInterrupt") exit("KeyboardInterrupt")
except Exception as e: except Exception as e:
self.log("An exception occcured while trying to get links from '" + url + "': ") self.log("An exception occcured while trying to get links from '" + url + "': ")
self.log(e) self.log(e)
return sites return url, linked_sites
links = {}
for url in urls:
links[url] = _get_links(url)
return links
def _get_linked_sites(self, urls:list): async def _get_linked_sites(self, urls:list, client:AsyncClient):
# split urls into self.max_threads chunks # get results
urlchunks = [] results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls])
chunk_size = max(int(len(urls) / self.max_threads), 1) results_as_dict = {}
for i in range(self.max_threads): for url, links in results:
start = i*chunk_size results_as_dict[url] = links
end = (i*chunk_size)+chunk_size return results_as_dict
new_chunk = urls[start:end]
if len(new_chunk) > 0:
urlchunks.append(new_chunk)
results = []
# threads
with ThreadPoolExecutor() as tpe:
self.log(f"--Using {len(urlchunks)} concurrent connections...")
futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks]
for f in futures:
# wait for results
results.append(f.result())
results_combined = {}
for result_chunk in results:
for url in result_chunk:
results_combined[url] = result_chunk[url]
return results_combined
def _generate_linkmap(self, start_urls:list, _current_depth:int): async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient):
linkdict = {} linkdict = {}
linked_sites = self._get_linked_sites(start_urls) linked_sites = await self._get_linked_sites(start_urls, client)
for url in linked_sites: for url in linked_sites:
linkdict[url] = {} linkdict[url] = {}
self.generated_linkmap.add_link(url) self.generated_linkmap.add_link(url)
@ -118,13 +108,14 @@ class LinkMapFromSitelinksGenerator:
self.generated_linkmap.add_link_connection(url, l)# self.generated_linkmap.add_link_connection(url, l)#
if _current_depth < self.max_depth: if _current_depth < self.max_depth:
for url in linkdict: for url in linkdict:
linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1) linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client)
def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3): async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
self.generated_linkmap = LinkMap() self.generated_linkmap = LinkMap()
self.max_links_per_site = max_links_per_site self.max_links_per_site = max_links_per_site
self.max_depth = max_depth self.max_depth = max_depth
self._generate_linkmap([start_url], 1) async with AsyncClient() as client:
await self._generate_linkmap([start_url], 1, client)
def get_linkmap(self) -> LinkMap: def get_linkmap(self) -> LinkMap:
return self.generated_linkmap return self.generated_linkmap

View file

@ -1,13 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright (c) 2021 Julian Müller (W13R)
import asyncio
from lib.args import argparser from lib.args import argparser
from lib.graph import pyvis_graph_from_linkmap from lib.graph import pyvis_graph_from_linkmap
from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator
if __name__ == "__main__":
args = argparser.parse_args()
async def main(args):
nm = LinkMapFromSitelinksGenerator() nm = LinkMapFromSitelinksGenerator()
nm.site_request_max_len = args.http_download_limit nm.site_request_max_len = args.http_download_limit
nm.enable_log = args.log nm.enable_log = args.log
@ -16,15 +19,21 @@ if __name__ == "__main__":
if not (starturl.startswith("https://") or starturl.startswith("http://")): if not (starturl.startswith("https://") or starturl.startswith("http://")):
starturl = "https://" + starturl starturl = "https://" + starturl
nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site) await nm.generate(
starturl,
max_depth=args.max_depth,
max_links_per_site=args.max_links_per_site
)
if args.dump: if args.dump:
print( print(
"\n".join(str(c) for c in nm.get_linkmap().link_connections) "\n".join(str(c) for c in nm.get_linkmap().link_connections)
) )
else: else:
pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl) pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
pyvis_network_graph.show("output.html") pyvis_network_graph.show("output.html")
if __name__ == "__main__":
args = argparser.parse_args()
asyncio.run(main(args))

View file

@ -1,2 +1,2 @@
pyvis pyvis
requests httpx