Compare commits

..

No commits in common. "main" and "1" have entirely different histories.
main ... 1

8 changed files with 95 additions and 107 deletions

21
LICENSE
View file

@ -1,21 +0,0 @@
MIT License
Copyright (c) 2022 Julian Müller (ChaoticByte)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -8,10 +8,13 @@ Create a graph from the links on a website and the following sites.
### Pip Dependencies
- pyvis
- requests
You can install those dependencies with
```
pip install -r requirements.txt
pip install -r pip-dependencies.txt
```
## Usage

View file

@ -2,7 +2,9 @@
from argparse import ArgumentParser
argparser = ArgumentParser(description="Map all links on a site (and links on resulting sites)")
argparser.add_argument("url", help="The URL of the site you want to start from")
argparser.add_argument("--dump", action="store_true", help="Only output the found connections to the console and exit")
argparser.add_argument("--max-depth", metavar="N", type=int, help="The maximum depth at which links will be followed (default: 3)", default=3)
argparser.add_argument("--max-links-per-site", metavar="N", type=int, help="The maximum amount of links on a page that will be included (default: 3)", default=3)

View file

@ -1,11 +1,14 @@
from pyvis.network import Network
from .linkmap import LinkMap
def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
pyvis_net = Network(bgcolor="#222222", font_color="#fafafa", width="100%", height="95%", directed=True)
pyvis_net.add_nodes(linkmap.links, size=[8]*len(linkmap.links))
pyvis_net.add_edges(linkmap.link_connections)
@ -63,5 +66,6 @@ def pyvis_graph_from_linkmap(linkmap:LinkMap, heading:str=None) -> Network:
# pyvis_net.show_buttons()
pyvis_net.set_options(pyvis_options)
return pyvis_net

View file

@ -1,30 +1,27 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from html.parser import HTMLParser
from random import sample
from requests import get as request_get
from sys import stderr
from urllib.parse import urlparse
from httpx import AsyncClient
from .linkmap import LinkMap
class HTMLLinkFinder(HTMLParser):
class _HTMLExternalLinkFinder(HTMLParser):
links = []
def handle_starttag(self, tag, attrs):
if tag != "a":
return
for a in attrs:
attr, val = a
if attr != "href":
continue
if val.startswith("https://") or val.startswith("http://"):
if not val in self.links:
self.links.append(val)
if tag == "a":
for a in attrs:
attr, val = a
if attr == "href":
if val.startswith("https://") or val.startswith("http://"):
if not val in self.links:
self.links.append(val)
def get_links(self, input_html:str):
self.feed(input_html)
@ -38,67 +35,80 @@ class LinkMapFromSitelinksGenerator:
generated_linkmap = LinkMap()
max_links_per_site = 3
max_depth = 3
max_threads = 4
enable_log = False
def log(self, something):
if self.enable_log:
print(something, file=stderr)
async def _get_html(self, url:str, client:AsyncClient) -> str:
content = bytearray()
def _get_html(self, url:str) -> str:
html_content = ""
# receive up to self.site_request_max_len bytes after a maximum of self.site_request_timeout seconds
self.log("----" + url)
response = request_get(url, stream=True, timeout=self.site_request_timeout)
response.raise_for_status()
content_size = 0
# receive up to self.site_request_max_len bytes after
# a maximum of self.site_request_timeout seconds
self.log(f"Request: {url}")
async with client.stream(
"GET",
url,
timeout=self.site_request_timeout,
follow_redirects=True
) as stream:
async for chunk in stream.aiter_bytes(1024):
content_size += len(chunk)
if content_size > self.site_request_max_len:
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
break
else:
content.extend(chunk)
# decode
try:
html_content = content.decode()
except UnicodeDecodeError:
self.log(f"Couldn't decode {url}")
html_content = ""
content_chunks = []
for chunk in response.iter_content(1024, decode_unicode=True):
content_size += len(chunk)
if content_size > self.site_request_max_len:
self.log(f"Maximum content length exceeded! received: {content_size} (maximum: {self.site_request_max_len})")
break
else:
content_chunks.append(chunk)
html_content = "".join(content_chunks)
return html_content
async def _get_linked_sites_coro(self, url, client:AsyncClient):
linked_sites = []
try:
html = await self._get_html(url, client)
found_links = HTMLLinkFinder().get_links(html)
found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
for l in found_links:
self.log(f"Found {l}")
if l != None:
linked_sites.append(l)
except KeyboardInterrupt:
exit("KeyboardInterrupt")
except Exception as e:
self.log("An exception occcured while trying to get links from '" + url + "': ")
self.log(e)
return url, linked_sites
def _get_linked_sites_thread(self, urls:list):
def _get_links(url:str):
sites = []
try:
html = self._get_html(url)
found_links = _HTMLExternalLinkFinder().get_links(html)
found_links = sample(found_links, min(self.max_links_per_site, len(found_links)))
self.log("\n".join(found_links))
for l in found_links:
if l != None:
sites.append(l)
except KeyboardInterrupt:
exit("KeyboardInterrupt")
except Exception as e:
self.log("An exception occcured while trying to get links from '" + url + "': ")
self.log(e)
return sites
links = {}
for url in urls:
links[url] = _get_links(url)
return links
async def _get_linked_sites(self, urls:list, client:AsyncClient):
# get results
results = await asyncio.gather(*[self._get_linked_sites_coro(url, client) for url in urls])
results_as_dict = {}
for url, links in results:
results_as_dict[url] = links
return results_as_dict
def _get_linked_sites(self, urls:list):
# split urls into self.max_threads chunks
urlchunks = []
chunk_size = max(int(len(urls) / self.max_threads), 1)
for i in range(self.max_threads):
start = i*chunk_size
end = (i*chunk_size)+chunk_size
new_chunk = urls[start:end]
if len(new_chunk) > 0:
urlchunks.append(new_chunk)
results = []
# threads
with ThreadPoolExecutor() as tpe:
self.log(f"--Using {len(urlchunks)} concurrent connections...")
futures = [tpe.submit(self._get_linked_sites_thread, chunk) for chunk in urlchunks]
for f in futures:
# wait for results
results.append(f.result())
results_combined = {}
for result_chunk in results:
for url in result_chunk:
results_combined[url] = result_chunk[url]
return results_combined
async def _generate_linkmap(self, start_urls:list, _current_depth:int, client:AsyncClient):
def _generate_linkmap(self, start_urls:list, _current_depth:int):
linkdict = {}
linked_sites = await self._get_linked_sites(start_urls, client)
linked_sites = self._get_linked_sites(start_urls)
for url in linked_sites:
linkdict[url] = {}
self.generated_linkmap.add_link(url)
@ -108,14 +118,13 @@ class LinkMapFromSitelinksGenerator:
self.generated_linkmap.add_link_connection(url, l)#
if _current_depth < self.max_depth:
for url in linkdict:
linkdict[url] = await self._generate_linkmap(list(linkdict[url]), _current_depth + 1, client)
linkdict[url] = self._generate_linkmap(list(linkdict[url]), _current_depth + 1)
async def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
def generate(self, start_url:str, max_depth:int=3, max_links_per_site:int=3):
self.generated_linkmap = LinkMap()
self.max_links_per_site = max_links_per_site
self.max_depth = max_depth
async with AsyncClient() as client:
await self._generate_linkmap([start_url], 1, client)
self._generate_linkmap([start_url], 1)
def get_linkmap(self) -> LinkMap:
return self.generated_linkmap

View file

@ -1,16 +1,13 @@
#!/usr/bin/env python3
# Copyright (c) 2021 Julian Müller (ChaoticByte)
import asyncio
from lib.args import argparser
from lib.graph import pyvis_graph_from_linkmap
from lib.linkmap_from_sitelinks import LinkMapFromSitelinksGenerator
if __name__ == "__main__":
args = argparser.parse_args()
async def main(args):
nm = LinkMapFromSitelinksGenerator()
nm.site_request_max_len = args.http_download_limit
nm.enable_log = args.log
@ -18,22 +15,16 @@ async def main(args):
starturl = args.url
if not (starturl.startswith("https://") or starturl.startswith("http://")):
starturl = "https://" + starturl
await nm.generate(
starturl,
max_depth=args.max_depth,
max_links_per_site=args.max_links_per_site
)
nm.generate(starturl, max_depth=args.max_depth, max_links_per_site=args.max_links_per_site)
if args.dump:
print(
"\n".join(str(c) for c in nm.get_linkmap().link_connections)
)
else:
pyvis_network_graph = pyvis_graph_from_linkmap(nm.get_linkmap(), heading=starturl)
pyvis_network_graph.show("output.html")
if __name__ == "__main__":
args = argparser.parse_args()
asyncio.run(main(args))

2
pip-dependencies.txt Normal file
View file

@ -0,0 +1,2 @@
pyvis
requests

View file

@ -1,2 +0,0 @@
pyvis
httpx