import requests from bs4 import BeautifulSoup import urllib.parse import pdfkit class MLWerk: class SubSite: def __init__(self, url): self.url = url self.getsoup() def getsoup(self): r = requests.get(self.url) r.encoding = 'UTF-8' self.soup = BeautifulSoup(r.text, 'html.parser') def __init__(self, url): self.url = url self.baseurl = url[:url.rfind('/')] + '/' self.subsites = [] self.getsoup() def getsoup(self): r = requests.get(self.url) r.encoding = 'UTF-8' self.soup = BeautifulSoup(r.text, 'html.parser') def getsubsites(self, limit = None): i = 0 for link in self.soup.find_all('a'): if limit != None and i > limit: break link = link.get('href') if not (link == None and ("#" in link or "/" in link)): self.subsites.append(self.SubSite(urllib.parse.urljoin(self.baseurl, link))) i += 1 def concatsubsites(self): for subsite in self.subsites: for element in subsite.soup.body: self.soup.body.append(element) def genpdf(self, path): pdfkit.from_string(str(self.soup), path, options={"enable-local-file-access": ""}) def text2pdf(self, path): print("gettings subsites") self.getsubsites print("concatting subsites") self.concatsubsites print("generating pdf") self.genpdf(path)