diff --git a/mlwerke2pdf.py b/mlwerke2pdf.py new file mode 100644 index 0000000..aae7164 --- /dev/null +++ b/mlwerke2pdf.py @@ -0,0 +1,52 @@ +import requests +from bs4 import BeautifulSoup +import urllib.parse +import pdfkit + +class MLWerk: + class SubSite: + def __init__(self, url): + self.url = url + self.getsoup() + + def getsoup(self): + r = requests.get(self.url) + r.encoding = 'UTF-8' + self.soup = BeautifulSoup(r.text, 'html.parser') + + def __init__(self, url): + self.url = url + self.baseurl = url[:url.rfind('/')] + '/' + self.subsites = [] + self.getsoup() + + def getsoup(self): + r = requests.get(self.url) + r.encoding = 'UTF-8' + self.soup = BeautifulSoup(r.text, 'html.parser') + + def getsubsites(self, limit = None): + i = 0 + for link in self.soup.find_all('a'): + if limit != None and i > limit: + break + link = link.get('href') + if not (link == None and ("#" in link or "/" in link)): + self.subsites.append(self.SubSite(urllib.parse.urljoin(self.baseurl, link))) + i += 1 + + def concatsubsites(self): + for subsite in self.subsites: + for element in subsite.soup.body: + self.soup.body.append(element) + + def genpdf(self, path): + pdfkit.from_string(str(self.soup), path, options={"enable-local-file-access": ""}) + + def text2pdf(self, path): + print("gettings subsites") + self.getsubsites + print("concatting subsites") + self.concatsubsites + print("generating pdf") + self.genpdf(pdf) \ No newline at end of file