diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/requirements.txt b/requirements.txt index d5b1119..bbd7c24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ simplematrixbotlib +beautifulsoup4 +requests diff --git a/scraping.py b/scraping.py new file mode 100644 index 0000000..203f9a6 --- /dev/null +++ b/scraping.py @@ -0,0 +1,34 @@ +from bs4 import BeautifulSoup +import requests +from operator import add +from functools import reduce + + +if __name__ == "__main__": + r = requests.get("http://www.pvv.ntnu.no/hendelser/") + soup = BeautifulSoup(r.text, "html.parser") + zips = [] + events = soup.find_all("ul", "events") + for event in events: + times, places, organizers = zip(*(list( + map(lambda x: map(lambda y: y.find("strong").text, x), + filter(lambda x: x != [], + map(lambda x: x.find_all("li"), + event.find_all("li"))))))) + titles = list(map(lambda x: x[0].text, + filter(lambda x: x != [], + map(lambda x: x.find_all("a"), + event.find_all("li"))))) + descriptions = list(map(lambda x: x.text, + filter(lambda x: x is not None, + map(lambda x: x.find("p"), + event.find_all("li"))))) + ids = [] + for a in event.find_all("a", href=True): + if a['href'][:10] == '/hendelser': + ids.append(int(a['href'][a['href'].find("=")+1:])) + + zips.append(list(zip(ids, times, places, organizers, titles, descriptions))) + + events = reduce(add, zips) +