From 956013c7caca31a4d8f6c6e38b5d1d1f73e0b9c0 Mon Sep 17 00:00:00 2001 From: fredrikr79 Date: Thu, 17 Aug 2023 22:52:36 +0200 Subject: [PATCH] scraping.py --- .gitignore | 1 + requirements.txt | 2 ++ scraping.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) create mode 100644 .gitignore create mode 100644 scraping.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/requirements.txt b/requirements.txt index d5b1119..bbd7c24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ simplematrixbotlib +beautifulsoup4 +requests diff --git a/scraping.py b/scraping.py new file mode 100644 index 0000000..203f9a6 --- /dev/null +++ b/scraping.py @@ -0,0 +1,34 @@ +from bs4 import BeautifulSoup +import requests +from operator import add +from functools import reduce + + +if __name__ == "__main__": + r = requests.get("http://www.pvv.ntnu.no/hendelser/") + soup = BeautifulSoup(r.text, "html.parser") + zips = [] + events = soup.find_all("ul", "events") + for event in events: + times, places, organizers = zip(*(list( + map(lambda x: map(lambda y: y.find("strong").text, x), + filter(lambda x: x != [], + map(lambda x: x.find_all("li"), + event.find_all("li"))))))) + titles = list(map(lambda x: x[0].text, + filter(lambda x: x != [], + map(lambda x: x.find_all("a"), + event.find_all("li"))))) + descriptions = list(map(lambda x: x.text, + filter(lambda x: x is not None, + map(lambda x: x.find("p"), + event.find_all("li"))))) + ids = [] + for a in event.find_all("a", href=True): + if a['href'][:10] == '/hendelser': + ids.append(int(a['href'][a['href'].find("=")+1:])) + + zips.append(list(zip(ids, times, places, organizers, titles, descriptions))) + + events = reduce(add, zips) +