scraping.py

2023-08-17 22:52:36 +02:00 · 2023-08-17 22:52:36 +02:00 · 956013c7ca
commit 956013c7ca
parent 10ae419d7f
3 changed files with 37 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+venv/
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,3 @@
 simplematrixbotlib
+beautifulsoup4
+requests
--- a/scraping.py
+++ b/scraping.py
@ -0,0 +1,34 @@
+from bs4 import BeautifulSoup
+import requests
+from operator import add
+from functools import reduce
+
+
+if __name__ == "__main__":
+    r = requests.get("http://www.pvv.ntnu.no/hendelser/")
+    soup = BeautifulSoup(r.text, "html.parser")
+    zips = []
+    events = soup.find_all("ul", "events")
+    for event in events:
+        times, places, organizers = zip(*(list(
+            map(lambda x: map(lambda y: y.find("strong").text, x), 
+                filter(lambda x: x != [], 
+                       map(lambda x: x.find_all("li"), 
+                           event.find_all("li")))))))
+        titles = list(map(lambda x: x[0].text, 
+                     filter(lambda x: x != [], 
+                            map(lambda x: x.find_all("a"), 
+                                event.find_all("li")))))
+        descriptions = list(map(lambda x: x.text, 
+                     filter(lambda x: x is not None, 
+                            map(lambda x: x.find("p"), 
+                                event.find_all("li")))))
+        ids = []
+        for a in event.find_all("a", href=True):
+            if a['href'][:10] == '/hendelser':
+                ids.append(int(a['href'][a['href'].find("=")+1:]))
+
+        zips.append(list(zip(ids, times, places, organizers, titles, descriptions)))
+
+    events = reduce(add, zips)
+