scraping.py

This commit is contained in:
fredrikr79 2023-08-17 22:52:36 +02:00
parent 10ae419d7f
commit 956013c7ca
3 changed files with 37 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
venv/

View File

@ -1 +1,3 @@
simplematrixbotlib
beautifulsoup4
requests

34
scraping.py Normal file
View File

@ -0,0 +1,34 @@
from bs4 import BeautifulSoup
import requests
from operator import add
from functools import reduce
if __name__ == "__main__":
r = requests.get("http://www.pvv.ntnu.no/hendelser/")
soup = BeautifulSoup(r.text, "html.parser")
zips = []
events = soup.find_all("ul", "events")
for event in events:
times, places, organizers = zip(*(list(
map(lambda x: map(lambda y: y.find("strong").text, x),
filter(lambda x: x != [],
map(lambda x: x.find_all("li"),
event.find_all("li")))))))
titles = list(map(lambda x: x[0].text,
filter(lambda x: x != [],
map(lambda x: x.find_all("a"),
event.find_all("li")))))
descriptions = list(map(lambda x: x.text,
filter(lambda x: x is not None,
map(lambda x: x.find("p"),
event.find_all("li")))))
ids = []
for a in event.find_all("a", href=True):
if a['href'][:10] == '/hendelser':
ids.append(int(a['href'][a['href'].find("=")+1:]))
zips.append(list(zip(ids, times, places, organizers, titles, descriptions)))
events = reduce(add, zips)