scraping.py
This commit is contained in:
parent
10ae419d7f
commit
956013c7ca
|
@ -0,0 +1 @@
|
||||||
|
venv/
|
|
@ -1 +1,3 @@
|
||||||
simplematrixbotlib
|
simplematrixbotlib
|
||||||
|
beautifulsoup4
|
||||||
|
requests
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
from operator import add
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
r = requests.get("http://www.pvv.ntnu.no/hendelser/")
|
||||||
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
zips = []
|
||||||
|
events = soup.find_all("ul", "events")
|
||||||
|
for event in events:
|
||||||
|
times, places, organizers = zip(*(list(
|
||||||
|
map(lambda x: map(lambda y: y.find("strong").text, x),
|
||||||
|
filter(lambda x: x != [],
|
||||||
|
map(lambda x: x.find_all("li"),
|
||||||
|
event.find_all("li")))))))
|
||||||
|
titles = list(map(lambda x: x[0].text,
|
||||||
|
filter(lambda x: x != [],
|
||||||
|
map(lambda x: x.find_all("a"),
|
||||||
|
event.find_all("li")))))
|
||||||
|
descriptions = list(map(lambda x: x.text,
|
||||||
|
filter(lambda x: x is not None,
|
||||||
|
map(lambda x: x.find("p"),
|
||||||
|
event.find_all("li")))))
|
||||||
|
ids = []
|
||||||
|
for a in event.find_all("a", href=True):
|
||||||
|
if a['href'][:10] == '/hendelser':
|
||||||
|
ids.append(int(a['href'][a['href'].find("=")+1:]))
|
||||||
|
|
||||||
|
zips.append(list(zip(ids, times, places, organizers, titles, descriptions)))
|
||||||
|
|
||||||
|
events = reduce(add, zips)
|
||||||
|
|
Loading…
Reference in New Issue