Files
lrc-backend/backend/tools/scrape_rooms.py

61 lines
1.7 KiB
Python

from pprint import pprint
import re
import requests
from bs4 import BeautifulSoup
def scrape_rooms():
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
room_url = "https://campus.kit.edu/live-stud/campus/all/roomgroup.asp?roomgroupcolumn1=H%F6r%2D%2FLehrsaal&tguid=0x1A35C3A1490748388EBEBA3943EFCDD5"
page = requests.get(room_url, headers=headers)
# soup = BeautifulSoup(page.content, 'html5lib')
soup = BeautifulSoup(page.content, 'html.parser')
# pprint(page.content)
# pprint(soup.prettify())
idx = 0
rooms = []
re_string = r"^(\d\d.\d\d)?\s(.*)"
re_exp = re.compile(re_string)
for tr in soup.find_all('tr'):
idx += 1
if idx == 1: # skip first row
continue
a_name = tr.find_all('a')[0].string
a_building = tr.find_all('a')[3].string
match = re_exp.match(a_name)
if match is not None:
building_number, name = re_exp.match(a_name).groups()
else:
name = a_name
building_number = None
match = re_exp.match(a_building)
if match is not None:
building_number, building_name = re_exp.match(a_building).groups()
else:
building_name = a_name
building_number = None
room = {'name': name,
'room_number': tr.find_all('a')[1].string if tr.find_all('a')[0].string != "None" else tr.find_all('a')[
1].string,
'building_name': building_name,
'building_number': building_number}
rooms.append(room)
return rooms
if __name__ == '__main__':
pprint(scrape_rooms())