61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
from pprint import pprint
|
|
|
|
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def scrape_rooms():
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
|
|
|
room_url = "https://campus.kit.edu/live-stud/campus/all/roomgroup.asp?roomgroupcolumn1=H%F6r%2D%2FLehrsaal&tguid=0x1A35C3A1490748388EBEBA3943EFCDD5"
|
|
page = requests.get(room_url, headers=headers)
|
|
# soup = BeautifulSoup(page.content, 'html5lib')
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
# pprint(page.content)
|
|
|
|
# pprint(soup.prettify())
|
|
|
|
idx = 0
|
|
|
|
rooms = []
|
|
|
|
re_string = r"^(\d\d.\d\d)?\s(.*)"
|
|
re_exp = re.compile(re_string)
|
|
|
|
for tr in soup.find_all('tr'):
|
|
idx += 1
|
|
if idx == 1: # skip first row
|
|
continue
|
|
a_name = tr.find_all('a')[0].string
|
|
a_building = tr.find_all('a')[3].string
|
|
match = re_exp.match(a_name)
|
|
if match is not None:
|
|
building_number, name = re_exp.match(a_name).groups()
|
|
else:
|
|
name = a_name
|
|
building_number = None
|
|
|
|
match = re_exp.match(a_building)
|
|
if match is not None:
|
|
building_number, building_name = re_exp.match(a_building).groups()
|
|
else:
|
|
building_name = a_name
|
|
building_number = None
|
|
|
|
room = {'name': name,
|
|
'room_number': tr.find_all('a')[1].string if tr.find_all('a')[0].string != "None" else tr.find_all('a')[
|
|
1].string,
|
|
'building_name': building_name,
|
|
'building_number': building_number}
|
|
|
|
rooms.append(room)
|
|
|
|
return rooms
|
|
|
|
|
|
if __name__ == '__main__':
|
|
scrape_rooms()
|