from pprint import pprint import re import requests from bs4 import BeautifulSoup def scrape_rooms(): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} room_url = "https://campus.kit.edu/live-stud/campus/all/roomgroup.asp?roomgroupcolumn1=H%F6r%2D%2FLehrsaal&tguid=0x1A35C3A1490748388EBEBA3943EFCDD5" page = requests.get(room_url, headers=headers) # soup = BeautifulSoup(page.content, 'html5lib') soup = BeautifulSoup(page.content, 'html.parser') # pprint(page.content) # pprint(soup.prettify()) idx = 0 rooms = [] re_string = r"^(\d\d.\d\d)?\s(.*)" re_exp = re.compile(re_string) for tr in soup.find_all('tr'): idx += 1 if idx == 1: # skip first row continue a_name = tr.find_all('a')[0].string a_building = tr.find_all('a')[3].string match = re_exp.match(a_name) if match is not None: building_number, name = re_exp.match(a_name).groups() else: name = a_name building_number = None match = re_exp.match(a_building) if match is not None: building_number, building_name = re_exp.match(a_building).groups() else: building_name = a_name building_number = None room = {'name': name, 'room_number': tr.find_all('a')[1].string if tr.find_all('a')[0].string != "None" else tr.find_all('a')[ 1].string, 'building_name': building_name, 'building_number': building_number} rooms.append(room) return rooms if __name__ == '__main__': pprint(scrape_rooms())