Проблема
Я пытаюсь разобрать таблицу HTML со строками в ней, как и в, я пытаюсь разобрать расписание моего колледжа.
Я столкнулся с проблемой, когда, если последняя строка содержит строку rowspan, в следующей строке отсутствует TD, где теперь строка rowspan отсутствует в TD.
У меня нет подсказки, как объяснить это, и я надеюсь, что смогу разобрать это расписание.
Что я пробовал
Довольно многое, о чем я могу думать.
Результат:
[
{
'blok_eind': 4,
'blok_start': 3,
'dag': 4, # Should be 5
'leraar': 'DOODF000',
'lokaal': 'ALK C212',
'vak': 'PROJ-T',
},
]
Как вы можете видеть, есть ключ vak
со значением PROJ-T
в вышеприведенном фрагменте вывода, dag
- 4
, в то время как он должен быть 5
(aka Friday/Vrijdag), как видно здесь:
В результате я хочу
Python dict(), который выглядит как выше, но с правильным значением
Где:
-
day
/dag
является int от 1 ~ 5, представляющим понедельник-пятницу -
block_start
/blok_start
- это int, который представляет, когда начинается курс (блок времени, левая сторона таблицы) -
block_end
/blok_eind
- это int, которые представляют, в каком блоке заканчивается курс -
classroom
/lokaal
- это код класса, курс которого находится в -
teacher
/leraar
- идентификатор учителя -
course
/vak
- это идентификатор курса
Основная структура HTML для данных выше
<center>
<table>
<tr>
<td>
<table>
<tbody>
<tr>
<td>
<font>
TEACHER-ID
</font>
</td>
<td>
<font>
<b>
CLASSROOM ID
</b>
</font>
</td>
</tr>
<tr>
<td>
<font>
COURSE ID
</font>
</td>
</tr>
</tbody>
</table>
</td>
</tr>
</table>
</center>
Код
HTML
<CENTER><font size="3" face="Arial" color="#000000">
<BR></font>
<font size="6" face="Arial" color="#0000FF">
16AO4EIO1B
</font> <font size="4" face="Arial">
IO1B
</font>
<BR>
<TABLE border="3" rules="all" cellpadding="1" cellspacing="1">
<TR>
<TD align="center">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial" color="#000000">
Maandag 29-08
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
Dinsdag 30-08
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
Woensdag 31-08
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
Donderdag 01-09
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
Vrijdag 02-09
</font> </TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>1</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
8:30
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
9:20
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
BLEEJ002
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK B021</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
WEBD
</font> </TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>2</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
9:20
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
10:10
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
BLEEJ002
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK B021B</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
WEBD
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>3</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
10:25
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
11:15
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
DOODF000
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK C212</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
PROJ-T
</font> </TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>4</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
11:15
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
12:05
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
BLEEJ002
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK B021B</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
MENT
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>5</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
12:05
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
12:55
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>6</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
12:55
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
13:45
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
JONGJ003
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK B008</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
BURG
</font> </TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>7</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
13:45
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
14:35
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
FLUIP000
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK B004</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
ICT algemeen Prakti
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>8</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
14:50
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
15:40
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=4 align="center" nowrap="1">
<TABLE>
<TR>
<TD width="50%" nowrap=1><font size="2" face="Arial">
KOOLE000
</font> </TD>
<TD width="50%" nowrap=1><font size="2" face="Arial">
<B>ALK B008</B>
</font> </TD>
</TR>
<TR>
<TD colspan="2" width="50%" nowrap=1><font size="2" face="Arial">
NED
</font> </TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>9</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
15:40
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
16:30
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
<TR>
<TD rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD align="center" rowspan="2" nowrap=1><font size="3" face="Arial">
<B>10</B>
</font> </TD>
<TD align="center" nowrap=1><font size="2" face="Arial">
16:30
</font> </TD>
</TR>
<TR>
<TD align="center" nowrap=1><font size="2" face="Arial">
17:20
</font> </TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
<TD colspan=12 rowspan=2 align="center" nowrap="1">
<TABLE>
<TR>
<TD></TD>
</TR>
</TABLE>
</TD>
</TR>
<TR>
</TR>
</TABLE>
<TABLE cellspacing="1" cellpadding="1">
<TR>
<TD valign=bottom> <font size="4" face="Arial" color="#0000FF"></TR></TABLE><font size="3" face="Arial">
Periode1 29-08-2016 (35) - 04-09-2016 (35) G r u b e r & P e t t e r s S o f t w a r e
</font></CENTER>
Python
from pprint import pprint
from bs4 import BeautifulSoup
import requests
r = requests.get("http://rooster.horizoncollege.nl/rstr/ECO/AMR/400-ECO/Roosters/36"
"/c/c00025.htm")
daytable = {
1: "Maandag",
2: "Dinsdag",
3: "Woensdag",
4: "Donderdag",
5: "Vrijdag"
}
timetable = {
1: ("8:30", "9:20"),
2: ("9:20", "10:10"),
3: ("10:25", "11:15"),
4: ("11:15", "12:05"),
5: ("12:05", "12:55"),
6: ("12:55", "13:45"),
7: ("13:45", "14:35"),
8: ("14:50", "15:40"),
9: ("15:40", "16:30"),
10: ("16:30", "17:20"),
}
page = BeautifulSoup(r.content, "lxml")
roster = []
big_rows = 2
last_row_big = False
# There are 10 blocks, each made up out of 2 TR's, run through them
for block_count in range(2, 22, 2):
# There are 5 days, first column is not data we want
for day in range(2, 7):
dayroster = {
"dag": 0,
"blok_start": 0,
"blok_eind": 0,
"lokaal": "",
"leraar": "",
"vak": ""
}
# This selector provides the classroom
table_bold = page.select(
"html > body > center > table > tr:nth-of-type(" + str(block_count) + ") > td:nth-of-type(" + str(
day) + ") > table > tr > td > font > b")
# This selector provides the teacher code and the course ID
table = page.select(
"html > body > center > table > tr:nth-of-type(" + str(block_count) + ") > td:nth-of-type(" + str(
day) + ") > table > tr > td > font")
# This gets the rowspan on the current row and column
rowspan = page.select(
"html > body > center > table > tr:nth-of-type(" + str(block_count) + ") > td:nth-of-type(" + str(
day) + ")")
try:
if table or table_bold and rowspan[0].attrs.get("rowspan") == "4":
last_row_big = True
# Setting end of class
dayroster["blok_eind"] = (block_count // 2) + 1
else:
last_row_big = False
# Setting end of class
dayroster["blok_eind"] = (block_count // 2)
except IndexError:
pass
if table_bold:
x = table_bold[0]
# Classroom ID
dayroster["lokaal"] = x.contents[0]
if table:
iter = 0
for x in table:
content = x.contents[0].lstrip("\r\n").rstrip("\r\n")
# Cell has data
if content != "":
# Set start of class
dayroster["blok_start"] = block_count // 2
# Set day of class
dayroster["dag"] = day - 1
if iter == 0:
# Teacher ID
dayroster["leraar"] = content
elif iter == 1:
# Course ID
dayroster["vak"] = content
iter += 1
if table or table_bold:
# Store the data
roster.append(dayroster)
# Remove duplicates
seen = set()
new_l = []
for d in roster:
t = tuple(d.items())
if t not in seen:
seen.add(t)
new_l.append(d)
pprint(new_l)