Fix: VDH-Scraper <br> in handle_starttag statt handle_endtag (void elements)
This commit is contained in:
parent
bff54dcfd3
commit
4e5a13d9e2
1 changed files with 14 additions and 12 deletions
|
|
@ -112,6 +112,13 @@ class _SpezialParser(HTMLParser):
|
|||
self._in_b = True
|
||||
self._buf = ""
|
||||
|
||||
# <br> ist void — kein handle_endtag, muss hier behandelt werden
|
||||
if self._in_span and tag == "br":
|
||||
part = self._buf.strip()
|
||||
if part and not self._in_b:
|
||||
self._parts.append(part)
|
||||
self._buf = ""
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self._in_span:
|
||||
if tag == "b" and self._in_b:
|
||||
|
|
@ -119,12 +126,6 @@ class _SpezialParser(HTMLParser):
|
|||
self._title = self._buf.strip()
|
||||
self._buf = ""
|
||||
|
||||
elif tag == "br":
|
||||
part = self._buf.strip()
|
||||
if part:
|
||||
self._parts.append(part)
|
||||
self._buf = ""
|
||||
|
||||
elif self._depth <= self._span_d:
|
||||
# Ende der linken span6 — auswerten
|
||||
self._in_span = False
|
||||
|
|
@ -199,6 +200,13 @@ class _SportParser(HTMLParser):
|
|||
if self._in_li and tag == "b":
|
||||
self._in_b = True
|
||||
|
||||
# <br> ist void — kein handle_endtag, muss hier behandelt werden
|
||||
if self._in_li and tag == "br":
|
||||
part = self._buf.strip()
|
||||
if part:
|
||||
self._parts.append(part)
|
||||
self._buf = ""
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "h2" and self._in_h2:
|
||||
self._in_h2 = False
|
||||
|
|
@ -209,12 +217,6 @@ class _SportParser(HTMLParser):
|
|||
if tag == "b":
|
||||
self._in_b = False
|
||||
|
||||
elif tag == "br":
|
||||
part = self._buf.strip()
|
||||
if part:
|
||||
self._parts.append(part)
|
||||
self._buf = ""
|
||||
|
||||
elif tag == "li" and self._depth <= self._li_d:
|
||||
self._in_li = False
|
||||
# parts: [date, title, "Ort: Stadt"] oder ähnlich
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue