Fix: VDH-Scraper <br> in handle_starttag statt handle_endtag (void elements)

This commit is contained in:
rene 2026-05-08 13:38:13 +02:00
parent bff54dcfd3
commit 4e5a13d9e2

View file

@ -112,6 +112,13 @@ class _SpezialParser(HTMLParser):
self._in_b = True self._in_b = True
self._buf = "" self._buf = ""
# <br> ist void — kein handle_endtag, muss hier behandelt werden
if self._in_span and tag == "br":
part = self._buf.strip()
if part and not self._in_b:
self._parts.append(part)
self._buf = ""
def handle_endtag(self, tag): def handle_endtag(self, tag):
if self._in_span: if self._in_span:
if tag == "b" and self._in_b: if tag == "b" and self._in_b:
@ -119,12 +126,6 @@ class _SpezialParser(HTMLParser):
self._title = self._buf.strip() self._title = self._buf.strip()
self._buf = "" self._buf = ""
elif tag == "br":
part = self._buf.strip()
if part:
self._parts.append(part)
self._buf = ""
elif self._depth <= self._span_d: elif self._depth <= self._span_d:
# Ende der linken span6 — auswerten # Ende der linken span6 — auswerten
self._in_span = False self._in_span = False
@ -199,6 +200,13 @@ class _SportParser(HTMLParser):
if self._in_li and tag == "b": if self._in_li and tag == "b":
self._in_b = True self._in_b = True
# <br> ist void — kein handle_endtag, muss hier behandelt werden
if self._in_li and tag == "br":
part = self._buf.strip()
if part:
self._parts.append(part)
self._buf = ""
def handle_endtag(self, tag): def handle_endtag(self, tag):
if tag == "h2" and self._in_h2: if tag == "h2" and self._in_h2:
self._in_h2 = False self._in_h2 = False
@ -209,12 +217,6 @@ class _SportParser(HTMLParser):
if tag == "b": if tag == "b":
self._in_b = False self._in_b = False
elif tag == "br":
part = self._buf.strip()
if part:
self._parts.append(part)
self._buf = ""
elif tag == "li" and self._depth <= self._li_d: elif tag == "li" and self._depth <= self._li_d:
self._in_li = False self._in_li = False
# parts: [date, title, "Ort: Stadt"] oder ähnlich # parts: [date, title, "Ort: Stadt"] oder ähnlich