diff --git a/backend/scraper/events_vdh.py b/backend/scraper/events_vdh.py
index 585c570..72313af 100644
--- a/backend/scraper/events_vdh.py
+++ b/backend/scraper/events_vdh.py
@@ -7,18 +7,17 @@ Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
import logging
import re
from datetime import datetime
-from html.parser import HTMLParser
import httpx
logger = logging.getLogger(__name__)
FALLBACK_EVENTS = [
- {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
- {"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
- {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
- {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
- {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
+ {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
+ {"titel": "Internationale Hundeausstellung Frankfurt","datum": "2026-05-03", "ort_name": "Frankfurt am Main","typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
+ {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
+ {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
+ {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
]
_TYP_MAP = {
@@ -28,13 +27,6 @@ _TYP_MAP = {
"training": "training", "treffen": "treffen", "markt": "markt",
}
-_MONATE = {
- "januar": 1, "februar": 2, "märz": 3, "maerz": 3,
- "april": 4, "mai": 5, "juni": 6, "juli": 7,
- "august": 8, "september": 9, "oktober": 10,
- "november": 11, "dezember": 12,
-}
-
def _guess_typ(text: str) -> str:
t = text.lower()
@@ -45,215 +37,21 @@ def _guess_typ(text: str) -> str:
def _parse_date(raw: str) -> str | None:
- raw = raw.strip()
- # Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen
- raw = raw.split(" - ")[0].strip()
- # ISO: 2026-05-03
- m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
- if m:
- return raw
- # DD.MM.YYYY oder D.M.YYYY
+ """DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY → YYYY-MM-DD (erstes Datum)."""
+ raw = raw.strip().split(" - ")[0].strip()
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
if m:
d, mo, y = m.groups()
return f"{y}-{int(mo):02d}-{int(d):02d}"
- # DD. Monatsname YYYY
- m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
+ m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
if m:
- d, mon_str, y = m.groups()
- mon_num = _MONATE.get(mon_str.lower())
- if mon_num:
- return f"{y}-{mon_num:02d}-{int(d):02d}"
+ return raw
return None
-# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
-# Struktur: div.ausstellung_liste > div.row > div.span6
-# Linke span6: Rassen
DD.MM.YYYY
Verein
Straße
PLZ Ort
-
-class _SpezialParser(HTMLParser):
- def __init__(self):
- super().__init__()
- self._events = []
- self._in_liste = False
- self._row_d = 0 # depth beim row-Start
- self._span_d = 0 # depth beim span6-Start
- self._depth = 0
- self._in_row = False
- self._in_span = False # linke span6 (erste im row)
- self._span_done = False # linke span6 fertig geparst
- self._in_b = False
- self._buf = ""
- self._parts: list[str] = [] # Teile zwischen
- self._title = ""
-
- def handle_starttag(self, tag, attrs):
- self._depth += 1
- a = dict(attrs)
- cls = a.get("class", "")
-
- if "ausstellung_liste" in cls:
- self._in_liste = True
-
- if self._in_liste and tag == "div" and "row" in cls.split():
- self._in_row = True
- self._row_d = self._depth
- self._span_done = False
- self._title = ""
- self._parts = []
-
- if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split():
- if not self._in_span:
- self._in_span = True
- self._span_d = self._depth
- self._buf = ""
-
- if self._in_span and tag == "b":
- self._in_b = True
- self._buf = ""
-
- #
ist void — kein handle_endtag, muss hier behandelt werden
- if self._in_span and tag == "br":
- part = self._buf.strip()
- if part and not self._in_b:
- self._parts.append(part)
- self._buf = ""
-
- def handle_endtag(self, tag):
- if self._in_span:
- if tag == "b" and self._in_b:
- self._in_b = False
- self._title = self._buf.strip()
- self._buf = ""
-
- elif self._depth <= self._span_d:
- # Ende der linken span6 — auswerten
- self._in_span = False
- self._span_done = True
- date_str, ort = "", ""
- for part in self._parts:
- if not date_str:
- parsed = _parse_date(part)
- if parsed:
- date_str = parsed
- continue
- # PLZ Ort: 5 Ziffern gefolgt von Stadtname
- m = re.match(r'^\d{5}\s+(.+)$', part)
- if m:
- ort = m.group(1).strip()
-
- if self._title and date_str:
- self._events.append({
- "titel": self._title,
- "datum": date_str,
- "ort_name": ort,
- "link": "https://www.vdh.de/ausstellungen/",
- })
-
- if self._in_row and self._depth < self._row_d:
- self._in_row = False
-
- self._depth -= 1
-
- def handle_data(self, data):
- if self._in_b:
- self._buf += data
- elif self._in_span:
- self._buf += data
-
- def get_events(self) -> list[dict]:
- return self._events
-
-
-# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
-# Struktur: h2.h2ash1 (Kategorie) → ul > li
-# li-Text: "DD.MM.YYYY
Titel
Ort: Stadt
"
-
-class _SportParser(HTMLParser):
- def __init__(self):
- super().__init__()
- self._events = []
- self._cat = "" # aktuelle Kategorie (h2.h2ash1)
- self._in_h2 = False
- self._in_li = False
- self._in_b = False
- self._buf = ""
- self._parts: list[str] = []
- self._depth = 0
- self._li_d = 0
-
- def handle_starttag(self, tag, attrs):
- self._depth += 1
- a = dict(attrs)
- cls = a.get("class", "")
-
- if tag == "h2" and "h2ash1" in cls:
- self._in_h2 = True
- self._buf = ""
-
- if tag == "li" and not self._in_li:
- self._in_li = True
- self._li_d = self._depth
- self._parts = []
- self._buf = ""
-
- if self._in_li and tag == "b":
- self._in_b = True
-
- #
ist void — kein handle_endtag, muss hier behandelt werden
- if self._in_li and tag == "br":
- part = self._buf.strip()
- if part:
- self._parts.append(part)
- self._buf = ""
-
- def handle_endtag(self, tag):
- if tag == "h2" and self._in_h2:
- self._in_h2 = False
- self._cat = self._buf.strip()
- self._buf = ""
-
- if self._in_li:
- if tag == "b":
- self._in_b = False
-
- elif tag == "li" and self._depth <= self._li_d:
- self._in_li = False
- # parts: [date, title, "Ort: Stadt"] oder ähnlich
- date_str, title, ort = "", "", ""
- for i, part in enumerate(self._parts):
- if not date_str:
- parsed = _parse_date(part.split(" - ")[0].strip())
- if parsed:
- date_str = parsed
- continue
- if not title and date_str:
- # Titel darf nicht mit "Ort:" beginnen
- if not part.lower().startswith("ort:"):
- title = part
- continue
- m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE)
- if m:
- ort = m.group(1).strip()
-
- if title and date_str:
- self._events.append({
- "titel": title,
- "datum": date_str,
- "ort_name": ort,
- "link": "https://www.vdh.de/hundesport/termine/",
- })
-
- self._depth -= 1
-
- def handle_data(self, data):
- if self._in_h2:
- self._buf += data
- elif self._in_li:
- self._buf += data
-
- def get_events(self) -> list[dict]:
- return self._events
+def _strip_tags(html: str) -> str:
+ """Entfernt HTML-Tags."""
+ return re.sub(r'<[^>]+>', '', html).strip()
def _build_external_id(ev: dict) -> str:
@@ -262,23 +60,125 @@ def _build_external_id(ev: dict) -> str:
return key[:120]
+# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
+# Struktur innerhalb div.ausstellung_liste:
+#