diff --git a/backend/scraper/events_vdh.py b/backend/scraper/events_vdh.py
index afdea43..020908c 100644
--- a/backend/scraper/events_vdh.py
+++ b/backend/scraper/events_vdh.py
@@ -21,21 +21,13 @@ FALLBACK_EVENTS = [
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
]
-# Mapping VDH-Kategorienamen → interne Typen
_TYP_MAP = {
- "ausstellung": "ausstellung",
- "show": "ausstellung",
- "siegershow": "ausstellung",
- "agility": "wettkampf",
- "wettkampf": "wettkampf",
- "turnier": "wettkampf",
- "prüfung": "wettkampf",
- "training": "training",
- "treffen": "treffen",
- "markt": "markt",
+ "ausstellung": "ausstellung", "show": "ausstellung", "siegershow": "ausstellung",
+ "agility": "wettkampf", "wettkampf": "wettkampf", "turnier": "wettkampf",
+ "prüfung": "wettkampf", "meisterschaft": "wettkampf",
+ "training": "training", "treffen": "treffen", "markt": "markt",
}
-# Monatsnamen Deutsch → Zahl
_MONATE = {
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
"april": 4, "mai": 5, "juni": 6, "juli": 7,
@@ -45,171 +37,217 @@ _MONATE = {
def _guess_typ(text: str) -> str:
- """Bestimmt den Event-Typ anhand des Titels."""
t = text.lower()
for keyword, typ in _TYP_MAP.items():
if keyword in t:
return typ
- return "sonstiges"
+ return "ausstellung"
def _parse_date(raw: str) -> str | None:
- """
- Versucht verschiedene Datumsformate zu parsen.
- Gibt YYYY-MM-DD zurück oder None.
- """
raw = raw.strip()
-
+ # Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen
+ raw = raw.split(" - ")[0].strip()
# ISO: 2026-05-03
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
if m:
return raw
-
# DD.MM.YYYY oder D.M.YYYY
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
if m:
d, mo, y = m.groups()
return f"{y}-{int(mo):02d}-{int(d):02d}"
-
- # DD. Monatsname YYYY (z.B. "14. Juni 2026")
+ # DD. Monatsname YYYY
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
if m:
d, mon_str, y = m.groups()
mon_num = _MONATE.get(mon_str.lower())
if mon_num:
return f"{y}-{mon_num:02d}-{int(d):02d}"
-
- # Monatsname DD, YYYY (englisch, Fallback)
- try:
- dt = datetime.strptime(raw, "%B %d, %Y")
- return dt.strftime("%Y-%m-%d")
- except ValueError:
- pass
-
return None
-class _VDHParser(HTMLParser):
- """
- Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite.
- Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen.
- """
+# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
+# Struktur: div.ausstellung_liste > div.row > div.span6
+# Linke span6: Rassen
DD.MM.YYYY
Verein
Straße
PLZ Ort
+class _SpezialParser(HTMLParser):
def __init__(self):
super().__init__()
- self._events: list[dict] = []
- self._current: dict | None = None
- self._depth = 0
- self._start_depth = 0
- self._capture = None # 'titel' | 'datum' | 'ort'
- self._buf = ""
- self._in_event = False
-
- # ---------- Hilfsmethoden ----------
-
- def _is_event_container(self, tag, attrs):
- """Erkennt Start eines Event-Blocks."""
- a = dict(attrs)
- cls = a.get("class", "")
- return (
- tag == "article"
- or (tag in ("li", "div") and any(
- kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item")
- ))
- )
-
- def _is_title_tag(self, tag, attrs):
- a = dict(attrs)
- cls = a.get("class", "")
- return tag in ("h2", "h3", "h4") or any(
- kw in cls for kw in ("title", "titel", "name", "heading")
- )
-
- def _is_date_tag(self, tag, attrs):
- a = dict(attrs)
- cls = a.get("class", "")
- it = a.get("itemprop", "")
- return (
- tag in ("time",)
- or any(kw in cls for kw in ("date", "datum", "time"))
- or it in ("startDate", "endDate")
- )
-
- def _is_location_tag(self, tag, attrs):
- a = dict(attrs)
- cls = a.get("class", "")
- it = a.get("itemprop", "")
- return (
- any(kw in cls for kw in ("location", "ort", "venue", "place", "city"))
- or it in ("location", "addressLocality")
- )
-
- # ---------- SAX-Events ----------
+ self._events = []
+ self._in_liste = False
+ self._row_d = 0 # depth beim row-Start
+ self._span_d = 0 # depth beim span6-Start
+ self._depth = 0
+ self._in_row = False
+ self._in_span = False # linke span6 (erste im row)
+ self._span_done = False # linke span6 fertig geparst
+ self._in_b = False
+ self._buf = ""
+ self._parts: list[str] = [] # Teile zwischen
+ self._title = ""
def handle_starttag(self, tag, attrs):
self._depth += 1
a = dict(attrs)
+ cls = a.get("class", "")
- if not self._in_event and self._is_event_container(tag, attrs):
- self._in_event = True
- self._start_depth = self._depth
- self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""}
- # Direkter Link auf dem Container?
- if tag == "a" and "href" in a:
- self._current["link"] = a["href"]
- return
+ if "ausstellung_liste" in cls:
+ self._in_liste = True
- if self._in_event:
- # Link innerhalb des Event-Blocks
- if tag == "a" and "href" in a and not self._current.get("link"):
- href = a["href"]
- if "vdh.de" in href or href.startswith("/"):
- self._current["link"] = href
+ if self._in_liste and tag == "div" and "row" in cls.split():
+ self._in_row = True
+ self._row_d = self._depth
+ self._span_done = False
+ self._title = ""
+ self._parts = []
- #