""" BAN YARO — VDH Veranstaltungs-Scraper Scrapt Hundeveranstaltungen von vdh.de. Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events. """ import logging import re from datetime import datetime from html.parser import HTMLParser import httpx logger = logging.getLogger(__name__) FALLBACK_EVENTS = [ {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"}, {"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"}, {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"}, {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"}, {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"}, ] _TYP_MAP = { "ausstellung": "ausstellung", "show": "ausstellung", "siegershow": "ausstellung", "agility": "wettkampf", "wettkampf": "wettkampf", "turnier": "wettkampf", "prüfung": "wettkampf", "meisterschaft": "wettkampf", "training": "training", "treffen": "treffen", "markt": "markt", } _MONATE = { "januar": 1, "februar": 2, "märz": 3, "maerz": 3, "april": 4, "mai": 5, "juni": 6, "juli": 7, "august": 8, "september": 9, "oktober": 10, "november": 11, "dezember": 12, } def _guess_typ(text: str) -> str: t = text.lower() for keyword, typ in _TYP_MAP.items(): if keyword in t: return typ return "ausstellung" def _parse_date(raw: str) -> str | None: raw = raw.strip() # Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen raw = raw.split(" - ")[0].strip() # ISO: 2026-05-03 m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw) if m: return raw # DD.MM.YYYY oder D.M.YYYY m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw) if m: d, mo, y = m.groups() return f"{y}-{int(mo):02d}-{int(d):02d}" # DD. Monatsname YYYY m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw) if m: d, mon_str, y = m.groups() mon_num = _MONATE.get(mon_str.lower()) if mon_num: return f"{y}-{mon_num:02d}-{int(d):02d}" return None # ── PARSER 1: /ausstellungen/liste/typ/spezial/ ────────────────────────────── # Struktur: div.ausstellung_liste > div.row > div.span6 # Linke span6: Rassen
DD.MM.YYYY
Verein
Straße
PLZ Ort
class _SpezialParser(HTMLParser): def __init__(self): super().__init__() self._events = [] self._in_liste = False self._row_d = 0 # depth beim row-Start self._span_d = 0 # depth beim span6-Start self._depth = 0 self._in_row = False self._in_span = False # linke span6 (erste im row) self._span_done = False # linke span6 fertig geparst self._in_b = False self._buf = "" self._parts: list[str] = [] # Teile zwischen
self._title = "" def handle_starttag(self, tag, attrs): self._depth += 1 a = dict(attrs) cls = a.get("class", "") if "ausstellung_liste" in cls: self._in_liste = True if self._in_liste and tag == "div" and "row" in cls.split(): self._in_row = True self._row_d = self._depth self._span_done = False self._title = "" self._parts = [] if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split(): if not self._in_span: self._in_span = True self._span_d = self._depth self._buf = "" if self._in_span and tag == "b": self._in_b = True self._buf = "" def handle_endtag(self, tag): if self._in_span: if tag == "b" and self._in_b: self._in_b = False self._title = self._buf.strip() self._buf = "" elif tag == "br": part = self._buf.strip() if part: self._parts.append(part) self._buf = "" elif self._depth <= self._span_d: # Ende der linken span6 — auswerten self._in_span = False self._span_done = True date_str, ort = "", "" for part in self._parts: if not date_str: parsed = _parse_date(part) if parsed: date_str = parsed continue # PLZ Ort: 5 Ziffern gefolgt von Stadtname m = re.match(r'^\d{5}\s+(.+)$', part) if m: ort = m.group(1).strip() if self._title and date_str: self._events.append({ "titel": self._title, "datum": date_str, "ort_name": ort, "link": "https://www.vdh.de/ausstellungen/", }) if self._in_row and self._depth < self._row_d: self._in_row = False self._depth -= 1 def handle_data(self, data): if self._in_b: self._buf += data elif self._in_span: self._buf += data def get_events(self) -> list[dict]: return self._events # ── PARSER 2: /hundesport/termine/ ─────────────────────────────────────────── # Struktur: h2.h2ash1 (Kategorie) → ul > li # li-Text: "DD.MM.YYYY
Titel
Ort: Stadt
" class _SportParser(HTMLParser): def __init__(self): super().__init__() self._events = [] self._cat = "" # aktuelle Kategorie (h2.h2ash1) self._in_h2 = False self._in_li = False self._in_b = False self._buf = "" self._parts: list[str] = [] self._depth = 0 self._li_d = 0 def handle_starttag(self, tag, attrs): self._depth += 1 a = dict(attrs) cls = a.get("class", "") if tag == "h2" and "h2ash1" in cls: self._in_h2 = True self._buf = "" if tag == "li" and not self._in_li: self._in_li = True self._li_d = self._depth self._parts = [] self._buf = "" if self._in_li and tag == "b": self._in_b = True def handle_endtag(self, tag): if tag == "h2" and self._in_h2: self._in_h2 = False self._cat = self._buf.strip() self._buf = "" if self._in_li: if tag == "b": self._in_b = False elif tag == "br": part = self._buf.strip() if part: self._parts.append(part) self._buf = "" elif tag == "li" and self._depth <= self._li_d: self._in_li = False # parts: [date, title, "Ort: Stadt"] oder ähnlich date_str, title, ort = "", "", "" for i, part in enumerate(self._parts): if not date_str: parsed = _parse_date(part.split(" - ")[0].strip()) if parsed: date_str = parsed continue if not title and date_str: # Titel darf nicht mit "Ort:" beginnen if not part.lower().startswith("ort:"): title = part continue m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE) if m: ort = m.group(1).strip() if title and date_str: self._events.append({ "titel": title, "datum": date_str, "ort_name": ort, "link": "https://www.vdh.de/hundesport/termine/", }) self._depth -= 1 def handle_data(self, data): if self._in_h2: self._buf += data elif self._in_li: self._buf += data def get_events(self) -> list[dict]: return self._events def _build_external_id(ev: dict) -> str: raw = f"vdh-{ev['datum']}-{ev['titel']}" key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-') return key[:120] async def fetch_vdh_events() -> list[dict]: """ Scrapt VDH-Veranstaltungen von ausstellungen/liste und hundesport/termine. Gibt eine Liste von Dicts zurück: {titel, datum, ort_name, typ, link, external_id} Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS. """ sources = [ ("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _SpezialParser), ("https://www.vdh.de/hundesport/termine/", _SportParser), ] headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "de-DE,de;q=0.9,en;q=0.5", } raw_events: list[dict] = [] async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: for url, ParserClass in sources: try: resp = await client.get(url, headers=headers) resp.raise_for_status() parser = ParserClass() parser.feed(resp.text) found = parser.get_events() if found: logger.info(f"VDH-Scraper: {len(found)} Events von {url}") raw_events.extend(found) else: logger.info(f"VDH-Scraper: Keine Events auf {url}") except httpx.HTTPStatusError as e: logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}") except Exception as e: logger.warning(f"VDH-Scraper Fehler für {url}: {e}") if not raw_events: logger.warning("VDH-Scraper: Keine Daten — verwende Fallback-Events.") return list(FALLBACK_EVENTS) today = datetime.today().strftime("%Y-%m-%d") result = [] seen_ids: set[str] = set() for ev in raw_events: datum = ev.get("datum", "") if datum < today: continue titel = ev.get("titel", "").strip() if not titel or len(titel) < 3: continue link = ev.get("link", "https://www.vdh.de") entry = { "titel": titel, "datum": datum, "ort_name": ev.get("ort_name") or None, "typ": _guess_typ(titel), "link": link, "external_id": _build_external_id(ev), } if entry["external_id"] not in seen_ids: seen_ids.add(entry["external_id"]) result.append(entry) if not result: logger.warning("VDH-Scraper: Nach Filterung 0 Events — verwende Fallback.") return list(FALLBACK_EVENTS) logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.") return result