""" BAN YARO — VDH Veranstaltungs-Scraper Scrapt Hundeveranstaltungen von vdh.de. Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events. """ import logging import re from datetime import datetime from html.parser import HTMLParser import httpx logger = logging.getLogger(__name__) FALLBACK_EVENTS = [ {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"}, {"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"}, {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"}, {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"}, {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"}, ] # Mapping VDH-Kategorienamen → interne Typen _TYP_MAP = { "ausstellung": "ausstellung", "show": "ausstellung", "siegershow": "ausstellung", "agility": "wettkampf", "wettkampf": "wettkampf", "turnier": "wettkampf", "prüfung": "wettkampf", "training": "training", "treffen": "treffen", "markt": "markt", } # Monatsnamen Deutsch → Zahl _MONATE = { "januar": 1, "februar": 2, "märz": 3, "maerz": 3, "april": 4, "mai": 5, "juni": 6, "juli": 7, "august": 8, "september": 9, "oktober": 10, "november": 11, "dezember": 12, } def _guess_typ(text: str) -> str: """Bestimmt den Event-Typ anhand des Titels.""" t = text.lower() for keyword, typ in _TYP_MAP.items(): if keyword in t: return typ return "sonstiges" def _parse_date(raw: str) -> str | None: """ Versucht verschiedene Datumsformate zu parsen. Gibt YYYY-MM-DD zurück oder None. """ raw = raw.strip() # ISO: 2026-05-03 m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw) if m: return raw # DD.MM.YYYY oder D.M.YYYY m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw) if m: d, mo, y = m.groups() return f"{y}-{int(mo):02d}-{int(d):02d}" # DD. Monatsname YYYY (z.B. "14. Juni 2026") m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw) if m: d, mon_str, y = m.groups() mon_num = _MONATE.get(mon_str.lower()) if mon_num: return f"{y}-{mon_num:02d}-{int(d):02d}" # Monatsname DD, YYYY (englisch, Fallback) try: dt = datetime.strptime(raw, "%B %d, %Y") return dt.strftime("%Y-%m-%d") except ValueError: pass return None class _VDHParser(HTMLParser): """ Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite. Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen. """ def __init__(self): super().__init__() self._events: list[dict] = [] self._current: dict | None = None self._depth = 0 self._start_depth = 0 self._capture = None # 'titel' | 'datum' | 'ort' self._buf = "" self._in_event = False # ---------- Hilfsmethoden ---------- def _is_event_container(self, tag, attrs): """Erkennt Start eines Event-Blocks.""" a = dict(attrs) cls = a.get("class", "") return ( tag == "article" or (tag in ("li", "div") and any( kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item") )) ) def _is_title_tag(self, tag, attrs): a = dict(attrs) cls = a.get("class", "") return tag in ("h2", "h3", "h4") or any( kw in cls for kw in ("title", "titel", "name", "heading") ) def _is_date_tag(self, tag, attrs): a = dict(attrs) cls = a.get("class", "") it = a.get("itemprop", "") return ( tag in ("time",) or any(kw in cls for kw in ("date", "datum", "time")) or it in ("startDate", "endDate") ) def _is_location_tag(self, tag, attrs): a = dict(attrs) cls = a.get("class", "") it = a.get("itemprop", "") return ( any(kw in cls for kw in ("location", "ort", "venue", "place", "city")) or it in ("location", "addressLocality") ) # ---------- SAX-Events ---------- def handle_starttag(self, tag, attrs): self._depth += 1 a = dict(attrs) if not self._in_event and self._is_event_container(tag, attrs): self._in_event = True self._start_depth = self._depth self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""} # Direkter Link auf dem Container? if tag == "a" and "href" in a: self._current["link"] = a["href"] return if self._in_event: # Link innerhalb des Event-Blocks if tag == "a" and "href" in a and not self._current.get("link"): href = a["href"] if "vdh.de" in href or href.startswith("/"): self._current["link"] = href #