diff --git a/backend/scraper/events_vdh.py b/backend/scraper/events_vdh.py index 585c570..72313af 100644 --- a/backend/scraper/events_vdh.py +++ b/backend/scraper/events_vdh.py @@ -7,18 +7,17 @@ Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events. import logging import re from datetime import datetime -from html.parser import HTMLParser import httpx logger = logging.getLogger(__name__) FALLBACK_EVENTS = [ - {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"}, - {"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"}, - {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"}, - {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"}, - {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"}, + {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"}, + {"titel": "Internationale Hundeausstellung Frankfurt","datum": "2026-05-03", "ort_name": "Frankfurt am Main","typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"}, + {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"}, + {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"}, + {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"}, ] _TYP_MAP = { @@ -28,13 +27,6 @@ _TYP_MAP = { "training": "training", "treffen": "treffen", "markt": "markt", } -_MONATE = { - "januar": 1, "februar": 2, "märz": 3, "maerz": 3, - "april": 4, "mai": 5, "juni": 6, "juli": 7, - "august": 8, "september": 9, "oktober": 10, - "november": 11, "dezember": 12, -} - def _guess_typ(text: str) -> str: t = text.lower() @@ -45,215 +37,21 @@ def _guess_typ(text: str) -> str: def _parse_date(raw: str) -> str | None: - raw = raw.strip() - # Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen - raw = raw.split(" - ")[0].strip() - # ISO: 2026-05-03 - m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw) - if m: - return raw - # DD.MM.YYYY oder D.M.YYYY + """DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY → YYYY-MM-DD (erstes Datum).""" + raw = raw.strip().split(" - ")[0].strip() m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw) if m: d, mo, y = m.groups() return f"{y}-{int(mo):02d}-{int(d):02d}" - # DD. Monatsname YYYY - m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw) + m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw) if m: - d, mon_str, y = m.groups() - mon_num = _MONATE.get(mon_str.lower()) - if mon_num: - return f"{y}-{mon_num:02d}-{int(d):02d}" + return raw return None -# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ────────────────────────────── -# Struktur: div.ausstellung_liste > div.row > div.span6 -# Linke span6: Rassen
DD.MM.YYYY
Verein
Straße
PLZ Ort
- -class _SpezialParser(HTMLParser): - def __init__(self): - super().__init__() - self._events = [] - self._in_liste = False - self._row_d = 0 # depth beim row-Start - self._span_d = 0 # depth beim span6-Start - self._depth = 0 - self._in_row = False - self._in_span = False # linke span6 (erste im row) - self._span_done = False # linke span6 fertig geparst - self._in_b = False - self._buf = "" - self._parts: list[str] = [] # Teile zwischen
- self._title = "" - - def handle_starttag(self, tag, attrs): - self._depth += 1 - a = dict(attrs) - cls = a.get("class", "") - - if "ausstellung_liste" in cls: - self._in_liste = True - - if self._in_liste and tag == "div" and "row" in cls.split(): - self._in_row = True - self._row_d = self._depth - self._span_done = False - self._title = "" - self._parts = [] - - if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split(): - if not self._in_span: - self._in_span = True - self._span_d = self._depth - self._buf = "" - - if self._in_span and tag == "b": - self._in_b = True - self._buf = "" - - #
ist void — kein handle_endtag, muss hier behandelt werden - if self._in_span and tag == "br": - part = self._buf.strip() - if part and not self._in_b: - self._parts.append(part) - self._buf = "" - - def handle_endtag(self, tag): - if self._in_span: - if tag == "b" and self._in_b: - self._in_b = False - self._title = self._buf.strip() - self._buf = "" - - elif self._depth <= self._span_d: - # Ende der linken span6 — auswerten - self._in_span = False - self._span_done = True - date_str, ort = "", "" - for part in self._parts: - if not date_str: - parsed = _parse_date(part) - if parsed: - date_str = parsed - continue - # PLZ Ort: 5 Ziffern gefolgt von Stadtname - m = re.match(r'^\d{5}\s+(.+)$', part) - if m: - ort = m.group(1).strip() - - if self._title and date_str: - self._events.append({ - "titel": self._title, - "datum": date_str, - "ort_name": ort, - "link": "https://www.vdh.de/ausstellungen/", - }) - - if self._in_row and self._depth < self._row_d: - self._in_row = False - - self._depth -= 1 - - def handle_data(self, data): - if self._in_b: - self._buf += data - elif self._in_span: - self._buf += data - - def get_events(self) -> list[dict]: - return self._events - - -# ── PARSER 2: /hundesport/termine/ ─────────────────────────────────────────── -# Struktur: h2.h2ash1 (Kategorie) → ul > li -# li-Text: "DD.MM.YYYY
Titel
Ort: Stadt
" - -class _SportParser(HTMLParser): - def __init__(self): - super().__init__() - self._events = [] - self._cat = "" # aktuelle Kategorie (h2.h2ash1) - self._in_h2 = False - self._in_li = False - self._in_b = False - self._buf = "" - self._parts: list[str] = [] - self._depth = 0 - self._li_d = 0 - - def handle_starttag(self, tag, attrs): - self._depth += 1 - a = dict(attrs) - cls = a.get("class", "") - - if tag == "h2" and "h2ash1" in cls: - self._in_h2 = True - self._buf = "" - - if tag == "li" and not self._in_li: - self._in_li = True - self._li_d = self._depth - self._parts = [] - self._buf = "" - - if self._in_li and tag == "b": - self._in_b = True - - #
ist void — kein handle_endtag, muss hier behandelt werden - if self._in_li and tag == "br": - part = self._buf.strip() - if part: - self._parts.append(part) - self._buf = "" - - def handle_endtag(self, tag): - if tag == "h2" and self._in_h2: - self._in_h2 = False - self._cat = self._buf.strip() - self._buf = "" - - if self._in_li: - if tag == "b": - self._in_b = False - - elif tag == "li" and self._depth <= self._li_d: - self._in_li = False - # parts: [date, title, "Ort: Stadt"] oder ähnlich - date_str, title, ort = "", "", "" - for i, part in enumerate(self._parts): - if not date_str: - parsed = _parse_date(part.split(" - ")[0].strip()) - if parsed: - date_str = parsed - continue - if not title and date_str: - # Titel darf nicht mit "Ort:" beginnen - if not part.lower().startswith("ort:"): - title = part - continue - m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE) - if m: - ort = m.group(1).strip() - - if title and date_str: - self._events.append({ - "titel": title, - "datum": date_str, - "ort_name": ort, - "link": "https://www.vdh.de/hundesport/termine/", - }) - - self._depth -= 1 - - def handle_data(self, data): - if self._in_h2: - self._buf += data - elif self._in_li: - self._buf += data - - def get_events(self) -> list[dict]: - return self._events +def _strip_tags(html: str) -> str: + """Entfernt HTML-Tags.""" + return re.sub(r'<[^>]+>', '', html).strip() def _build_external_id(ev: dict) -> str: @@ -262,23 +60,125 @@ def _build_external_id(ev: dict) -> str: return key[:120] +# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ────────────────────────────── +# Struktur innerhalb div.ausstellung_liste: +#
+#
+# Rassen
DD.MM.YYYY
Verein
Straße
PLZ Ort
+#
+#
…Kontakt…
+#
+ +def _parse_spezial(html: str) -> list[dict]: + events = [] + + # Ausstellung_liste-Block extrahieren + m = re.search(r'
(.*?)(?=
\s*
)', + html, re.DOTALL) + block = m.group(1) if m else html + + # Jede linke span6 (erstes span6 pro row) extrahieren + # Pattern:
...
INHALT
+ row_pattern = re.compile( + r'
\s*
(.*?)
', + re.DOTALL + ) + + for row_m in row_pattern.finditer(block): + cell = row_m.group(1) + + # Titel aus ... + title_m = re.search(r'(.*?)', cell, re.DOTALL) + if not title_m: + continue + title = _strip_tags(title_m.group(1)).strip() + if not title: + continue + + # Datum: erste DD.MM.YYYY nach dem -Block + after_b = cell[title_m.end():] + date_m = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})', after_b) + if not date_m: + continue + date_str = _parse_date(date_m.group(1)) + if not date_str: + continue + + # PLZ + Ort: "12345 Stadtname" + ort = "" + ort_m = re.search(r'(\d{5})\s+([^<\n\r]+)', after_b) + if ort_m: + ort = ort_m.group(2).strip() + + events.append({ + "titel": title, + "datum": date_str, + "ort_name": ort, + "link": "https://www.vdh.de/ausstellungen/", + }) + + return events + + +# ── PARSER 2: /hundesport/termine/ ─────────────────────────────────────────── +# Struktur:

Kategorie

dann
  • +#
  • DD.MM.YYYY - DD.MM.YYYY
    Titel
    Ort: Stadt
  • + +def _parse_sport(html: str) -> list[dict]: + events = [] + + #
  • -Blöcke extrahieren + li_pattern = re.compile(r'
  • (.*?)
  • ', re.DOTALL) + + for li_m in li_pattern.finditer(html): + cell = li_m.group(1) + # Datum: erstes DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY + date_m = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4}(?:\s*-\s*\d{1,2}\.\d{1,2}\.\d{4})?)', cell) + if not date_m: + continue + date_str = _parse_date(date_m.group(1)) + if not date_str: + continue + + # Text nach dem Datum ohne Tags + after_date = cell[date_m.end():] + # "Ort:" aus Ort: Stadt entfernen wir für den Titel + parts = [p.strip() for p in re.split(r'|[^<]*', after_date) if p.strip()] + parts = [_strip_tags(p) for p in parts if _strip_tags(p)] + + title = "" + ort = "" + for part in parts: + if re.match(r'^Ort:\s*', part, re.IGNORECASE): + ort = re.sub(r'^Ort:\s*', '', part, flags=re.IGNORECASE).strip() + elif not title and not re.match(r'^\d', part): + title = part + + if not title: + continue + + events.append({ + "titel": title, + "datum": date_str, + "ort_name": ort, + "link": "https://www.vdh.de/hundesport/termine/", + }) + + return events + + async def fetch_vdh_events() -> list[dict]: """ Scrapt VDH-Veranstaltungen von ausstellungen/liste und hundesport/termine. Gibt eine Liste von Dicts zurück: {titel, datum, ort_name, typ, link, external_id} - Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS. """ sources = [ - ("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _SpezialParser), - ("https://www.vdh.de/hundesport/termine/", _SportParser), + ("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _parse_spezial), + ("https://www.vdh.de/hundesport/termine/", _parse_sport), ] headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/124.0.0.0 Safari/537.36" - ), + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "de-DE,de;q=0.9,en;q=0.5", } @@ -286,13 +186,11 @@ async def fetch_vdh_events() -> list[dict]: raw_events: list[dict] = [] async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: - for url, ParserClass in sources: + for url, parse_fn in sources: try: resp = await client.get(url, headers=headers) resp.raise_for_status() - parser = ParserClass() - parser.feed(resp.text) - found = parser.get_events() + found = parse_fn(resp.text) if found: logger.info(f"VDH-Scraper: {len(found)} Events von {url}") raw_events.extend(found) @@ -318,14 +216,13 @@ async def fetch_vdh_events() -> list[dict]: titel = ev.get("titel", "").strip() if not titel or len(titel) < 3: continue - link = ev.get("link", "https://www.vdh.de") entry = { "titel": titel, "datum": datum, "ort_name": ev.get("ort_name") or None, "typ": _guess_typ(titel), - "link": link, + "link": ev.get("link", "https://www.vdh.de"), "external_id": _build_external_id(ev), } if entry["external_id"] not in seen_ids: