Fix: VDH-Scraper Regex statt HTMLParser (void-element Depth-Bug), parse_spezial + parse_sport

2026-05-08 13:43:40 +02:00 · 2026-05-08 13:43:40 +02:00 · da036d2b93
commit da036d2b93
parent 4e5a13d9e2
1 changed files with 125 additions and 228 deletions
--- a/backend/scraper/events_vdh.py
+++ b/backend/scraper/events_vdh.py
@ -7,18 +7,17 @@ Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
 import logging
 import re
 from datetime import datetime
-from html.parser import HTMLParser

 import httpx

 logger = logging.getLogger(__name__)

 FALLBACK_EVENTS = [
-    {"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
-    {"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
-    {"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
-    {"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
-    {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
+    {"titel": "VDH-Europasiegershow 2026",              "datum": "2026-06-14", "ort_name": "Dortmund",        "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
+    {"titel": "Internationale Hundeausstellung Frankfurt","datum": "2026-05-03", "ort_name": "Frankfurt am Main","typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
+    {"titel": "VDH-Bundessiegerprüfung Agility",         "datum": "2026-07-19", "ort_name": "Leipzig",         "typ": "wettkampf",   "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
+    {"titel": "Rassehundetreffen München",               "datum": "2026-08-22", "ort_name": "München",         "typ": "treffen",     "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
+    {"titel": "Hundesport-Turnier Berlin",               "datum": "2026-09-12", "ort_name": "Berlin",          "typ": "wettkampf",   "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
 ]

 _TYP_MAP = {
@ -28,13 +27,6 @@ _TYP_MAP = {
    "training":    "training",    "treffen": "treffen",     "markt": "markt",
 }

-_MONATE = {
-    "januar": 1, "februar": 2, "märz": 3, "maerz": 3,
-    "april": 4, "mai": 5, "juni": 6, "juli": 7,
-    "august": 8, "september": 9, "oktober": 10,
-    "november": 11, "dezember": 12,
-}
-

 def _guess_typ(text: str) -> str:
    t = text.lower()
@ -45,215 +37,21 @@ def _guess_typ(text: str) -> str:


 def _parse_date(raw: str) -> str | None:
-    raw = raw.strip()
-    # Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen
-    raw = raw.split(" - ")[0].strip()
-    # ISO: 2026-05-03
-    m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
-    if m:
-        return raw
-    # DD.MM.YYYY oder D.M.YYYY
+    """DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY → YYYY-MM-DD (erstes Datum)."""
+    raw = raw.strip().split(" - ")[0].strip()
    m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
    if m:
        d, mo, y = m.groups()
        return f"{y}-{int(mo):02d}-{int(d):02d}"
-    # DD. Monatsname YYYY
-    m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
+    m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
    if m:
-        d, mon_str, y = m.groups()
-        mon_num = _MONATE.get(mon_str.lower())
-        if mon_num:
-            return f"{y}-{mon_num:02d}-{int(d):02d}"
+        return raw
    return None


-# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
-# Struktur: div.ausstellung_liste > div.row > div.span6
-#   Linke span6: <b>Rassen</b><br>DD.MM.YYYY<br> Verein<br> Straße<br> PLZ Ort<br>
-
-class _SpezialParser(HTMLParser):
-    def __init__(self):
-        super().__init__()
-        self._events    = []
-        self._in_liste  = False
-        self._row_d     = 0       # depth beim row-Start
-        self._span_d    = 0       # depth beim span6-Start
-        self._depth     = 0
-        self._in_row    = False
-        self._in_span   = False   # linke span6 (erste im row)
-        self._span_done = False   # linke span6 fertig geparst
-        self._in_b      = False
-        self._buf       = ""
-        self._parts: list[str] = []   # Teile zwischen <br>
-        self._title     = ""
-
-    def handle_starttag(self, tag, attrs):
-        self._depth += 1
-        a = dict(attrs)
-        cls = a.get("class", "")
-
-        if "ausstellung_liste" in cls:
-            self._in_liste = True
-
-        if self._in_liste and tag == "div" and "row" in cls.split():
-            self._in_row    = True
-            self._row_d     = self._depth
-            self._span_done = False
-            self._title     = ""
-            self._parts     = []
-
-        if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split():
-            if not self._in_span:
-                self._in_span = True
-                self._span_d  = self._depth
-                self._buf     = ""
-
-        if self._in_span and tag == "b":
-            self._in_b = True
-            self._buf  = ""
-
-        # <br> ist void — kein handle_endtag, muss hier behandelt werden
-        if self._in_span and tag == "br":
-            part = self._buf.strip()
-            if part and not self._in_b:
-                self._parts.append(part)
-            self._buf = ""
-
-    def handle_endtag(self, tag):
-        if self._in_span:
-            if tag == "b" and self._in_b:
-                self._in_b    = False
-                self._title   = self._buf.strip()
-                self._buf     = ""
-
-            elif self._depth <= self._span_d:
-                # Ende der linken span6 — auswerten
-                self._in_span   = False
-                self._span_done = True
-                date_str, ort = "", ""
-                for part in self._parts:
-                    if not date_str:
-                        parsed = _parse_date(part)
-                        if parsed:
-                            date_str = parsed
-                            continue
-                    # PLZ Ort: 5 Ziffern gefolgt von Stadtname
-                    m = re.match(r'^\d{5}\s+(.+)$', part)
-                    if m:
-                        ort = m.group(1).strip()
-
-                if self._title and date_str:
-                    self._events.append({
-                        "titel":    self._title,
-                        "datum":    date_str,
-                        "ort_name": ort,
-                        "link":     "https://www.vdh.de/ausstellungen/",
-                    })
-
-        if self._in_row and self._depth < self._row_d:
-            self._in_row = False
-
-        self._depth -= 1
-
-    def handle_data(self, data):
-        if self._in_b:
-            self._buf += data
-        elif self._in_span:
-            self._buf += data
-
-    def get_events(self) -> list[dict]:
-        return self._events
-
-
-# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
-# Struktur: h2.h2ash1 (Kategorie) → ul > li
-#   li-Text: "DD.MM.YYYY<br>Titel<br><b>Ort:</b> Stadt<br>"
-
-class _SportParser(HTMLParser):
-    def __init__(self):
-        super().__init__()
-        self._events   = []
-        self._cat      = ""         # aktuelle Kategorie (h2.h2ash1)
-        self._in_h2    = False
-        self._in_li    = False
-        self._in_b     = False
-        self._buf      = ""
-        self._parts: list[str] = []
-        self._depth    = 0
-        self._li_d     = 0
-
-    def handle_starttag(self, tag, attrs):
-        self._depth += 1
-        a = dict(attrs)
-        cls = a.get("class", "")
-
-        if tag == "h2" and "h2ash1" in cls:
-            self._in_h2 = True
-            self._buf   = ""
-
-        if tag == "li" and not self._in_li:
-            self._in_li = True
-            self._li_d  = self._depth
-            self._parts = []
-            self._buf   = ""
-
-        if self._in_li and tag == "b":
-            self._in_b = True
-
-        # <br> ist void — kein handle_endtag, muss hier behandelt werden
-        if self._in_li and tag == "br":
-            part = self._buf.strip()
-            if part:
-                self._parts.append(part)
-            self._buf = ""
-
-    def handle_endtag(self, tag):
-        if tag == "h2" and self._in_h2:
-            self._in_h2 = False
-            self._cat   = self._buf.strip()
-            self._buf   = ""
-
-        if self._in_li:
-            if tag == "b":
-                self._in_b = False
-
-            elif tag == "li" and self._depth <= self._li_d:
-                self._in_li = False
-                # parts: [date, title, "Ort: Stadt"] oder ähnlich
-                date_str, title, ort = "", "", ""
-                for i, part in enumerate(self._parts):
-                    if not date_str:
-                        parsed = _parse_date(part.split(" - ")[0].strip())
-                        if parsed:
-                            date_str = parsed
-                            continue
-                    if not title and date_str:
-                        # Titel darf nicht mit "Ort:" beginnen
-                        if not part.lower().startswith("ort:"):
-                            title = part
-                            continue
-                    m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE)
-                    if m:
-                        ort = m.group(1).strip()
-
-                if title and date_str:
-                    self._events.append({
-                        "titel":    title,
-                        "datum":    date_str,
-                        "ort_name": ort,
-                        "link":     "https://www.vdh.de/hundesport/termine/",
-                    })
-
-        self._depth -= 1
-
-    def handle_data(self, data):
-        if self._in_h2:
-            self._buf += data
-        elif self._in_li:
-            self._buf += data
-
-    def get_events(self) -> list[dict]:
-        return self._events
+def _strip_tags(html: str) -> str:
+    """Entfernt HTML-Tags."""
+    return re.sub(r'<[^>]+>', '', html).strip()


 def _build_external_id(ev: dict) -> str:
@ -262,23 +60,125 @@ def _build_external_id(ev: dict) -> str:
    return key[:120]


+# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
+# Struktur innerhalb div.ausstellung_liste:
+#   <div class="row">
+#     <div class="span6">
+#       <b>Rassen</b><br>DD.MM.YYYY<br> Verein<br> Straße<br> PLZ Ort<br>
+#     </div>
+#     <div class="span6">…Kontakt…</div>
+#   </div>
+
+def _parse_spezial(html: str) -> list[dict]:
+    events = []
+
+    # Ausstellung_liste-Block extrahieren
+    m = re.search(r'<div class="ausstellung_liste">(.*?)(?=<div class="row">\s*<div class="span12">)',
+                  html, re.DOTALL)
+    block = m.group(1) if m else html
+
+    # Jede linke span6 (erstes span6 pro row) extrahieren
+    # Pattern: <div class="row"> ... <div class="span6">INHALT</div>
+    row_pattern = re.compile(
+        r'<div class="row">\s*<div class="span6">(.*?)</div>',
+        re.DOTALL
+    )
+
+    for row_m in row_pattern.finditer(block):
+        cell = row_m.group(1)
+
+        # Titel aus <b>...</b>
+        title_m = re.search(r'<b>(.*?)</b>', cell, re.DOTALL)
+        if not title_m:
+            continue
+        title = _strip_tags(title_m.group(1)).strip()
+        if not title:
+            continue
+
+        # Datum: erste DD.MM.YYYY nach dem <b>-Block
+        after_b = cell[title_m.end():]
+        date_m = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})', after_b)
+        if not date_m:
+            continue
+        date_str = _parse_date(date_m.group(1))
+        if not date_str:
+            continue
+
+        # PLZ + Ort: "12345 Stadtname"
+        ort = ""
+        ort_m = re.search(r'(\d{5})\s+([^<\n\r]+)', after_b)
+        if ort_m:
+            ort = ort_m.group(2).strip()
+
+        events.append({
+            "titel":    title,
+            "datum":    date_str,
+            "ort_name": ort,
+            "link":     "https://www.vdh.de/ausstellungen/",
+        })
+
+    return events
+
+
+# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
+# Struktur: <h2 class="h2ash1">Kategorie</h2> dann <ul><li>
+#   <li>DD.MM.YYYY - DD.MM.YYYY<br>Titel<br><b>Ort:</b> Stadt<br></li>
+
+def _parse_sport(html: str) -> list[dict]:
+    events = []
+
+    # <li>-Blöcke extrahieren
+    li_pattern = re.compile(r'<li>(.*?)</li>', re.DOTALL)
+
+    for li_m in li_pattern.finditer(html):
+        cell = li_m.group(1)
+        # Datum: erstes DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY
+        date_m = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4}(?:\s*-\s*\d{1,2}\.\d{1,2}\.\d{4})?)', cell)
+        if not date_m:
+            continue
+        date_str = _parse_date(date_m.group(1))
+        if not date_str:
+            continue
+
+        # Text nach dem Datum ohne Tags
+        after_date = cell[date_m.end():]
+        # "Ort:" aus <b>Ort:</b> Stadt entfernen wir für den Titel
+        parts = [p.strip() for p in re.split(r'<br\s*/?>|<b>[^<]*</b>', after_date) if p.strip()]
+        parts = [_strip_tags(p) for p in parts if _strip_tags(p)]
+
+        title = ""
+        ort   = ""
+        for part in parts:
+            if re.match(r'^Ort:\s*', part, re.IGNORECASE):
+                ort = re.sub(r'^Ort:\s*', '', part, flags=re.IGNORECASE).strip()
+            elif not title and not re.match(r'^\d', part):
+                title = part
+
+        if not title:
+            continue
+
+        events.append({
+            "titel":    title,
+            "datum":    date_str,
+            "ort_name": ort,
+            "link":     "https://www.vdh.de/hundesport/termine/",
+        })
+
+    return events
+
+
 async def fetch_vdh_events() -> list[dict]:
    """
    Scrapt VDH-Veranstaltungen von ausstellungen/liste und hundesport/termine.
    Gibt eine Liste von Dicts zurück: {titel, datum, ort_name, typ, link, external_id}
-    Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
    """
    sources = [
-        ("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _SpezialParser),
-        ("https://www.vdh.de/hundesport/termine/",              _SportParser),
+        ("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _parse_spezial),
+        ("https://www.vdh.de/hundesport/termine/",              _parse_sport),
    ]

    headers = {
-        "User-Agent": (
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/124.0.0.0 Safari/537.36"
-        ),
+        "User-Agent":      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36",
        "Accept":          "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
    }
@ -286,13 +186,11 @@ async def fetch_vdh_events() -> list[dict]:
    raw_events: list[dict] = []

    async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
-        for url, ParserClass in sources:
+        for url, parse_fn in sources:
            try:
                resp = await client.get(url, headers=headers)
                resp.raise_for_status()
-                parser = ParserClass()
-                parser.feed(resp.text)
-                found = parser.get_events()
+                found = parse_fn(resp.text)
                if found:
                    logger.info(f"VDH-Scraper: {len(found)} Events von {url}")
                    raw_events.extend(found)
@ -318,14 +216,13 @@ async def fetch_vdh_events() -> list[dict]:
        titel = ev.get("titel", "").strip()
        if not titel or len(titel) < 3:
            continue
-        link = ev.get("link", "https://www.vdh.de")

        entry = {
            "titel":       titel,
            "datum":       datum,
            "ort_name":    ev.get("ort_name") or None,
            "typ":         _guess_typ(titel),
-            "link":        link,
+            "link":        ev.get("link", "https://www.vdh.de"),
            "external_id": _build_external_id(ev),
        }
        if entry["external_id"] not in seen_ids: