From bff54dcfd3408304eec238abb6d09ec82f2b283f Mon Sep 17 00:00:00 2001 From: rene Date: Fri, 8 May 2026 13:34:13 +0200 Subject: [PATCH] =?UTF-8?q?Fix:=20VDH-Scraper=20komplett=20neu=20=E2=80=94?= =?UTF-8?q?=20dedizierte=20Parser=20f=C3=BCr=20/ausstellungen/liste/=20und?= =?UTF-8?q?=20/hundesport/termine/=20(neue=20HTML-Struktur)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/scraper/events_vdh.py | 356 ++++++++++++++++++---------------- 1 file changed, 188 insertions(+), 168 deletions(-) diff --git a/backend/scraper/events_vdh.py b/backend/scraper/events_vdh.py index afdea43..020908c 100644 --- a/backend/scraper/events_vdh.py +++ b/backend/scraper/events_vdh.py @@ -21,21 +21,13 @@ FALLBACK_EVENTS = [ {"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"}, ] -# Mapping VDH-Kategorienamen → interne Typen _TYP_MAP = { - "ausstellung": "ausstellung", - "show": "ausstellung", - "siegershow": "ausstellung", - "agility": "wettkampf", - "wettkampf": "wettkampf", - "turnier": "wettkampf", - "prüfung": "wettkampf", - "training": "training", - "treffen": "treffen", - "markt": "markt", + "ausstellung": "ausstellung", "show": "ausstellung", "siegershow": "ausstellung", + "agility": "wettkampf", "wettkampf": "wettkampf", "turnier": "wettkampf", + "prüfung": "wettkampf", "meisterschaft": "wettkampf", + "training": "training", "treffen": "treffen", "markt": "markt", } -# Monatsnamen Deutsch → Zahl _MONATE = { "januar": 1, "februar": 2, "märz": 3, "maerz": 3, "april": 4, "mai": 5, "juni": 6, "juli": 7, @@ -45,171 +37,217 @@ _MONATE = { def _guess_typ(text: str) -> str: - """Bestimmt den Event-Typ anhand des Titels.""" t = text.lower() for keyword, typ in _TYP_MAP.items(): if keyword in t: return typ - return "sonstiges" + return "ausstellung" def _parse_date(raw: str) -> str | None: - """ - Versucht verschiedene Datumsformate zu parsen. - Gibt YYYY-MM-DD zurück oder None. - """ raw = raw.strip() - + # Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen + raw = raw.split(" - ")[0].strip() # ISO: 2026-05-03 m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw) if m: return raw - # DD.MM.YYYY oder D.M.YYYY m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw) if m: d, mo, y = m.groups() return f"{y}-{int(mo):02d}-{int(d):02d}" - - # DD. Monatsname YYYY (z.B. "14. Juni 2026") + # DD. Monatsname YYYY m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw) if m: d, mon_str, y = m.groups() mon_num = _MONATE.get(mon_str.lower()) if mon_num: return f"{y}-{mon_num:02d}-{int(d):02d}" - - # Monatsname DD, YYYY (englisch, Fallback) - try: - dt = datetime.strptime(raw, "%B %d, %Y") - return dt.strftime("%Y-%m-%d") - except ValueError: - pass - return None -class _VDHParser(HTMLParser): - """ - Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite. - Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen. - """ +# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ────────────────────────────── +# Struktur: div.ausstellung_liste > div.row > div.span6 +# Linke span6: Rassen
DD.MM.YYYY
Verein
Straße
PLZ Ort
+class _SpezialParser(HTMLParser): def __init__(self): super().__init__() - self._events: list[dict] = [] - self._current: dict | None = None - self._depth = 0 - self._start_depth = 0 - self._capture = None # 'titel' | 'datum' | 'ort' - self._buf = "" - self._in_event = False - - # ---------- Hilfsmethoden ---------- - - def _is_event_container(self, tag, attrs): - """Erkennt Start eines Event-Blocks.""" - a = dict(attrs) - cls = a.get("class", "") - return ( - tag == "article" - or (tag in ("li", "div") and any( - kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item") - )) - ) - - def _is_title_tag(self, tag, attrs): - a = dict(attrs) - cls = a.get("class", "") - return tag in ("h2", "h3", "h4") or any( - kw in cls for kw in ("title", "titel", "name", "heading") - ) - - def _is_date_tag(self, tag, attrs): - a = dict(attrs) - cls = a.get("class", "") - it = a.get("itemprop", "") - return ( - tag in ("time",) - or any(kw in cls for kw in ("date", "datum", "time")) - or it in ("startDate", "endDate") - ) - - def _is_location_tag(self, tag, attrs): - a = dict(attrs) - cls = a.get("class", "") - it = a.get("itemprop", "") - return ( - any(kw in cls for kw in ("location", "ort", "venue", "place", "city")) - or it in ("location", "addressLocality") - ) - - # ---------- SAX-Events ---------- + self._events = [] + self._in_liste = False + self._row_d = 0 # depth beim row-Start + self._span_d = 0 # depth beim span6-Start + self._depth = 0 + self._in_row = False + self._in_span = False # linke span6 (erste im row) + self._span_done = False # linke span6 fertig geparst + self._in_b = False + self._buf = "" + self._parts: list[str] = [] # Teile zwischen
+ self._title = "" def handle_starttag(self, tag, attrs): self._depth += 1 a = dict(attrs) + cls = a.get("class", "") - if not self._in_event and self._is_event_container(tag, attrs): - self._in_event = True - self._start_depth = self._depth - self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""} - # Direkter Link auf dem Container? - if tag == "a" and "href" in a: - self._current["link"] = a["href"] - return + if "ausstellung_liste" in cls: + self._in_liste = True - if self._in_event: - # Link innerhalb des Event-Blocks - if tag == "a" and "href" in a and not self._current.get("link"): - href = a["href"] - if "vdh.de" in href or href.startswith("/"): - self._current["link"] = href + if self._in_liste and tag == "div" and "row" in cls.split(): + self._in_row = True + self._row_d = self._depth + self._span_done = False + self._title = "" + self._parts = [] - #