banyaro/backend/scraper/events_vdh.py

340 lines
12 KiB
Python

"""
BAN YARO — VDH Veranstaltungs-Scraper
Scrapt Hundeveranstaltungen von vdh.de.
Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
"""
import logging
import re
from datetime import datetime
from html.parser import HTMLParser
import httpx
logger = logging.getLogger(__name__)
FALLBACK_EVENTS = [
{"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
{"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
{"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
{"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
]
_TYP_MAP = {
"ausstellung": "ausstellung", "show": "ausstellung", "siegershow": "ausstellung",
"agility": "wettkampf", "wettkampf": "wettkampf", "turnier": "wettkampf",
"prüfung": "wettkampf", "meisterschaft": "wettkampf",
"training": "training", "treffen": "treffen", "markt": "markt",
}
_MONATE = {
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
"april": 4, "mai": 5, "juni": 6, "juli": 7,
"august": 8, "september": 9, "oktober": 10,
"november": 11, "dezember": 12,
}
def _guess_typ(text: str) -> str:
t = text.lower()
for keyword, typ in _TYP_MAP.items():
if keyword in t:
return typ
return "ausstellung"
def _parse_date(raw: str) -> str | None:
raw = raw.strip()
# Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen
raw = raw.split(" - ")[0].strip()
# ISO: 2026-05-03
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
if m:
return raw
# DD.MM.YYYY oder D.M.YYYY
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
if m:
d, mo, y = m.groups()
return f"{y}-{int(mo):02d}-{int(d):02d}"
# DD. Monatsname YYYY
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
if m:
d, mon_str, y = m.groups()
mon_num = _MONATE.get(mon_str.lower())
if mon_num:
return f"{y}-{mon_num:02d}-{int(d):02d}"
return None
# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
# Struktur: div.ausstellung_liste > div.row > div.span6
# Linke span6: <b>Rassen</b><br>DD.MM.YYYY<br> Verein<br> Straße<br> PLZ Ort<br>
class _SpezialParser(HTMLParser):
def __init__(self):
super().__init__()
self._events = []
self._in_liste = False
self._row_d = 0 # depth beim row-Start
self._span_d = 0 # depth beim span6-Start
self._depth = 0
self._in_row = False
self._in_span = False # linke span6 (erste im row)
self._span_done = False # linke span6 fertig geparst
self._in_b = False
self._buf = ""
self._parts: list[str] = [] # Teile zwischen <br>
self._title = ""
def handle_starttag(self, tag, attrs):
self._depth += 1
a = dict(attrs)
cls = a.get("class", "")
if "ausstellung_liste" in cls:
self._in_liste = True
if self._in_liste and tag == "div" and "row" in cls.split():
self._in_row = True
self._row_d = self._depth
self._span_done = False
self._title = ""
self._parts = []
if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split():
if not self._in_span:
self._in_span = True
self._span_d = self._depth
self._buf = ""
if self._in_span and tag == "b":
self._in_b = True
self._buf = ""
# <br> ist void — kein handle_endtag, muss hier behandelt werden
if self._in_span and tag == "br":
part = self._buf.strip()
if part and not self._in_b:
self._parts.append(part)
self._buf = ""
def handle_endtag(self, tag):
if self._in_span:
if tag == "b" and self._in_b:
self._in_b = False
self._title = self._buf.strip()
self._buf = ""
elif self._depth <= self._span_d:
# Ende der linken span6 — auswerten
self._in_span = False
self._span_done = True
date_str, ort = "", ""
for part in self._parts:
if not date_str:
parsed = _parse_date(part)
if parsed:
date_str = parsed
continue
# PLZ Ort: 5 Ziffern gefolgt von Stadtname
m = re.match(r'^\d{5}\s+(.+)$', part)
if m:
ort = m.group(1).strip()
if self._title and date_str:
self._events.append({
"titel": self._title,
"datum": date_str,
"ort_name": ort,
"link": "https://www.vdh.de/ausstellungen/",
})
if self._in_row and self._depth < self._row_d:
self._in_row = False
self._depth -= 1
def handle_data(self, data):
if self._in_b:
self._buf += data
elif self._in_span:
self._buf += data
def get_events(self) -> list[dict]:
return self._events
# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
# Struktur: h2.h2ash1 (Kategorie) → ul > li
# li-Text: "DD.MM.YYYY<br>Titel<br><b>Ort:</b> Stadt<br>"
class _SportParser(HTMLParser):
def __init__(self):
super().__init__()
self._events = []
self._cat = "" # aktuelle Kategorie (h2.h2ash1)
self._in_h2 = False
self._in_li = False
self._in_b = False
self._buf = ""
self._parts: list[str] = []
self._depth = 0
self._li_d = 0
def handle_starttag(self, tag, attrs):
self._depth += 1
a = dict(attrs)
cls = a.get("class", "")
if tag == "h2" and "h2ash1" in cls:
self._in_h2 = True
self._buf = ""
if tag == "li" and not self._in_li:
self._in_li = True
self._li_d = self._depth
self._parts = []
self._buf = ""
if self._in_li and tag == "b":
self._in_b = True
# <br> ist void — kein handle_endtag, muss hier behandelt werden
if self._in_li and tag == "br":
part = self._buf.strip()
if part:
self._parts.append(part)
self._buf = ""
def handle_endtag(self, tag):
if tag == "h2" and self._in_h2:
self._in_h2 = False
self._cat = self._buf.strip()
self._buf = ""
if self._in_li:
if tag == "b":
self._in_b = False
elif tag == "li" and self._depth <= self._li_d:
self._in_li = False
# parts: [date, title, "Ort: Stadt"] oder ähnlich
date_str, title, ort = "", "", ""
for i, part in enumerate(self._parts):
if not date_str:
parsed = _parse_date(part.split(" - ")[0].strip())
if parsed:
date_str = parsed
continue
if not title and date_str:
# Titel darf nicht mit "Ort:" beginnen
if not part.lower().startswith("ort:"):
title = part
continue
m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE)
if m:
ort = m.group(1).strip()
if title and date_str:
self._events.append({
"titel": title,
"datum": date_str,
"ort_name": ort,
"link": "https://www.vdh.de/hundesport/termine/",
})
self._depth -= 1
def handle_data(self, data):
if self._in_h2:
self._buf += data
elif self._in_li:
self._buf += data
def get_events(self) -> list[dict]:
return self._events
def _build_external_id(ev: dict) -> str:
raw = f"vdh-{ev['datum']}-{ev['titel']}"
key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-')
return key[:120]
async def fetch_vdh_events() -> list[dict]:
"""
Scrapt VDH-Veranstaltungen von ausstellungen/liste und hundesport/termine.
Gibt eine Liste von Dicts zurück: {titel, datum, ort_name, typ, link, external_id}
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
"""
sources = [
("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _SpezialParser),
("https://www.vdh.de/hundesport/termine/", _SportParser),
]
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
}
raw_events: list[dict] = []
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
for url, ParserClass in sources:
try:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
parser = ParserClass()
parser.feed(resp.text)
found = parser.get_events()
if found:
logger.info(f"VDH-Scraper: {len(found)} Events von {url}")
raw_events.extend(found)
else:
logger.info(f"VDH-Scraper: Keine Events auf {url}")
except httpx.HTTPStatusError as e:
logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}")
except Exception as e:
logger.warning(f"VDH-Scraper Fehler für {url}: {e}")
if not raw_events:
logger.warning("VDH-Scraper: Keine Daten — verwende Fallback-Events.")
return list(FALLBACK_EVENTS)
today = datetime.today().strftime("%Y-%m-%d")
result = []
seen_ids: set[str] = set()
for ev in raw_events:
datum = ev.get("datum", "")
if datum < today:
continue
titel = ev.get("titel", "").strip()
if not titel or len(titel) < 3:
continue
link = ev.get("link", "https://www.vdh.de")
entry = {
"titel": titel,
"datum": datum,
"ort_name": ev.get("ort_name") or None,
"typ": _guess_typ(titel),
"link": link,
"external_id": _build_external_id(ev),
}
if entry["external_id"] not in seen_ids:
seen_ids.add(entry["external_id"])
result.append(entry)
if not result:
logger.warning("VDH-Scraper: Nach Filterung 0 Events — verwende Fallback.")
return list(FALLBACK_EVENTS)
logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.")
return result