banyaro/backend/scraper/events_vdh.py

318 lines
11 KiB
Python

"""
BAN YARO — VDH Veranstaltungs-Scraper
Scrapt Hundeveranstaltungen von vdh.de.
Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
"""
import logging
import re
from datetime import datetime
from html.parser import HTMLParser
import httpx
logger = logging.getLogger(__name__)
FALLBACK_EVENTS = [
{"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
{"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
{"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
{"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
]
# Mapping VDH-Kategorienamen → interne Typen
_TYP_MAP = {
"ausstellung": "ausstellung",
"show": "ausstellung",
"siegershow": "ausstellung",
"agility": "wettkampf",
"wettkampf": "wettkampf",
"turnier": "wettkampf",
"prüfung": "wettkampf",
"training": "training",
"treffen": "treffen",
"markt": "markt",
}
# Monatsnamen Deutsch → Zahl
_MONATE = {
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
"april": 4, "mai": 5, "juni": 6, "juli": 7,
"august": 8, "september": 9, "oktober": 10,
"november": 11, "dezember": 12,
}
def _guess_typ(text: str) -> str:
"""Bestimmt den Event-Typ anhand des Titels."""
t = text.lower()
for keyword, typ in _TYP_MAP.items():
if keyword in t:
return typ
return "sonstiges"
def _parse_date(raw: str) -> str | None:
"""
Versucht verschiedene Datumsformate zu parsen.
Gibt YYYY-MM-DD zurück oder None.
"""
raw = raw.strip()
# ISO: 2026-05-03
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
if m:
return raw
# DD.MM.YYYY oder D.M.YYYY
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
if m:
d, mo, y = m.groups()
return f"{y}-{int(mo):02d}-{int(d):02d}"
# DD. Monatsname YYYY (z.B. "14. Juni 2026")
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
if m:
d, mon_str, y = m.groups()
mon_num = _MONATE.get(mon_str.lower())
if mon_num:
return f"{y}-{mon_num:02d}-{int(d):02d}"
# Monatsname DD, YYYY (englisch, Fallback)
try:
dt = datetime.strptime(raw, "%B %d, %Y")
return dt.strftime("%Y-%m-%d")
except ValueError:
pass
return None
class _VDHParser(HTMLParser):
"""
Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite.
Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen.
"""
def __init__(self):
super().__init__()
self._events: list[dict] = []
self._current: dict | None = None
self._depth = 0
self._start_depth = 0
self._capture = None # 'titel' | 'datum' | 'ort'
self._buf = ""
self._in_event = False
# ---------- Hilfsmethoden ----------
def _is_event_container(self, tag, attrs):
"""Erkennt Start eines Event-Blocks."""
a = dict(attrs)
cls = a.get("class", "")
return (
tag == "article"
or (tag in ("li", "div") and any(
kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item")
))
)
def _is_title_tag(self, tag, attrs):
a = dict(attrs)
cls = a.get("class", "")
return tag in ("h2", "h3", "h4") or any(
kw in cls for kw in ("title", "titel", "name", "heading")
)
def _is_date_tag(self, tag, attrs):
a = dict(attrs)
cls = a.get("class", "")
it = a.get("itemprop", "")
return (
tag in ("time",)
or any(kw in cls for kw in ("date", "datum", "time"))
or it in ("startDate", "endDate")
)
def _is_location_tag(self, tag, attrs):
a = dict(attrs)
cls = a.get("class", "")
it = a.get("itemprop", "")
return (
any(kw in cls for kw in ("location", "ort", "venue", "place", "city"))
or it in ("location", "addressLocality")
)
# ---------- SAX-Events ----------
def handle_starttag(self, tag, attrs):
self._depth += 1
a = dict(attrs)
if not self._in_event and self._is_event_container(tag, attrs):
self._in_event = True
self._start_depth = self._depth
self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""}
# Direkter Link auf dem Container?
if tag == "a" and "href" in a:
self._current["link"] = a["href"]
return
if self._in_event:
# Link innerhalb des Event-Blocks
if tag == "a" and "href" in a and not self._current.get("link"):
href = a["href"]
if "vdh.de" in href or href.startswith("/"):
self._current["link"] = href
# <time datetime="…">
if tag == "time":
dt = a.get("datetime", "")
if dt:
parsed = _parse_date(dt)
if parsed:
self._current["datum"] = parsed
if self._is_title_tag(tag, attrs):
self._capture = "titel"
self._buf = ""
elif self._is_date_tag(tag, attrs) and not self._current.get("datum"):
self._capture = "datum"
self._buf = ""
elif self._is_location_tag(tag, attrs):
self._capture = "ort"
self._buf = ""
def handle_endtag(self, tag):
if self._capture:
val = self._buf.strip()
if self._capture == "titel" and val:
self._current["titel"] = val
elif self._capture == "datum" and val and not self._current.get("datum"):
parsed = _parse_date(val)
if parsed:
self._current["datum"] = parsed
elif self._capture == "ort" and val:
self._current["ort_name"] = val
self._capture = None
self._buf = ""
self._depth -= 1
if self._in_event and self._depth < self._start_depth:
self._in_event = False
ev = self._current
# Nur speichern wenn wir Titel + Datum haben
if ev and ev.get("titel") and ev.get("datum"):
self._events.append(ev)
self._current = None
def handle_data(self, data):
if self._capture:
self._buf += data
def get_events(self) -> list[dict]:
return self._events
def _build_external_id(ev: dict) -> str:
"""Erzeugt einen stabilen Dedup-Key aus Datum + Titel."""
raw = f"vdh-{ev['datum']}-{ev['titel']}"
# Einfache Normalisierung: lowercase, Sonderzeichen raus
key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-')
return key[:120]
async def fetch_vdh_events() -> list[dict]:
"""
Scrapt VDH-Veranstaltungen und gibt eine Liste von Dicts zurück:
{titel, datum, ort_name, typ, link, external_id}
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
"""
urls = [
"https://www.vdh.de/ausstellungen/liste/typ/spezial/",
"https://www.vdh.de/ausstellungen/",
"https://www.vdh.de/hundesport/termine/",
]
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
}
raw_events: list[dict] = []
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
for url in urls:
try:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
html = resp.text
parser = _VDHParser()
parser.feed(html)
found = parser.get_events()
if found:
logger.info(f"VDH-Scraper: {len(found)} Events von {url} geparst.")
raw_events = found
break
else:
logger.info(f"VDH-Scraper: Keine Events auf {url} gefunden, nächste URL versuchen.")
except httpx.HTTPStatusError as e:
logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}")
except httpx.RequestError as e:
logger.warning(f"VDH-Scraper Netzwerkfehler für {url}: {e}")
except Exception as e:
logger.warning(f"VDH-Scraper unbekannter Fehler für {url}: {e}")
if not raw_events:
logger.warning("VDH-Scraper: Keine Daten erhalten — verwende Fallback-Events.")
return list(FALLBACK_EVENTS)
# Normalisieren
today = datetime.today().strftime("%Y-%m-%d")
result = []
seen_ids: set[str] = set()
for ev in raw_events:
datum = ev.get("datum", "")
# Nur zukünftige Events
if datum < today:
continue
titel = ev.get("titel", "").strip()
if not titel or len(titel) < 3:
continue
link = ev.get("link", "")
if link and link.startswith("/"):
link = "https://www.vdh.de" + link
entry = {
"titel": titel,
"datum": datum,
"ort_name": ev.get("ort_name") or None,
"typ": _guess_typ(titel),
"link": link or "https://www.vdh.de",
"external_id": _build_external_id(ev),
}
if entry["external_id"] not in seen_ids:
seen_ids.add(entry["external_id"])
result.append(entry)
if not result:
logger.warning("VDH-Scraper: Nach Filterung 0 zukünftige Events — verwende Fallback-Events.")
return list(FALLBACK_EVENTS)
logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.")
return result