Fix: VDH-Scraper komplett neu — dedizierte Parser für /ausstellungen/liste/ und /hundesport/termine/ (neue HTML-Struktur)
This commit is contained in:
parent
f02b9aa4ab
commit
bff54dcfd3
1 changed files with 188 additions and 168 deletions
|
|
@ -21,21 +21,13 @@ FALLBACK_EVENTS = [
|
||||||
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
|
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Mapping VDH-Kategorienamen → interne Typen
|
|
||||||
_TYP_MAP = {
|
_TYP_MAP = {
|
||||||
"ausstellung": "ausstellung",
|
"ausstellung": "ausstellung", "show": "ausstellung", "siegershow": "ausstellung",
|
||||||
"show": "ausstellung",
|
"agility": "wettkampf", "wettkampf": "wettkampf", "turnier": "wettkampf",
|
||||||
"siegershow": "ausstellung",
|
"prüfung": "wettkampf", "meisterschaft": "wettkampf",
|
||||||
"agility": "wettkampf",
|
"training": "training", "treffen": "treffen", "markt": "markt",
|
||||||
"wettkampf": "wettkampf",
|
|
||||||
"turnier": "wettkampf",
|
|
||||||
"prüfung": "wettkampf",
|
|
||||||
"training": "training",
|
|
||||||
"treffen": "treffen",
|
|
||||||
"markt": "markt",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Monatsnamen Deutsch → Zahl
|
|
||||||
_MONATE = {
|
_MONATE = {
|
||||||
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
|
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
|
||||||
"april": 4, "mai": 5, "juni": 6, "juli": 7,
|
"april": 4, "mai": 5, "juni": 6, "juli": 7,
|
||||||
|
|
@ -45,171 +37,217 @@ _MONATE = {
|
||||||
|
|
||||||
|
|
||||||
def _guess_typ(text: str) -> str:
|
def _guess_typ(text: str) -> str:
|
||||||
"""Bestimmt den Event-Typ anhand des Titels."""
|
|
||||||
t = text.lower()
|
t = text.lower()
|
||||||
for keyword, typ in _TYP_MAP.items():
|
for keyword, typ in _TYP_MAP.items():
|
||||||
if keyword in t:
|
if keyword in t:
|
||||||
return typ
|
return typ
|
||||||
return "sonstiges"
|
return "ausstellung"
|
||||||
|
|
||||||
|
|
||||||
def _parse_date(raw: str) -> str | None:
|
def _parse_date(raw: str) -> str | None:
|
||||||
"""
|
|
||||||
Versucht verschiedene Datumsformate zu parsen.
|
|
||||||
Gibt YYYY-MM-DD zurück oder None.
|
|
||||||
"""
|
|
||||||
raw = raw.strip()
|
raw = raw.strip()
|
||||||
|
# Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen
|
||||||
|
raw = raw.split(" - ")[0].strip()
|
||||||
# ISO: 2026-05-03
|
# ISO: 2026-05-03
|
||||||
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
|
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
|
||||||
if m:
|
if m:
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
# DD.MM.YYYY oder D.M.YYYY
|
# DD.MM.YYYY oder D.M.YYYY
|
||||||
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
|
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
|
||||||
if m:
|
if m:
|
||||||
d, mo, y = m.groups()
|
d, mo, y = m.groups()
|
||||||
return f"{y}-{int(mo):02d}-{int(d):02d}"
|
return f"{y}-{int(mo):02d}-{int(d):02d}"
|
||||||
|
# DD. Monatsname YYYY
|
||||||
# DD. Monatsname YYYY (z.B. "14. Juni 2026")
|
|
||||||
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
|
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
|
||||||
if m:
|
if m:
|
||||||
d, mon_str, y = m.groups()
|
d, mon_str, y = m.groups()
|
||||||
mon_num = _MONATE.get(mon_str.lower())
|
mon_num = _MONATE.get(mon_str.lower())
|
||||||
if mon_num:
|
if mon_num:
|
||||||
return f"{y}-{mon_num:02d}-{int(d):02d}"
|
return f"{y}-{mon_num:02d}-{int(d):02d}"
|
||||||
|
|
||||||
# Monatsname DD, YYYY (englisch, Fallback)
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(raw, "%B %d, %Y")
|
|
||||||
return dt.strftime("%Y-%m-%d")
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class _VDHParser(HTMLParser):
|
# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
|
||||||
"""
|
# Struktur: div.ausstellung_liste > div.row > div.span6
|
||||||
Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite.
|
# Linke span6: <b>Rassen</b><br>DD.MM.YYYY<br> Verein<br> Straße<br> PLZ Ort<br>
|
||||||
Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
class _SpezialParser(HTMLParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._events: list[dict] = []
|
self._events = []
|
||||||
self._current: dict | None = None
|
self._in_liste = False
|
||||||
self._depth = 0
|
self._row_d = 0 # depth beim row-Start
|
||||||
self._start_depth = 0
|
self._span_d = 0 # depth beim span6-Start
|
||||||
self._capture = None # 'titel' | 'datum' | 'ort'
|
self._depth = 0
|
||||||
self._buf = ""
|
self._in_row = False
|
||||||
self._in_event = False
|
self._in_span = False # linke span6 (erste im row)
|
||||||
|
self._span_done = False # linke span6 fertig geparst
|
||||||
# ---------- Hilfsmethoden ----------
|
self._in_b = False
|
||||||
|
self._buf = ""
|
||||||
def _is_event_container(self, tag, attrs):
|
self._parts: list[str] = [] # Teile zwischen <br>
|
||||||
"""Erkennt Start eines Event-Blocks."""
|
self._title = ""
|
||||||
a = dict(attrs)
|
|
||||||
cls = a.get("class", "")
|
|
||||||
return (
|
|
||||||
tag == "article"
|
|
||||||
or (tag in ("li", "div") and any(
|
|
||||||
kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item")
|
|
||||||
))
|
|
||||||
)
|
|
||||||
|
|
||||||
def _is_title_tag(self, tag, attrs):
|
|
||||||
a = dict(attrs)
|
|
||||||
cls = a.get("class", "")
|
|
||||||
return tag in ("h2", "h3", "h4") or any(
|
|
||||||
kw in cls for kw in ("title", "titel", "name", "heading")
|
|
||||||
)
|
|
||||||
|
|
||||||
def _is_date_tag(self, tag, attrs):
|
|
||||||
a = dict(attrs)
|
|
||||||
cls = a.get("class", "")
|
|
||||||
it = a.get("itemprop", "")
|
|
||||||
return (
|
|
||||||
tag in ("time",)
|
|
||||||
or any(kw in cls for kw in ("date", "datum", "time"))
|
|
||||||
or it in ("startDate", "endDate")
|
|
||||||
)
|
|
||||||
|
|
||||||
def _is_location_tag(self, tag, attrs):
|
|
||||||
a = dict(attrs)
|
|
||||||
cls = a.get("class", "")
|
|
||||||
it = a.get("itemprop", "")
|
|
||||||
return (
|
|
||||||
any(kw in cls for kw in ("location", "ort", "venue", "place", "city"))
|
|
||||||
or it in ("location", "addressLocality")
|
|
||||||
)
|
|
||||||
|
|
||||||
# ---------- SAX-Events ----------
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
self._depth += 1
|
self._depth += 1
|
||||||
a = dict(attrs)
|
a = dict(attrs)
|
||||||
|
cls = a.get("class", "")
|
||||||
|
|
||||||
if not self._in_event and self._is_event_container(tag, attrs):
|
if "ausstellung_liste" in cls:
|
||||||
self._in_event = True
|
self._in_liste = True
|
||||||
self._start_depth = self._depth
|
|
||||||
self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""}
|
|
||||||
# Direkter Link auf dem Container?
|
|
||||||
if tag == "a" and "href" in a:
|
|
||||||
self._current["link"] = a["href"]
|
|
||||||
return
|
|
||||||
|
|
||||||
if self._in_event:
|
if self._in_liste and tag == "div" and "row" in cls.split():
|
||||||
# Link innerhalb des Event-Blocks
|
self._in_row = True
|
||||||
if tag == "a" and "href" in a and not self._current.get("link"):
|
self._row_d = self._depth
|
||||||
href = a["href"]
|
self._span_done = False
|
||||||
if "vdh.de" in href or href.startswith("/"):
|
self._title = ""
|
||||||
self._current["link"] = href
|
self._parts = []
|
||||||
|
|
||||||
# <time datetime="…">
|
if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split():
|
||||||
if tag == "time":
|
if not self._in_span:
|
||||||
dt = a.get("datetime", "")
|
self._in_span = True
|
||||||
if dt:
|
self._span_d = self._depth
|
||||||
parsed = _parse_date(dt)
|
|
||||||
if parsed:
|
|
||||||
self._current["datum"] = parsed
|
|
||||||
|
|
||||||
if self._is_title_tag(tag, attrs):
|
|
||||||
self._capture = "titel"
|
|
||||||
self._buf = ""
|
|
||||||
elif self._is_date_tag(tag, attrs) and not self._current.get("datum"):
|
|
||||||
self._capture = "datum"
|
|
||||||
self._buf = ""
|
|
||||||
elif self._is_location_tag(tag, attrs):
|
|
||||||
self._capture = "ort"
|
|
||||||
self._buf = ""
|
self._buf = ""
|
||||||
|
|
||||||
|
if self._in_span and tag == "b":
|
||||||
|
self._in_b = True
|
||||||
|
self._buf = ""
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
if self._capture:
|
if self._in_span:
|
||||||
val = self._buf.strip()
|
if tag == "b" and self._in_b:
|
||||||
if self._capture == "titel" and val:
|
self._in_b = False
|
||||||
self._current["titel"] = val
|
self._title = self._buf.strip()
|
||||||
elif self._capture == "datum" and val and not self._current.get("datum"):
|
self._buf = ""
|
||||||
parsed = _parse_date(val)
|
|
||||||
if parsed:
|
elif tag == "br":
|
||||||
self._current["datum"] = parsed
|
part = self._buf.strip()
|
||||||
elif self._capture == "ort" and val:
|
if part:
|
||||||
self._current["ort_name"] = val
|
self._parts.append(part)
|
||||||
self._capture = None
|
self._buf = ""
|
||||||
self._buf = ""
|
|
||||||
|
elif self._depth <= self._span_d:
|
||||||
|
# Ende der linken span6 — auswerten
|
||||||
|
self._in_span = False
|
||||||
|
self._span_done = True
|
||||||
|
date_str, ort = "", ""
|
||||||
|
for part in self._parts:
|
||||||
|
if not date_str:
|
||||||
|
parsed = _parse_date(part)
|
||||||
|
if parsed:
|
||||||
|
date_str = parsed
|
||||||
|
continue
|
||||||
|
# PLZ Ort: 5 Ziffern gefolgt von Stadtname
|
||||||
|
m = re.match(r'^\d{5}\s+(.+)$', part)
|
||||||
|
if m:
|
||||||
|
ort = m.group(1).strip()
|
||||||
|
|
||||||
|
if self._title and date_str:
|
||||||
|
self._events.append({
|
||||||
|
"titel": self._title,
|
||||||
|
"datum": date_str,
|
||||||
|
"ort_name": ort,
|
||||||
|
"link": "https://www.vdh.de/ausstellungen/",
|
||||||
|
})
|
||||||
|
|
||||||
|
if self._in_row and self._depth < self._row_d:
|
||||||
|
self._in_row = False
|
||||||
|
|
||||||
self._depth -= 1
|
self._depth -= 1
|
||||||
|
|
||||||
if self._in_event and self._depth < self._start_depth:
|
def handle_data(self, data):
|
||||||
self._in_event = False
|
if self._in_b:
|
||||||
ev = self._current
|
self._buf += data
|
||||||
# Nur speichern wenn wir Titel + Datum haben
|
elif self._in_span:
|
||||||
if ev and ev.get("titel") and ev.get("datum"):
|
self._buf += data
|
||||||
self._events.append(ev)
|
|
||||||
self._current = None
|
def get_events(self) -> list[dict]:
|
||||||
|
return self._events
|
||||||
|
|
||||||
|
|
||||||
|
# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
|
||||||
|
# Struktur: h2.h2ash1 (Kategorie) → ul > li
|
||||||
|
# li-Text: "DD.MM.YYYY<br>Titel<br><b>Ort:</b> Stadt<br>"
|
||||||
|
|
||||||
|
class _SportParser(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._events = []
|
||||||
|
self._cat = "" # aktuelle Kategorie (h2.h2ash1)
|
||||||
|
self._in_h2 = False
|
||||||
|
self._in_li = False
|
||||||
|
self._in_b = False
|
||||||
|
self._buf = ""
|
||||||
|
self._parts: list[str] = []
|
||||||
|
self._depth = 0
|
||||||
|
self._li_d = 0
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
self._depth += 1
|
||||||
|
a = dict(attrs)
|
||||||
|
cls = a.get("class", "")
|
||||||
|
|
||||||
|
if tag == "h2" and "h2ash1" in cls:
|
||||||
|
self._in_h2 = True
|
||||||
|
self._buf = ""
|
||||||
|
|
||||||
|
if tag == "li" and not self._in_li:
|
||||||
|
self._in_li = True
|
||||||
|
self._li_d = self._depth
|
||||||
|
self._parts = []
|
||||||
|
self._buf = ""
|
||||||
|
|
||||||
|
if self._in_li and tag == "b":
|
||||||
|
self._in_b = True
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag == "h2" and self._in_h2:
|
||||||
|
self._in_h2 = False
|
||||||
|
self._cat = self._buf.strip()
|
||||||
|
self._buf = ""
|
||||||
|
|
||||||
|
if self._in_li:
|
||||||
|
if tag == "b":
|
||||||
|
self._in_b = False
|
||||||
|
|
||||||
|
elif tag == "br":
|
||||||
|
part = self._buf.strip()
|
||||||
|
if part:
|
||||||
|
self._parts.append(part)
|
||||||
|
self._buf = ""
|
||||||
|
|
||||||
|
elif tag == "li" and self._depth <= self._li_d:
|
||||||
|
self._in_li = False
|
||||||
|
# parts: [date, title, "Ort: Stadt"] oder ähnlich
|
||||||
|
date_str, title, ort = "", "", ""
|
||||||
|
for i, part in enumerate(self._parts):
|
||||||
|
if not date_str:
|
||||||
|
parsed = _parse_date(part.split(" - ")[0].strip())
|
||||||
|
if parsed:
|
||||||
|
date_str = parsed
|
||||||
|
continue
|
||||||
|
if not title and date_str:
|
||||||
|
# Titel darf nicht mit "Ort:" beginnen
|
||||||
|
if not part.lower().startswith("ort:"):
|
||||||
|
title = part
|
||||||
|
continue
|
||||||
|
m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
ort = m.group(1).strip()
|
||||||
|
|
||||||
|
if title and date_str:
|
||||||
|
self._events.append({
|
||||||
|
"titel": title,
|
||||||
|
"datum": date_str,
|
||||||
|
"ort_name": ort,
|
||||||
|
"link": "https://www.vdh.de/hundesport/termine/",
|
||||||
|
})
|
||||||
|
|
||||||
|
self._depth -= 1
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
if self._capture:
|
if self._in_h2:
|
||||||
|
self._buf += data
|
||||||
|
elif self._in_li:
|
||||||
self._buf += data
|
self._buf += data
|
||||||
|
|
||||||
def get_events(self) -> list[dict]:
|
def get_events(self) -> list[dict]:
|
||||||
|
|
@ -217,24 +255,20 @@ class _VDHParser(HTMLParser):
|
||||||
|
|
||||||
|
|
||||||
def _build_external_id(ev: dict) -> str:
|
def _build_external_id(ev: dict) -> str:
|
||||||
"""Erzeugt einen stabilen Dedup-Key aus Datum + Titel."""
|
|
||||||
raw = f"vdh-{ev['datum']}-{ev['titel']}"
|
raw = f"vdh-{ev['datum']}-{ev['titel']}"
|
||||||
# Einfache Normalisierung: lowercase, Sonderzeichen raus
|
|
||||||
key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-')
|
key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-')
|
||||||
return key[:120]
|
return key[:120]
|
||||||
|
|
||||||
|
|
||||||
async def fetch_vdh_events() -> list[dict]:
|
async def fetch_vdh_events() -> list[dict]:
|
||||||
"""
|
"""
|
||||||
Scrapt VDH-Veranstaltungen und gibt eine Liste von Dicts zurück:
|
Scrapt VDH-Veranstaltungen von ausstellungen/liste und hundesport/termine.
|
||||||
{titel, datum, ort_name, typ, link, external_id}
|
Gibt eine Liste von Dicts zurück: {titel, datum, ort_name, typ, link, external_id}
|
||||||
|
|
||||||
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
|
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
|
||||||
"""
|
"""
|
||||||
urls = [
|
sources = [
|
||||||
"https://www.vdh.de/ausstellungen/liste/typ/spezial/",
|
("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _SpezialParser),
|
||||||
"https://www.vdh.de/ausstellungen/",
|
("https://www.vdh.de/hundesport/termine/", _SportParser),
|
||||||
"https://www.vdh.de/hundesport/termine/",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
|
|
@ -243,75 +277,61 @@ async def fetch_vdh_events() -> list[dict]:
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
"Chrome/124.0.0.0 Safari/537.36"
|
"Chrome/124.0.0.0 Safari/537.36"
|
||||||
),
|
),
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
|
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
|
||||||
}
|
}
|
||||||
|
|
||||||
raw_events: list[dict] = []
|
raw_events: list[dict] = []
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
||||||
for url in urls:
|
for url, ParserClass in sources:
|
||||||
try:
|
try:
|
||||||
resp = await client.get(url, headers=headers)
|
resp = await client.get(url, headers=headers)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
html = resp.text
|
parser = ParserClass()
|
||||||
|
parser.feed(resp.text)
|
||||||
parser = _VDHParser()
|
|
||||||
parser.feed(html)
|
|
||||||
found = parser.get_events()
|
found = parser.get_events()
|
||||||
|
|
||||||
if found:
|
if found:
|
||||||
logger.info(f"VDH-Scraper: {len(found)} Events von {url} geparst.")
|
logger.info(f"VDH-Scraper: {len(found)} Events von {url}")
|
||||||
raw_events = found
|
raw_events.extend(found)
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
logger.info(f"VDH-Scraper: Keine Events auf {url} gefunden, nächste URL versuchen.")
|
logger.info(f"VDH-Scraper: Keine Events auf {url}")
|
||||||
|
|
||||||
except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}")
|
logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}")
|
||||||
except httpx.RequestError as e:
|
|
||||||
logger.warning(f"VDH-Scraper Netzwerkfehler für {url}: {e}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"VDH-Scraper unbekannter Fehler für {url}: {e}")
|
logger.warning(f"VDH-Scraper Fehler für {url}: {e}")
|
||||||
|
|
||||||
if not raw_events:
|
if not raw_events:
|
||||||
logger.warning("VDH-Scraper: Keine Daten erhalten — verwende Fallback-Events.")
|
logger.warning("VDH-Scraper: Keine Daten — verwende Fallback-Events.")
|
||||||
return list(FALLBACK_EVENTS)
|
return list(FALLBACK_EVENTS)
|
||||||
|
|
||||||
# Normalisieren
|
|
||||||
today = datetime.today().strftime("%Y-%m-%d")
|
today = datetime.today().strftime("%Y-%m-%d")
|
||||||
result = []
|
result = []
|
||||||
seen_ids: set[str] = set()
|
seen_ids: set[str] = set()
|
||||||
|
|
||||||
for ev in raw_events:
|
for ev in raw_events:
|
||||||
datum = ev.get("datum", "")
|
datum = ev.get("datum", "")
|
||||||
# Nur zukünftige Events
|
|
||||||
if datum < today:
|
if datum < today:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
titel = ev.get("titel", "").strip()
|
titel = ev.get("titel", "").strip()
|
||||||
if not titel or len(titel) < 3:
|
if not titel or len(titel) < 3:
|
||||||
continue
|
continue
|
||||||
|
link = ev.get("link", "https://www.vdh.de")
|
||||||
link = ev.get("link", "")
|
|
||||||
if link and link.startswith("/"):
|
|
||||||
link = "https://www.vdh.de" + link
|
|
||||||
|
|
||||||
entry = {
|
entry = {
|
||||||
"titel": titel,
|
"titel": titel,
|
||||||
"datum": datum,
|
"datum": datum,
|
||||||
"ort_name": ev.get("ort_name") or None,
|
"ort_name": ev.get("ort_name") or None,
|
||||||
"typ": _guess_typ(titel),
|
"typ": _guess_typ(titel),
|
||||||
"link": link or "https://www.vdh.de",
|
"link": link,
|
||||||
"external_id": _build_external_id(ev),
|
"external_id": _build_external_id(ev),
|
||||||
}
|
}
|
||||||
|
|
||||||
if entry["external_id"] not in seen_ids:
|
if entry["external_id"] not in seen_ids:
|
||||||
seen_ids.add(entry["external_id"])
|
seen_ids.add(entry["external_id"])
|
||||||
result.append(entry)
|
result.append(entry)
|
||||||
|
|
||||||
if not result:
|
if not result:
|
||||||
logger.warning("VDH-Scraper: Nach Filterung 0 zukünftige Events — verwende Fallback-Events.")
|
logger.warning("VDH-Scraper: Nach Filterung 0 Events — verwende Fallback.")
|
||||||
return list(FALLBACK_EVENTS)
|
return list(FALLBACK_EVENTS)
|
||||||
|
|
||||||
logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.")
|
logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue