Fix: VDH-Scraper Regex statt HTMLParser (void-element Depth-Bug), parse_spezial + parse_sport
This commit is contained in:
parent
4e5a13d9e2
commit
da036d2b93
1 changed files with 125 additions and 228 deletions
|
|
@ -7,18 +7,17 @@ Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
|
|||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FALLBACK_EVENTS = [
|
||||
{"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
|
||||
{"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
|
||||
{"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
|
||||
{"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
|
||||
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
|
||||
{"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
|
||||
{"titel": "Internationale Hundeausstellung Frankfurt","datum": "2026-05-03", "ort_name": "Frankfurt am Main","typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
|
||||
{"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
|
||||
{"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
|
||||
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
|
||||
]
|
||||
|
||||
_TYP_MAP = {
|
||||
|
|
@ -28,13 +27,6 @@ _TYP_MAP = {
|
|||
"training": "training", "treffen": "treffen", "markt": "markt",
|
||||
}
|
||||
|
||||
_MONATE = {
|
||||
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
|
||||
"april": 4, "mai": 5, "juni": 6, "juli": 7,
|
||||
"august": 8, "september": 9, "oktober": 10,
|
||||
"november": 11, "dezember": 12,
|
||||
}
|
||||
|
||||
|
||||
def _guess_typ(text: str) -> str:
|
||||
t = text.lower()
|
||||
|
|
@ -45,215 +37,21 @@ def _guess_typ(text: str) -> str:
|
|||
|
||||
|
||||
def _parse_date(raw: str) -> str | None:
|
||||
raw = raw.strip()
|
||||
# Datumsbereich: "DD.MM.YYYY - DD.MM.YYYY" → erstes Datum nehmen
|
||||
raw = raw.split(" - ")[0].strip()
|
||||
# ISO: 2026-05-03
|
||||
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
|
||||
if m:
|
||||
return raw
|
||||
# DD.MM.YYYY oder D.M.YYYY
|
||||
"""DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY → YYYY-MM-DD (erstes Datum)."""
|
||||
raw = raw.strip().split(" - ")[0].strip()
|
||||
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
|
||||
if m:
|
||||
d, mo, y = m.groups()
|
||||
return f"{y}-{int(mo):02d}-{int(d):02d}"
|
||||
# DD. Monatsname YYYY
|
||||
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
|
||||
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
|
||||
if m:
|
||||
d, mon_str, y = m.groups()
|
||||
mon_num = _MONATE.get(mon_str.lower())
|
||||
if mon_num:
|
||||
return f"{y}-{mon_num:02d}-{int(d):02d}"
|
||||
return raw
|
||||
return None
|
||||
|
||||
|
||||
# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
|
||||
# Struktur: div.ausstellung_liste > div.row > div.span6
|
||||
# Linke span6: <b>Rassen</b><br>DD.MM.YYYY<br> Verein<br> Straße<br> PLZ Ort<br>
|
||||
|
||||
class _SpezialParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._events = []
|
||||
self._in_liste = False
|
||||
self._row_d = 0 # depth beim row-Start
|
||||
self._span_d = 0 # depth beim span6-Start
|
||||
self._depth = 0
|
||||
self._in_row = False
|
||||
self._in_span = False # linke span6 (erste im row)
|
||||
self._span_done = False # linke span6 fertig geparst
|
||||
self._in_b = False
|
||||
self._buf = ""
|
||||
self._parts: list[str] = [] # Teile zwischen <br>
|
||||
self._title = ""
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self._depth += 1
|
||||
a = dict(attrs)
|
||||
cls = a.get("class", "")
|
||||
|
||||
if "ausstellung_liste" in cls:
|
||||
self._in_liste = True
|
||||
|
||||
if self._in_liste and tag == "div" and "row" in cls.split():
|
||||
self._in_row = True
|
||||
self._row_d = self._depth
|
||||
self._span_done = False
|
||||
self._title = ""
|
||||
self._parts = []
|
||||
|
||||
if self._in_row and not self._span_done and tag == "div" and "span6" in cls.split():
|
||||
if not self._in_span:
|
||||
self._in_span = True
|
||||
self._span_d = self._depth
|
||||
self._buf = ""
|
||||
|
||||
if self._in_span and tag == "b":
|
||||
self._in_b = True
|
||||
self._buf = ""
|
||||
|
||||
# <br> ist void — kein handle_endtag, muss hier behandelt werden
|
||||
if self._in_span and tag == "br":
|
||||
part = self._buf.strip()
|
||||
if part and not self._in_b:
|
||||
self._parts.append(part)
|
||||
self._buf = ""
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self._in_span:
|
||||
if tag == "b" and self._in_b:
|
||||
self._in_b = False
|
||||
self._title = self._buf.strip()
|
||||
self._buf = ""
|
||||
|
||||
elif self._depth <= self._span_d:
|
||||
# Ende der linken span6 — auswerten
|
||||
self._in_span = False
|
||||
self._span_done = True
|
||||
date_str, ort = "", ""
|
||||
for part in self._parts:
|
||||
if not date_str:
|
||||
parsed = _parse_date(part)
|
||||
if parsed:
|
||||
date_str = parsed
|
||||
continue
|
||||
# PLZ Ort: 5 Ziffern gefolgt von Stadtname
|
||||
m = re.match(r'^\d{5}\s+(.+)$', part)
|
||||
if m:
|
||||
ort = m.group(1).strip()
|
||||
|
||||
if self._title and date_str:
|
||||
self._events.append({
|
||||
"titel": self._title,
|
||||
"datum": date_str,
|
||||
"ort_name": ort,
|
||||
"link": "https://www.vdh.de/ausstellungen/",
|
||||
})
|
||||
|
||||
if self._in_row and self._depth < self._row_d:
|
||||
self._in_row = False
|
||||
|
||||
self._depth -= 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._in_b:
|
||||
self._buf += data
|
||||
elif self._in_span:
|
||||
self._buf += data
|
||||
|
||||
def get_events(self) -> list[dict]:
|
||||
return self._events
|
||||
|
||||
|
||||
# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
|
||||
# Struktur: h2.h2ash1 (Kategorie) → ul > li
|
||||
# li-Text: "DD.MM.YYYY<br>Titel<br><b>Ort:</b> Stadt<br>"
|
||||
|
||||
class _SportParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._events = []
|
||||
self._cat = "" # aktuelle Kategorie (h2.h2ash1)
|
||||
self._in_h2 = False
|
||||
self._in_li = False
|
||||
self._in_b = False
|
||||
self._buf = ""
|
||||
self._parts: list[str] = []
|
||||
self._depth = 0
|
||||
self._li_d = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self._depth += 1
|
||||
a = dict(attrs)
|
||||
cls = a.get("class", "")
|
||||
|
||||
if tag == "h2" and "h2ash1" in cls:
|
||||
self._in_h2 = True
|
||||
self._buf = ""
|
||||
|
||||
if tag == "li" and not self._in_li:
|
||||
self._in_li = True
|
||||
self._li_d = self._depth
|
||||
self._parts = []
|
||||
self._buf = ""
|
||||
|
||||
if self._in_li and tag == "b":
|
||||
self._in_b = True
|
||||
|
||||
# <br> ist void — kein handle_endtag, muss hier behandelt werden
|
||||
if self._in_li and tag == "br":
|
||||
part = self._buf.strip()
|
||||
if part:
|
||||
self._parts.append(part)
|
||||
self._buf = ""
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "h2" and self._in_h2:
|
||||
self._in_h2 = False
|
||||
self._cat = self._buf.strip()
|
||||
self._buf = ""
|
||||
|
||||
if self._in_li:
|
||||
if tag == "b":
|
||||
self._in_b = False
|
||||
|
||||
elif tag == "li" and self._depth <= self._li_d:
|
||||
self._in_li = False
|
||||
# parts: [date, title, "Ort: Stadt"] oder ähnlich
|
||||
date_str, title, ort = "", "", ""
|
||||
for i, part in enumerate(self._parts):
|
||||
if not date_str:
|
||||
parsed = _parse_date(part.split(" - ")[0].strip())
|
||||
if parsed:
|
||||
date_str = parsed
|
||||
continue
|
||||
if not title and date_str:
|
||||
# Titel darf nicht mit "Ort:" beginnen
|
||||
if not part.lower().startswith("ort:"):
|
||||
title = part
|
||||
continue
|
||||
m = re.match(r'^Ort:\s*(.+)$', part, re.IGNORECASE)
|
||||
if m:
|
||||
ort = m.group(1).strip()
|
||||
|
||||
if title and date_str:
|
||||
self._events.append({
|
||||
"titel": title,
|
||||
"datum": date_str,
|
||||
"ort_name": ort,
|
||||
"link": "https://www.vdh.de/hundesport/termine/",
|
||||
})
|
||||
|
||||
self._depth -= 1
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._in_h2:
|
||||
self._buf += data
|
||||
elif self._in_li:
|
||||
self._buf += data
|
||||
|
||||
def get_events(self) -> list[dict]:
|
||||
return self._events
|
||||
def _strip_tags(html: str) -> str:
|
||||
"""Entfernt HTML-Tags."""
|
||||
return re.sub(r'<[^>]+>', '', html).strip()
|
||||
|
||||
|
||||
def _build_external_id(ev: dict) -> str:
|
||||
|
|
@ -262,23 +60,125 @@ def _build_external_id(ev: dict) -> str:
|
|||
return key[:120]
|
||||
|
||||
|
||||
# ── PARSER 1: /ausstellungen/liste/typ/spezial/ ──────────────────────────────
|
||||
# Struktur innerhalb div.ausstellung_liste:
|
||||
# <div class="row">
|
||||
# <div class="span6">
|
||||
# <b>Rassen</b><br>DD.MM.YYYY<br> Verein<br> Straße<br> PLZ Ort<br>
|
||||
# </div>
|
||||
# <div class="span6">…Kontakt…</div>
|
||||
# </div>
|
||||
|
||||
def _parse_spezial(html: str) -> list[dict]:
|
||||
events = []
|
||||
|
||||
# Ausstellung_liste-Block extrahieren
|
||||
m = re.search(r'<div class="ausstellung_liste">(.*?)(?=<div class="row">\s*<div class="span12">)',
|
||||
html, re.DOTALL)
|
||||
block = m.group(1) if m else html
|
||||
|
||||
# Jede linke span6 (erstes span6 pro row) extrahieren
|
||||
# Pattern: <div class="row"> ... <div class="span6">INHALT</div>
|
||||
row_pattern = re.compile(
|
||||
r'<div class="row">\s*<div class="span6">(.*?)</div>',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
for row_m in row_pattern.finditer(block):
|
||||
cell = row_m.group(1)
|
||||
|
||||
# Titel aus <b>...</b>
|
||||
title_m = re.search(r'<b>(.*?)</b>', cell, re.DOTALL)
|
||||
if not title_m:
|
||||
continue
|
||||
title = _strip_tags(title_m.group(1)).strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
# Datum: erste DD.MM.YYYY nach dem <b>-Block
|
||||
after_b = cell[title_m.end():]
|
||||
date_m = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})', after_b)
|
||||
if not date_m:
|
||||
continue
|
||||
date_str = _parse_date(date_m.group(1))
|
||||
if not date_str:
|
||||
continue
|
||||
|
||||
# PLZ + Ort: "12345 Stadtname"
|
||||
ort = ""
|
||||
ort_m = re.search(r'(\d{5})\s+([^<\n\r]+)', after_b)
|
||||
if ort_m:
|
||||
ort = ort_m.group(2).strip()
|
||||
|
||||
events.append({
|
||||
"titel": title,
|
||||
"datum": date_str,
|
||||
"ort_name": ort,
|
||||
"link": "https://www.vdh.de/ausstellungen/",
|
||||
})
|
||||
|
||||
return events
|
||||
|
||||
|
||||
# ── PARSER 2: /hundesport/termine/ ───────────────────────────────────────────
|
||||
# Struktur: <h2 class="h2ash1">Kategorie</h2> dann <ul><li>
|
||||
# <li>DD.MM.YYYY - DD.MM.YYYY<br>Titel<br><b>Ort:</b> Stadt<br></li>
|
||||
|
||||
def _parse_sport(html: str) -> list[dict]:
|
||||
events = []
|
||||
|
||||
# <li>-Blöcke extrahieren
|
||||
li_pattern = re.compile(r'<li>(.*?)</li>', re.DOTALL)
|
||||
|
||||
for li_m in li_pattern.finditer(html):
|
||||
cell = li_m.group(1)
|
||||
# Datum: erstes DD.MM.YYYY oder DD.MM.YYYY - DD.MM.YYYY
|
||||
date_m = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4}(?:\s*-\s*\d{1,2}\.\d{1,2}\.\d{4})?)', cell)
|
||||
if not date_m:
|
||||
continue
|
||||
date_str = _parse_date(date_m.group(1))
|
||||
if not date_str:
|
||||
continue
|
||||
|
||||
# Text nach dem Datum ohne Tags
|
||||
after_date = cell[date_m.end():]
|
||||
# "Ort:" aus <b>Ort:</b> Stadt entfernen wir für den Titel
|
||||
parts = [p.strip() for p in re.split(r'<br\s*/?>|<b>[^<]*</b>', after_date) if p.strip()]
|
||||
parts = [_strip_tags(p) for p in parts if _strip_tags(p)]
|
||||
|
||||
title = ""
|
||||
ort = ""
|
||||
for part in parts:
|
||||
if re.match(r'^Ort:\s*', part, re.IGNORECASE):
|
||||
ort = re.sub(r'^Ort:\s*', '', part, flags=re.IGNORECASE).strip()
|
||||
elif not title and not re.match(r'^\d', part):
|
||||
title = part
|
||||
|
||||
if not title:
|
||||
continue
|
||||
|
||||
events.append({
|
||||
"titel": title,
|
||||
"datum": date_str,
|
||||
"ort_name": ort,
|
||||
"link": "https://www.vdh.de/hundesport/termine/",
|
||||
})
|
||||
|
||||
return events
|
||||
|
||||
|
||||
async def fetch_vdh_events() -> list[dict]:
|
||||
"""
|
||||
Scrapt VDH-Veranstaltungen von ausstellungen/liste und hundesport/termine.
|
||||
Gibt eine Liste von Dicts zurück: {titel, datum, ort_name, typ, link, external_id}
|
||||
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
|
||||
"""
|
||||
sources = [
|
||||
("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _SpezialParser),
|
||||
("https://www.vdh.de/hundesport/termine/", _SportParser),
|
||||
("https://www.vdh.de/ausstellungen/liste/typ/spezial/", _parse_spezial),
|
||||
("https://www.vdh.de/hundesport/termine/", _parse_sport),
|
||||
]
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
|
||||
}
|
||||
|
|
@ -286,13 +186,11 @@ async def fetch_vdh_events() -> list[dict]:
|
|||
raw_events: list[dict] = []
|
||||
|
||||
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
||||
for url, ParserClass in sources:
|
||||
for url, parse_fn in sources:
|
||||
try:
|
||||
resp = await client.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
parser = ParserClass()
|
||||
parser.feed(resp.text)
|
||||
found = parser.get_events()
|
||||
found = parse_fn(resp.text)
|
||||
if found:
|
||||
logger.info(f"VDH-Scraper: {len(found)} Events von {url}")
|
||||
raw_events.extend(found)
|
||||
|
|
@ -318,14 +216,13 @@ async def fetch_vdh_events() -> list[dict]:
|
|||
titel = ev.get("titel", "").strip()
|
||||
if not titel or len(titel) < 3:
|
||||
continue
|
||||
link = ev.get("link", "https://www.vdh.de")
|
||||
|
||||
entry = {
|
||||
"titel": titel,
|
||||
"datum": datum,
|
||||
"ort_name": ev.get("ort_name") or None,
|
||||
"typ": _guess_typ(titel),
|
||||
"link": link,
|
||||
"link": ev.get("link", "https://www.vdh.de"),
|
||||
"external_id": _build_external_id(ev),
|
||||
}
|
||||
if entry["external_id"] not in seen_ids:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue