Sprint 11: Freunde & Chat + Phosphor-Icon-Vollmigration

- Freundschaften (pending/accepted), Nutzersuche, Anfragen per Push
- Direktnachrichten mit Polling, iMessage-Stil, Deep-Links aus Push
- Alle Seiten (map, places, diary, health, dog-profile, sitting, knigge,
  forum, wiki, walks) vollständig auf Phosphor-Icons migriert
- Wikidata-Rassen-Scraper (~833 neue Rassen, lokal gespiegelte Fotos)
- TheDogAPI lokal gespiegelt (169 Rassen + Fotos)
- Quiz-Result-Cards horizontal (korrekte Bildproportionen)
- SW by-v89
This commit is contained in:
rene 2026-04-15 21:33:53 +02:00
parent 96bd57f0ad
commit 097295c628
44 changed files with 9980 additions and 300 deletions

View file

138
backend/scraper/breeds.py Normal file
View file

@ -0,0 +1,138 @@
"""Fetches breed data from TheDogAPI and seeds the wiki_rassen table."""
import httpx, re, logging, os
from database import db
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
logger = logging.getLogger(__name__)
def _slug(name: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def _derive_groesse(weight_max_kg: float) -> str:
if weight_max_kg <= 10: return 'klein'
if weight_max_kg <= 25: return 'mittel'
if weight_max_kg <= 40: return 'gross'
return 'sehr_gross'
def _derive_aktivitaet(bred_for: str, temperament: str, group: str) -> str:
text = f"{bred_for or ''} {temperament or ''} {group or ''}".lower()
high_keywords = ['herding', 'hunting', 'sporting', 'working', 'energetic', 'active', 'agile']
low_keywords = ['companion', 'toy', 'lap', 'gentle', 'calm', 'quiet']
if any(k in text for k in high_keywords): return 'hoch'
if any(k in text for k in low_keywords): return 'niedrig'
return 'mittel'
def _derive_erfahrung(temperament: str, group: str) -> str:
text = f"{temperament or ''} {group or ''}".lower()
expert = ['stubborn', 'independent', 'dominant', 'terrier', 'herding']
advanced = ['protective', 'reserved', 'working', 'guard']
if any(k in text for k in expert): return 'fortgeschritten'
if any(k in text for k in advanced): return 'fortgeschritten'
return 'anfaenger'
def _derive_kinder(temperament: str) -> int:
if not temperament: return 1
bad = ['aggressive', 'aloof', 'reserved with strangers']
return 0 if any(k in temperament.lower() for k in bad) else 1
def _parse_weight_kg(weight_metric: str):
"""Parse '7 - 14' or '14' -> (min, max) in kg"""
try:
parts = [p.strip() for p in weight_metric.replace(',', '.').split('-')]
nums = [float(p) for p in parts if p]
if len(nums) >= 2: return nums[0], nums[1]
if len(nums) == 1: return nums[0], nums[0]
except Exception: pass
return None, None
async def mirror_breed_photos():
"""Download CDN breed photos to local storage and update foto_url in DB."""
os.makedirs(BREEDS_DIR, exist_ok=True)
with db() as conn:
rows = conn.execute(
"SELECT id, external_id, foto_url FROM wiki_rassen WHERE foto_url LIKE 'http%' AND foto_url NOT LIKE '/media/%'"
).fetchall()
if not rows:
logger.info("Breed photos: nothing to mirror")
return 0
mirrored = 0
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
for row_id, ext_id, cdn_url in rows:
local_path = os.path.join(BREEDS_DIR, f"{ext_id}.jpg")
local_url = f"/media/breeds/{ext_id}.jpg"
# Skip if already downloaded
if os.path.exists(local_path):
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
mirrored += 1
continue
try:
r = await client.get(cdn_url)
if r.status_code == 200:
with open(local_path, "wb") as f:
f.write(r.content)
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
mirrored += 1
else:
logger.warning(f"Breed photo {ext_id}: HTTP {r.status_code}")
except Exception as e:
logger.warning(f"Breed photo {ext_id} download failed: {e}")
logger.info(f"Breed photos mirrored: {mirrored}/{len(rows)}")
return mirrored
async def fetch_and_seed_breeds():
"""Fetch all breeds from TheDogAPI and upsert into wiki_rassen."""
api_key = os.getenv("THEDOGAPI_KEY", "")
try:
async with httpx.AsyncClient(timeout=30) as client:
r = await client.get('https://api.thedogapi.com/v1/breeds',
headers={'x-api-key': api_key})
r.raise_for_status()
breeds = r.json()
except Exception as e:
logger.error(f"TheDogAPI fetch failed: {e}")
return 0
seeded = 0
with db() as conn:
for b in breeds:
try:
w_min, w_max = _parse_weight_kg(b.get('weight', {}).get('metric', '') or '')
groesse = _derive_groesse(w_max or 20)
aktivitaet = _derive_aktivitaet(b.get('bred_for',''), b.get('temperament',''), b.get('breed_group',''))
erfahrung = _derive_erfahrung(b.get('temperament',''), b.get('breed_group',''))
kinder = _derive_kinder(b.get('temperament',''))
wohnung = 1 if groesse == 'klein' and aktivitaet in ('niedrig','mittel') else 0
foto_url = b.get('image', {}).get('url') or None
slug = _slug(b['name'])
conn.execute("""
INSERT INTO wiki_rassen
(external_id, name, gruppe, herkunft, temperament,
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
foto_url, bred_for, aktivitaet, wohnung_geeignet,
kinder_geeignet, erfahrung, slug)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(external_id) DO UPDATE SET
foto_url=excluded.foto_url,
temperament=excluded.temperament
""", (
b['id'], b['name'],
b.get('breed_group'), b.get('origin'), b.get('temperament'),
w_min, w_max, groesse, b.get('life_span'),
foto_url, b.get('bred_for'), aktivitaet, wohnung, kinder, erfahrung, slug
))
seeded += 1
except Exception as e:
logger.warning(f"Breed {b.get('name')} seed failed: {e}")
logger.info(f"Breeds seeded: {seeded}")
return seeded

View file

@ -0,0 +1,317 @@
"""
BAN YARO VDH Veranstaltungs-Scraper
Scrapt Hundeveranstaltungen von vdh.de.
Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
"""
import logging
import re
from datetime import datetime
from html.parser import HTMLParser
import httpx
logger = logging.getLogger(__name__)
FALLBACK_EVENTS = [
{"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
{"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
{"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
{"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
]
# Mapping VDH-Kategorienamen → interne Typen
_TYP_MAP = {
"ausstellung": "ausstellung",
"show": "ausstellung",
"siegershow": "ausstellung",
"agility": "wettkampf",
"wettkampf": "wettkampf",
"turnier": "wettkampf",
"prüfung": "wettkampf",
"training": "training",
"treffen": "treffen",
"markt": "markt",
}
# Monatsnamen Deutsch → Zahl
_MONATE = {
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
"april": 4, "mai": 5, "juni": 6, "juli": 7,
"august": 8, "september": 9, "oktober": 10,
"november": 11, "dezember": 12,
}
def _guess_typ(text: str) -> str:
"""Bestimmt den Event-Typ anhand des Titels."""
t = text.lower()
for keyword, typ in _TYP_MAP.items():
if keyword in t:
return typ
return "sonstiges"
def _parse_date(raw: str) -> str | None:
"""
Versucht verschiedene Datumsformate zu parsen.
Gibt YYYY-MM-DD zurück oder None.
"""
raw = raw.strip()
# ISO: 2026-05-03
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
if m:
return raw
# DD.MM.YYYY oder D.M.YYYY
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
if m:
d, mo, y = m.groups()
return f"{y}-{int(mo):02d}-{int(d):02d}"
# DD. Monatsname YYYY (z.B. "14. Juni 2026")
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
if m:
d, mon_str, y = m.groups()
mon_num = _MONATE.get(mon_str.lower())
if mon_num:
return f"{y}-{mon_num:02d}-{int(d):02d}"
# Monatsname DD, YYYY (englisch, Fallback)
try:
dt = datetime.strptime(raw, "%B %d, %Y")
return dt.strftime("%Y-%m-%d")
except ValueError:
pass
return None
class _VDHParser(HTMLParser):
"""
Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite.
Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen.
"""
def __init__(self):
super().__init__()
self._events: list[dict] = []
self._current: dict | None = None
self._depth = 0
self._start_depth = 0
self._capture = None # 'titel' | 'datum' | 'ort'
self._buf = ""
self._in_event = False
# ---------- Hilfsmethoden ----------
def _is_event_container(self, tag, attrs):
"""Erkennt Start eines Event-Blocks."""
a = dict(attrs)
cls = a.get("class", "")
return (
tag == "article"
or (tag in ("li", "div") and any(
kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item")
))
)
def _is_title_tag(self, tag, attrs):
a = dict(attrs)
cls = a.get("class", "")
return tag in ("h2", "h3", "h4") or any(
kw in cls for kw in ("title", "titel", "name", "heading")
)
def _is_date_tag(self, tag, attrs):
a = dict(attrs)
cls = a.get("class", "")
it = a.get("itemprop", "")
return (
tag in ("time",)
or any(kw in cls for kw in ("date", "datum", "time"))
or it in ("startDate", "endDate")
)
def _is_location_tag(self, tag, attrs):
a = dict(attrs)
cls = a.get("class", "")
it = a.get("itemprop", "")
return (
any(kw in cls for kw in ("location", "ort", "venue", "place", "city"))
or it in ("location", "addressLocality")
)
# ---------- SAX-Events ----------
def handle_starttag(self, tag, attrs):
self._depth += 1
a = dict(attrs)
if not self._in_event and self._is_event_container(tag, attrs):
self._in_event = True
self._start_depth = self._depth
self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""}
# Direkter Link auf dem Container?
if tag == "a" and "href" in a:
self._current["link"] = a["href"]
return
if self._in_event:
# Link innerhalb des Event-Blocks
if tag == "a" and "href" in a and not self._current.get("link"):
href = a["href"]
if "vdh.de" in href or href.startswith("/"):
self._current["link"] = href
# <time datetime="…">
if tag == "time":
dt = a.get("datetime", "")
if dt:
parsed = _parse_date(dt)
if parsed:
self._current["datum"] = parsed
if self._is_title_tag(tag, attrs):
self._capture = "titel"
self._buf = ""
elif self._is_date_tag(tag, attrs) and not self._current.get("datum"):
self._capture = "datum"
self._buf = ""
elif self._is_location_tag(tag, attrs):
self._capture = "ort"
self._buf = ""
def handle_endtag(self, tag):
if self._capture:
val = self._buf.strip()
if self._capture == "titel" and val:
self._current["titel"] = val
elif self._capture == "datum" and val and not self._current.get("datum"):
parsed = _parse_date(val)
if parsed:
self._current["datum"] = parsed
elif self._capture == "ort" and val:
self._current["ort_name"] = val
self._capture = None
self._buf = ""
self._depth -= 1
if self._in_event and self._depth < self._start_depth:
self._in_event = False
ev = self._current
# Nur speichern wenn wir Titel + Datum haben
if ev and ev.get("titel") and ev.get("datum"):
self._events.append(ev)
self._current = None
def handle_data(self, data):
if self._capture:
self._buf += data
def get_events(self) -> list[dict]:
return self._events
def _build_external_id(ev: dict) -> str:
"""Erzeugt einen stabilen Dedup-Key aus Datum + Titel."""
raw = f"vdh-{ev['datum']}-{ev['titel']}"
# Einfache Normalisierung: lowercase, Sonderzeichen raus
key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-')
return key[:120]
async def fetch_vdh_events() -> list[dict]:
"""
Scrapt VDH-Veranstaltungen und gibt eine Liste von Dicts zurück:
{titel, datum, ort_name, typ, link, external_id}
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
"""
urls = [
"https://www.vdh.de/veranstaltungen/ausstellungen/",
"https://www.vdh.de/veranstaltungen/",
]
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
}
raw_events: list[dict] = []
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
for url in urls:
try:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
html = resp.text
parser = _VDHParser()
parser.feed(html)
found = parser.get_events()
if found:
logger.info(f"VDH-Scraper: {len(found)} Events von {url} geparst.")
raw_events = found
break
else:
logger.info(f"VDH-Scraper: Keine Events auf {url} gefunden, nächste URL versuchen.")
except httpx.HTTPStatusError as e:
logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}")
except httpx.RequestError as e:
logger.warning(f"VDH-Scraper Netzwerkfehler für {url}: {e}")
except Exception as e:
logger.warning(f"VDH-Scraper unbekannter Fehler für {url}: {e}")
if not raw_events:
logger.warning("VDH-Scraper: Keine Daten erhalten — verwende Fallback-Events.")
return list(FALLBACK_EVENTS)
# Normalisieren
today = datetime.today().strftime("%Y-%m-%d")
result = []
seen_ids: set[str] = set()
for ev in raw_events:
datum = ev.get("datum", "")
# Nur zukünftige Events
if datum < today:
continue
titel = ev.get("titel", "").strip()
if not titel or len(titel) < 3:
continue
link = ev.get("link", "")
if link and link.startswith("/"):
link = "https://www.vdh.de" + link
entry = {
"titel": titel,
"datum": datum,
"ort_name": ev.get("ort_name") or None,
"typ": _guess_typ(titel),
"link": link or "https://www.vdh.de",
"external_id": _build_external_id(ev),
}
if entry["external_id"] not in seen_ids:
seen_ids.add(entry["external_id"])
result.append(entry)
if not result:
logger.warning("VDH-Scraper: Nach Filterung 0 zukünftige Events — verwende Fallback-Events.")
return list(FALLBACK_EVENTS)
logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.")
return result

View file

@ -0,0 +1,196 @@
"""Fetches missing dog breed data from Wikidata SPARQL and seeds wiki_rassen."""
import httpx, re, logging, os
from database import db
logger = logging.getLogger(__name__)
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
SPARQL_URL = "https://query.wikidata.org/sparql"
# GROUP BY + SAMPLE so each breed appears once even if it has multiple images
SPARQL_QUERY = """
SELECT ?breed
(SAMPLE(?nameDE) AS ?nameDE)
(SAMPLE(?nameEN) AS ?nameEN)
(SAMPLE(?image) AS ?image)
(SAMPLE(?countryDE) AS ?countryDE)
(SAMPLE(?descDE) AS ?descDE)
(SAMPLE(?descEN) AS ?descEN)
WHERE {
?breed wdt:P31 wd:Q39367 .
OPTIONAL { ?breed rdfs:label ?nameDE FILTER(LANG(?nameDE) = "de") }
OPTIONAL { ?breed rdfs:label ?nameEN FILTER(LANG(?nameEN) = "en") }
FILTER(BOUND(?nameDE) || BOUND(?nameEN))
OPTIONAL { ?breed wdt:P18 ?image }
OPTIONAL {
?breed wdt:P495 ?country .
?country rdfs:label ?countryDE FILTER(LANG(?countryDE) = "de")
}
OPTIONAL { ?breed schema:description ?descDE FILTER(LANG(?descDE) = "de") }
OPTIONAL { ?breed schema:description ?descEN FILTER(LANG(?descEN) = "en") }
}
GROUP BY ?breed
ORDER BY ?nameDE ?nameEN
"""
def _slug(name: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def _normalise(name: str) -> str:
"""Lowercase + remove diacritics for name deduplication."""
import unicodedata
nfkd = unicodedata.normalize('NFKD', name.lower())
return re.sub(r'[^a-z0-9 ]', '', nfkd).strip()
async def fetch_and_seed_wikidata_breeds():
"""Query Wikidata for dog breeds and insert only those missing from wiki_rassen."""
# -- fetch from SPARQL -------------------------------------------------
try:
async with httpx.AsyncClient(
timeout=60,
headers={"Accept": "application/sparql-results+json",
"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
) as client:
r = await client.get(SPARQL_URL, params={"query": SPARQL_QUERY})
r.raise_for_status()
data = r.json()
except Exception as e:
logger.error(f"Wikidata SPARQL fetch failed: {e}")
return 0
bindings = data.get("results", {}).get("bindings", [])
logger.info(f"Wikidata: {len(bindings)} breed entries received")
# -- load existing names for deduplication -----------------------------
with db() as conn:
existing = conn.execute("SELECT name FROM wiki_rassen").fetchall()
existing_norm = {_normalise(row[0]) for row in existing}
seeded = 0
with db() as conn:
for b in bindings:
name = (b.get("nameDE", {}).get("value") or
b.get("nameEN", {}).get("value") or "").strip()
if not name:
continue
# skip if already in DB (by normalised name)
if _normalise(name) in existing_norm:
continue
qid = b["breed"]["value"].rsplit("/", 1)[-1] # e.g. "Q312440"
ext_id = f"wd_{qid}"
image_url = b.get("image", {}).get("value") or None
herkunft = b.get("countryDE", {}).get("value") or None
desc = (b.get("descDE", {}).get("value") or
b.get("descEN", {}).get("value") or None)
slug_base = _slug(name)
# make slug unique if collision exists
slug = slug_base
suffix = 1
while True:
row = conn.execute(
"SELECT 1 FROM wiki_rassen WHERE slug=? AND external_id != ?",
(slug, ext_id)
).fetchone()
if not row:
break
slug = f"{slug_base}-{suffix}"
suffix += 1
try:
conn.execute("""
INSERT INTO wiki_rassen
(external_id, name, gruppe, herkunft, temperament,
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
foto_url, bred_for, aktivitaet, wohnung_geeignet,
kinder_geeignet, erfahrung, slug)
VALUES (?,?,?,?,?,NULL,NULL,'mittel',NULL,?,NULL,'mittel',0,1,'anfaenger',?)
ON CONFLICT(external_id) DO UPDATE SET
foto_url = CASE
WHEN excluded.foto_url IS NOT NULL AND wiki_rassen.foto_url IS NULL
THEN excluded.foto_url
ELSE wiki_rassen.foto_url
END,
herkunft = COALESCE(wiki_rassen.herkunft, excluded.herkunft),
temperament = COALESCE(wiki_rassen.temperament, excluded.temperament)
""", (ext_id, name, None, herkunft, desc, image_url, slug))
existing_norm.add(_normalise(name)) # avoid re-inserting within same run
seeded += 1
except Exception as e:
logger.warning(f"Wikidata breed '{name}' seed failed: {e}")
logger.info(f"Wikidata breeds seeded: {seeded}")
return seeded
async def mirror_wikidata_photos():
"""Download Wikimedia Commons photos for Wikidata breeds that still have external URLs."""
os.makedirs(BREEDS_DIR, exist_ok=True)
with db() as conn:
rows = conn.execute(
"""SELECT id, external_id, foto_url FROM wiki_rassen
WHERE external_id LIKE 'wd_%'
AND foto_url LIKE 'http%'
AND foto_url NOT LIKE '/media/%'"""
).fetchall()
if not rows:
logger.info("Wikidata photos: nothing to mirror")
return 0
mirrored = 0
import asyncio
async with httpx.AsyncClient(
timeout=30,
follow_redirects=True,
headers={"User-Agent": "BanYaro/1.0 (https://banyaro.app)"}
) as client:
for i, (row_id, ext_id, img_url) in enumerate(rows):
qid = ext_id.replace("wd_", "")
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
local_url = f"/media/breeds/{qid}.jpg"
if os.path.exists(local_path):
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
(local_url, row_id))
mirrored += 1
continue
# Wikimedia Commons: append ?width=600 for scaled download
fetch_url = img_url if "?" in img_url else img_url + "?width=600"
retries = 2
for attempt in range(retries):
try:
await asyncio.sleep(0.3) # 300ms zwischen Requests → ~3/s
r = await client.get(fetch_url)
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
with open(local_path, "wb") as f:
f.write(r.content)
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
(local_url, row_id))
mirrored += 1
break
elif r.status_code == 429:
wait = 10 * (attempt + 1)
logger.info(f"Rate limited, warte {wait}s…")
await asyncio.sleep(wait)
else:
logger.warning(f"Wikidata photo {qid}: HTTP {r.status_code}")
break
except Exception as e:
logger.warning(f"Wikidata photo {qid} failed: {e}")
break
if i % 50 == 0 and i > 0:
logger.info(f"Wikidata photos: {mirrored}/{i+1} bisher")
logger.info(f"Wikidata photos mirrored: {mirrored}/{len(rows)}")
return mirrored