Sprint 11: Freunde & Chat + Phosphor-Icon-Vollmigration
- Freundschaften (pending/accepted), Nutzersuche, Anfragen per Push - Direktnachrichten mit Polling, iMessage-Stil, Deep-Links aus Push - Alle Seiten (map, places, diary, health, dog-profile, sitting, knigge, forum, wiki, walks) vollständig auf Phosphor-Icons migriert - Wikidata-Rassen-Scraper (~833 neue Rassen, lokal gespiegelte Fotos) - TheDogAPI lokal gespiegelt (169 Rassen + Fotos) - Quiz-Result-Cards horizontal (korrekte Bildproportionen) - SW by-v89
This commit is contained in:
parent
96bd57f0ad
commit
097295c628
44 changed files with 9980 additions and 300 deletions
0
backend/scraper/__init__.py
Normal file
0
backend/scraper/__init__.py
Normal file
138
backend/scraper/breeds.py
Normal file
138
backend/scraper/breeds.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
"""Fetches breed data from TheDogAPI and seeds the wiki_rassen table."""
|
||||
import httpx, re, logging, os
|
||||
from database import db
|
||||
|
||||
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
|
||||
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _slug(name: str) -> str:
|
||||
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
||||
|
||||
def _derive_groesse(weight_max_kg: float) -> str:
|
||||
if weight_max_kg <= 10: return 'klein'
|
||||
if weight_max_kg <= 25: return 'mittel'
|
||||
if weight_max_kg <= 40: return 'gross'
|
||||
return 'sehr_gross'
|
||||
|
||||
def _derive_aktivitaet(bred_for: str, temperament: str, group: str) -> str:
|
||||
text = f"{bred_for or ''} {temperament or ''} {group or ''}".lower()
|
||||
high_keywords = ['herding', 'hunting', 'sporting', 'working', 'energetic', 'active', 'agile']
|
||||
low_keywords = ['companion', 'toy', 'lap', 'gentle', 'calm', 'quiet']
|
||||
if any(k in text for k in high_keywords): return 'hoch'
|
||||
if any(k in text for k in low_keywords): return 'niedrig'
|
||||
return 'mittel'
|
||||
|
||||
def _derive_erfahrung(temperament: str, group: str) -> str:
|
||||
text = f"{temperament or ''} {group or ''}".lower()
|
||||
expert = ['stubborn', 'independent', 'dominant', 'terrier', 'herding']
|
||||
advanced = ['protective', 'reserved', 'working', 'guard']
|
||||
if any(k in text for k in expert): return 'fortgeschritten'
|
||||
if any(k in text for k in advanced): return 'fortgeschritten'
|
||||
return 'anfaenger'
|
||||
|
||||
def _derive_kinder(temperament: str) -> int:
|
||||
if not temperament: return 1
|
||||
bad = ['aggressive', 'aloof', 'reserved with strangers']
|
||||
return 0 if any(k in temperament.lower() for k in bad) else 1
|
||||
|
||||
def _parse_weight_kg(weight_metric: str):
|
||||
"""Parse '7 - 14' or '14' -> (min, max) in kg"""
|
||||
try:
|
||||
parts = [p.strip() for p in weight_metric.replace(',', '.').split('-')]
|
||||
nums = [float(p) for p in parts if p]
|
||||
if len(nums) >= 2: return nums[0], nums[1]
|
||||
if len(nums) == 1: return nums[0], nums[0]
|
||||
except Exception: pass
|
||||
return None, None
|
||||
|
||||
async def mirror_breed_photos():
|
||||
"""Download CDN breed photos to local storage and update foto_url in DB."""
|
||||
os.makedirs(BREEDS_DIR, exist_ok=True)
|
||||
|
||||
with db() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT id, external_id, foto_url FROM wiki_rassen WHERE foto_url LIKE 'http%' AND foto_url NOT LIKE '/media/%'"
|
||||
).fetchall()
|
||||
|
||||
if not rows:
|
||||
logger.info("Breed photos: nothing to mirror")
|
||||
return 0
|
||||
|
||||
mirrored = 0
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
for row_id, ext_id, cdn_url in rows:
|
||||
local_path = os.path.join(BREEDS_DIR, f"{ext_id}.jpg")
|
||||
local_url = f"/media/breeds/{ext_id}.jpg"
|
||||
|
||||
# Skip if already downloaded
|
||||
if os.path.exists(local_path):
|
||||
with db() as conn:
|
||||
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
|
||||
mirrored += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
r = await client.get(cdn_url)
|
||||
if r.status_code == 200:
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(r.content)
|
||||
with db() as conn:
|
||||
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
|
||||
mirrored += 1
|
||||
else:
|
||||
logger.warning(f"Breed photo {ext_id}: HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Breed photo {ext_id} download failed: {e}")
|
||||
|
||||
logger.info(f"Breed photos mirrored: {mirrored}/{len(rows)}")
|
||||
return mirrored
|
||||
|
||||
|
||||
async def fetch_and_seed_breeds():
|
||||
"""Fetch all breeds from TheDogAPI and upsert into wiki_rassen."""
|
||||
api_key = os.getenv("THEDOGAPI_KEY", "")
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
r = await client.get('https://api.thedogapi.com/v1/breeds',
|
||||
headers={'x-api-key': api_key})
|
||||
r.raise_for_status()
|
||||
breeds = r.json()
|
||||
except Exception as e:
|
||||
logger.error(f"TheDogAPI fetch failed: {e}")
|
||||
return 0
|
||||
|
||||
seeded = 0
|
||||
with db() as conn:
|
||||
for b in breeds:
|
||||
try:
|
||||
w_min, w_max = _parse_weight_kg(b.get('weight', {}).get('metric', '') or '')
|
||||
groesse = _derive_groesse(w_max or 20)
|
||||
aktivitaet = _derive_aktivitaet(b.get('bred_for',''), b.get('temperament',''), b.get('breed_group',''))
|
||||
erfahrung = _derive_erfahrung(b.get('temperament',''), b.get('breed_group',''))
|
||||
kinder = _derive_kinder(b.get('temperament',''))
|
||||
wohnung = 1 if groesse == 'klein' and aktivitaet in ('niedrig','mittel') else 0
|
||||
foto_url = b.get('image', {}).get('url') or None
|
||||
slug = _slug(b['name'])
|
||||
conn.execute("""
|
||||
INSERT INTO wiki_rassen
|
||||
(external_id, name, gruppe, herkunft, temperament,
|
||||
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
|
||||
foto_url, bred_for, aktivitaet, wohnung_geeignet,
|
||||
kinder_geeignet, erfahrung, slug)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
ON CONFLICT(external_id) DO UPDATE SET
|
||||
foto_url=excluded.foto_url,
|
||||
temperament=excluded.temperament
|
||||
""", (
|
||||
b['id'], b['name'],
|
||||
b.get('breed_group'), b.get('origin'), b.get('temperament'),
|
||||
w_min, w_max, groesse, b.get('life_span'),
|
||||
foto_url, b.get('bred_for'), aktivitaet, wohnung, kinder, erfahrung, slug
|
||||
))
|
||||
seeded += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Breed {b.get('name')} seed failed: {e}")
|
||||
logger.info(f"Breeds seeded: {seeded}")
|
||||
return seeded
|
||||
317
backend/scraper/events_vdh.py
Normal file
317
backend/scraper/events_vdh.py
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
"""
|
||||
BAN YARO — VDH Veranstaltungs-Scraper
|
||||
Scrapt Hundeveranstaltungen von vdh.de.
|
||||
Bei Fehler oder 0 Ergebnissen: Fallback auf hartcodierte Events.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FALLBACK_EVENTS = [
|
||||
{"titel": "VDH-Europasiegershow 2026", "datum": "2026-06-14", "ort_name": "Dortmund", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-europasieger-2026"},
|
||||
{"titel": "Internationale Hundeausstellung Frankfurt", "datum": "2026-05-03", "ort_name": "Frankfurt am Main", "typ": "ausstellung", "link": "https://www.vdh.de", "external_id": "vdh-fallback-frankfurt-2026"},
|
||||
{"titel": "VDH-Bundessiegerprüfung Agility", "datum": "2026-07-19", "ort_name": "Leipzig", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-agility-2026"},
|
||||
{"titel": "Rassehundetreffen München", "datum": "2026-08-22", "ort_name": "München", "typ": "treffen", "link": "https://www.vdh.de", "external_id": "vdh-fallback-muenchen-2026"},
|
||||
{"titel": "Hundesport-Turnier Berlin", "datum": "2026-09-12", "ort_name": "Berlin", "typ": "wettkampf", "link": "https://www.vdh.de", "external_id": "vdh-fallback-berlin-2026"},
|
||||
]
|
||||
|
||||
# Mapping VDH-Kategorienamen → interne Typen
|
||||
_TYP_MAP = {
|
||||
"ausstellung": "ausstellung",
|
||||
"show": "ausstellung",
|
||||
"siegershow": "ausstellung",
|
||||
"agility": "wettkampf",
|
||||
"wettkampf": "wettkampf",
|
||||
"turnier": "wettkampf",
|
||||
"prüfung": "wettkampf",
|
||||
"training": "training",
|
||||
"treffen": "treffen",
|
||||
"markt": "markt",
|
||||
}
|
||||
|
||||
# Monatsnamen Deutsch → Zahl
|
||||
_MONATE = {
|
||||
"januar": 1, "februar": 2, "märz": 3, "maerz": 3,
|
||||
"april": 4, "mai": 5, "juni": 6, "juli": 7,
|
||||
"august": 8, "september": 9, "oktober": 10,
|
||||
"november": 11, "dezember": 12,
|
||||
}
|
||||
|
||||
|
||||
def _guess_typ(text: str) -> str:
|
||||
"""Bestimmt den Event-Typ anhand des Titels."""
|
||||
t = text.lower()
|
||||
for keyword, typ in _TYP_MAP.items():
|
||||
if keyword in t:
|
||||
return typ
|
||||
return "sonstiges"
|
||||
|
||||
|
||||
def _parse_date(raw: str) -> str | None:
|
||||
"""
|
||||
Versucht verschiedene Datumsformate zu parsen.
|
||||
Gibt YYYY-MM-DD zurück oder None.
|
||||
"""
|
||||
raw = raw.strip()
|
||||
|
||||
# ISO: 2026-05-03
|
||||
m = re.match(r'^(\d{4})-(\d{2})-(\d{2})$', raw)
|
||||
if m:
|
||||
return raw
|
||||
|
||||
# DD.MM.YYYY oder D.M.YYYY
|
||||
m = re.match(r'^(\d{1,2})\.(\d{1,2})\.(\d{4})$', raw)
|
||||
if m:
|
||||
d, mo, y = m.groups()
|
||||
return f"{y}-{int(mo):02d}-{int(d):02d}"
|
||||
|
||||
# DD. Monatsname YYYY (z.B. "14. Juni 2026")
|
||||
m = re.match(r'^(\d{1,2})\.\s*(\w+)\s+(\d{4})$', raw)
|
||||
if m:
|
||||
d, mon_str, y = m.groups()
|
||||
mon_num = _MONATE.get(mon_str.lower())
|
||||
if mon_num:
|
||||
return f"{y}-{mon_num:02d}-{int(d):02d}"
|
||||
|
||||
# Monatsname DD, YYYY (englisch, Fallback)
|
||||
try:
|
||||
dt = datetime.strptime(raw, "%B %d, %Y")
|
||||
return dt.strftime("%Y-%m-%d")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class _VDHParser(HTMLParser):
|
||||
"""
|
||||
Einfacher Zustandsautomat-Parser für die VDH-Veranstaltungsseite.
|
||||
Sucht nach typischen Strukturen: article, li.event, div mit Datums-/Titel-Klassen.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._events: list[dict] = []
|
||||
self._current: dict | None = None
|
||||
self._depth = 0
|
||||
self._start_depth = 0
|
||||
self._capture = None # 'titel' | 'datum' | 'ort'
|
||||
self._buf = ""
|
||||
self._in_event = False
|
||||
|
||||
# ---------- Hilfsmethoden ----------
|
||||
|
||||
def _is_event_container(self, tag, attrs):
|
||||
"""Erkennt Start eines Event-Blocks."""
|
||||
a = dict(attrs)
|
||||
cls = a.get("class", "")
|
||||
return (
|
||||
tag == "article"
|
||||
or (tag in ("li", "div") and any(
|
||||
kw in cls for kw in ("event", "veranstaltung", "termin", "entry", "item")
|
||||
))
|
||||
)
|
||||
|
||||
def _is_title_tag(self, tag, attrs):
|
||||
a = dict(attrs)
|
||||
cls = a.get("class", "")
|
||||
return tag in ("h2", "h3", "h4") or any(
|
||||
kw in cls for kw in ("title", "titel", "name", "heading")
|
||||
)
|
||||
|
||||
def _is_date_tag(self, tag, attrs):
|
||||
a = dict(attrs)
|
||||
cls = a.get("class", "")
|
||||
it = a.get("itemprop", "")
|
||||
return (
|
||||
tag in ("time",)
|
||||
or any(kw in cls for kw in ("date", "datum", "time"))
|
||||
or it in ("startDate", "endDate")
|
||||
)
|
||||
|
||||
def _is_location_tag(self, tag, attrs):
|
||||
a = dict(attrs)
|
||||
cls = a.get("class", "")
|
||||
it = a.get("itemprop", "")
|
||||
return (
|
||||
any(kw in cls for kw in ("location", "ort", "venue", "place", "city"))
|
||||
or it in ("location", "addressLocality")
|
||||
)
|
||||
|
||||
# ---------- SAX-Events ----------
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self._depth += 1
|
||||
a = dict(attrs)
|
||||
|
||||
if not self._in_event and self._is_event_container(tag, attrs):
|
||||
self._in_event = True
|
||||
self._start_depth = self._depth
|
||||
self._current = {"titel": "", "datum": "", "ort_name": "", "link": ""}
|
||||
# Direkter Link auf dem Container?
|
||||
if tag == "a" and "href" in a:
|
||||
self._current["link"] = a["href"]
|
||||
return
|
||||
|
||||
if self._in_event:
|
||||
# Link innerhalb des Event-Blocks
|
||||
if tag == "a" and "href" in a and not self._current.get("link"):
|
||||
href = a["href"]
|
||||
if "vdh.de" in href or href.startswith("/"):
|
||||
self._current["link"] = href
|
||||
|
||||
# <time datetime="…">
|
||||
if tag == "time":
|
||||
dt = a.get("datetime", "")
|
||||
if dt:
|
||||
parsed = _parse_date(dt)
|
||||
if parsed:
|
||||
self._current["datum"] = parsed
|
||||
|
||||
if self._is_title_tag(tag, attrs):
|
||||
self._capture = "titel"
|
||||
self._buf = ""
|
||||
elif self._is_date_tag(tag, attrs) and not self._current.get("datum"):
|
||||
self._capture = "datum"
|
||||
self._buf = ""
|
||||
elif self._is_location_tag(tag, attrs):
|
||||
self._capture = "ort"
|
||||
self._buf = ""
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if self._capture:
|
||||
val = self._buf.strip()
|
||||
if self._capture == "titel" and val:
|
||||
self._current["titel"] = val
|
||||
elif self._capture == "datum" and val and not self._current.get("datum"):
|
||||
parsed = _parse_date(val)
|
||||
if parsed:
|
||||
self._current["datum"] = parsed
|
||||
elif self._capture == "ort" and val:
|
||||
self._current["ort_name"] = val
|
||||
self._capture = None
|
||||
self._buf = ""
|
||||
|
||||
self._depth -= 1
|
||||
|
||||
if self._in_event and self._depth < self._start_depth:
|
||||
self._in_event = False
|
||||
ev = self._current
|
||||
# Nur speichern wenn wir Titel + Datum haben
|
||||
if ev and ev.get("titel") and ev.get("datum"):
|
||||
self._events.append(ev)
|
||||
self._current = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._capture:
|
||||
self._buf += data
|
||||
|
||||
def get_events(self) -> list[dict]:
|
||||
return self._events
|
||||
|
||||
|
||||
def _build_external_id(ev: dict) -> str:
|
||||
"""Erzeugt einen stabilen Dedup-Key aus Datum + Titel."""
|
||||
raw = f"vdh-{ev['datum']}-{ev['titel']}"
|
||||
# Einfache Normalisierung: lowercase, Sonderzeichen raus
|
||||
key = re.sub(r'[^a-z0-9]+', '-', raw.lower()).strip('-')
|
||||
return key[:120]
|
||||
|
||||
|
||||
async def fetch_vdh_events() -> list[dict]:
|
||||
"""
|
||||
Scrapt VDH-Veranstaltungen und gibt eine Liste von Dicts zurück:
|
||||
{titel, datum, ort_name, typ, link, external_id}
|
||||
|
||||
Bei Fehler oder 0 Ergebnissen: Fallback auf FALLBACK_EVENTS.
|
||||
"""
|
||||
urls = [
|
||||
"https://www.vdh.de/veranstaltungen/ausstellungen/",
|
||||
"https://www.vdh.de/veranstaltungen/",
|
||||
]
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "de-DE,de;q=0.9,en;q=0.5",
|
||||
}
|
||||
|
||||
raw_events: list[dict] = []
|
||||
|
||||
async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client:
|
||||
for url in urls:
|
||||
try:
|
||||
resp = await client.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
html = resp.text
|
||||
|
||||
parser = _VDHParser()
|
||||
parser.feed(html)
|
||||
found = parser.get_events()
|
||||
|
||||
if found:
|
||||
logger.info(f"VDH-Scraper: {len(found)} Events von {url} geparst.")
|
||||
raw_events = found
|
||||
break
|
||||
else:
|
||||
logger.info(f"VDH-Scraper: Keine Events auf {url} gefunden, nächste URL versuchen.")
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.warning(f"VDH-Scraper HTTP-Fehler {e.response.status_code} für {url}: {e}")
|
||||
except httpx.RequestError as e:
|
||||
logger.warning(f"VDH-Scraper Netzwerkfehler für {url}: {e}")
|
||||
except Exception as e:
|
||||
logger.warning(f"VDH-Scraper unbekannter Fehler für {url}: {e}")
|
||||
|
||||
if not raw_events:
|
||||
logger.warning("VDH-Scraper: Keine Daten erhalten — verwende Fallback-Events.")
|
||||
return list(FALLBACK_EVENTS)
|
||||
|
||||
# Normalisieren
|
||||
today = datetime.today().strftime("%Y-%m-%d")
|
||||
result = []
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
for ev in raw_events:
|
||||
datum = ev.get("datum", "")
|
||||
# Nur zukünftige Events
|
||||
if datum < today:
|
||||
continue
|
||||
|
||||
titel = ev.get("titel", "").strip()
|
||||
if not titel or len(titel) < 3:
|
||||
continue
|
||||
|
||||
link = ev.get("link", "")
|
||||
if link and link.startswith("/"):
|
||||
link = "https://www.vdh.de" + link
|
||||
|
||||
entry = {
|
||||
"titel": titel,
|
||||
"datum": datum,
|
||||
"ort_name": ev.get("ort_name") or None,
|
||||
"typ": _guess_typ(titel),
|
||||
"link": link or "https://www.vdh.de",
|
||||
"external_id": _build_external_id(ev),
|
||||
}
|
||||
|
||||
if entry["external_id"] not in seen_ids:
|
||||
seen_ids.add(entry["external_id"])
|
||||
result.append(entry)
|
||||
|
||||
if not result:
|
||||
logger.warning("VDH-Scraper: Nach Filterung 0 zukünftige Events — verwende Fallback-Events.")
|
||||
return list(FALLBACK_EVENTS)
|
||||
|
||||
logger.info(f"VDH-Scraper: {len(result)} zukünftige Events nach Normalisierung.")
|
||||
return result
|
||||
196
backend/scraper/wikidata_breeds.py
Normal file
196
backend/scraper/wikidata_breeds.py
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
"""Fetches missing dog breed data from Wikidata SPARQL and seeds wiki_rassen."""
|
||||
import httpx, re, logging, os
|
||||
from database import db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
|
||||
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
|
||||
|
||||
SPARQL_URL = "https://query.wikidata.org/sparql"
|
||||
|
||||
# GROUP BY + SAMPLE so each breed appears once even if it has multiple images
|
||||
SPARQL_QUERY = """
|
||||
SELECT ?breed
|
||||
(SAMPLE(?nameDE) AS ?nameDE)
|
||||
(SAMPLE(?nameEN) AS ?nameEN)
|
||||
(SAMPLE(?image) AS ?image)
|
||||
(SAMPLE(?countryDE) AS ?countryDE)
|
||||
(SAMPLE(?descDE) AS ?descDE)
|
||||
(SAMPLE(?descEN) AS ?descEN)
|
||||
WHERE {
|
||||
?breed wdt:P31 wd:Q39367 .
|
||||
OPTIONAL { ?breed rdfs:label ?nameDE FILTER(LANG(?nameDE) = "de") }
|
||||
OPTIONAL { ?breed rdfs:label ?nameEN FILTER(LANG(?nameEN) = "en") }
|
||||
FILTER(BOUND(?nameDE) || BOUND(?nameEN))
|
||||
OPTIONAL { ?breed wdt:P18 ?image }
|
||||
OPTIONAL {
|
||||
?breed wdt:P495 ?country .
|
||||
?country rdfs:label ?countryDE FILTER(LANG(?countryDE) = "de")
|
||||
}
|
||||
OPTIONAL { ?breed schema:description ?descDE FILTER(LANG(?descDE) = "de") }
|
||||
OPTIONAL { ?breed schema:description ?descEN FILTER(LANG(?descEN) = "en") }
|
||||
}
|
||||
GROUP BY ?breed
|
||||
ORDER BY ?nameDE ?nameEN
|
||||
"""
|
||||
|
||||
def _slug(name: str) -> str:
|
||||
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
||||
|
||||
|
||||
def _normalise(name: str) -> str:
|
||||
"""Lowercase + remove diacritics for name deduplication."""
|
||||
import unicodedata
|
||||
nfkd = unicodedata.normalize('NFKD', name.lower())
|
||||
return re.sub(r'[^a-z0-9 ]', '', nfkd).strip()
|
||||
|
||||
|
||||
async def fetch_and_seed_wikidata_breeds():
|
||||
"""Query Wikidata for dog breeds and insert only those missing from wiki_rassen."""
|
||||
# -- fetch from SPARQL -------------------------------------------------
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=60,
|
||||
headers={"Accept": "application/sparql-results+json",
|
||||
"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
|
||||
) as client:
|
||||
r = await client.get(SPARQL_URL, params={"query": SPARQL_QUERY})
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Wikidata SPARQL fetch failed: {e}")
|
||||
return 0
|
||||
|
||||
bindings = data.get("results", {}).get("bindings", [])
|
||||
logger.info(f"Wikidata: {len(bindings)} breed entries received")
|
||||
|
||||
# -- load existing names for deduplication -----------------------------
|
||||
with db() as conn:
|
||||
existing = conn.execute("SELECT name FROM wiki_rassen").fetchall()
|
||||
existing_norm = {_normalise(row[0]) for row in existing}
|
||||
|
||||
seeded = 0
|
||||
with db() as conn:
|
||||
for b in bindings:
|
||||
name = (b.get("nameDE", {}).get("value") or
|
||||
b.get("nameEN", {}).get("value") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
|
||||
# skip if already in DB (by normalised name)
|
||||
if _normalise(name) in existing_norm:
|
||||
continue
|
||||
|
||||
qid = b["breed"]["value"].rsplit("/", 1)[-1] # e.g. "Q312440"
|
||||
ext_id = f"wd_{qid}"
|
||||
image_url = b.get("image", {}).get("value") or None
|
||||
herkunft = b.get("countryDE", {}).get("value") or None
|
||||
desc = (b.get("descDE", {}).get("value") or
|
||||
b.get("descEN", {}).get("value") or None)
|
||||
slug_base = _slug(name)
|
||||
|
||||
# make slug unique if collision exists
|
||||
slug = slug_base
|
||||
suffix = 1
|
||||
while True:
|
||||
row = conn.execute(
|
||||
"SELECT 1 FROM wiki_rassen WHERE slug=? AND external_id != ?",
|
||||
(slug, ext_id)
|
||||
).fetchone()
|
||||
if not row:
|
||||
break
|
||||
slug = f"{slug_base}-{suffix}"
|
||||
suffix += 1
|
||||
|
||||
try:
|
||||
conn.execute("""
|
||||
INSERT INTO wiki_rassen
|
||||
(external_id, name, gruppe, herkunft, temperament,
|
||||
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
|
||||
foto_url, bred_for, aktivitaet, wohnung_geeignet,
|
||||
kinder_geeignet, erfahrung, slug)
|
||||
VALUES (?,?,?,?,?,NULL,NULL,'mittel',NULL,?,NULL,'mittel',0,1,'anfaenger',?)
|
||||
ON CONFLICT(external_id) DO UPDATE SET
|
||||
foto_url = CASE
|
||||
WHEN excluded.foto_url IS NOT NULL AND wiki_rassen.foto_url IS NULL
|
||||
THEN excluded.foto_url
|
||||
ELSE wiki_rassen.foto_url
|
||||
END,
|
||||
herkunft = COALESCE(wiki_rassen.herkunft, excluded.herkunft),
|
||||
temperament = COALESCE(wiki_rassen.temperament, excluded.temperament)
|
||||
""", (ext_id, name, None, herkunft, desc, image_url, slug))
|
||||
existing_norm.add(_normalise(name)) # avoid re-inserting within same run
|
||||
seeded += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Wikidata breed '{name}' seed failed: {e}")
|
||||
|
||||
logger.info(f"Wikidata breeds seeded: {seeded}")
|
||||
return seeded
|
||||
|
||||
|
||||
async def mirror_wikidata_photos():
|
||||
"""Download Wikimedia Commons photos for Wikidata breeds that still have external URLs."""
|
||||
os.makedirs(BREEDS_DIR, exist_ok=True)
|
||||
|
||||
with db() as conn:
|
||||
rows = conn.execute(
|
||||
"""SELECT id, external_id, foto_url FROM wiki_rassen
|
||||
WHERE external_id LIKE 'wd_%'
|
||||
AND foto_url LIKE 'http%'
|
||||
AND foto_url NOT LIKE '/media/%'"""
|
||||
).fetchall()
|
||||
|
||||
if not rows:
|
||||
logger.info("Wikidata photos: nothing to mirror")
|
||||
return 0
|
||||
|
||||
mirrored = 0
|
||||
import asyncio
|
||||
async with httpx.AsyncClient(
|
||||
timeout=30,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": "BanYaro/1.0 (https://banyaro.app)"}
|
||||
) as client:
|
||||
for i, (row_id, ext_id, img_url) in enumerate(rows):
|
||||
qid = ext_id.replace("wd_", "")
|
||||
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
|
||||
local_url = f"/media/breeds/{qid}.jpg"
|
||||
|
||||
if os.path.exists(local_path):
|
||||
with db() as conn:
|
||||
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
|
||||
(local_url, row_id))
|
||||
mirrored += 1
|
||||
continue
|
||||
|
||||
# Wikimedia Commons: append ?width=600 for scaled download
|
||||
fetch_url = img_url if "?" in img_url else img_url + "?width=600"
|
||||
retries = 2
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
await asyncio.sleep(0.3) # 300ms zwischen Requests → ~3/s
|
||||
r = await client.get(fetch_url)
|
||||
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
|
||||
with open(local_path, "wb") as f:
|
||||
f.write(r.content)
|
||||
with db() as conn:
|
||||
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
|
||||
(local_url, row_id))
|
||||
mirrored += 1
|
||||
break
|
||||
elif r.status_code == 429:
|
||||
wait = 10 * (attempt + 1)
|
||||
logger.info(f"Rate limited, warte {wait}s…")
|
||||
await asyncio.sleep(wait)
|
||||
else:
|
||||
logger.warning(f"Wikidata photo {qid}: HTTP {r.status_code}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Wikidata photo {qid} failed: {e}")
|
||||
break
|
||||
|
||||
if i % 50 == 0 and i > 0:
|
||||
logger.info(f"Wikidata photos: {mirrored}/{i+1} bisher")
|
||||
|
||||
logger.info(f"Wikidata photos mirrored: {mirrored}/{len(rows)}")
|
||||
return mirrored
|
||||
Loading…
Add table
Add a link
Reference in a new issue