banyaro/backend/scraper/wikidata_breeds.py
rene a7753c9cf5 Sprint 16: Chat-Fotos/Online/Read-Receipts, Gesundheit-Dokumente löschen, Bugfixes
- Chat: Foto-Versand (POST /api/chat/conversations/{id}/upload, media_url/media_type)
- Chat: Online-Indikator (last_seen Heartbeat, grüner Dot, 3min-Fenster)
- Chat: Read Receipts (read_at, Einzel-/Doppelhaken-Icons)
- Gesundheit: Dokument löschen (DELETE .../dokument, Datei + DB-Eintrag)
- Bug: events.user_id NOT NULL → nullable (Table-Recreation-Migration)
- Bug: scheduler INSERT user_id 0 → NULL
- Bug: Wikidata Rate-Limit: sleep 0.3s→1.0s, retries 2→4, exponentielles Backoff
- SW: by-v146, APP_VER 119
2026-04-17 22:38:33 +02:00

196 lines
7.8 KiB
Python

"""Fetches missing dog breed data from Wikidata SPARQL and seeds wiki_rassen."""
import httpx, re, logging, os
from database import db
logger = logging.getLogger(__name__)
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
SPARQL_URL = "https://query.wikidata.org/sparql"
# GROUP BY + SAMPLE so each breed appears once even if it has multiple images
SPARQL_QUERY = """
SELECT ?breed
(SAMPLE(?nameDE) AS ?nameDE)
(SAMPLE(?nameEN) AS ?nameEN)
(SAMPLE(?image) AS ?image)
(SAMPLE(?countryDE) AS ?countryDE)
(SAMPLE(?descDE) AS ?descDE)
(SAMPLE(?descEN) AS ?descEN)
WHERE {
?breed wdt:P31 wd:Q39367 .
OPTIONAL { ?breed rdfs:label ?nameDE FILTER(LANG(?nameDE) = "de") }
OPTIONAL { ?breed rdfs:label ?nameEN FILTER(LANG(?nameEN) = "en") }
FILTER(BOUND(?nameDE) || BOUND(?nameEN))
OPTIONAL { ?breed wdt:P18 ?image }
OPTIONAL {
?breed wdt:P495 ?country .
?country rdfs:label ?countryDE FILTER(LANG(?countryDE) = "de")
}
OPTIONAL { ?breed schema:description ?descDE FILTER(LANG(?descDE) = "de") }
OPTIONAL { ?breed schema:description ?descEN FILTER(LANG(?descEN) = "en") }
}
GROUP BY ?breed
ORDER BY ?nameDE ?nameEN
"""
def _slug(name: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def _normalise(name: str) -> str:
"""Lowercase + remove diacritics for name deduplication."""
import unicodedata
nfkd = unicodedata.normalize('NFKD', name.lower())
return re.sub(r'[^a-z0-9 ]', '', nfkd).strip()
async def fetch_and_seed_wikidata_breeds():
"""Query Wikidata for dog breeds and insert only those missing from wiki_rassen."""
# -- fetch from SPARQL -------------------------------------------------
try:
async with httpx.AsyncClient(
timeout=60,
headers={"Accept": "application/sparql-results+json",
"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
) as client:
r = await client.get(SPARQL_URL, params={"query": SPARQL_QUERY})
r.raise_for_status()
data = r.json()
except Exception as e:
logger.error(f"Wikidata SPARQL fetch failed: {e}")
return 0
bindings = data.get("results", {}).get("bindings", [])
logger.info(f"Wikidata: {len(bindings)} breed entries received")
# -- load existing names for deduplication -----------------------------
with db() as conn:
existing = conn.execute("SELECT name FROM wiki_rassen").fetchall()
existing_norm = {_normalise(row[0]) for row in existing}
seeded = 0
with db() as conn:
for b in bindings:
name = (b.get("nameDE", {}).get("value") or
b.get("nameEN", {}).get("value") or "").strip()
if not name:
continue
# skip if already in DB (by normalised name)
if _normalise(name) in existing_norm:
continue
qid = b["breed"]["value"].rsplit("/", 1)[-1] # e.g. "Q312440"
ext_id = f"wd_{qid}"
image_url = b.get("image", {}).get("value") or None
herkunft = b.get("countryDE", {}).get("value") or None
desc = (b.get("descDE", {}).get("value") or
b.get("descEN", {}).get("value") or None)
slug_base = _slug(name)
# make slug unique if collision exists
slug = slug_base
suffix = 1
while True:
row = conn.execute(
"SELECT 1 FROM wiki_rassen WHERE slug=? AND external_id != ?",
(slug, ext_id)
).fetchone()
if not row:
break
slug = f"{slug_base}-{suffix}"
suffix += 1
try:
conn.execute("""
INSERT INTO wiki_rassen
(external_id, name, gruppe, herkunft, temperament,
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
foto_url, bred_for, aktivitaet, wohnung_geeignet,
kinder_geeignet, erfahrung, slug)
VALUES (?,?,?,?,?,NULL,NULL,'mittel',NULL,?,NULL,'mittel',0,1,'anfaenger',?)
ON CONFLICT(external_id) DO UPDATE SET
foto_url = CASE
WHEN excluded.foto_url IS NOT NULL AND wiki_rassen.foto_url IS NULL
THEN excluded.foto_url
ELSE wiki_rassen.foto_url
END,
herkunft = COALESCE(wiki_rassen.herkunft, excluded.herkunft),
temperament = COALESCE(wiki_rassen.temperament, excluded.temperament)
""", (ext_id, name, None, herkunft, desc, image_url, slug))
existing_norm.add(_normalise(name)) # avoid re-inserting within same run
seeded += 1
except Exception as e:
logger.warning(f"Wikidata breed '{name}' seed failed: {e}")
logger.info(f"Wikidata breeds seeded: {seeded}")
return seeded
async def mirror_wikidata_photos():
"""Download Wikimedia Commons photos for Wikidata breeds that still have external URLs."""
os.makedirs(BREEDS_DIR, exist_ok=True)
with db() as conn:
rows = conn.execute(
"""SELECT id, external_id, foto_url FROM wiki_rassen
WHERE external_id LIKE 'wd_%'
AND foto_url LIKE 'http%'
AND foto_url NOT LIKE '/media/%'"""
).fetchall()
if not rows:
logger.info("Wikidata photos: nothing to mirror")
return 0
mirrored = 0
import asyncio
async with httpx.AsyncClient(
timeout=30,
follow_redirects=True,
headers={"User-Agent": "BanYaro/1.0 (https://banyaro.app)"}
) as client:
for i, (row_id, ext_id, img_url) in enumerate(rows):
qid = ext_id.replace("wd_", "")
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
local_url = f"/media/breeds/{qid}.jpg"
if os.path.exists(local_path):
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
(local_url, row_id))
mirrored += 1
continue
# Wikimedia Commons: append ?width=600 for scaled download
fetch_url = img_url if "?" in img_url else img_url + "?width=600"
retries = 4
for attempt in range(retries):
try:
await asyncio.sleep(1.0 * (attempt + 1)) # exponentiell: 1s, 2s, 3s, 4s
r = await client.get(fetch_url)
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
with open(local_path, "wb") as f:
f.write(r.content)
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
(local_url, row_id))
mirrored += 1
break
elif r.status_code == 429:
wait = 10 * (attempt + 1)
logger.info(f"Rate limited, warte {wait}s…")
await asyncio.sleep(wait)
else:
logger.warning(f"Wikidata photo {qid}: HTTP {r.status_code}")
break
except Exception as e:
logger.warning(f"Wikidata photo {qid} failed: {e}")
break
if i % 50 == 0 and i > 0:
logger.info(f"Wikidata photos: {mirrored}/{i+1} bisher")
logger.info(f"Wikidata photos mirrored: {mirrored}/{len(rows)}")
return mirrored