Sprint 11b: Wiki-Foto-Einreichungen + Wikipedia-Foto-Scraper

- User können Fotos für Rassen vorschlagen (Upload-Modal in Rassen-Detail) - Mod/Admin-Review-Tab im Wiki mit Freischalten/Ablehnen + Push-Notification - wikipedia_photos.py: holt Fotos über Wikidata-QID → Wikipedia-API - Foto-Status: 578 lokal, 186 extern, 238 ohne Foto - DB: wiki_foto_submissions Tabelle - SW by-v90
2026-04-15 22:01:58 +02:00 · 2026-04-15 22:01:58 +02:00 · 32d630d5a1
commit 32d630d5a1
parent 097295c628
6 changed files with 598 additions and 3 deletions
--- a/backend/scraper/wikipedia_photos.py
+++ b/backend/scraper/wikipedia_photos.py
@ -0,0 +1,198 @@
+"""
+Holt Fotos für Wikidata-Rassen ohne Bild über die Wikipedia-API.
+
+Strategie:
+1. Wikidata-API: QID → Wikipedia-Artikel-Titel (DE bevorzugt, Fallback EN)
+2. Wikipedia pageimages-API: Artikel-Titel → Bild-URL
+3. Wikimedia Commons: Bild herunterladen und lokal speichern
+"""
+
+import asyncio
+import logging
+import os
+import re
+import httpx
+
+from database import db
+
+logger    = logging.getLogger(__name__)
+MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
+BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
+
+WIKIDATA_API  = "https://www.wikidata.org/w/api.php"
+WP_DE_API     = "https://de.wikipedia.org/w/api.php"
+WP_EN_API     = "https://en.wikipedia.org/w/api.php"
+HEADERS       = {"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
+BATCH_SIZE    = 50   # Wikidata API erlaubt max 50 IDs pro Request
+SLEEP_MS      = 0.35  # 350ms zwischen Downloads
+
+
+def _qid_from_ext(ext_id: str) -> str | None:
+    """Extrahiert QID aus external_id wie 'wd_Q12345' → 'Q12345'."""
+    m = re.match(r"wd_(Q\d+)$", ext_id)
+    return m.group(1) if m else None
+
+
+async def _fetch_sitelinks(qids: list[str], client: httpx.AsyncClient) -> dict[str, dict]:
+    """
+    Gibt {qid: {'de': 'Titel_DE', 'en': 'Titel_EN'}} zurück
+    für alle QIDs, die mindestens einen Wikipedia-Sitelink haben.
+    """
+    if not qids:
+        return {}
+    try:
+        r = await client.get(WIKIDATA_API, params={
+            "action": "wbgetentities",
+            "ids":    "|".join(qids),
+            "props":  "sitelinks",
+            "sitefilter": "dewiki|enwiki",
+            "format": "json",
+        })
+        r.raise_for_status()
+        data = r.json()
+    except Exception as e:
+        logger.warning(f"Wikidata sitelinks Fehler: {e}")
+        return {}
+
+    result = {}
+    for qid, entity in data.get("entities", {}).items():
+        sitelinks = entity.get("sitelinks", {})
+        titles = {}
+        if "dewiki" in sitelinks:
+            titles["de"] = sitelinks["dewiki"]["title"]
+        if "enwiki" in sitelinks:
+            titles["en"] = sitelinks["enwiki"]["title"]
+        if titles:
+            result[qid] = titles
+    return result
+
+
+async def _fetch_wp_image(title: str, lang: str, client: httpx.AsyncClient) -> str | None:
+    """
+    Gibt die Thumbnail-URL eines Wikipedia-Artikels zurück (600px-Version).
+    """
+    api = WP_DE_API if lang == "de" else WP_EN_API
+    try:
+        r = await client.get(api, params={
+            "action":      "query",
+            "titles":      title,
+            "prop":        "pageimages",
+            "pithumbsize": 600,
+            "format":      "json",
+        })
+        r.raise_for_status()
+        pages = r.json().get("query", {}).get("pages", {})
+        for page in pages.values():
+            thumb = page.get("thumbnail", {}).get("source")
+            if thumb:
+                return thumb
+    except Exception as e:
+        logger.debug(f"WP pageimage Fehler ({lang}/{title}): {e}")
+    return None
+
+
+async def _download_image(url: str, path: str, client: httpx.AsyncClient) -> bool:
+    """Lädt Bild herunter, speichert unter path. True bei Erfolg."""
+    for attempt in range(2):
+        try:
+            await asyncio.sleep(SLEEP_MS)
+            r = await client.get(url)
+            if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
+                with open(path, "wb") as f:
+                    f.write(r.content)
+                return True
+            if r.status_code == 429:
+                await asyncio.sleep(15 * (attempt + 1))
+        except Exception as e:
+            logger.debug(f"Download Fehler {url}: {e}")
+    return False
+
+
+async def fetch_wikipedia_photos() -> int:
+    """
+    Haupt-Funktion: Holt Wikipedia-Fotos für alle Rassen ohne foto_url.
+    Gibt Anzahl erfolgreich gespeicherter Fotos zurück.
+    """
+    os.makedirs(BREEDS_DIR, exist_ok=True)
+
+    with db() as conn:
+        rows = conn.execute("""
+            SELECT id, external_id, name
+            FROM wiki_rassen
+            WHERE (foto_url IS NULL OR foto_url = '')
+              AND external_id LIKE 'wd_%'
+        """).fetchall()
+
+    if not rows:
+        logger.info("Wikipedia-Fotos: nichts zu tun")
+        return 0
+
+    logger.info(f"Wikipedia-Fotos: {len(rows)} Rassen ohne Foto")
+
+    # QID → DB-Row mappen
+    qid_map = {}   # { 'Q12345': {'id': 1, 'external_id': 'wd_Q12345', 'name': '...'} }
+    for row in rows:
+        qid = _qid_from_ext(row["external_id"])
+        if qid:
+            qid_map[qid] = dict(row)
+
+    qids = list(qid_map.keys())
+    saved = 0
+
+    async with httpx.AsyncClient(
+        timeout=30,
+        follow_redirects=True,
+        headers=HEADERS
+    ) as client:
+
+        # Sitelinks in Batches holen
+        sitelinks: dict[str, dict] = {}
+        for i in range(0, len(qids), BATCH_SIZE):
+            batch = qids[i:i + BATCH_SIZE]
+            chunk = await _fetch_sitelinks(batch, client)
+            sitelinks.update(chunk)
+            await asyncio.sleep(0.5)
+            logger.info(f"Sitelinks: {i + len(batch)}/{len(qids)} abgefragt, {len(sitelinks)} mit WP-Link")
+
+        logger.info(f"Wikipedia-Links gefunden: {len(sitelinks)}/{len(qids)}")
+
+        # Für jeden mit Sitelink → Bild holen + herunterladen
+        for idx, (qid, titles) in enumerate(sitelinks.items()):
+            row    = qid_map[qid]
+            row_id = row["id"]
+            lang   = "de" if "de" in titles else "en"
+            title  = titles[lang]
+
+            img_url = await _fetch_wp_image(title, lang, client)
+            if not img_url:
+                # Zweiter Versuch mit EN wenn DE kein Bild hat
+                if lang == "de" and "en" in titles:
+                    img_url = await _fetch_wp_image(titles["en"], "en", client)
+
+            if not img_url:
+                logger.debug(f"Kein WP-Bild für {row['name']} ({qid})")
+                continue
+
+            local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
+            local_url  = f"/media/breeds/{qid}.jpg"
+
+            if os.path.exists(local_path):
+                # Datei existiert bereits → nur DB updaten
+                with db() as conn:
+                    conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
+                saved += 1
+                continue
+
+            ok = await _download_image(img_url, local_path, client)
+            if ok:
+                with db() as conn:
+                    conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
+                saved += 1
+            else:
+                logger.debug(f"Download fehlgeschlagen: {row['name']}")
+
+            if idx % 50 == 0 and idx > 0:
+                logger.info(f"Wikipedia-Fotos: {saved}/{idx + 1} bisher")
+
+    logger.info(f"Wikipedia-Fotos gespeichert: {saved}/{len(sitelinks)} (mit WP-Link)")
+    return saved