banyaro/backend/scraper/wikipedia_photos.py
rene 32d630d5a1 Sprint 11b: Wiki-Foto-Einreichungen + Wikipedia-Foto-Scraper
- User können Fotos für Rassen vorschlagen (Upload-Modal in Rassen-Detail)
- Mod/Admin-Review-Tab im Wiki mit Freischalten/Ablehnen + Push-Notification
- wikipedia_photos.py: holt Fotos über Wikidata-QID → Wikipedia-API
- Foto-Status: 578 lokal, 186 extern, 238 ohne Foto
- DB: wiki_foto_submissions Tabelle
- SW by-v90
2026-04-15 22:01:58 +02:00

198 lines
6.7 KiB
Python

"""
Holt Fotos für Wikidata-Rassen ohne Bild über die Wikipedia-API.
Strategie:
1. Wikidata-API: QID → Wikipedia-Artikel-Titel (DE bevorzugt, Fallback EN)
2. Wikipedia pageimages-API: Artikel-Titel → Bild-URL
3. Wikimedia Commons: Bild herunterladen und lokal speichern
"""
import asyncio
import logging
import os
import re
import httpx
from database import db
logger = logging.getLogger(__name__)
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
WP_DE_API = "https://de.wikipedia.org/w/api.php"
WP_EN_API = "https://en.wikipedia.org/w/api.php"
HEADERS = {"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
BATCH_SIZE = 50 # Wikidata API erlaubt max 50 IDs pro Request
SLEEP_MS = 0.35 # 350ms zwischen Downloads
def _qid_from_ext(ext_id: str) -> str | None:
"""Extrahiert QID aus external_id wie 'wd_Q12345''Q12345'."""
m = re.match(r"wd_(Q\d+)$", ext_id)
return m.group(1) if m else None
async def _fetch_sitelinks(qids: list[str], client: httpx.AsyncClient) -> dict[str, dict]:
"""
Gibt {qid: {'de': 'Titel_DE', 'en': 'Titel_EN'}} zurück
für alle QIDs, die mindestens einen Wikipedia-Sitelink haben.
"""
if not qids:
return {}
try:
r = await client.get(WIKIDATA_API, params={
"action": "wbgetentities",
"ids": "|".join(qids),
"props": "sitelinks",
"sitefilter": "dewiki|enwiki",
"format": "json",
})
r.raise_for_status()
data = r.json()
except Exception as e:
logger.warning(f"Wikidata sitelinks Fehler: {e}")
return {}
result = {}
for qid, entity in data.get("entities", {}).items():
sitelinks = entity.get("sitelinks", {})
titles = {}
if "dewiki" in sitelinks:
titles["de"] = sitelinks["dewiki"]["title"]
if "enwiki" in sitelinks:
titles["en"] = sitelinks["enwiki"]["title"]
if titles:
result[qid] = titles
return result
async def _fetch_wp_image(title: str, lang: str, client: httpx.AsyncClient) -> str | None:
"""
Gibt die Thumbnail-URL eines Wikipedia-Artikels zurück (600px-Version).
"""
api = WP_DE_API if lang == "de" else WP_EN_API
try:
r = await client.get(api, params={
"action": "query",
"titles": title,
"prop": "pageimages",
"pithumbsize": 600,
"format": "json",
})
r.raise_for_status()
pages = r.json().get("query", {}).get("pages", {})
for page in pages.values():
thumb = page.get("thumbnail", {}).get("source")
if thumb:
return thumb
except Exception as e:
logger.debug(f"WP pageimage Fehler ({lang}/{title}): {e}")
return None
async def _download_image(url: str, path: str, client: httpx.AsyncClient) -> bool:
"""Lädt Bild herunter, speichert unter path. True bei Erfolg."""
for attempt in range(2):
try:
await asyncio.sleep(SLEEP_MS)
r = await client.get(url)
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
with open(path, "wb") as f:
f.write(r.content)
return True
if r.status_code == 429:
await asyncio.sleep(15 * (attempt + 1))
except Exception as e:
logger.debug(f"Download Fehler {url}: {e}")
return False
async def fetch_wikipedia_photos() -> int:
"""
Haupt-Funktion: Holt Wikipedia-Fotos für alle Rassen ohne foto_url.
Gibt Anzahl erfolgreich gespeicherter Fotos zurück.
"""
os.makedirs(BREEDS_DIR, exist_ok=True)
with db() as conn:
rows = conn.execute("""
SELECT id, external_id, name
FROM wiki_rassen
WHERE (foto_url IS NULL OR foto_url = '')
AND external_id LIKE 'wd_%'
""").fetchall()
if not rows:
logger.info("Wikipedia-Fotos: nichts zu tun")
return 0
logger.info(f"Wikipedia-Fotos: {len(rows)} Rassen ohne Foto")
# QID → DB-Row mappen
qid_map = {} # { 'Q12345': {'id': 1, 'external_id': 'wd_Q12345', 'name': '...'} }
for row in rows:
qid = _qid_from_ext(row["external_id"])
if qid:
qid_map[qid] = dict(row)
qids = list(qid_map.keys())
saved = 0
async with httpx.AsyncClient(
timeout=30,
follow_redirects=True,
headers=HEADERS
) as client:
# Sitelinks in Batches holen
sitelinks: dict[str, dict] = {}
for i in range(0, len(qids), BATCH_SIZE):
batch = qids[i:i + BATCH_SIZE]
chunk = await _fetch_sitelinks(batch, client)
sitelinks.update(chunk)
await asyncio.sleep(0.5)
logger.info(f"Sitelinks: {i + len(batch)}/{len(qids)} abgefragt, {len(sitelinks)} mit WP-Link")
logger.info(f"Wikipedia-Links gefunden: {len(sitelinks)}/{len(qids)}")
# Für jeden mit Sitelink → Bild holen + herunterladen
for idx, (qid, titles) in enumerate(sitelinks.items()):
row = qid_map[qid]
row_id = row["id"]
lang = "de" if "de" in titles else "en"
title = titles[lang]
img_url = await _fetch_wp_image(title, lang, client)
if not img_url:
# Zweiter Versuch mit EN wenn DE kein Bild hat
if lang == "de" and "en" in titles:
img_url = await _fetch_wp_image(titles["en"], "en", client)
if not img_url:
logger.debug(f"Kein WP-Bild für {row['name']} ({qid})")
continue
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
local_url = f"/media/breeds/{qid}.jpg"
if os.path.exists(local_path):
# Datei existiert bereits → nur DB updaten
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
saved += 1
continue
ok = await _download_image(img_url, local_path, client)
if ok:
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
saved += 1
else:
logger.debug(f"Download fehlgeschlagen: {row['name']}")
if idx % 50 == 0 and idx > 0:
logger.info(f"Wikipedia-Fotos: {saved}/{idx + 1} bisher")
logger.info(f"Wikipedia-Fotos gespeichert: {saved}/{len(sitelinks)} (mit WP-Link)")
return saved