- User können Fotos für Rassen vorschlagen (Upload-Modal in Rassen-Detail) - Mod/Admin-Review-Tab im Wiki mit Freischalten/Ablehnen + Push-Notification - wikipedia_photos.py: holt Fotos über Wikidata-QID → Wikipedia-API - Foto-Status: 578 lokal, 186 extern, 238 ohne Foto - DB: wiki_foto_submissions Tabelle - SW by-v90
198 lines
6.7 KiB
Python
198 lines
6.7 KiB
Python
"""
|
|
Holt Fotos für Wikidata-Rassen ohne Bild über die Wikipedia-API.
|
|
|
|
Strategie:
|
|
1. Wikidata-API: QID → Wikipedia-Artikel-Titel (DE bevorzugt, Fallback EN)
|
|
2. Wikipedia pageimages-API: Artikel-Titel → Bild-URL
|
|
3. Wikimedia Commons: Bild herunterladen und lokal speichern
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import re
|
|
import httpx
|
|
|
|
from database import db
|
|
|
|
logger = logging.getLogger(__name__)
|
|
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
|
|
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
|
|
|
|
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
|
|
WP_DE_API = "https://de.wikipedia.org/w/api.php"
|
|
WP_EN_API = "https://en.wikipedia.org/w/api.php"
|
|
HEADERS = {"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
|
|
BATCH_SIZE = 50 # Wikidata API erlaubt max 50 IDs pro Request
|
|
SLEEP_MS = 0.35 # 350ms zwischen Downloads
|
|
|
|
|
|
def _qid_from_ext(ext_id: str) -> str | None:
|
|
"""Extrahiert QID aus external_id wie 'wd_Q12345' → 'Q12345'."""
|
|
m = re.match(r"wd_(Q\d+)$", ext_id)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
async def _fetch_sitelinks(qids: list[str], client: httpx.AsyncClient) -> dict[str, dict]:
|
|
"""
|
|
Gibt {qid: {'de': 'Titel_DE', 'en': 'Titel_EN'}} zurück
|
|
für alle QIDs, die mindestens einen Wikipedia-Sitelink haben.
|
|
"""
|
|
if not qids:
|
|
return {}
|
|
try:
|
|
r = await client.get(WIKIDATA_API, params={
|
|
"action": "wbgetentities",
|
|
"ids": "|".join(qids),
|
|
"props": "sitelinks",
|
|
"sitefilter": "dewiki|enwiki",
|
|
"format": "json",
|
|
})
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
except Exception as e:
|
|
logger.warning(f"Wikidata sitelinks Fehler: {e}")
|
|
return {}
|
|
|
|
result = {}
|
|
for qid, entity in data.get("entities", {}).items():
|
|
sitelinks = entity.get("sitelinks", {})
|
|
titles = {}
|
|
if "dewiki" in sitelinks:
|
|
titles["de"] = sitelinks["dewiki"]["title"]
|
|
if "enwiki" in sitelinks:
|
|
titles["en"] = sitelinks["enwiki"]["title"]
|
|
if titles:
|
|
result[qid] = titles
|
|
return result
|
|
|
|
|
|
async def _fetch_wp_image(title: str, lang: str, client: httpx.AsyncClient) -> str | None:
|
|
"""
|
|
Gibt die Thumbnail-URL eines Wikipedia-Artikels zurück (600px-Version).
|
|
"""
|
|
api = WP_DE_API if lang == "de" else WP_EN_API
|
|
try:
|
|
r = await client.get(api, params={
|
|
"action": "query",
|
|
"titles": title,
|
|
"prop": "pageimages",
|
|
"pithumbsize": 600,
|
|
"format": "json",
|
|
})
|
|
r.raise_for_status()
|
|
pages = r.json().get("query", {}).get("pages", {})
|
|
for page in pages.values():
|
|
thumb = page.get("thumbnail", {}).get("source")
|
|
if thumb:
|
|
return thumb
|
|
except Exception as e:
|
|
logger.debug(f"WP pageimage Fehler ({lang}/{title}): {e}")
|
|
return None
|
|
|
|
|
|
async def _download_image(url: str, path: str, client: httpx.AsyncClient) -> bool:
|
|
"""Lädt Bild herunter, speichert unter path. True bei Erfolg."""
|
|
for attempt in range(2):
|
|
try:
|
|
await asyncio.sleep(SLEEP_MS)
|
|
r = await client.get(url)
|
|
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
|
|
with open(path, "wb") as f:
|
|
f.write(r.content)
|
|
return True
|
|
if r.status_code == 429:
|
|
await asyncio.sleep(15 * (attempt + 1))
|
|
except Exception as e:
|
|
logger.debug(f"Download Fehler {url}: {e}")
|
|
return False
|
|
|
|
|
|
async def fetch_wikipedia_photos() -> int:
|
|
"""
|
|
Haupt-Funktion: Holt Wikipedia-Fotos für alle Rassen ohne foto_url.
|
|
Gibt Anzahl erfolgreich gespeicherter Fotos zurück.
|
|
"""
|
|
os.makedirs(BREEDS_DIR, exist_ok=True)
|
|
|
|
with db() as conn:
|
|
rows = conn.execute("""
|
|
SELECT id, external_id, name
|
|
FROM wiki_rassen
|
|
WHERE (foto_url IS NULL OR foto_url = '')
|
|
AND external_id LIKE 'wd_%'
|
|
""").fetchall()
|
|
|
|
if not rows:
|
|
logger.info("Wikipedia-Fotos: nichts zu tun")
|
|
return 0
|
|
|
|
logger.info(f"Wikipedia-Fotos: {len(rows)} Rassen ohne Foto")
|
|
|
|
# QID → DB-Row mappen
|
|
qid_map = {} # { 'Q12345': {'id': 1, 'external_id': 'wd_Q12345', 'name': '...'} }
|
|
for row in rows:
|
|
qid = _qid_from_ext(row["external_id"])
|
|
if qid:
|
|
qid_map[qid] = dict(row)
|
|
|
|
qids = list(qid_map.keys())
|
|
saved = 0
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=30,
|
|
follow_redirects=True,
|
|
headers=HEADERS
|
|
) as client:
|
|
|
|
# Sitelinks in Batches holen
|
|
sitelinks: dict[str, dict] = {}
|
|
for i in range(0, len(qids), BATCH_SIZE):
|
|
batch = qids[i:i + BATCH_SIZE]
|
|
chunk = await _fetch_sitelinks(batch, client)
|
|
sitelinks.update(chunk)
|
|
await asyncio.sleep(0.5)
|
|
logger.info(f"Sitelinks: {i + len(batch)}/{len(qids)} abgefragt, {len(sitelinks)} mit WP-Link")
|
|
|
|
logger.info(f"Wikipedia-Links gefunden: {len(sitelinks)}/{len(qids)}")
|
|
|
|
# Für jeden mit Sitelink → Bild holen + herunterladen
|
|
for idx, (qid, titles) in enumerate(sitelinks.items()):
|
|
row = qid_map[qid]
|
|
row_id = row["id"]
|
|
lang = "de" if "de" in titles else "en"
|
|
title = titles[lang]
|
|
|
|
img_url = await _fetch_wp_image(title, lang, client)
|
|
if not img_url:
|
|
# Zweiter Versuch mit EN wenn DE kein Bild hat
|
|
if lang == "de" and "en" in titles:
|
|
img_url = await _fetch_wp_image(titles["en"], "en", client)
|
|
|
|
if not img_url:
|
|
logger.debug(f"Kein WP-Bild für {row['name']} ({qid})")
|
|
continue
|
|
|
|
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
|
|
local_url = f"/media/breeds/{qid}.jpg"
|
|
|
|
if os.path.exists(local_path):
|
|
# Datei existiert bereits → nur DB updaten
|
|
with db() as conn:
|
|
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
|
|
saved += 1
|
|
continue
|
|
|
|
ok = await _download_image(img_url, local_path, client)
|
|
if ok:
|
|
with db() as conn:
|
|
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
|
|
saved += 1
|
|
else:
|
|
logger.debug(f"Download fehlgeschlagen: {row['name']}")
|
|
|
|
if idx % 50 == 0 and idx > 0:
|
|
logger.info(f"Wikipedia-Fotos: {saved}/{idx + 1} bisher")
|
|
|
|
logger.info(f"Wikipedia-Fotos gespeichert: {saved}/{len(sitelinks)} (mit WP-Link)")
|
|
return saved
|