""" Holt Fotos für Wikidata-Rassen ohne Bild über die Wikipedia-API. Strategie: 1. Wikidata-API: QID → Wikipedia-Artikel-Titel (DE bevorzugt, Fallback EN) 2. Wikipedia pageimages-API: Artikel-Titel → Bild-URL 3. Wikimedia Commons: Bild herunterladen und lokal speichern """ import asyncio import logging import os import re import httpx from database import db logger = logging.getLogger(__name__) MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media") BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds") WIKIDATA_API = "https://www.wikidata.org/w/api.php" WP_DE_API = "https://de.wikipedia.org/w/api.php" WP_EN_API = "https://en.wikipedia.org/w/api.php" HEADERS = {"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"} BATCH_SIZE = 50 # Wikidata API erlaubt max 50 IDs pro Request SLEEP_MS = 0.35 # 350ms zwischen Downloads def _qid_from_ext(ext_id: str) -> str | None: """Extrahiert QID aus external_id wie 'wd_Q12345' → 'Q12345'.""" m = re.match(r"wd_(Q\d+)$", ext_id) return m.group(1) if m else None async def _fetch_sitelinks(qids: list[str], client: httpx.AsyncClient) -> dict[str, dict]: """ Gibt {qid: {'de': 'Titel_DE', 'en': 'Titel_EN'}} zurück für alle QIDs, die mindestens einen Wikipedia-Sitelink haben. """ if not qids: return {} try: r = await client.get(WIKIDATA_API, params={ "action": "wbgetentities", "ids": "|".join(qids), "props": "sitelinks", "sitefilter": "dewiki|enwiki", "format": "json", }) r.raise_for_status() data = r.json() except Exception as e: logger.warning(f"Wikidata sitelinks Fehler: {e}") return {} result = {} for qid, entity in data.get("entities", {}).items(): sitelinks = entity.get("sitelinks", {}) titles = {} if "dewiki" in sitelinks: titles["de"] = sitelinks["dewiki"]["title"] if "enwiki" in sitelinks: titles["en"] = sitelinks["enwiki"]["title"] if titles: result[qid] = titles return result async def _fetch_wp_image(title: str, lang: str, client: httpx.AsyncClient) -> str | None: """ Gibt die Thumbnail-URL eines Wikipedia-Artikels zurück (600px-Version). """ api = WP_DE_API if lang == "de" else WP_EN_API try: r = await client.get(api, params={ "action": "query", "titles": title, "prop": "pageimages", "pithumbsize": 600, "format": "json", }) r.raise_for_status() pages = r.json().get("query", {}).get("pages", {}) for page in pages.values(): thumb = page.get("thumbnail", {}).get("source") if thumb: return thumb except Exception as e: logger.debug(f"WP pageimage Fehler ({lang}/{title}): {e}") return None async def _download_image(url: str, path: str, client: httpx.AsyncClient) -> bool: """Lädt Bild herunter, speichert unter path. True bei Erfolg.""" for attempt in range(2): try: await asyncio.sleep(SLEEP_MS) r = await client.get(url) if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"): with open(path, "wb") as f: f.write(r.content) return True if r.status_code == 429: await asyncio.sleep(15 * (attempt + 1)) except Exception as e: logger.debug(f"Download Fehler {url}: {e}") return False async def fetch_wikipedia_photos() -> int: """ Haupt-Funktion: Holt Wikipedia-Fotos für alle Rassen ohne foto_url. Gibt Anzahl erfolgreich gespeicherter Fotos zurück. """ os.makedirs(BREEDS_DIR, exist_ok=True) with db() as conn: rows = conn.execute(""" SELECT id, external_id, name FROM wiki_rassen WHERE (foto_url IS NULL OR foto_url = '') AND external_id LIKE 'wd_%' """).fetchall() if not rows: logger.info("Wikipedia-Fotos: nichts zu tun") return 0 logger.info(f"Wikipedia-Fotos: {len(rows)} Rassen ohne Foto") # QID → DB-Row mappen qid_map = {} # { 'Q12345': {'id': 1, 'external_id': 'wd_Q12345', 'name': '...'} } for row in rows: qid = _qid_from_ext(row["external_id"]) if qid: qid_map[qid] = dict(row) qids = list(qid_map.keys()) saved = 0 async with httpx.AsyncClient( timeout=30, follow_redirects=True, headers=HEADERS ) as client: # Sitelinks in Batches holen sitelinks: dict[str, dict] = {} for i in range(0, len(qids), BATCH_SIZE): batch = qids[i:i + BATCH_SIZE] chunk = await _fetch_sitelinks(batch, client) sitelinks.update(chunk) await asyncio.sleep(0.5) logger.info(f"Sitelinks: {i + len(batch)}/{len(qids)} abgefragt, {len(sitelinks)} mit WP-Link") logger.info(f"Wikipedia-Links gefunden: {len(sitelinks)}/{len(qids)}") # Für jeden mit Sitelink → Bild holen + herunterladen for idx, (qid, titles) in enumerate(sitelinks.items()): row = qid_map[qid] row_id = row["id"] lang = "de" if "de" in titles else "en" title = titles[lang] img_url = await _fetch_wp_image(title, lang, client) if not img_url: # Zweiter Versuch mit EN wenn DE kein Bild hat if lang == "de" and "en" in titles: img_url = await _fetch_wp_image(titles["en"], "en", client) if not img_url: logger.debug(f"Kein WP-Bild für {row['name']} ({qid})") continue local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg") local_url = f"/media/breeds/{qid}.jpg" if os.path.exists(local_path): # Datei existiert bereits → nur DB updaten with db() as conn: conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id)) saved += 1 continue ok = await _download_image(img_url, local_path, client) if ok: with db() as conn: conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id)) saved += 1 else: logger.debug(f"Download fehlgeschlagen: {row['name']}") if idx % 50 == 0 and idx > 0: logger.info(f"Wikipedia-Fotos: {saved}/{idx + 1} bisher") logger.info(f"Wikipedia-Fotos gespeichert: {saved}/{len(sitelinks)} (mit WP-Link)") return saved