diff --git a/backend/scraper/fetch_wiki_images.py b/backend/scraper/fetch_wiki_images.py new file mode 100644 index 0000000..04b39eb --- /dev/null +++ b/backend/scraper/fetch_wiki_images.py @@ -0,0 +1,254 @@ +""" +BAN YARO — Fehlende Rassen-Fotos von Wikipedia/Wikimedia holen + +Strategie: + 1. Alle Rassen ohne foto_url aus wiki_rassen holen + 2. Pro Rasse: Wikipedia pageimages API (de → en Fallback) + 3. Letzter Fallback: Wikimedia Commons pageimages API + 4. Sinnlose Bilder filtern (SVG, Flaggen-Icons, Karten, Logos) + 5. URL direkt in wiki_rassen.foto_url speichern + +CLI-Optionen: + --limit N Nur N Rassen bearbeiten (Default: 100) + --dry-run Nur anzeigen, nicht speichern + --model NAME Claude-Modell für ggf. zukünftige Text-Tasks + (Default: claude-sonnet-4-6) +""" + +import argparse +import asyncio +import logging +import os +import sys + +import httpx + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from database import db + +logger = logging.getLogger(__name__) + +_WP_HEADERS = { + "User-Agent": "Banyaro/1.0 (https://banyaro.de; mail@banyaro.de) httpx/Python" +} +_THUMB_SIZE = 600 + +# Dateinamen-Fragmente, die auf unbrauchbare Bilder hindeuten +_SKIP_PATTERNS = ( + ".svg", + "flag_of_", + "coat_of_arms", + "emblem_of_", + "location_map", + "orthographic_projection", + "locator_map", + "blank_map", + "wikimedia-logo", + "commons-logo", + "question_mark", + "noimage", +) + + +def _is_usable(url: str) -> bool: + """Gibt True zurück wenn die Bild-URL brauchbar erscheint.""" + low = url.lower() + if low.endswith(".svg"): + return False + for pattern in _SKIP_PATTERNS: + if pattern in low: + return False + return True + + +async def _fetch_wp_image(name: str, lang: str, client: httpx.AsyncClient) -> str | None: + """ + Fragt Wikipedia pageimages API für `name` in `lang` ab. + Gibt Thumbnail-URL zurück oder None. + """ + try: + resp = await client.get( + f"https://{lang}.wikipedia.org/w/api.php", + params={ + "action": "query", + "titles": name, + "prop": "pageimages", + "format": "json", + "pithumbsize": _THUMB_SIZE, + "redirects": 1, + }, + ) + resp.raise_for_status() + pages = resp.json().get("query", {}).get("pages", {}) + for page in pages.values(): + if page.get("pageid", -1) == -1: + continue + thumb = page.get("thumbnail", {}).get("source", "") + if thumb and _is_usable(thumb): + return thumb + except Exception as exc: + logger.debug("WP pageimages (%s/%s) Fehler: %s", lang, name, exc) + return None + + +async def _fetch_commons_image(name: str, client: httpx.AsyncClient) -> str | None: + """ + Fragt Wikimedia Commons pageimages API für `name` ab. + Wird als letzter Fallback genutzt. + """ + try: + resp = await client.get( + "https://commons.wikimedia.org/w/api.php", + params={ + "action": "query", + "titles": name, + "prop": "pageimages", + "format": "json", + "pithumbsize": _THUMB_SIZE, + }, + ) + resp.raise_for_status() + pages = resp.json().get("query", {}).get("pages", {}) + for page in pages.values(): + if page.get("pageid", -1) == -1: + continue + thumb = page.get("thumbnail", {}).get("source", "") + if thumb and _is_usable(thumb): + return thumb + except Exception as exc: + logger.debug("Commons pageimages (%s) Fehler: %s", name, exc) + return None + + +async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict: + """ + Holt Wikipedia-Fotos für alle Rassen ohne foto_url. + + Returns: {'found': int, 'saved': int, 'missing': int} + """ + with db() as conn: + rows = conn.execute( + """SELECT id, name, name_de, slug + FROM wiki_rassen + WHERE (foto_url IS NULL OR foto_url = '') + ORDER BY name ASC + LIMIT ?""", + (limit,), + ).fetchall() + + total = len(rows) + if total == 0: + logger.info("Alle Rassen haben bereits ein Foto — nichts zu tun.") + return {"found": 0, "saved": 0, "missing": 0} + + logger.info("%d Rassen ohne Foto werden verarbeitet (limit=%d).", total, limit) + + found = 0 + saved = 0 + + async with httpx.AsyncClient( + timeout=12, + follow_redirects=True, + headers=_WP_HEADERS, + ) as client: + for idx, row in enumerate(rows, start=1): + name = row["name"] + name_de = row["name_de"] or "" + slug = row["slug"] or name + + # Suchreihenfolge: DE-Name → EN-Name → Commons mit EN-Name + candidates: list[tuple[str, str]] = [] + + if name_de: + candidates.append((name_de, "de")) + candidates.append((name, "en")) + if name_de: + candidates.append((name_de, "en")) + + foto_url: str | None = None + + for search_name, lang in candidates: + foto_url = await _fetch_wp_image(search_name, lang, client) + if foto_url: + logger.info( + "[%d/%d] ✓ %s → WP %s (%s)", + idx, total, name, lang.upper(), search_name, + ) + break + + # Letzter Fallback: Wikimedia Commons + if not foto_url: + foto_url = await _fetch_commons_image(name, client) + if foto_url: + logger.info( + "[%d/%d] ✓ %s → Commons", idx, total, name + ) + + if foto_url: + found += 1 + if dry_run: + logger.info(" [dry-run] würde setzen: %s", foto_url) + else: + try: + with db() as conn: + conn.execute( + "UPDATE wiki_rassen SET foto_url=? WHERE id=?", + (foto_url, row["id"]), + ) + saved += 1 + except Exception as exc: + logger.error("DB-Update fehlgeschlagen für %s: %s", name, exc) + else: + logger.info("[%d/%d] ✗ %s — kein Foto gefunden", idx, total, name) + + # Rate-Limit: 1 Sekunde zwischen Anfragen + await asyncio.sleep(1.0) + + missing = total - found + logger.info( + "Fertig: %d/%d Fotos gefunden, %d gespeichert, %d ohne Treffer.", + found, total, saved, missing, + ) + return {"found": found, "saved": saved, "missing": missing} + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + datefmt="%H:%M:%S", + ) + + parser = argparse.ArgumentParser( + description="Fehlende Rassen-Fotos von Wikipedia/Wikimedia holen" + ) + parser.add_argument( + "--limit", + type=int, + default=100, + metavar="N", + help="Maximale Anzahl Rassen (Default: 100)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Nur anzeigen, nicht in DB speichern", + ) + parser.add_argument( + "--model", + default="claude-sonnet-4-6", + metavar="MODEL", + help="Claude-Modell für Text-Tasks (Default: claude-sonnet-4-6)", + ) + args = parser.parse_args() + + if args.dry_run: + logger.info("DRY-RUN Modus — keine DB-Änderungen.") + + result = asyncio.run(fetch_wiki_images(limit=args.limit, dry_run=args.dry_run)) + print( + f"\nErgebnis: {result['found']} gefunden, " + f"{result['saved']} gespeichert, " + f"{result['missing']} ohne Treffer." + )