_commons_search() und deren Aufruf (Stufe 4+5) entfernt. Nur WP pageimages DE/EN + Commons pageimages (exakter Treffer) bleiben. urllib.parse Import entfernt.
298 lines
9 KiB
Python
298 lines
9 KiB
Python
"""
|
|
BAN YARO — Fehlende Rassen-Fotos von Wikipedia/Wikimedia holen
|
|
|
|
Strategie (in Reihenfolge):
|
|
1. Wikipedia pageimages DE (exakter Artikel-Treffer)
|
|
2. Wikipedia pageimages EN
|
|
3. Wikimedia Commons pageimages (exakter Artikel-Treffer)
|
|
|
|
Alle Bilder werden als externe URLs gespeichert (Wikimedia CDN).
|
|
Lizenz: CC-BY-SA (Wikimedia Commons) — Attribution in Wiki-Seite anzeigen.
|
|
|
|
CLI-Optionen:
|
|
--limit N Nur N Rassen bearbeiten (Default: 100)
|
|
--dry-run Nur anzeigen, nicht speichern
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import sys
|
|
|
|
import httpx
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from database import db
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_WP_HEADERS = {
|
|
"User-Agent": "Banyaro/1.0 (https://banyaro.de; mail@banyaro.de) httpx/Python"
|
|
}
|
|
_THUMB_SIZE = 600
|
|
|
|
# Nur diese Bildformate akzeptieren
|
|
_VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp")
|
|
|
|
# Dateinamen-Fragmente, die auf unbrauchbare Bilder hindeuten
|
|
_SKIP_PATTERNS = (
|
|
"flag_of_",
|
|
"coat_of_arms",
|
|
"emblem_of_",
|
|
"location_map",
|
|
"orthographic_projection",
|
|
"locator_map",
|
|
"blank_map",
|
|
"wikimedia-logo",
|
|
"commons-logo",
|
|
"question_mark",
|
|
"noimage",
|
|
"placeholder",
|
|
"silhouette",
|
|
"icon_",
|
|
"_icon",
|
|
"logo_",
|
|
"_logo",
|
|
# Historische Dokumente / Bücher / Karten
|
|
"_survey_",
|
|
"_register_",
|
|
"_magazine_",
|
|
"_journal_",
|
|
"_bulletin_",
|
|
"_catalogue_",
|
|
"_glossaire_",
|
|
"_aarbog_",
|
|
"_museum_",
|
|
"_tales_",
|
|
"_history_",
|
|
"_stories_",
|
|
"_inn_",
|
|
"curiosities",
|
|
"viviparous",
|
|
"quadrupeds",
|
|
# Geo / Städte
|
|
"spain.jpg",
|
|
"italy.jpg",
|
|
"france.jpg",
|
|
"germany.jpg",
|
|
"austria.jpg",
|
|
"map_of_",
|
|
# Militär / Dokumente
|
|
"military_working",
|
|
"mod_45",
|
|
"federal_register",
|
|
"prairie_dog", # ist kein Hund
|
|
)
|
|
|
|
# Suffixe die beim Normalisieren abgeschnitten werden
|
|
_BREED_SUFFIXES = (" dog", " hound", " terrier", " spaniel", " shepherd")
|
|
|
|
|
|
def _is_usable(url: str) -> bool:
|
|
"""Gibt True zurück wenn die URL eine brauchbare Hundebild-URL ist."""
|
|
low = url.lower()
|
|
|
|
# Nur echte Bildformate — keine PDFs, DjVu, Karten etc.
|
|
# Prüfe gegen den Dateinamen (nach dem letzten /)
|
|
fname = low.split("/")[-1].split("?")[0]
|
|
# Bei Thumbnail-URLs: Originaldatei-Endung extrahieren
|
|
# z.B. "960px-foo.jpg" → ".jpg" | "page1-500px-foo.pdf.jpg" → ablehnen
|
|
if ".pdf" in fname or ".djvu" in fname or ".svg" in fname:
|
|
return False
|
|
if not any(fname.endswith(ext) or (ext + "/") in low for ext in _VALID_EXTENSIONS):
|
|
# Manchmal ist die Extension mitten in der URL (Thumbnail-Pfad)
|
|
if not any(ext in low for ext in _VALID_EXTENSIONS):
|
|
return False
|
|
|
|
for pattern in _SKIP_PATTERNS:
|
|
if pattern in low:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _name_variants(name: str, name_de: str | None) -> list[str]:
|
|
"""Gibt Suchbegriff-Varianten zurück (dedupliziert, Reihenfolge bleibt)."""
|
|
seen = set()
|
|
result = []
|
|
|
|
def _add(n: str):
|
|
n = n.strip()
|
|
if n and n not in seen:
|
|
seen.add(n)
|
|
result.append(n)
|
|
|
|
_add(name)
|
|
if name_de:
|
|
_add(name_de)
|
|
|
|
# Ohne Klammern-Zusatz: "Foo (Bar)" → "Foo"
|
|
if "(" in name:
|
|
_add(name.split("(")[0].strip())
|
|
|
|
# Bindestrich → Leerzeichen
|
|
_add(name.replace("-", " "))
|
|
|
|
# Suffix abschneiden
|
|
low = name.lower()
|
|
for suf in _BREED_SUFFIXES:
|
|
if low.endswith(suf):
|
|
_add(name[: -len(suf)].strip())
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
async def _wp_pageimages(name: str, lang: str, client: httpx.AsyncClient) -> str | None:
|
|
"""Wikipedia pageimages API — gibt Thumbnail-URL oder None zurück."""
|
|
try:
|
|
resp = await client.get(
|
|
f"https://{lang}.wikipedia.org/w/api.php",
|
|
params={
|
|
"action": "query",
|
|
"titles": name,
|
|
"prop": "pageimages",
|
|
"format": "json",
|
|
"pithumbsize": _THUMB_SIZE,
|
|
"redirects": 1,
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
for page in resp.json().get("query", {}).get("pages", {}).values():
|
|
if page.get("pageid", -1) == -1:
|
|
continue
|
|
thumb = page.get("thumbnail", {}).get("source", "")
|
|
if thumb and _is_usable(thumb):
|
|
return thumb
|
|
except Exception as exc:
|
|
logger.debug("WP pageimages (%s/%s): %s", lang, name, exc)
|
|
return None
|
|
|
|
|
|
async def _commons_pageimages(name: str, client: httpx.AsyncClient) -> str | None:
|
|
"""Wikimedia Commons pageimages API (exakter Artikel-Treffer)."""
|
|
try:
|
|
resp = await client.get(
|
|
"https://commons.wikimedia.org/w/api.php",
|
|
params={
|
|
"action": "query",
|
|
"titles": name,
|
|
"prop": "pageimages",
|
|
"format": "json",
|
|
"pithumbsize": _THUMB_SIZE,
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
for page in resp.json().get("query", {}).get("pages", {}).values():
|
|
if page.get("pageid", -1) == -1:
|
|
continue
|
|
thumb = page.get("thumbnail", {}).get("source", "")
|
|
if thumb and _is_usable(thumb):
|
|
return thumb
|
|
except Exception as exc:
|
|
logger.debug("Commons pageimages (%s): %s", name, exc)
|
|
return None
|
|
|
|
|
|
async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
|
"""
|
|
Holt Fotos für alle Rassen ohne foto_url.
|
|
Versucht mehrere Quellen und Namensvarianten.
|
|
"""
|
|
with db() as conn:
|
|
rows = conn.execute(
|
|
"""SELECT id, name, name_de, slug
|
|
FROM wiki_rassen
|
|
WHERE (foto_url IS NULL OR foto_url = '')
|
|
ORDER BY name ASC
|
|
LIMIT ?""",
|
|
(limit,),
|
|
).fetchall()
|
|
|
|
total = len(rows)
|
|
if total == 0:
|
|
logger.info("Alle Rassen haben bereits ein Foto — nichts zu tun.")
|
|
return {"found": 0, "saved": 0, "missing": 0}
|
|
|
|
logger.info("%d Rassen ohne Foto (limit=%d).", total, limit)
|
|
found = saved = 0
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=15, follow_redirects=True, headers=_WP_HEADERS
|
|
) as client:
|
|
|
|
for idx, row in enumerate(rows, start=1):
|
|
row = dict(row)
|
|
name = row["name"]
|
|
name_de = row.get("name_de") or ""
|
|
variants = _name_variants(name, name_de or None)
|
|
|
|
foto_url: str | None = None
|
|
source: str = ""
|
|
|
|
# ── Stufe 1+2: Wikipedia pageimages DE / EN ──────────────────
|
|
for lang in ("de", "en"):
|
|
for variant in variants:
|
|
foto_url = await _wp_pageimages(variant, lang, client)
|
|
if foto_url:
|
|
source = f"WP-{lang.upper()} ({variant})"
|
|
break
|
|
if foto_url:
|
|
break
|
|
|
|
# ── Stufe 3: Commons pageimages (exakter Treffer) ─────────────
|
|
if not foto_url:
|
|
for variant in variants:
|
|
foto_url = await _commons_pageimages(variant, client)
|
|
if foto_url:
|
|
source = f"Commons-exact ({variant})"
|
|
break
|
|
|
|
if foto_url:
|
|
found += 1
|
|
logger.info("[%d/%d] ✓ %s → %s", idx, total, name, source)
|
|
if not dry_run:
|
|
try:
|
|
with db() as conn:
|
|
conn.execute(
|
|
"UPDATE wiki_rassen SET foto_url=? WHERE id=?",
|
|
(foto_url, row["id"]),
|
|
)
|
|
saved += 1
|
|
except Exception as exc:
|
|
logger.error("DB-Update %s: %s", name, exc)
|
|
else:
|
|
logger.info(" [dry-run] %s", foto_url)
|
|
else:
|
|
logger.info("[%d/%d] ✗ %s", idx, total, name)
|
|
|
|
await asyncio.sleep(0.8)
|
|
|
|
logger.info(
|
|
"Fertig: %d/%d gefunden, %d gespeichert, %d ohne Treffer.",
|
|
found, total, saved, total - found,
|
|
)
|
|
return {"found": found, "saved": saved, "missing": total - found}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
parser = argparse.ArgumentParser(description="Rassen-Fotos von Wikimedia holen")
|
|
parser.add_argument("--limit", type=int, default=100, metavar="N")
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
if args.dry_run:
|
|
logger.info("DRY-RUN — keine DB-Änderungen.")
|
|
|
|
result = asyncio.run(fetch_wiki_images(limit=args.limit, dry_run=args.dry_run))
|
|
print(
|
|
f"\nErgebnis: {result['found']} gefunden, "
|
|
f"{result['saved']} gespeichert, "
|
|
f"{result['missing']} ohne Treffer."
|
|
)
|