Scraper: fetch_wiki_images mit Commons-Dateisuche (File-Namespace)
This commit is contained in:
parent
c3d33547c7
commit
93ea8a69fd
1 changed files with 177 additions and 101 deletions
|
|
@ -1,18 +1,20 @@
|
||||||
"""
|
"""
|
||||||
BAN YARO — Fehlende Rassen-Fotos von Wikipedia/Wikimedia holen
|
BAN YARO — Fehlende Rassen-Fotos von Wikipedia/Wikimedia holen
|
||||||
|
|
||||||
Strategie:
|
Strategie (in Reihenfolge):
|
||||||
1. Alle Rassen ohne foto_url aus wiki_rassen holen
|
1. Wikipedia pageimages DE (exakter Artikel-Treffer)
|
||||||
2. Pro Rasse: Wikipedia pageimages API (de → en Fallback)
|
2. Wikipedia pageimages EN
|
||||||
3. Letzter Fallback: Wikimedia Commons pageimages API
|
3. Wikimedia Commons pageimages (exakter Artikel-Treffer)
|
||||||
4. Sinnlose Bilder filtern (SVG, Flaggen-Icons, Karten, Logos)
|
4. Wikimedia Commons Datei-Suche (action=query&list=search im File-Namespace)
|
||||||
5. URL direkt in wiki_rassen.foto_url speichern
|
→ Sucht nach Bilddateien die den Rassenamen enthalten
|
||||||
|
5. Gleiche Suche mit name_de (falls vorhanden)
|
||||||
|
|
||||||
|
Alle Bilder werden als externe URLs gespeichert (Wikimedia CDN).
|
||||||
|
Lizenz: CC-BY-SA (Wikimedia Commons) — Attribution in Wiki-Seite anzeigen.
|
||||||
|
|
||||||
CLI-Optionen:
|
CLI-Optionen:
|
||||||
--limit N Nur N Rassen bearbeiten (Default: 100)
|
--limit N Nur N Rassen bearbeiten (Default: 100)
|
||||||
--dry-run Nur anzeigen, nicht speichern
|
--dry-run Nur anzeigen, nicht speichern
|
||||||
--model NAME Claude-Modell für ggf. zukünftige Text-Tasks
|
|
||||||
(Default: claude-sonnet-4-6)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -20,6 +22,7 @@ import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
|
@ -48,11 +51,19 @@ _SKIP_PATTERNS = (
|
||||||
"commons-logo",
|
"commons-logo",
|
||||||
"question_mark",
|
"question_mark",
|
||||||
"noimage",
|
"noimage",
|
||||||
|
"placeholder",
|
||||||
|
"silhouette",
|
||||||
|
"icon_",
|
||||||
|
"_icon",
|
||||||
|
"logo_",
|
||||||
|
"_logo",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Suffixe die beim Normalisieren abgeschnitten werden
|
||||||
|
_BREED_SUFFIXES = (" dog", " hound", " terrier", " spaniel", " shepherd")
|
||||||
|
|
||||||
|
|
||||||
def _is_usable(url: str) -> bool:
|
def _is_usable(url: str) -> bool:
|
||||||
"""Gibt True zurück wenn die Bild-URL brauchbar erscheint."""
|
|
||||||
low = url.lower()
|
low = url.lower()
|
||||||
if low.endswith(".svg"):
|
if low.endswith(".svg"):
|
||||||
return False
|
return False
|
||||||
|
|
@ -62,70 +73,156 @@ def _is_usable(url: str) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_wp_image(name: str, lang: str, client: httpx.AsyncClient) -> str | None:
|
def _name_variants(name: str, name_de: str | None) -> list[str]:
|
||||||
"""
|
"""Gibt Suchbegriff-Varianten zurück (dedupliziert, Reihenfolge bleibt)."""
|
||||||
Fragt Wikipedia pageimages API für `name` in `lang` ab.
|
seen = set()
|
||||||
Gibt Thumbnail-URL zurück oder None.
|
result = []
|
||||||
"""
|
|
||||||
|
def _add(n: str):
|
||||||
|
n = n.strip()
|
||||||
|
if n and n not in seen:
|
||||||
|
seen.add(n)
|
||||||
|
result.append(n)
|
||||||
|
|
||||||
|
_add(name)
|
||||||
|
if name_de:
|
||||||
|
_add(name_de)
|
||||||
|
|
||||||
|
# Ohne Klammern-Zusatz: "Foo (Bar)" → "Foo"
|
||||||
|
if "(" in name:
|
||||||
|
_add(name.split("(")[0].strip())
|
||||||
|
|
||||||
|
# Bindestrich → Leerzeichen
|
||||||
|
_add(name.replace("-", " "))
|
||||||
|
|
||||||
|
# Suffix abschneiden
|
||||||
|
low = name.lower()
|
||||||
|
for suf in _BREED_SUFFIXES:
|
||||||
|
if low.endswith(suf):
|
||||||
|
_add(name[: -len(suf)].strip())
|
||||||
|
break
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def _wp_pageimages(name: str, lang: str, client: httpx.AsyncClient) -> str | None:
|
||||||
|
"""Wikipedia pageimages API — gibt Thumbnail-URL oder None zurück."""
|
||||||
try:
|
try:
|
||||||
resp = await client.get(
|
resp = await client.get(
|
||||||
f"https://{lang}.wikipedia.org/w/api.php",
|
f"https://{lang}.wikipedia.org/w/api.php",
|
||||||
params={
|
params={
|
||||||
"action": "query",
|
"action": "query",
|
||||||
"titles": name,
|
"titles": name,
|
||||||
"prop": "pageimages",
|
"prop": "pageimages",
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"pithumbsize": _THUMB_SIZE,
|
"pithumbsize": _THUMB_SIZE,
|
||||||
"redirects": 1,
|
"redirects": 1,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
pages = resp.json().get("query", {}).get("pages", {})
|
for page in resp.json().get("query", {}).get("pages", {}).values():
|
||||||
for page in pages.values():
|
|
||||||
if page.get("pageid", -1) == -1:
|
if page.get("pageid", -1) == -1:
|
||||||
continue
|
continue
|
||||||
thumb = page.get("thumbnail", {}).get("source", "")
|
thumb = page.get("thumbnail", {}).get("source", "")
|
||||||
if thumb and _is_usable(thumb):
|
if thumb and _is_usable(thumb):
|
||||||
return thumb
|
return thumb
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("WP pageimages (%s/%s) Fehler: %s", lang, name, exc)
|
logger.debug("WP pageimages (%s/%s): %s", lang, name, exc)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_commons_image(name: str, client: httpx.AsyncClient) -> str | None:
|
async def _commons_pageimages(name: str, client: httpx.AsyncClient) -> str | None:
|
||||||
"""
|
"""Wikimedia Commons pageimages API (exakter Artikel-Treffer)."""
|
||||||
Fragt Wikimedia Commons pageimages API für `name` ab.
|
|
||||||
Wird als letzter Fallback genutzt.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
resp = await client.get(
|
resp = await client.get(
|
||||||
"https://commons.wikimedia.org/w/api.php",
|
"https://commons.wikimedia.org/w/api.php",
|
||||||
params={
|
params={
|
||||||
"action": "query",
|
"action": "query",
|
||||||
"titles": name,
|
"titles": name,
|
||||||
"prop": "pageimages",
|
"prop": "pageimages",
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"pithumbsize": _THUMB_SIZE,
|
"pithumbsize": _THUMB_SIZE,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
pages = resp.json().get("query", {}).get("pages", {})
|
for page in resp.json().get("query", {}).get("pages", {}).values():
|
||||||
for page in pages.values():
|
|
||||||
if page.get("pageid", -1) == -1:
|
if page.get("pageid", -1) == -1:
|
||||||
continue
|
continue
|
||||||
thumb = page.get("thumbnail", {}).get("source", "")
|
thumb = page.get("thumbnail", {}).get("source", "")
|
||||||
if thumb and _is_usable(thumb):
|
if thumb and _is_usable(thumb):
|
||||||
return thumb
|
return thumb
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("Commons pageimages (%s) Fehler: %s", name, exc)
|
logger.debug("Commons pageimages (%s): %s", name, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
||||||
|
"""
|
||||||
|
Wikimedia Commons Datei-Suche im File-Namespace (6).
|
||||||
|
Gibt Thumbnail-URL des ersten brauchbaren Treffers zurück.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Schritt 1: Dateinamen suchen
|
||||||
|
resp = await client.get(
|
||||||
|
"https://commons.wikimedia.org/w/api.php",
|
||||||
|
params={
|
||||||
|
"action": "query",
|
||||||
|
"list": "search",
|
||||||
|
"srsearch": query,
|
||||||
|
"srnamespace": "6", # File-Namespace
|
||||||
|
"srlimit": "5",
|
||||||
|
"format": "json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
hits = resp.json().get("query", {}).get("search", [])
|
||||||
|
if not hits:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Schritt 2: Für jeden Treffer imageinfo holen
|
||||||
|
titles = "|".join(h["title"] for h in hits[:5])
|
||||||
|
resp2 = await client.get(
|
||||||
|
"https://commons.wikimedia.org/w/api.php",
|
||||||
|
params={
|
||||||
|
"action": "query",
|
||||||
|
"titles": titles,
|
||||||
|
"prop": "imageinfo",
|
||||||
|
"iiprop": "url",
|
||||||
|
"iiurlwidth": _THUMB_SIZE,
|
||||||
|
"format": "json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp2.raise_for_status()
|
||||||
|
pages = resp2.json().get("query", {}).get("pages", {})
|
||||||
|
|
||||||
|
# Trefferqualität: bevorzuge Bilder die den Suchbegriff im Dateinamen haben
|
||||||
|
query_lower = query.lower().replace(" ", "_")
|
||||||
|
best: str | None = None
|
||||||
|
|
||||||
|
for page in pages.values():
|
||||||
|
if page.get("pageid", -1) == -1:
|
||||||
|
continue
|
||||||
|
for ii in page.get("imageinfo", []):
|
||||||
|
thumb = ii.get("thumburl") or ii.get("url", "")
|
||||||
|
if not thumb or not _is_usable(thumb):
|
||||||
|
continue
|
||||||
|
fname = urllib.parse.unquote(thumb).lower()
|
||||||
|
if query_lower in fname and best is None:
|
||||||
|
best = thumb
|
||||||
|
elif best is None:
|
||||||
|
best = thumb # Fallback: erster brauchbarer Treffer
|
||||||
|
|
||||||
|
return best
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Commons search (%s): %s", query, exc)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
||||||
"""
|
"""
|
||||||
Holt Wikipedia-Fotos für alle Rassen ohne foto_url.
|
Holt Fotos für alle Rassen ohne foto_url.
|
||||||
|
Versucht mehrere Quellen und Namensvarianten.
|
||||||
Returns: {'found': int, 'saved': int, 'missing': int}
|
|
||||||
"""
|
"""
|
||||||
with db() as conn:
|
with db() as conn:
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
|
|
@ -142,54 +239,52 @@ async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
||||||
logger.info("Alle Rassen haben bereits ein Foto — nichts zu tun.")
|
logger.info("Alle Rassen haben bereits ein Foto — nichts zu tun.")
|
||||||
return {"found": 0, "saved": 0, "missing": 0}
|
return {"found": 0, "saved": 0, "missing": 0}
|
||||||
|
|
||||||
logger.info("%d Rassen ohne Foto werden verarbeitet (limit=%d).", total, limit)
|
logger.info("%d Rassen ohne Foto (limit=%d).", total, limit)
|
||||||
|
found = saved = 0
|
||||||
found = 0
|
|
||||||
saved = 0
|
|
||||||
|
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
timeout=12,
|
timeout=15, follow_redirects=True, headers=_WP_HEADERS
|
||||||
follow_redirects=True,
|
|
||||||
headers=_WP_HEADERS,
|
|
||||||
) as client:
|
) as client:
|
||||||
|
|
||||||
for idx, row in enumerate(rows, start=1):
|
for idx, row in enumerate(rows, start=1):
|
||||||
|
row = dict(row)
|
||||||
name = row["name"]
|
name = row["name"]
|
||||||
name_de = row["name_de"] or ""
|
name_de = row.get("name_de") or ""
|
||||||
slug = row["slug"] or name
|
variants = _name_variants(name, name_de or None)
|
||||||
|
|
||||||
# Suchreihenfolge: DE-Name → EN-Name → Commons mit EN-Name
|
|
||||||
candidates: list[tuple[str, str]] = []
|
|
||||||
|
|
||||||
if name_de:
|
|
||||||
candidates.append((name_de, "de"))
|
|
||||||
candidates.append((name, "en"))
|
|
||||||
if name_de:
|
|
||||||
candidates.append((name_de, "en"))
|
|
||||||
|
|
||||||
foto_url: str | None = None
|
foto_url: str | None = None
|
||||||
|
source: str = ""
|
||||||
|
|
||||||
for search_name, lang in candidates:
|
# ── Stufe 1+2: Wikipedia pageimages DE / EN ──────────────────
|
||||||
foto_url = await _fetch_wp_image(search_name, lang, client)
|
for lang in ("de", "en"):
|
||||||
|
for variant in variants:
|
||||||
|
foto_url = await _wp_pageimages(variant, lang, client)
|
||||||
|
if foto_url:
|
||||||
|
source = f"WP-{lang.upper()} ({variant})"
|
||||||
|
break
|
||||||
if foto_url:
|
if foto_url:
|
||||||
logger.info(
|
|
||||||
"[%d/%d] ✓ %s → WP %s (%s)",
|
|
||||||
idx, total, name, lang.upper(), search_name,
|
|
||||||
)
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# Letzter Fallback: Wikimedia Commons
|
# ── Stufe 3: Commons pageimages (exakter Treffer) ─────────────
|
||||||
if not foto_url:
|
if not foto_url:
|
||||||
foto_url = await _fetch_commons_image(name, client)
|
for variant in variants:
|
||||||
if foto_url:
|
foto_url = await _commons_pageimages(variant, client)
|
||||||
logger.info(
|
if foto_url:
|
||||||
"[%d/%d] ✓ %s → Commons", idx, total, name
|
source = f"Commons-exact ({variant})"
|
||||||
)
|
break
|
||||||
|
|
||||||
|
# ── Stufe 4+5: Commons Datei-Suche ───────────────────────────
|
||||||
|
if not foto_url:
|
||||||
|
for variant in variants:
|
||||||
|
foto_url = await _commons_search(variant, client)
|
||||||
|
if foto_url:
|
||||||
|
source = f"Commons-search ({variant})"
|
||||||
|
break
|
||||||
|
|
||||||
if foto_url:
|
if foto_url:
|
||||||
found += 1
|
found += 1
|
||||||
if dry_run:
|
logger.info("[%d/%d] ✓ %s → %s", idx, total, name, source)
|
||||||
logger.info(" [dry-run] würde setzen: %s", foto_url)
|
if not dry_run:
|
||||||
else:
|
|
||||||
try:
|
try:
|
||||||
with db() as conn:
|
with db() as conn:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
|
|
@ -198,19 +293,19 @@ async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
||||||
)
|
)
|
||||||
saved += 1
|
saved += 1
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error("DB-Update fehlgeschlagen für %s: %s", name, exc)
|
logger.error("DB-Update %s: %s", name, exc)
|
||||||
|
else:
|
||||||
|
logger.info(" [dry-run] %s", foto_url)
|
||||||
else:
|
else:
|
||||||
logger.info("[%d/%d] ✗ %s — kein Foto gefunden", idx, total, name)
|
logger.info("[%d/%d] ✗ %s", idx, total, name)
|
||||||
|
|
||||||
# Rate-Limit: 1 Sekunde zwischen Anfragen
|
await asyncio.sleep(0.8)
|
||||||
await asyncio.sleep(1.0)
|
|
||||||
|
|
||||||
missing = total - found
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Fertig: %d/%d Fotos gefunden, %d gespeichert, %d ohne Treffer.",
|
"Fertig: %d/%d gefunden, %d gespeichert, %d ohne Treffer.",
|
||||||
found, total, saved, missing,
|
found, total, saved, total - found,
|
||||||
)
|
)
|
||||||
return {"found": found, "saved": saved, "missing": missing}
|
return {"found": found, "saved": saved, "missing": total - found}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
@ -219,32 +314,13 @@ if __name__ == "__main__":
|
||||||
format="%(asctime)s %(levelname)s %(message)s",
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
datefmt="%H:%M:%S",
|
datefmt="%H:%M:%S",
|
||||||
)
|
)
|
||||||
|
parser = argparse.ArgumentParser(description="Rassen-Fotos von Wikimedia holen")
|
||||||
parser = argparse.ArgumentParser(
|
parser.add_argument("--limit", type=int, default=100, metavar="N")
|
||||||
description="Fehlende Rassen-Fotos von Wikipedia/Wikimedia holen"
|
parser.add_argument("--dry-run", action="store_true")
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--limit",
|
|
||||||
type=int,
|
|
||||||
default=100,
|
|
||||||
metavar="N",
|
|
||||||
help="Maximale Anzahl Rassen (Default: 100)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--dry-run",
|
|
||||||
action="store_true",
|
|
||||||
help="Nur anzeigen, nicht in DB speichern",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--model",
|
|
||||||
default="claude-sonnet-4-6",
|
|
||||||
metavar="MODEL",
|
|
||||||
help="Claude-Modell für Text-Tasks (Default: claude-sonnet-4-6)",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
logger.info("DRY-RUN Modus — keine DB-Änderungen.")
|
logger.info("DRY-RUN — keine DB-Änderungen.")
|
||||||
|
|
||||||
result = asyncio.run(fetch_wiki_images(limit=args.limit, dry_run=args.dry_run))
|
result = asyncio.run(fetch_wiki_images(limit=args.limit, dry_run=args.dry_run))
|
||||||
print(
|
print(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue