fetch_wiki_images: Commons File-Namespace-Suche entfernt
_commons_search() und deren Aufruf (Stufe 4+5) entfernt. Nur WP pageimages DE/EN + Commons pageimages (exakter Treffer) bleiben. urllib.parse Import entfernt.
This commit is contained in:
parent
26074a42db
commit
2fed44fbd4
1 changed files with 0 additions and 82 deletions
|
|
@ -5,9 +5,6 @@ Strategie (in Reihenfolge):
|
|||
1. Wikipedia pageimages DE (exakter Artikel-Treffer)
|
||||
2. Wikipedia pageimages EN
|
||||
3. Wikimedia Commons pageimages (exakter Artikel-Treffer)
|
||||
4. Wikimedia Commons Datei-Suche (action=query&list=search im File-Namespace)
|
||||
→ Sucht nach Bilddateien die den Rassenamen enthalten
|
||||
5. Gleiche Suche mit name_de (falls vorhanden)
|
||||
|
||||
Alle Bilder werden als externe URLs gespeichert (Wikimedia CDN).
|
||||
Lizenz: CC-BY-SA (Wikimedia Commons) — Attribution in Wiki-Seite anzeigen.
|
||||
|
|
@ -22,7 +19,6 @@ import asyncio
|
|||
import logging
|
||||
import os
|
||||
import sys
|
||||
import urllib.parse
|
||||
|
||||
import httpx
|
||||
|
||||
|
|
@ -199,76 +195,6 @@ async def _commons_pageimages(name: str, client: httpx.AsyncClient) -> str | Non
|
|||
return None
|
||||
|
||||
|
||||
async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
||||
"""
|
||||
Wikimedia Commons Datei-Suche im File-Namespace (6).
|
||||
Gibt Thumbnail-URL des ersten brauchbaren Treffers zurück.
|
||||
"""
|
||||
try:
|
||||
# Schritt 1: Dateinamen suchen
|
||||
resp = await client.get(
|
||||
"https://commons.wikimedia.org/w/api.php",
|
||||
params={
|
||||
"action": "query",
|
||||
"list": "search",
|
||||
"srsearch": query,
|
||||
"srnamespace": "6", # File-Namespace
|
||||
"srlimit": "5",
|
||||
"format": "json",
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
hits = resp.json().get("query", {}).get("search", [])
|
||||
if not hits:
|
||||
return None
|
||||
|
||||
# Schritt 2: Für jeden Treffer imageinfo holen
|
||||
titles = "|".join(h["title"] for h in hits[:5])
|
||||
resp2 = await client.get(
|
||||
"https://commons.wikimedia.org/w/api.php",
|
||||
params={
|
||||
"action": "query",
|
||||
"titles": titles,
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "url",
|
||||
"iiurlwidth": _THUMB_SIZE,
|
||||
"format": "json",
|
||||
},
|
||||
)
|
||||
resp2.raise_for_status()
|
||||
pages = resp2.json().get("query", {}).get("pages", {})
|
||||
|
||||
# Qualitätsprüfung: Dateiname muss ALLE signifikanten Wörter enthalten
|
||||
# (verhindert Fehlmatches wie "Afghan" → Kind aus Afghanistan)
|
||||
query_words = [w for w in query.lower().split() if len(w) > 3]
|
||||
needed_score = len(query_words) # alle Wörter müssen vorkommen
|
||||
best_score = -1
|
||||
best: str | None = None
|
||||
|
||||
for page in pages.values():
|
||||
if page.get("pageid", -1) == -1:
|
||||
continue
|
||||
for ii in page.get("imageinfo", []):
|
||||
thumb = ii.get("thumburl") or ii.get("url", "")
|
||||
if not thumb or not _is_usable(thumb):
|
||||
continue
|
||||
fname = urllib.parse.unquote(thumb).lower()
|
||||
score = sum(1 for w in query_words if w in fname)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = thumb
|
||||
|
||||
# Alle signifikanten Wörter müssen im Dateinamen vorkommen
|
||||
# Bei 1-Wort-Suchen: Mindestens 1 Match
|
||||
if needed_score == 0:
|
||||
return None
|
||||
return best if best_score >= needed_score else None
|
||||
|
||||
except Exception as exc:
|
||||
logger.debug("Commons search (%s): %s", query, exc)
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
||||
"""
|
||||
Holt Fotos für alle Rassen ohne foto_url.
|
||||
|
|
@ -323,14 +249,6 @@ async def fetch_wiki_images(limit: int = 100, dry_run: bool = False) -> dict:
|
|||
source = f"Commons-exact ({variant})"
|
||||
break
|
||||
|
||||
# ── Stufe 4+5: Commons Datei-Suche ───────────────────────────
|
||||
if not foto_url:
|
||||
for variant in variants:
|
||||
foto_url = await _commons_search(variant, client)
|
||||
if foto_url:
|
||||
source = f"Commons-search ({variant})"
|
||||
break
|
||||
|
||||
if foto_url:
|
||||
found += 1
|
||||
logger.info("[%d/%d] ✓ %s → %s", idx, total, name, source)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue