Sprint 11b: Wiki-Foto-Einreichungen + Wikipedia-Foto-Scraper
- User können Fotos für Rassen vorschlagen (Upload-Modal in Rassen-Detail) - Mod/Admin-Review-Tab im Wiki mit Freischalten/Ablehnen + Push-Notification - wikipedia_photos.py: holt Fotos über Wikidata-QID → Wikipedia-API - Foto-Status: 578 lokal, 186 extern, 238 ohne Foto - DB: wiki_foto_submissions Tabelle - SW by-v90
This commit is contained in:
parent
097295c628
commit
32d630d5a1
6 changed files with 598 additions and 3 deletions
198
backend/scraper/wikipedia_photos.py
Normal file
198
backend/scraper/wikipedia_photos.py
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
"""
|
||||
Holt Fotos für Wikidata-Rassen ohne Bild über die Wikipedia-API.
|
||||
|
||||
Strategie:
|
||||
1. Wikidata-API: QID → Wikipedia-Artikel-Titel (DE bevorzugt, Fallback EN)
|
||||
2. Wikipedia pageimages-API: Artikel-Titel → Bild-URL
|
||||
3. Wikimedia Commons: Bild herunterladen und lokal speichern
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import httpx
|
||||
|
||||
from database import db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
|
||||
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
|
||||
|
||||
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
|
||||
WP_DE_API = "https://de.wikipedia.org/w/api.php"
|
||||
WP_EN_API = "https://en.wikipedia.org/w/api.php"
|
||||
HEADERS = {"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
|
||||
BATCH_SIZE = 50 # Wikidata API erlaubt max 50 IDs pro Request
|
||||
SLEEP_MS = 0.35 # 350ms zwischen Downloads
|
||||
|
||||
|
||||
def _qid_from_ext(ext_id: str) -> str | None:
|
||||
"""Extrahiert QID aus external_id wie 'wd_Q12345' → 'Q12345'."""
|
||||
m = re.match(r"wd_(Q\d+)$", ext_id)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
async def _fetch_sitelinks(qids: list[str], client: httpx.AsyncClient) -> dict[str, dict]:
|
||||
"""
|
||||
Gibt {qid: {'de': 'Titel_DE', 'en': 'Titel_EN'}} zurück
|
||||
für alle QIDs, die mindestens einen Wikipedia-Sitelink haben.
|
||||
"""
|
||||
if not qids:
|
||||
return {}
|
||||
try:
|
||||
r = await client.get(WIKIDATA_API, params={
|
||||
"action": "wbgetentities",
|
||||
"ids": "|".join(qids),
|
||||
"props": "sitelinks",
|
||||
"sitefilter": "dewiki|enwiki",
|
||||
"format": "json",
|
||||
})
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
except Exception as e:
|
||||
logger.warning(f"Wikidata sitelinks Fehler: {e}")
|
||||
return {}
|
||||
|
||||
result = {}
|
||||
for qid, entity in data.get("entities", {}).items():
|
||||
sitelinks = entity.get("sitelinks", {})
|
||||
titles = {}
|
||||
if "dewiki" in sitelinks:
|
||||
titles["de"] = sitelinks["dewiki"]["title"]
|
||||
if "enwiki" in sitelinks:
|
||||
titles["en"] = sitelinks["enwiki"]["title"]
|
||||
if titles:
|
||||
result[qid] = titles
|
||||
return result
|
||||
|
||||
|
||||
async def _fetch_wp_image(title: str, lang: str, client: httpx.AsyncClient) -> str | None:
|
||||
"""
|
||||
Gibt die Thumbnail-URL eines Wikipedia-Artikels zurück (600px-Version).
|
||||
"""
|
||||
api = WP_DE_API if lang == "de" else WP_EN_API
|
||||
try:
|
||||
r = await client.get(api, params={
|
||||
"action": "query",
|
||||
"titles": title,
|
||||
"prop": "pageimages",
|
||||
"pithumbsize": 600,
|
||||
"format": "json",
|
||||
})
|
||||
r.raise_for_status()
|
||||
pages = r.json().get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
thumb = page.get("thumbnail", {}).get("source")
|
||||
if thumb:
|
||||
return thumb
|
||||
except Exception as e:
|
||||
logger.debug(f"WP pageimage Fehler ({lang}/{title}): {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def _download_image(url: str, path: str, client: httpx.AsyncClient) -> bool:
|
||||
"""Lädt Bild herunter, speichert unter path. True bei Erfolg."""
|
||||
for attempt in range(2):
|
||||
try:
|
||||
await asyncio.sleep(SLEEP_MS)
|
||||
r = await client.get(url)
|
||||
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
|
||||
with open(path, "wb") as f:
|
||||
f.write(r.content)
|
||||
return True
|
||||
if r.status_code == 429:
|
||||
await asyncio.sleep(15 * (attempt + 1))
|
||||
except Exception as e:
|
||||
logger.debug(f"Download Fehler {url}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def fetch_wikipedia_photos() -> int:
|
||||
"""
|
||||
Haupt-Funktion: Holt Wikipedia-Fotos für alle Rassen ohne foto_url.
|
||||
Gibt Anzahl erfolgreich gespeicherter Fotos zurück.
|
||||
"""
|
||||
os.makedirs(BREEDS_DIR, exist_ok=True)
|
||||
|
||||
with db() as conn:
|
||||
rows = conn.execute("""
|
||||
SELECT id, external_id, name
|
||||
FROM wiki_rassen
|
||||
WHERE (foto_url IS NULL OR foto_url = '')
|
||||
AND external_id LIKE 'wd_%'
|
||||
""").fetchall()
|
||||
|
||||
if not rows:
|
||||
logger.info("Wikipedia-Fotos: nichts zu tun")
|
||||
return 0
|
||||
|
||||
logger.info(f"Wikipedia-Fotos: {len(rows)} Rassen ohne Foto")
|
||||
|
||||
# QID → DB-Row mappen
|
||||
qid_map = {} # { 'Q12345': {'id': 1, 'external_id': 'wd_Q12345', 'name': '...'} }
|
||||
for row in rows:
|
||||
qid = _qid_from_ext(row["external_id"])
|
||||
if qid:
|
||||
qid_map[qid] = dict(row)
|
||||
|
||||
qids = list(qid_map.keys())
|
||||
saved = 0
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=30,
|
||||
follow_redirects=True,
|
||||
headers=HEADERS
|
||||
) as client:
|
||||
|
||||
# Sitelinks in Batches holen
|
||||
sitelinks: dict[str, dict] = {}
|
||||
for i in range(0, len(qids), BATCH_SIZE):
|
||||
batch = qids[i:i + BATCH_SIZE]
|
||||
chunk = await _fetch_sitelinks(batch, client)
|
||||
sitelinks.update(chunk)
|
||||
await asyncio.sleep(0.5)
|
||||
logger.info(f"Sitelinks: {i + len(batch)}/{len(qids)} abgefragt, {len(sitelinks)} mit WP-Link")
|
||||
|
||||
logger.info(f"Wikipedia-Links gefunden: {len(sitelinks)}/{len(qids)}")
|
||||
|
||||
# Für jeden mit Sitelink → Bild holen + herunterladen
|
||||
for idx, (qid, titles) in enumerate(sitelinks.items()):
|
||||
row = qid_map[qid]
|
||||
row_id = row["id"]
|
||||
lang = "de" if "de" in titles else "en"
|
||||
title = titles[lang]
|
||||
|
||||
img_url = await _fetch_wp_image(title, lang, client)
|
||||
if not img_url:
|
||||
# Zweiter Versuch mit EN wenn DE kein Bild hat
|
||||
if lang == "de" and "en" in titles:
|
||||
img_url = await _fetch_wp_image(titles["en"], "en", client)
|
||||
|
||||
if not img_url:
|
||||
logger.debug(f"Kein WP-Bild für {row['name']} ({qid})")
|
||||
continue
|
||||
|
||||
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
|
||||
local_url = f"/media/breeds/{qid}.jpg"
|
||||
|
||||
if os.path.exists(local_path):
|
||||
# Datei existiert bereits → nur DB updaten
|
||||
with db() as conn:
|
||||
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
|
||||
saved += 1
|
||||
continue
|
||||
|
||||
ok = await _download_image(img_url, local_path, client)
|
||||
if ok:
|
||||
with db() as conn:
|
||||
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
|
||||
saved += 1
|
||||
else:
|
||||
logger.debug(f"Download fehlgeschlagen: {row['name']}")
|
||||
|
||||
if idx % 50 == 0 and idx > 0:
|
||||
logger.info(f"Wikipedia-Fotos: {saved}/{idx + 1} bisher")
|
||||
|
||||
logger.info(f"Wikipedia-Fotos gespeichert: {saved}/{len(sitelinks)} (mit WP-Link)")
|
||||
return saved
|
||||
Loading…
Add table
Add a link
Reference in a new issue