- Chat: Foto-Versand (POST /api/chat/conversations/{id}/upload, media_url/media_type)
- Chat: Online-Indikator (last_seen Heartbeat, grüner Dot, 3min-Fenster)
- Chat: Read Receipts (read_at, Einzel-/Doppelhaken-Icons)
- Gesundheit: Dokument löschen (DELETE .../dokument, Datei + DB-Eintrag)
- Bug: events.user_id NOT NULL → nullable (Table-Recreation-Migration)
- Bug: scheduler INSERT user_id 0 → NULL
- Bug: Wikidata Rate-Limit: sleep 0.3s→1.0s, retries 2→4, exponentielles Backoff
- SW: by-v146, APP_VER 119
196 lines
7.8 KiB
Python
196 lines
7.8 KiB
Python
"""Fetches missing dog breed data from Wikidata SPARQL and seeds wiki_rassen."""
|
|
import httpx, re, logging, os
|
|
from database import db
|
|
|
|
logger = logging.getLogger(__name__)
|
|
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
|
|
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
|
|
|
|
SPARQL_URL = "https://query.wikidata.org/sparql"
|
|
|
|
# GROUP BY + SAMPLE so each breed appears once even if it has multiple images
|
|
SPARQL_QUERY = """
|
|
SELECT ?breed
|
|
(SAMPLE(?nameDE) AS ?nameDE)
|
|
(SAMPLE(?nameEN) AS ?nameEN)
|
|
(SAMPLE(?image) AS ?image)
|
|
(SAMPLE(?countryDE) AS ?countryDE)
|
|
(SAMPLE(?descDE) AS ?descDE)
|
|
(SAMPLE(?descEN) AS ?descEN)
|
|
WHERE {
|
|
?breed wdt:P31 wd:Q39367 .
|
|
OPTIONAL { ?breed rdfs:label ?nameDE FILTER(LANG(?nameDE) = "de") }
|
|
OPTIONAL { ?breed rdfs:label ?nameEN FILTER(LANG(?nameEN) = "en") }
|
|
FILTER(BOUND(?nameDE) || BOUND(?nameEN))
|
|
OPTIONAL { ?breed wdt:P18 ?image }
|
|
OPTIONAL {
|
|
?breed wdt:P495 ?country .
|
|
?country rdfs:label ?countryDE FILTER(LANG(?countryDE) = "de")
|
|
}
|
|
OPTIONAL { ?breed schema:description ?descDE FILTER(LANG(?descDE) = "de") }
|
|
OPTIONAL { ?breed schema:description ?descEN FILTER(LANG(?descEN) = "en") }
|
|
}
|
|
GROUP BY ?breed
|
|
ORDER BY ?nameDE ?nameEN
|
|
"""
|
|
|
|
def _slug(name: str) -> str:
|
|
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
|
|
|
|
def _normalise(name: str) -> str:
|
|
"""Lowercase + remove diacritics for name deduplication."""
|
|
import unicodedata
|
|
nfkd = unicodedata.normalize('NFKD', name.lower())
|
|
return re.sub(r'[^a-z0-9 ]', '', nfkd).strip()
|
|
|
|
|
|
async def fetch_and_seed_wikidata_breeds():
|
|
"""Query Wikidata for dog breeds and insert only those missing from wiki_rassen."""
|
|
# -- fetch from SPARQL -------------------------------------------------
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=60,
|
|
headers={"Accept": "application/sparql-results+json",
|
|
"User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"}
|
|
) as client:
|
|
r = await client.get(SPARQL_URL, params={"query": SPARQL_QUERY})
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
except Exception as e:
|
|
logger.error(f"Wikidata SPARQL fetch failed: {e}")
|
|
return 0
|
|
|
|
bindings = data.get("results", {}).get("bindings", [])
|
|
logger.info(f"Wikidata: {len(bindings)} breed entries received")
|
|
|
|
# -- load existing names for deduplication -----------------------------
|
|
with db() as conn:
|
|
existing = conn.execute("SELECT name FROM wiki_rassen").fetchall()
|
|
existing_norm = {_normalise(row[0]) for row in existing}
|
|
|
|
seeded = 0
|
|
with db() as conn:
|
|
for b in bindings:
|
|
name = (b.get("nameDE", {}).get("value") or
|
|
b.get("nameEN", {}).get("value") or "").strip()
|
|
if not name:
|
|
continue
|
|
|
|
# skip if already in DB (by normalised name)
|
|
if _normalise(name) in existing_norm:
|
|
continue
|
|
|
|
qid = b["breed"]["value"].rsplit("/", 1)[-1] # e.g. "Q312440"
|
|
ext_id = f"wd_{qid}"
|
|
image_url = b.get("image", {}).get("value") or None
|
|
herkunft = b.get("countryDE", {}).get("value") or None
|
|
desc = (b.get("descDE", {}).get("value") or
|
|
b.get("descEN", {}).get("value") or None)
|
|
slug_base = _slug(name)
|
|
|
|
# make slug unique if collision exists
|
|
slug = slug_base
|
|
suffix = 1
|
|
while True:
|
|
row = conn.execute(
|
|
"SELECT 1 FROM wiki_rassen WHERE slug=? AND external_id != ?",
|
|
(slug, ext_id)
|
|
).fetchone()
|
|
if not row:
|
|
break
|
|
slug = f"{slug_base}-{suffix}"
|
|
suffix += 1
|
|
|
|
try:
|
|
conn.execute("""
|
|
INSERT INTO wiki_rassen
|
|
(external_id, name, gruppe, herkunft, temperament,
|
|
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
|
|
foto_url, bred_for, aktivitaet, wohnung_geeignet,
|
|
kinder_geeignet, erfahrung, slug)
|
|
VALUES (?,?,?,?,?,NULL,NULL,'mittel',NULL,?,NULL,'mittel',0,1,'anfaenger',?)
|
|
ON CONFLICT(external_id) DO UPDATE SET
|
|
foto_url = CASE
|
|
WHEN excluded.foto_url IS NOT NULL AND wiki_rassen.foto_url IS NULL
|
|
THEN excluded.foto_url
|
|
ELSE wiki_rassen.foto_url
|
|
END,
|
|
herkunft = COALESCE(wiki_rassen.herkunft, excluded.herkunft),
|
|
temperament = COALESCE(wiki_rassen.temperament, excluded.temperament)
|
|
""", (ext_id, name, None, herkunft, desc, image_url, slug))
|
|
existing_norm.add(_normalise(name)) # avoid re-inserting within same run
|
|
seeded += 1
|
|
except Exception as e:
|
|
logger.warning(f"Wikidata breed '{name}' seed failed: {e}")
|
|
|
|
logger.info(f"Wikidata breeds seeded: {seeded}")
|
|
return seeded
|
|
|
|
|
|
async def mirror_wikidata_photos():
|
|
"""Download Wikimedia Commons photos for Wikidata breeds that still have external URLs."""
|
|
os.makedirs(BREEDS_DIR, exist_ok=True)
|
|
|
|
with db() as conn:
|
|
rows = conn.execute(
|
|
"""SELECT id, external_id, foto_url FROM wiki_rassen
|
|
WHERE external_id LIKE 'wd_%'
|
|
AND foto_url LIKE 'http%'
|
|
AND foto_url NOT LIKE '/media/%'"""
|
|
).fetchall()
|
|
|
|
if not rows:
|
|
logger.info("Wikidata photos: nothing to mirror")
|
|
return 0
|
|
|
|
mirrored = 0
|
|
import asyncio
|
|
async with httpx.AsyncClient(
|
|
timeout=30,
|
|
follow_redirects=True,
|
|
headers={"User-Agent": "BanYaro/1.0 (https://banyaro.app)"}
|
|
) as client:
|
|
for i, (row_id, ext_id, img_url) in enumerate(rows):
|
|
qid = ext_id.replace("wd_", "")
|
|
local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg")
|
|
local_url = f"/media/breeds/{qid}.jpg"
|
|
|
|
if os.path.exists(local_path):
|
|
with db() as conn:
|
|
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
|
|
(local_url, row_id))
|
|
mirrored += 1
|
|
continue
|
|
|
|
# Wikimedia Commons: append ?width=600 for scaled download
|
|
fetch_url = img_url if "?" in img_url else img_url + "?width=600"
|
|
retries = 4
|
|
for attempt in range(retries):
|
|
try:
|
|
await asyncio.sleep(1.0 * (attempt + 1)) # exponentiell: 1s, 2s, 3s, 4s
|
|
r = await client.get(fetch_url)
|
|
if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"):
|
|
with open(local_path, "wb") as f:
|
|
f.write(r.content)
|
|
with db() as conn:
|
|
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?",
|
|
(local_url, row_id))
|
|
mirrored += 1
|
|
break
|
|
elif r.status_code == 429:
|
|
wait = 10 * (attempt + 1)
|
|
logger.info(f"Rate limited, warte {wait}s…")
|
|
await asyncio.sleep(wait)
|
|
else:
|
|
logger.warning(f"Wikidata photo {qid}: HTTP {r.status_code}")
|
|
break
|
|
except Exception as e:
|
|
logger.warning(f"Wikidata photo {qid} failed: {e}")
|
|
break
|
|
|
|
if i % 50 == 0 and i > 0:
|
|
logger.info(f"Wikidata photos: {mirrored}/{i+1} bisher")
|
|
|
|
logger.info(f"Wikidata photos mirrored: {mirrored}/{len(rows)}")
|
|
return mirrored
|