"""Fetches missing dog breed data from Wikidata SPARQL and seeds wiki_rassen.""" import httpx, re, logging, os from database import db logger = logging.getLogger(__name__) MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media") BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds") SPARQL_URL = "https://query.wikidata.org/sparql" # GROUP BY + SAMPLE so each breed appears once even if it has multiple images SPARQL_QUERY = """ SELECT ?breed (SAMPLE(?nameDE) AS ?nameDE) (SAMPLE(?nameEN) AS ?nameEN) (SAMPLE(?image) AS ?image) (SAMPLE(?countryDE) AS ?countryDE) (SAMPLE(?descDE) AS ?descDE) (SAMPLE(?descEN) AS ?descEN) WHERE { ?breed wdt:P31 wd:Q39367 . OPTIONAL { ?breed rdfs:label ?nameDE FILTER(LANG(?nameDE) = "de") } OPTIONAL { ?breed rdfs:label ?nameEN FILTER(LANG(?nameEN) = "en") } FILTER(BOUND(?nameDE) || BOUND(?nameEN)) OPTIONAL { ?breed wdt:P18 ?image } OPTIONAL { ?breed wdt:P495 ?country . ?country rdfs:label ?countryDE FILTER(LANG(?countryDE) = "de") } OPTIONAL { ?breed schema:description ?descDE FILTER(LANG(?descDE) = "de") } OPTIONAL { ?breed schema:description ?descEN FILTER(LANG(?descEN) = "en") } } GROUP BY ?breed ORDER BY ?nameDE ?nameEN """ def _slug(name: str) -> str: return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') def _normalise(name: str) -> str: """Lowercase + remove diacritics for name deduplication.""" import unicodedata nfkd = unicodedata.normalize('NFKD', name.lower()) return re.sub(r'[^a-z0-9 ]', '', nfkd).strip() async def fetch_and_seed_wikidata_breeds(): """Query Wikidata for dog breeds and insert only those missing from wiki_rassen.""" # -- fetch from SPARQL ------------------------------------------------- try: async with httpx.AsyncClient( timeout=60, headers={"Accept": "application/sparql-results+json", "User-Agent": "BanYaro/1.0 (https://banyaro.app; contact@banyaro.app)"} ) as client: r = await client.get(SPARQL_URL, params={"query": SPARQL_QUERY}) r.raise_for_status() data = r.json() except Exception as e: logger.error(f"Wikidata SPARQL fetch failed: {e}") return 0 bindings = data.get("results", {}).get("bindings", []) logger.info(f"Wikidata: {len(bindings)} breed entries received") # -- load existing names for deduplication ----------------------------- with db() as conn: existing = conn.execute("SELECT name FROM wiki_rassen").fetchall() existing_norm = {_normalise(row[0]) for row in existing} seeded = 0 with db() as conn: for b in bindings: name = (b.get("nameDE", {}).get("value") or b.get("nameEN", {}).get("value") or "").strip() if not name: continue # skip if already in DB (by normalised name) if _normalise(name) in existing_norm: continue qid = b["breed"]["value"].rsplit("/", 1)[-1] # e.g. "Q312440" ext_id = f"wd_{qid}" image_url = b.get("image", {}).get("value") or None herkunft = b.get("countryDE", {}).get("value") or None desc = (b.get("descDE", {}).get("value") or b.get("descEN", {}).get("value") or None) slug_base = _slug(name) # make slug unique if collision exists slug = slug_base suffix = 1 while True: row = conn.execute( "SELECT 1 FROM wiki_rassen WHERE slug=? AND external_id != ?", (slug, ext_id) ).fetchone() if not row: break slug = f"{slug_base}-{suffix}" suffix += 1 try: conn.execute(""" INSERT INTO wiki_rassen (external_id, name, gruppe, herkunft, temperament, gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer, foto_url, bred_for, aktivitaet, wohnung_geeignet, kinder_geeignet, erfahrung, slug) VALUES (?,?,?,?,?,NULL,NULL,'mittel',NULL,?,NULL,'mittel',0,1,'anfaenger',?) ON CONFLICT(external_id) DO UPDATE SET foto_url = CASE WHEN excluded.foto_url IS NOT NULL AND wiki_rassen.foto_url IS NULL THEN excluded.foto_url ELSE wiki_rassen.foto_url END, herkunft = COALESCE(wiki_rassen.herkunft, excluded.herkunft), temperament = COALESCE(wiki_rassen.temperament, excluded.temperament) """, (ext_id, name, None, herkunft, desc, image_url, slug)) existing_norm.add(_normalise(name)) # avoid re-inserting within same run seeded += 1 except Exception as e: logger.warning(f"Wikidata breed '{name}' seed failed: {e}") logger.info(f"Wikidata breeds seeded: {seeded}") return seeded async def mirror_wikidata_photos(): """Download Wikimedia Commons photos for Wikidata breeds that still have external URLs.""" os.makedirs(BREEDS_DIR, exist_ok=True) with db() as conn: rows = conn.execute( """SELECT id, external_id, foto_url FROM wiki_rassen WHERE external_id LIKE 'wd_%' AND foto_url LIKE 'http%' AND foto_url NOT LIKE '/media/%'""" ).fetchall() if not rows: logger.info("Wikidata photos: nothing to mirror") return 0 mirrored = 0 import asyncio async with httpx.AsyncClient( timeout=30, follow_redirects=True, headers={"User-Agent": "BanYaro/1.0 (https://banyaro.app)"} ) as client: for i, (row_id, ext_id, img_url) in enumerate(rows): qid = ext_id.replace("wd_", "") local_path = os.path.join(BREEDS_DIR, f"{qid}.jpg") local_url = f"/media/breeds/{qid}.jpg" if os.path.exists(local_path): with db() as conn: conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id)) mirrored += 1 continue # Wikimedia Commons: append ?width=600 for scaled download fetch_url = img_url if "?" in img_url else img_url + "?width=600" retries = 2 for attempt in range(retries): try: await asyncio.sleep(0.3) # 300ms zwischen Requests → ~3/s r = await client.get(fetch_url) if r.status_code == 200 and r.headers.get("content-type", "").startswith("image"): with open(local_path, "wb") as f: f.write(r.content) with db() as conn: conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id)) mirrored += 1 break elif r.status_code == 429: wait = 10 * (attempt + 1) logger.info(f"Rate limited, warte {wait}s…") await asyncio.sleep(wait) else: logger.warning(f"Wikidata photo {qid}: HTTP {r.status_code}") break except Exception as e: logger.warning(f"Wikidata photo {qid} failed: {e}") break if i % 50 == 0 and i > 0: logger.info(f"Wikidata photos: {mirrored}/{i+1} bisher") logger.info(f"Wikidata photos mirrored: {mirrored}/{len(rows)}") return mirrored