banyaro/backend/scraper/breeds.py
rene 097295c628 Sprint 11: Freunde & Chat + Phosphor-Icon-Vollmigration
- Freundschaften (pending/accepted), Nutzersuche, Anfragen per Push
- Direktnachrichten mit Polling, iMessage-Stil, Deep-Links aus Push
- Alle Seiten (map, places, diary, health, dog-profile, sitting, knigge,
  forum, wiki, walks) vollständig auf Phosphor-Icons migriert
- Wikidata-Rassen-Scraper (~833 neue Rassen, lokal gespiegelte Fotos)
- TheDogAPI lokal gespiegelt (169 Rassen + Fotos)
- Quiz-Result-Cards horizontal (korrekte Bildproportionen)
- SW by-v89
2026-04-15 21:33:53 +02:00

138 lines
5.9 KiB
Python

"""Fetches breed data from TheDogAPI and seeds the wiki_rassen table."""
import httpx, re, logging, os
from database import db
MEDIA_DIR = os.getenv("MEDIA_DIR", "/data/media")
BREEDS_DIR = os.path.join(MEDIA_DIR, "breeds")
logger = logging.getLogger(__name__)
def _slug(name: str) -> str:
return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
def _derive_groesse(weight_max_kg: float) -> str:
if weight_max_kg <= 10: return 'klein'
if weight_max_kg <= 25: return 'mittel'
if weight_max_kg <= 40: return 'gross'
return 'sehr_gross'
def _derive_aktivitaet(bred_for: str, temperament: str, group: str) -> str:
text = f"{bred_for or ''} {temperament or ''} {group or ''}".lower()
high_keywords = ['herding', 'hunting', 'sporting', 'working', 'energetic', 'active', 'agile']
low_keywords = ['companion', 'toy', 'lap', 'gentle', 'calm', 'quiet']
if any(k in text for k in high_keywords): return 'hoch'
if any(k in text for k in low_keywords): return 'niedrig'
return 'mittel'
def _derive_erfahrung(temperament: str, group: str) -> str:
text = f"{temperament or ''} {group or ''}".lower()
expert = ['stubborn', 'independent', 'dominant', 'terrier', 'herding']
advanced = ['protective', 'reserved', 'working', 'guard']
if any(k in text for k in expert): return 'fortgeschritten'
if any(k in text for k in advanced): return 'fortgeschritten'
return 'anfaenger'
def _derive_kinder(temperament: str) -> int:
if not temperament: return 1
bad = ['aggressive', 'aloof', 'reserved with strangers']
return 0 if any(k in temperament.lower() for k in bad) else 1
def _parse_weight_kg(weight_metric: str):
"""Parse '7 - 14' or '14' -> (min, max) in kg"""
try:
parts = [p.strip() for p in weight_metric.replace(',', '.').split('-')]
nums = [float(p) for p in parts if p]
if len(nums) >= 2: return nums[0], nums[1]
if len(nums) == 1: return nums[0], nums[0]
except Exception: pass
return None, None
async def mirror_breed_photos():
"""Download CDN breed photos to local storage and update foto_url in DB."""
os.makedirs(BREEDS_DIR, exist_ok=True)
with db() as conn:
rows = conn.execute(
"SELECT id, external_id, foto_url FROM wiki_rassen WHERE foto_url LIKE 'http%' AND foto_url NOT LIKE '/media/%'"
).fetchall()
if not rows:
logger.info("Breed photos: nothing to mirror")
return 0
mirrored = 0
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
for row_id, ext_id, cdn_url in rows:
local_path = os.path.join(BREEDS_DIR, f"{ext_id}.jpg")
local_url = f"/media/breeds/{ext_id}.jpg"
# Skip if already downloaded
if os.path.exists(local_path):
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
mirrored += 1
continue
try:
r = await client.get(cdn_url)
if r.status_code == 200:
with open(local_path, "wb") as f:
f.write(r.content)
with db() as conn:
conn.execute("UPDATE wiki_rassen SET foto_url=? WHERE id=?", (local_url, row_id))
mirrored += 1
else:
logger.warning(f"Breed photo {ext_id}: HTTP {r.status_code}")
except Exception as e:
logger.warning(f"Breed photo {ext_id} download failed: {e}")
logger.info(f"Breed photos mirrored: {mirrored}/{len(rows)}")
return mirrored
async def fetch_and_seed_breeds():
"""Fetch all breeds from TheDogAPI and upsert into wiki_rassen."""
api_key = os.getenv("THEDOGAPI_KEY", "")
try:
async with httpx.AsyncClient(timeout=30) as client:
r = await client.get('https://api.thedogapi.com/v1/breeds',
headers={'x-api-key': api_key})
r.raise_for_status()
breeds = r.json()
except Exception as e:
logger.error(f"TheDogAPI fetch failed: {e}")
return 0
seeded = 0
with db() as conn:
for b in breeds:
try:
w_min, w_max = _parse_weight_kg(b.get('weight', {}).get('metric', '') or '')
groesse = _derive_groesse(w_max or 20)
aktivitaet = _derive_aktivitaet(b.get('bred_for',''), b.get('temperament',''), b.get('breed_group',''))
erfahrung = _derive_erfahrung(b.get('temperament',''), b.get('breed_group',''))
kinder = _derive_kinder(b.get('temperament',''))
wohnung = 1 if groesse == 'klein' and aktivitaet in ('niedrig','mittel') else 0
foto_url = b.get('image', {}).get('url') or None
slug = _slug(b['name'])
conn.execute("""
INSERT INTO wiki_rassen
(external_id, name, gruppe, herkunft, temperament,
gewicht_min_kg, gewicht_max_kg, groesse, lebensdauer,
foto_url, bred_for, aktivitaet, wohnung_geeignet,
kinder_geeignet, erfahrung, slug)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(external_id) DO UPDATE SET
foto_url=excluded.foto_url,
temperament=excluded.temperament
""", (
b['id'], b['name'],
b.get('breed_group'), b.get('origin'), b.get('temperament'),
w_min, w_max, groesse, b.get('life_span'),
foto_url, b.get('bred_for'), aktivitaet, wohnung, kinder, erfahrung, slug
))
seeded += 1
except Exception as e:
logger.warning(f"Breed {b.get('name')} seed failed: {e}")
logger.info(f"Breeds seeded: {seeded}")
return seeded