Scraper: Commons-Suche nur bei vollständigem Namensmatch im Dateinamen
This commit is contained in:
parent
fc4cfcf19b
commit
26074a42db
1 changed files with 10 additions and 7 deletions
|
|
@ -238,10 +238,11 @@ async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
||||||
resp2.raise_for_status()
|
resp2.raise_for_status()
|
||||||
pages = resp2.json().get("query", {}).get("pages", {})
|
pages = resp2.json().get("query", {}).get("pages", {})
|
||||||
|
|
||||||
# Trefferqualität: bevorzuge Bilder die den Suchbegriff im Dateinamen haben
|
# Qualitätsprüfung: Dateiname muss ALLE signifikanten Wörter enthalten
|
||||||
# und lehne Treffer ab die keine Hundbezug haben
|
# (verhindert Fehlmatches wie "Afghan" → Kind aus Afghanistan)
|
||||||
query_words = [w for w in query.lower().split() if len(w) > 3]
|
query_words = [w for w in query.lower().split() if len(w) > 3]
|
||||||
best_score = -1
|
needed_score = len(query_words) # alle Wörter müssen vorkommen
|
||||||
|
best_score = -1
|
||||||
best: str | None = None
|
best: str | None = None
|
||||||
|
|
||||||
for page in pages.values():
|
for page in pages.values():
|
||||||
|
|
@ -252,14 +253,16 @@ async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
||||||
if not thumb or not _is_usable(thumb):
|
if not thumb or not _is_usable(thumb):
|
||||||
continue
|
continue
|
||||||
fname = urllib.parse.unquote(thumb).lower()
|
fname = urllib.parse.unquote(thumb).lower()
|
||||||
# Score: wie viele Suchbegriff-Wörter sind im Dateinamen?
|
|
||||||
score = sum(1 for w in query_words if w in fname)
|
score = sum(1 for w in query_words if w in fname)
|
||||||
if score > best_score:
|
if score > best_score:
|
||||||
best_score = score
|
best_score = score
|
||||||
best = thumb
|
best = thumb
|
||||||
|
|
||||||
# Mindestens 1 Wort des Suchbegriffs muss im Dateinamen vorkommen
|
# Alle signifikanten Wörter müssen im Dateinamen vorkommen
|
||||||
return best if best_score >= 1 else None
|
# Bei 1-Wort-Suchen: Mindestens 1 Match
|
||||||
|
if needed_score == 0:
|
||||||
|
return None
|
||||||
|
return best if best_score >= needed_score else None
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("Commons search (%s): %s", query, exc)
|
logger.debug("Commons search (%s): %s", query, exc)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue