Scraper: fetch_wiki_images striktere Bildfilterung (kein PDF/Stadtfoto/Dokument)
This commit is contained in:
parent
93ea8a69fd
commit
fc4cfcf19b
1 changed files with 54 additions and 7 deletions
|
|
@ -37,9 +37,11 @@ _WP_HEADERS = {
|
|||
}
|
||||
_THUMB_SIZE = 600
|
||||
|
||||
# Nur diese Bildformate akzeptieren
|
||||
_VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp")
|
||||
|
||||
# Dateinamen-Fragmente, die auf unbrauchbare Bilder hindeuten
|
||||
_SKIP_PATTERNS = (
|
||||
".svg",
|
||||
"flag_of_",
|
||||
"coat_of_arms",
|
||||
"emblem_of_",
|
||||
|
|
@ -57,6 +59,35 @@ _SKIP_PATTERNS = (
|
|||
"_icon",
|
||||
"logo_",
|
||||
"_logo",
|
||||
# Historische Dokumente / Bücher / Karten
|
||||
"_survey_",
|
||||
"_register_",
|
||||
"_magazine_",
|
||||
"_journal_",
|
||||
"_bulletin_",
|
||||
"_catalogue_",
|
||||
"_glossaire_",
|
||||
"_aarbog_",
|
||||
"_museum_",
|
||||
"_tales_",
|
||||
"_history_",
|
||||
"_stories_",
|
||||
"_inn_",
|
||||
"curiosities",
|
||||
"viviparous",
|
||||
"quadrupeds",
|
||||
# Geo / Städte
|
||||
"spain.jpg",
|
||||
"italy.jpg",
|
||||
"france.jpg",
|
||||
"germany.jpg",
|
||||
"austria.jpg",
|
||||
"map_of_",
|
||||
# Militär / Dokumente
|
||||
"military_working",
|
||||
"mod_45",
|
||||
"federal_register",
|
||||
"prairie_dog", # ist kein Hund
|
||||
)
|
||||
|
||||
# Suffixe die beim Normalisieren abgeschnitten werden
|
||||
|
|
@ -64,9 +95,21 @@ _BREED_SUFFIXES = (" dog", " hound", " terrier", " spaniel", " shepherd")
|
|||
|
||||
|
||||
def _is_usable(url: str) -> bool:
|
||||
"""Gibt True zurück wenn die URL eine brauchbare Hundebild-URL ist."""
|
||||
low = url.lower()
|
||||
if low.endswith(".svg"):
|
||||
|
||||
# Nur echte Bildformate — keine PDFs, DjVu, Karten etc.
|
||||
# Prüfe gegen den Dateinamen (nach dem letzten /)
|
||||
fname = low.split("/")[-1].split("?")[0]
|
||||
# Bei Thumbnail-URLs: Originaldatei-Endung extrahieren
|
||||
# z.B. "960px-foo.jpg" → ".jpg" | "page1-500px-foo.pdf.jpg" → ablehnen
|
||||
if ".pdf" in fname or ".djvu" in fname or ".svg" in fname:
|
||||
return False
|
||||
if not any(fname.endswith(ext) or (ext + "/") in low for ext in _VALID_EXTENSIONS):
|
||||
# Manchmal ist die Extension mitten in der URL (Thumbnail-Pfad)
|
||||
if not any(ext in low for ext in _VALID_EXTENSIONS):
|
||||
return False
|
||||
|
||||
for pattern in _SKIP_PATTERNS:
|
||||
if pattern in low:
|
||||
return False
|
||||
|
|
@ -196,7 +239,9 @@ async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
|||
pages = resp2.json().get("query", {}).get("pages", {})
|
||||
|
||||
# Trefferqualität: bevorzuge Bilder die den Suchbegriff im Dateinamen haben
|
||||
query_lower = query.lower().replace(" ", "_")
|
||||
# und lehne Treffer ab die keine Hundbezug haben
|
||||
query_words = [w for w in query.lower().split() if len(w) > 3]
|
||||
best_score = -1
|
||||
best: str | None = None
|
||||
|
||||
for page in pages.values():
|
||||
|
|
@ -207,12 +252,14 @@ async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
|||
if not thumb or not _is_usable(thumb):
|
||||
continue
|
||||
fname = urllib.parse.unquote(thumb).lower()
|
||||
if query_lower in fname and best is None:
|
||||
# Score: wie viele Suchbegriff-Wörter sind im Dateinamen?
|
||||
score = sum(1 for w in query_words if w in fname)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = thumb
|
||||
elif best is None:
|
||||
best = thumb # Fallback: erster brauchbarer Treffer
|
||||
|
||||
return best
|
||||
# Mindestens 1 Wort des Suchbegriffs muss im Dateinamen vorkommen
|
||||
return best if best_score >= 1 else None
|
||||
|
||||
except Exception as exc:
|
||||
logger.debug("Commons search (%s): %s", query, exc)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue