Scraper: fetch_wiki_images striktere Bildfilterung (kein PDF/Stadtfoto/Dokument)
This commit is contained in:
parent
93ea8a69fd
commit
fc4cfcf19b
1 changed files with 54 additions and 7 deletions
|
|
@ -37,9 +37,11 @@ _WP_HEADERS = {
|
||||||
}
|
}
|
||||||
_THUMB_SIZE = 600
|
_THUMB_SIZE = 600
|
||||||
|
|
||||||
|
# Nur diese Bildformate akzeptieren
|
||||||
|
_VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp")
|
||||||
|
|
||||||
# Dateinamen-Fragmente, die auf unbrauchbare Bilder hindeuten
|
# Dateinamen-Fragmente, die auf unbrauchbare Bilder hindeuten
|
||||||
_SKIP_PATTERNS = (
|
_SKIP_PATTERNS = (
|
||||||
".svg",
|
|
||||||
"flag_of_",
|
"flag_of_",
|
||||||
"coat_of_arms",
|
"coat_of_arms",
|
||||||
"emblem_of_",
|
"emblem_of_",
|
||||||
|
|
@ -57,6 +59,35 @@ _SKIP_PATTERNS = (
|
||||||
"_icon",
|
"_icon",
|
||||||
"logo_",
|
"logo_",
|
||||||
"_logo",
|
"_logo",
|
||||||
|
# Historische Dokumente / Bücher / Karten
|
||||||
|
"_survey_",
|
||||||
|
"_register_",
|
||||||
|
"_magazine_",
|
||||||
|
"_journal_",
|
||||||
|
"_bulletin_",
|
||||||
|
"_catalogue_",
|
||||||
|
"_glossaire_",
|
||||||
|
"_aarbog_",
|
||||||
|
"_museum_",
|
||||||
|
"_tales_",
|
||||||
|
"_history_",
|
||||||
|
"_stories_",
|
||||||
|
"_inn_",
|
||||||
|
"curiosities",
|
||||||
|
"viviparous",
|
||||||
|
"quadrupeds",
|
||||||
|
# Geo / Städte
|
||||||
|
"spain.jpg",
|
||||||
|
"italy.jpg",
|
||||||
|
"france.jpg",
|
||||||
|
"germany.jpg",
|
||||||
|
"austria.jpg",
|
||||||
|
"map_of_",
|
||||||
|
# Militär / Dokumente
|
||||||
|
"military_working",
|
||||||
|
"mod_45",
|
||||||
|
"federal_register",
|
||||||
|
"prairie_dog", # ist kein Hund
|
||||||
)
|
)
|
||||||
|
|
||||||
# Suffixe die beim Normalisieren abgeschnitten werden
|
# Suffixe die beim Normalisieren abgeschnitten werden
|
||||||
|
|
@ -64,9 +95,21 @@ _BREED_SUFFIXES = (" dog", " hound", " terrier", " spaniel", " shepherd")
|
||||||
|
|
||||||
|
|
||||||
def _is_usable(url: str) -> bool:
|
def _is_usable(url: str) -> bool:
|
||||||
|
"""Gibt True zurück wenn die URL eine brauchbare Hundebild-URL ist."""
|
||||||
low = url.lower()
|
low = url.lower()
|
||||||
if low.endswith(".svg"):
|
|
||||||
|
# Nur echte Bildformate — keine PDFs, DjVu, Karten etc.
|
||||||
|
# Prüfe gegen den Dateinamen (nach dem letzten /)
|
||||||
|
fname = low.split("/")[-1].split("?")[0]
|
||||||
|
# Bei Thumbnail-URLs: Originaldatei-Endung extrahieren
|
||||||
|
# z.B. "960px-foo.jpg" → ".jpg" | "page1-500px-foo.pdf.jpg" → ablehnen
|
||||||
|
if ".pdf" in fname or ".djvu" in fname or ".svg" in fname:
|
||||||
return False
|
return False
|
||||||
|
if not any(fname.endswith(ext) or (ext + "/") in low for ext in _VALID_EXTENSIONS):
|
||||||
|
# Manchmal ist die Extension mitten in der URL (Thumbnail-Pfad)
|
||||||
|
if not any(ext in low for ext in _VALID_EXTENSIONS):
|
||||||
|
return False
|
||||||
|
|
||||||
for pattern in _SKIP_PATTERNS:
|
for pattern in _SKIP_PATTERNS:
|
||||||
if pattern in low:
|
if pattern in low:
|
||||||
return False
|
return False
|
||||||
|
|
@ -196,7 +239,9 @@ async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
||||||
pages = resp2.json().get("query", {}).get("pages", {})
|
pages = resp2.json().get("query", {}).get("pages", {})
|
||||||
|
|
||||||
# Trefferqualität: bevorzuge Bilder die den Suchbegriff im Dateinamen haben
|
# Trefferqualität: bevorzuge Bilder die den Suchbegriff im Dateinamen haben
|
||||||
query_lower = query.lower().replace(" ", "_")
|
# und lehne Treffer ab die keine Hundbezug haben
|
||||||
|
query_words = [w for w in query.lower().split() if len(w) > 3]
|
||||||
|
best_score = -1
|
||||||
best: str | None = None
|
best: str | None = None
|
||||||
|
|
||||||
for page in pages.values():
|
for page in pages.values():
|
||||||
|
|
@ -207,12 +252,14 @@ async def _commons_search(query: str, client: httpx.AsyncClient) -> str | None:
|
||||||
if not thumb or not _is_usable(thumb):
|
if not thumb or not _is_usable(thumb):
|
||||||
continue
|
continue
|
||||||
fname = urllib.parse.unquote(thumb).lower()
|
fname = urllib.parse.unquote(thumb).lower()
|
||||||
if query_lower in fname and best is None:
|
# Score: wie viele Suchbegriff-Wörter sind im Dateinamen?
|
||||||
|
score = sum(1 for w in query_words if w in fname)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
best = thumb
|
best = thumb
|
||||||
elif best is None:
|
|
||||||
best = thumb # Fallback: erster brauchbarer Treffer
|
|
||||||
|
|
||||||
return best
|
# Mindestens 1 Wort des Suchbegriffs muss im Dateinamen vorkommen
|
||||||
|
return best if best_score >= 1 else None
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("Commons search (%s): %s", query, exc)
|
logger.debug("Commons search (%s): %s", query, exc)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue