09-image-choices

2026-02-13 01:04:49 -05:00
parent 88a5540b7d
commit c8f98c54c9
11 changed files with 798 additions and 32 deletions
--- a/backend/config.py
+++ b/backend/config.py
@@ -13,6 +13,12 @@ UMAMI_WEBSITE_ID = os.getenv("UMAMI_WEBSITE_ID", "")
 ROYALTY_IMAGE_MCP_ENDPOINT = os.getenv("ROYALTY_IMAGE_MCP_ENDPOINT", "")
 ROYALTY_IMAGE_API_KEY = os.getenv("ROYALTY_IMAGE_API_KEY", "")
 ROYALTY_IMAGE_PROVIDER = os.getenv("ROYALTY_IMAGE_PROVIDER", "picsum")
+ROYALTY_IMAGE_PROVIDERS = os.getenv(
+    "ROYALTY_IMAGE_PROVIDERS", "pixabay,unsplash,pexels,wikimedia,picsum"
+)
+PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY", "")
+UNSPLASH_ACCESS_KEY = os.getenv("UNSPLASH_ACCESS_KEY", "")
+PEXELS_API_KEY = os.getenv("PEXELS_API_KEY", "")

 _summary_length_raw = int(os.getenv("SUMMARY_LENGTH_SCALE", "3"))
 SUMMARY_LENGTH_SCALE = max(1, min(5, _summary_length_raw))
--- a/backend/news_service.py
+++ b/backend/news_service.py
@@ -3,6 +3,7 @@ import hashlib
 import json
 import logging
 import os
+import re
 import time
 from io import BytesIO
 from urllib.parse import quote_plus
@@ -294,7 +295,299 @@ def build_fallback_summary(summary: str, source_url: str | None) -> dict:
    }


+# Stop words to remove from image search queries
+_STOP_WORDS = frozenset(
+    [
+        "a",
+        "an",
+        "the",
+        "and",
+        "or",
+        "but",
+        "in",
+        "on",
+        "at",
+        "to",
+        "for",
+        "of",
+        "with",
+        "by",
+        "from",
+        "as",
+        "is",
+        "was",
+        "are",
+        "were",
+        "been",
+        "be",
+        "have",
+        "has",
+        "had",
+        "do",
+        "does",
+        "did",
+        "will",
+        "would",
+        "could",
+        "should",
+        "may",
+        "might",
+        "must",
+        "shall",
+        "can",
+        "need",
+        "it",
+        "its",
+        "this",
+        "that",
+        "these",
+        "those",
+        "i",
+        "you",
+        "he",
+        "she",
+        "we",
+        "they",
+        "what",
+        "which",
+        "who",
+        "whom",
+        "how",
+        "when",
+        "where",
+        "why",
+        "all",
+        "each",
+        "every",
+        "both",
+        "few",
+        "more",
+        "most",
+        "other",
+        "some",
+        "such",
+        "no",
+        "nor",
+        "not",
+        "only",
+        "own",
+        "same",
+        "so",
+        "than",
+        "too",
+        "very",
+        "just",
+        "also",
+        "now",
+        "here",
+        "there",
+        "about",
+        "after",
+        "before",
+        "above",
+        "below",
+        "between",
+        "into",
+        "through",
+        "during",
+        "under",
+        "again",
+        "further",
+        "then",
+        "once",
+        "announces",
+        "announced",
+        "says",
+        "said",
+        "reports",
+        "reported",
+        "reveals",
+        "revealed",
+        "launches",
+        "launched",
+        "introduces",
+        "introduced",
+    ]
+)
+
+
+def extract_image_keywords(headline: str) -> str:
+    """Extract relevant keywords from headline for image search.
+
+    - Removes stop words (articles, prepositions, common verbs)
+    - Limits to max 5 significant words
+    - Handles edge cases (empty, only stop words, special characters)
+    """
+    if not headline or not headline.strip():
+        return "news technology"
+
+    # Normalize: remove special characters, keep alphanumeric and spaces
+    cleaned = re.sub(r"[^\w\s]", " ", headline)
+    # Split into words and lowercase
+    words = cleaned.lower().split()
+
+    # Filter out stop words and very short words
+    keywords = [w for w in words if w not in _STOP_WORDS and len(w) > 2]
+
+    # Limit to first 5 significant keywords
+    keywords = keywords[:5]
+
+    if not keywords:
+        return "news technology"
+
+    return " ".join(keywords)
+
+
+async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
+    """Fetch image from Pixabay API."""
+    if not config.PIXABAY_API_KEY:
+        return None, None
+
+    try:
+        encoded_query = quote_plus(query)
+        url = (
+            f"https://pixabay.com/api/"
+            f"?key={config.PIXABAY_API_KEY}"
+            f"&q={encoded_query}"
+            f"&image_type=photo&per_page=3&safesearch=true"
+        )
+        async with httpx.AsyncClient(timeout=15.0) as client:
+            response = await client.get(url)
+            response.raise_for_status()
+            data = response.json()
+
+        hits = data.get("hits", [])
+        if hits:
+            image_url = hits[0].get("webformatURL")
+            user = hits[0].get("user", "Unknown")
+            if image_url:
+                return str(image_url), f"Photo by {user} on Pixabay"
+    except Exception:
+        logger.exception("Pixabay image retrieval failed")
+
+    return None, None
+
+
+async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
+    """Fetch image from Unsplash API."""
+    if not config.UNSPLASH_ACCESS_KEY:
+        return None, None
+
+    try:
+        encoded_query = quote_plus(query)
+        url = f"https://api.unsplash.com/search/photos?query={encoded_query}&per_page=3"
+        headers = {"Authorization": f"Client-ID {config.UNSPLASH_ACCESS_KEY}"}
+
+        async with httpx.AsyncClient(timeout=15.0) as client:
+            response = await client.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+
+        results = data.get("results", [])
+        if results:
+            image_url = results[0].get("urls", {}).get("regular")
+            user_name = results[0].get("user", {}).get("name", "Unknown")
+            if image_url:
+                return str(image_url), f"Photo by {user_name} on Unsplash"
+    except Exception:
+        logger.exception("Unsplash image retrieval failed")
+
+    return None, None
+
+
+async def fetch_pexels_image(query: str) -> tuple[str | None, str | None]:
+    """Fetch image from Pexels API."""
+    if not config.PEXELS_API_KEY:
+        return None, None
+
+    try:
+        encoded_query = quote_plus(query)
+        url = f"https://api.pexels.com/v1/search?query={encoded_query}&per_page=3"
+        headers = {"Authorization": config.PEXELS_API_KEY}
+
+        async with httpx.AsyncClient(timeout=15.0) as client:
+            response = await client.get(url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+
+        photos = data.get("photos", [])
+        if photos:
+            image_url = photos[0].get("src", {}).get("large")
+            photographer = photos[0].get("photographer", "Unknown")
+            if image_url:
+                return str(image_url), f"Photo by {photographer} on Pexels"
+    except Exception:
+        logger.exception("Pexels image retrieval failed")
+
+    return None, None
+
+
+async def fetch_wikimedia_image(query: str) -> tuple[str | None, str | None]:
+    """Fetch image from Wikimedia Commons."""
+    try:
+        encoded_query = quote_plus(query[:120])
+        search_url = (
+            "https://commons.wikimedia.org/w/api.php"
+            "?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
+            f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
+        )
+        async with httpx.AsyncClient(
+            timeout=15.0,
+            headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
+        ) as client:
+            response = await client.get(search_url)
+            response.raise_for_status()
+            data = response.json()
+
+        pages = data.get("query", {}).get("pages", {})
+        if pages:
+            first_page = next(iter(pages.values()))
+            infos = first_page.get("imageinfo", [])
+            if infos:
+                url = infos[0].get("url")
+                if url:
+                    return str(url), "Wikimedia Commons"
+    except Exception:
+        logger.exception("Wikimedia image retrieval failed")
+
+    return None, None
+
+
+async def fetch_picsum_image(query: str) -> tuple[str | None, str | None]:
+    """Generate deterministic Picsum image URL (always succeeds)."""
+    seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
+    return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
+
+
+# Provider registry: maps provider names to (fetch_function, requires_api_key)
+_PROVIDER_REGISTRY: dict[str, tuple] = {
+    "pixabay": (fetch_pixabay_image, lambda: bool(config.PIXABAY_API_KEY)),
+    "unsplash": (fetch_unsplash_image, lambda: bool(config.UNSPLASH_ACCESS_KEY)),
+    "pexels": (fetch_pexels_image, lambda: bool(config.PEXELS_API_KEY)),
+    "wikimedia": (fetch_wikimedia_image, lambda: True),  # No API key required
+    "picsum": (fetch_picsum_image, lambda: True),  # Always available
+}
+
+
+def get_enabled_providers() -> list[tuple[str, callable]]:
+    """Get ordered list of enabled providers based on config and available API keys."""
+    provider_names = [
+        p.strip().lower() for p in config.ROYALTY_IMAGE_PROVIDERS.split(",") if p.strip()
+    ]
+
+    enabled = []
+    for name in provider_names:
+        if name in _PROVIDER_REGISTRY:
+            fetch_fn, is_enabled = _PROVIDER_REGISTRY[name]
+            if is_enabled():
+                enabled.append((name, fetch_fn))
+
+    return enabled
+
+
 async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
+    """Fetch royalty-free image using provider chain with fallback."""
+    # MCP endpoint takes highest priority if configured
    if config.ROYALTY_IMAGE_MCP_ENDPOINT:
        try:
            async with httpx.AsyncClient(timeout=15.0) as client:
@@ -311,35 +604,17 @@ async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
        except Exception:
            logger.exception("MCP image retrieval failed")

-    if config.ROYALTY_IMAGE_PROVIDER.lower() == "wikimedia":
-        try:
-            encoded_query = quote_plus(query[:120])
-            search_url = (
-                "https://commons.wikimedia.org/w/api.php"
-                "?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
-                f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
-            )
-            async with httpx.AsyncClient(
-                timeout=15.0,
-                headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
-            ) as client:
-                response = await client.get(search_url)
-                response.raise_for_status()
-                data = response.json()
-            pages = data.get("query", {}).get("pages", {})
-            if pages:
-                first_page = next(iter(pages.values()))
-                infos = first_page.get("imageinfo", [])
-                if infos:
-                    url = infos[0].get("url")
-                    if url:
-                        return str(url), "Wikimedia Commons"
-        except Exception:
-            logger.exception("Wikimedia image retrieval failed")
+    # Extract keywords for better image search
+    refined_query = extract_image_keywords(query)

-    if config.ROYALTY_IMAGE_PROVIDER.lower() == "picsum":
-        seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
-        return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
+    # Try each enabled provider in order
+    for provider_name, fetch_fn in get_enabled_providers():
+        try:
+            image_url, credit = await fetch_fn(refined_query)
+            if image_url:
+                return image_url, credit
+        except Exception:
+            logger.exception("%s image retrieval failed", provider_name.capitalize())

    return None, None