09-image-choices

This commit is contained in:
2026-02-13 01:04:49 -05:00
parent 88a5540b7d
commit c8f98c54c9
11 changed files with 798 additions and 32 deletions

View File

@@ -13,6 +13,12 @@ UMAMI_WEBSITE_ID = os.getenv("UMAMI_WEBSITE_ID", "")
ROYALTY_IMAGE_MCP_ENDPOINT = os.getenv("ROYALTY_IMAGE_MCP_ENDPOINT", "")
ROYALTY_IMAGE_API_KEY = os.getenv("ROYALTY_IMAGE_API_KEY", "")
ROYALTY_IMAGE_PROVIDER = os.getenv("ROYALTY_IMAGE_PROVIDER", "picsum")
ROYALTY_IMAGE_PROVIDERS = os.getenv(
"ROYALTY_IMAGE_PROVIDERS", "pixabay,unsplash,pexels,wikimedia,picsum"
)
PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY", "")
UNSPLASH_ACCESS_KEY = os.getenv("UNSPLASH_ACCESS_KEY", "")
PEXELS_API_KEY = os.getenv("PEXELS_API_KEY", "")
_summary_length_raw = int(os.getenv("SUMMARY_LENGTH_SCALE", "3"))
SUMMARY_LENGTH_SCALE = max(1, min(5, _summary_length_raw))

View File

@@ -3,6 +3,7 @@ import hashlib
import json
import logging
import os
import re
import time
from io import BytesIO
from urllib.parse import quote_plus
@@ -294,7 +295,299 @@ def build_fallback_summary(summary: str, source_url: str | None) -> dict:
}
# Stop words to remove from image search queries
_STOP_WORDS = frozenset(
[
"a",
"an",
"the",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"as",
"is",
"was",
"are",
"were",
"been",
"be",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"must",
"shall",
"can",
"need",
"it",
"its",
"this",
"that",
"these",
"those",
"i",
"you",
"he",
"she",
"we",
"they",
"what",
"which",
"who",
"whom",
"how",
"when",
"where",
"why",
"all",
"each",
"every",
"both",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"just",
"also",
"now",
"here",
"there",
"about",
"after",
"before",
"above",
"below",
"between",
"into",
"through",
"during",
"under",
"again",
"further",
"then",
"once",
"announces",
"announced",
"says",
"said",
"reports",
"reported",
"reveals",
"revealed",
"launches",
"launched",
"introduces",
"introduced",
]
)
def extract_image_keywords(headline: str) -> str:
"""Extract relevant keywords from headline for image search.
- Removes stop words (articles, prepositions, common verbs)
- Limits to max 5 significant words
- Handles edge cases (empty, only stop words, special characters)
"""
if not headline or not headline.strip():
return "news technology"
# Normalize: remove special characters, keep alphanumeric and spaces
cleaned = re.sub(r"[^\w\s]", " ", headline)
# Split into words and lowercase
words = cleaned.lower().split()
# Filter out stop words and very short words
keywords = [w for w in words if w not in _STOP_WORDS and len(w) > 2]
# Limit to first 5 significant keywords
keywords = keywords[:5]
if not keywords:
return "news technology"
return " ".join(keywords)
async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
"""Fetch image from Pixabay API."""
if not config.PIXABAY_API_KEY:
return None, None
try:
encoded_query = quote_plus(query)
url = (
f"https://pixabay.com/api/"
f"?key={config.PIXABAY_API_KEY}"
f"&q={encoded_query}"
f"&image_type=photo&per_page=3&safesearch=true"
)
async with httpx.AsyncClient(timeout=15.0) as client:
response = await client.get(url)
response.raise_for_status()
data = response.json()
hits = data.get("hits", [])
if hits:
image_url = hits[0].get("webformatURL")
user = hits[0].get("user", "Unknown")
if image_url:
return str(image_url), f"Photo by {user} on Pixabay"
except Exception:
logger.exception("Pixabay image retrieval failed")
return None, None
async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
"""Fetch image from Unsplash API."""
if not config.UNSPLASH_ACCESS_KEY:
return None, None
try:
encoded_query = quote_plus(query)
url = f"https://api.unsplash.com/search/photos?query={encoded_query}&per_page=3"
headers = {"Authorization": f"Client-ID {config.UNSPLASH_ACCESS_KEY}"}
async with httpx.AsyncClient(timeout=15.0) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
if results:
image_url = results[0].get("urls", {}).get("regular")
user_name = results[0].get("user", {}).get("name", "Unknown")
if image_url:
return str(image_url), f"Photo by {user_name} on Unsplash"
except Exception:
logger.exception("Unsplash image retrieval failed")
return None, None
async def fetch_pexels_image(query: str) -> tuple[str | None, str | None]:
"""Fetch image from Pexels API."""
if not config.PEXELS_API_KEY:
return None, None
try:
encoded_query = quote_plus(query)
url = f"https://api.pexels.com/v1/search?query={encoded_query}&per_page=3"
headers = {"Authorization": config.PEXELS_API_KEY}
async with httpx.AsyncClient(timeout=15.0) as client:
response = await client.get(url, headers=headers)
response.raise_for_status()
data = response.json()
photos = data.get("photos", [])
if photos:
image_url = photos[0].get("src", {}).get("large")
photographer = photos[0].get("photographer", "Unknown")
if image_url:
return str(image_url), f"Photo by {photographer} on Pexels"
except Exception:
logger.exception("Pexels image retrieval failed")
return None, None
async def fetch_wikimedia_image(query: str) -> tuple[str | None, str | None]:
"""Fetch image from Wikimedia Commons."""
try:
encoded_query = quote_plus(query[:120])
search_url = (
"https://commons.wikimedia.org/w/api.php"
"?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
)
async with httpx.AsyncClient(
timeout=15.0,
headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
) as client:
response = await client.get(search_url)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
if pages:
first_page = next(iter(pages.values()))
infos = first_page.get("imageinfo", [])
if infos:
url = infos[0].get("url")
if url:
return str(url), "Wikimedia Commons"
except Exception:
logger.exception("Wikimedia image retrieval failed")
return None, None
async def fetch_picsum_image(query: str) -> tuple[str | None, str | None]:
"""Generate deterministic Picsum image URL (always succeeds)."""
seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
# Provider registry: maps provider names to (fetch_function, requires_api_key)
_PROVIDER_REGISTRY: dict[str, tuple] = {
"pixabay": (fetch_pixabay_image, lambda: bool(config.PIXABAY_API_KEY)),
"unsplash": (fetch_unsplash_image, lambda: bool(config.UNSPLASH_ACCESS_KEY)),
"pexels": (fetch_pexels_image, lambda: bool(config.PEXELS_API_KEY)),
"wikimedia": (fetch_wikimedia_image, lambda: True), # No API key required
"picsum": (fetch_picsum_image, lambda: True), # Always available
}
def get_enabled_providers() -> list[tuple[str, callable]]:
"""Get ordered list of enabled providers based on config and available API keys."""
provider_names = [
p.strip().lower() for p in config.ROYALTY_IMAGE_PROVIDERS.split(",") if p.strip()
]
enabled = []
for name in provider_names:
if name in _PROVIDER_REGISTRY:
fetch_fn, is_enabled = _PROVIDER_REGISTRY[name]
if is_enabled():
enabled.append((name, fetch_fn))
return enabled
async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
"""Fetch royalty-free image using provider chain with fallback."""
# MCP endpoint takes highest priority if configured
if config.ROYALTY_IMAGE_MCP_ENDPOINT:
try:
async with httpx.AsyncClient(timeout=15.0) as client:
@@ -311,35 +604,17 @@ async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
except Exception:
logger.exception("MCP image retrieval failed")
if config.ROYALTY_IMAGE_PROVIDER.lower() == "wikimedia":
try:
encoded_query = quote_plus(query[:120])
search_url = (
"https://commons.wikimedia.org/w/api.php"
"?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
)
async with httpx.AsyncClient(
timeout=15.0,
headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
) as client:
response = await client.get(search_url)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
if pages:
first_page = next(iter(pages.values()))
infos = first_page.get("imageinfo", [])
if infos:
url = infos[0].get("url")
if url:
return str(url), "Wikimedia Commons"
except Exception:
logger.exception("Wikimedia image retrieval failed")
# Extract keywords for better image search
refined_query = extract_image_keywords(query)
if config.ROYALTY_IMAGE_PROVIDER.lower() == "picsum":
seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
# Try each enabled provider in order
for provider_name, fetch_fn in get_enabled_providers():
try:
image_url, credit = await fetch_fn(refined_query)
if image_url:
return image_url, credit
except Exception:
logger.exception("%s image retrieval failed", provider_name.capitalize())
return None, None