09-image-choices
This commit is contained in:
@@ -13,6 +13,12 @@ UMAMI_WEBSITE_ID = os.getenv("UMAMI_WEBSITE_ID", "")
|
||||
ROYALTY_IMAGE_MCP_ENDPOINT = os.getenv("ROYALTY_IMAGE_MCP_ENDPOINT", "")
|
||||
ROYALTY_IMAGE_API_KEY = os.getenv("ROYALTY_IMAGE_API_KEY", "")
|
||||
ROYALTY_IMAGE_PROVIDER = os.getenv("ROYALTY_IMAGE_PROVIDER", "picsum")
|
||||
ROYALTY_IMAGE_PROVIDERS = os.getenv(
|
||||
"ROYALTY_IMAGE_PROVIDERS", "pixabay,unsplash,pexels,wikimedia,picsum"
|
||||
)
|
||||
PIXABAY_API_KEY = os.getenv("PIXABAY_API_KEY", "")
|
||||
UNSPLASH_ACCESS_KEY = os.getenv("UNSPLASH_ACCESS_KEY", "")
|
||||
PEXELS_API_KEY = os.getenv("PEXELS_API_KEY", "")
|
||||
|
||||
_summary_length_raw = int(os.getenv("SUMMARY_LENGTH_SCALE", "3"))
|
||||
SUMMARY_LENGTH_SCALE = max(1, min(5, _summary_length_raw))
|
||||
|
||||
@@ -3,6 +3,7 @@ import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from io import BytesIO
|
||||
from urllib.parse import quote_plus
|
||||
@@ -294,7 +295,299 @@ def build_fallback_summary(summary: str, source_url: str | None) -> dict:
|
||||
}
|
||||
|
||||
|
||||
# Stop words to remove from image search queries
|
||||
_STOP_WORDS = frozenset(
|
||||
[
|
||||
"a",
|
||||
"an",
|
||||
"the",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"by",
|
||||
"from",
|
||||
"as",
|
||||
"is",
|
||||
"was",
|
||||
"are",
|
||||
"were",
|
||||
"been",
|
||||
"be",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
"may",
|
||||
"might",
|
||||
"must",
|
||||
"shall",
|
||||
"can",
|
||||
"need",
|
||||
"it",
|
||||
"its",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
"i",
|
||||
"you",
|
||||
"he",
|
||||
"she",
|
||||
"we",
|
||||
"they",
|
||||
"what",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"how",
|
||||
"when",
|
||||
"where",
|
||||
"why",
|
||||
"all",
|
||||
"each",
|
||||
"every",
|
||||
"both",
|
||||
"few",
|
||||
"more",
|
||||
"most",
|
||||
"other",
|
||||
"some",
|
||||
"such",
|
||||
"no",
|
||||
"nor",
|
||||
"not",
|
||||
"only",
|
||||
"own",
|
||||
"same",
|
||||
"so",
|
||||
"than",
|
||||
"too",
|
||||
"very",
|
||||
"just",
|
||||
"also",
|
||||
"now",
|
||||
"here",
|
||||
"there",
|
||||
"about",
|
||||
"after",
|
||||
"before",
|
||||
"above",
|
||||
"below",
|
||||
"between",
|
||||
"into",
|
||||
"through",
|
||||
"during",
|
||||
"under",
|
||||
"again",
|
||||
"further",
|
||||
"then",
|
||||
"once",
|
||||
"announces",
|
||||
"announced",
|
||||
"says",
|
||||
"said",
|
||||
"reports",
|
||||
"reported",
|
||||
"reveals",
|
||||
"revealed",
|
||||
"launches",
|
||||
"launched",
|
||||
"introduces",
|
||||
"introduced",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def extract_image_keywords(headline: str) -> str:
|
||||
"""Extract relevant keywords from headline for image search.
|
||||
|
||||
- Removes stop words (articles, prepositions, common verbs)
|
||||
- Limits to max 5 significant words
|
||||
- Handles edge cases (empty, only stop words, special characters)
|
||||
"""
|
||||
if not headline or not headline.strip():
|
||||
return "news technology"
|
||||
|
||||
# Normalize: remove special characters, keep alphanumeric and spaces
|
||||
cleaned = re.sub(r"[^\w\s]", " ", headline)
|
||||
# Split into words and lowercase
|
||||
words = cleaned.lower().split()
|
||||
|
||||
# Filter out stop words and very short words
|
||||
keywords = [w for w in words if w not in _STOP_WORDS and len(w) > 2]
|
||||
|
||||
# Limit to first 5 significant keywords
|
||||
keywords = keywords[:5]
|
||||
|
||||
if not keywords:
|
||||
return "news technology"
|
||||
|
||||
return " ".join(keywords)
|
||||
|
||||
|
||||
async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch image from Pixabay API."""
|
||||
if not config.PIXABAY_API_KEY:
|
||||
return None, None
|
||||
|
||||
try:
|
||||
encoded_query = quote_plus(query)
|
||||
url = (
|
||||
f"https://pixabay.com/api/"
|
||||
f"?key={config.PIXABAY_API_KEY}"
|
||||
f"&q={encoded_query}"
|
||||
f"&image_type=photo&per_page=3&safesearch=true"
|
||||
)
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
hits = data.get("hits", [])
|
||||
if hits:
|
||||
image_url = hits[0].get("webformatURL")
|
||||
user = hits[0].get("user", "Unknown")
|
||||
if image_url:
|
||||
return str(image_url), f"Photo by {user} on Pixabay"
|
||||
except Exception:
|
||||
logger.exception("Pixabay image retrieval failed")
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch image from Unsplash API."""
|
||||
if not config.UNSPLASH_ACCESS_KEY:
|
||||
return None, None
|
||||
|
||||
try:
|
||||
encoded_query = quote_plus(query)
|
||||
url = f"https://api.unsplash.com/search/photos?query={encoded_query}&per_page=3"
|
||||
headers = {"Authorization": f"Client-ID {config.UNSPLASH_ACCESS_KEY}"}
|
||||
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
response = await client.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
results = data.get("results", [])
|
||||
if results:
|
||||
image_url = results[0].get("urls", {}).get("regular")
|
||||
user_name = results[0].get("user", {}).get("name", "Unknown")
|
||||
if image_url:
|
||||
return str(image_url), f"Photo by {user_name} on Unsplash"
|
||||
except Exception:
|
||||
logger.exception("Unsplash image retrieval failed")
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
async def fetch_pexels_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch image from Pexels API."""
|
||||
if not config.PEXELS_API_KEY:
|
||||
return None, None
|
||||
|
||||
try:
|
||||
encoded_query = quote_plus(query)
|
||||
url = f"https://api.pexels.com/v1/search?query={encoded_query}&per_page=3"
|
||||
headers = {"Authorization": config.PEXELS_API_KEY}
|
||||
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
response = await client.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
photos = data.get("photos", [])
|
||||
if photos:
|
||||
image_url = photos[0].get("src", {}).get("large")
|
||||
photographer = photos[0].get("photographer", "Unknown")
|
||||
if image_url:
|
||||
return str(image_url), f"Photo by {photographer} on Pexels"
|
||||
except Exception:
|
||||
logger.exception("Pexels image retrieval failed")
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
async def fetch_wikimedia_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch image from Wikimedia Commons."""
|
||||
try:
|
||||
encoded_query = quote_plus(query[:120])
|
||||
search_url = (
|
||||
"https://commons.wikimedia.org/w/api.php"
|
||||
"?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
|
||||
f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
|
||||
)
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15.0,
|
||||
headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
|
||||
) as client:
|
||||
response = await client.get(search_url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
if pages:
|
||||
first_page = next(iter(pages.values()))
|
||||
infos = first_page.get("imageinfo", [])
|
||||
if infos:
|
||||
url = infos[0].get("url")
|
||||
if url:
|
||||
return str(url), "Wikimedia Commons"
|
||||
except Exception:
|
||||
logger.exception("Wikimedia image retrieval failed")
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
async def fetch_picsum_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Generate deterministic Picsum image URL (always succeeds)."""
|
||||
seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
|
||||
return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
|
||||
|
||||
|
||||
# Provider registry: maps provider names to (fetch_function, requires_api_key)
|
||||
_PROVIDER_REGISTRY: dict[str, tuple] = {
|
||||
"pixabay": (fetch_pixabay_image, lambda: bool(config.PIXABAY_API_KEY)),
|
||||
"unsplash": (fetch_unsplash_image, lambda: bool(config.UNSPLASH_ACCESS_KEY)),
|
||||
"pexels": (fetch_pexels_image, lambda: bool(config.PEXELS_API_KEY)),
|
||||
"wikimedia": (fetch_wikimedia_image, lambda: True), # No API key required
|
||||
"picsum": (fetch_picsum_image, lambda: True), # Always available
|
||||
}
|
||||
|
||||
|
||||
def get_enabled_providers() -> list[tuple[str, callable]]:
|
||||
"""Get ordered list of enabled providers based on config and available API keys."""
|
||||
provider_names = [
|
||||
p.strip().lower() for p in config.ROYALTY_IMAGE_PROVIDERS.split(",") if p.strip()
|
||||
]
|
||||
|
||||
enabled = []
|
||||
for name in provider_names:
|
||||
if name in _PROVIDER_REGISTRY:
|
||||
fetch_fn, is_enabled = _PROVIDER_REGISTRY[name]
|
||||
if is_enabled():
|
||||
enabled.append((name, fetch_fn))
|
||||
|
||||
return enabled
|
||||
|
||||
|
||||
async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch royalty-free image using provider chain with fallback."""
|
||||
# MCP endpoint takes highest priority if configured
|
||||
if config.ROYALTY_IMAGE_MCP_ENDPOINT:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||
@@ -311,35 +604,17 @@ async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
|
||||
except Exception:
|
||||
logger.exception("MCP image retrieval failed")
|
||||
|
||||
if config.ROYALTY_IMAGE_PROVIDER.lower() == "wikimedia":
|
||||
try:
|
||||
encoded_query = quote_plus(query[:120])
|
||||
search_url = (
|
||||
"https://commons.wikimedia.org/w/api.php"
|
||||
"?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
|
||||
f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
|
||||
)
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15.0,
|
||||
headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
|
||||
) as client:
|
||||
response = await client.get(search_url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
if pages:
|
||||
first_page = next(iter(pages.values()))
|
||||
infos = first_page.get("imageinfo", [])
|
||||
if infos:
|
||||
url = infos[0].get("url")
|
||||
if url:
|
||||
return str(url), "Wikimedia Commons"
|
||||
except Exception:
|
||||
logger.exception("Wikimedia image retrieval failed")
|
||||
# Extract keywords for better image search
|
||||
refined_query = extract_image_keywords(query)
|
||||
|
||||
if config.ROYALTY_IMAGE_PROVIDER.lower() == "picsum":
|
||||
seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
|
||||
return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
|
||||
# Try each enabled provider in order
|
||||
for provider_name, fetch_fn in get_enabled_providers():
|
||||
try:
|
||||
image_url, credit = await fetch_fn(refined_query)
|
||||
if image_url:
|
||||
return image_url, credit
|
||||
except Exception:
|
||||
logger.exception("%s image retrieval failed", provider_name.capitalize())
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user