import asyncio import hashlib import json import logging import os import time from io import BytesIO from urllib.parse import quote_plus import httpx from PIL import Image from backend import config from backend.database import SessionLocal from backend.repository import ( create_news, create_translation, headline_exists_within_24h, translation_exists, ) logger = logging.getLogger(__name__) PLACEHOLDER_IMAGE_PATH = "/static/images/placeholder.png" async def call_perplexity_api(query: str) -> dict | None: headers = { "Authorization": f"Bearer {config.PERPLEXITY_API_KEY}", "Content-Type": "application/json", } payload = { "model": config.PERPLEXITY_MODEL, "messages": [ { "role": "system", "content": ( "You are a news aggregator. Return a JSON array of news items. " "Each item must have: headline, summary (2-3 sentences), source_url, " "image_url (a relevant image URL if available), image_credit. " "Return between 3 and 5 items. Respond ONLY with valid JSON array, no markdown." ), }, {"role": "user", "content": query}, ], "temperature": 0.3, } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload) cost_info = { "model": config.PERPLEXITY_MODEL, "status": response.status_code, "usage": response.json().get("usage", {}), } logger.info("Perplexity API cost: %s", json.dumps(cost_info)) response.raise_for_status() return response.json() async def call_openrouter_api(query: str) -> dict | None: if not config.OPENROUTER_API_KEY: return None headers = { "Authorization": f"Bearer {config.OPENROUTER_API_KEY}", "Content-Type": "application/json", } payload = { "model": config.OPENROUTER_MODEL, "messages": [ { "role": "system", "content": ( "You are a news aggregator. Return a JSON array of news items. " "Each item must have: headline, summary (2-3 sentences), source_url, " "image_url (a relevant image URL if available), image_credit. " "Return between 3 and 5 items. Respond ONLY with valid JSON array, no markdown." ), }, {"role": "user", "content": query}, ], "temperature": 0.3, } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post(config.OPENROUTER_API_URL, headers=headers, json=payload) cost_info = { "model": config.OPENROUTER_MODEL, "status": response.status_code, "usage": response.json().get("usage", {}), } logger.info("OpenRouter API cost: %s", json.dumps(cost_info)) response.raise_for_status() return response.json() async def call_perplexity_translation_api( headline: str, summary: str, language: str, tldr_points: list[str] | None = None, summary_body: str | None = None, source_citation: str | None = None, ) -> dict | None: headers = { "Authorization": f"Bearer {config.PERPLEXITY_API_KEY}", "Content-Type": "application/json", } payload = { "model": config.PERPLEXITY_MODEL, "messages": [ { "role": "system", "content": ( "Translate the given headline and summary to the target language. " "Return only valid JSON object with keys: headline, summary. " "No markdown, no extra text." ), }, { "role": "user", "content": json.dumps( { "target_language": language, "headline": headline, "summary": summary, "tldr_points": tldr_points or [], "summary_body": summary_body or "", "source_citation": source_citation or "", } ), }, ], "temperature": 0.1, } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload) response.raise_for_status() return response.json() def parse_translation_response(response: dict) -> dict | None: content = response.get("choices", [{}])[0].get("message", {}).get("content", "") content = content.strip() if content.startswith("```"): content = content.split("\n", 1)[-1].rsplit("```", 1)[0] try: parsed = json.loads(content) if isinstance(parsed, dict): headline = str(parsed.get("headline", "")).strip() summary = str(parsed.get("summary", "")).strip() if headline and summary: tldr_points = parsed.get("tldr_points", []) if not isinstance(tldr_points, list): tldr_points = [] cleaned_points = [str(p).strip() for p in tldr_points if str(p).strip()] summary_body = str(parsed.get("summary_body", "")).strip() or None source_citation = str(parsed.get("source_citation", "")).strip() or None return { "headline": headline, "summary": summary, "tldr_points": cleaned_points, "summary_body": summary_body, "source_citation": source_citation, } except json.JSONDecodeError: logger.error("Failed to parse translation response: %s", content[:200]) return None async def generate_translations( headline: str, summary: str, tldr_points: list[str] | None = None, summary_body: str | None = None, source_citation: str | None = None, ) -> dict[str, dict]: translations: dict[str, dict] = {} language_names = {"ta": "Tamil", "ml": "Malayalam"} if not config.PERPLEXITY_API_KEY: return translations for language_code, language_name in language_names.items(): try: response = await call_perplexity_translation_api( headline=headline, summary=summary, language=language_name, tldr_points=tldr_points, summary_body=summary_body, source_citation=source_citation, ) if response: parsed = parse_translation_response(response) if parsed: translations[language_code] = parsed except Exception: logger.exception("Translation generation failed for %s", language_code) return translations async def call_perplexity_summary_api( headline: str, summary: str, source_url: str | None ) -> dict | None: headers = { "Authorization": f"Bearer {config.PERPLEXITY_API_KEY}", "Content-Type": "application/json", } payload = { "model": config.PERPLEXITY_MODEL, "messages": [ { "role": "system", "content": ( "Generate concise structured JSON for UI modal. Return only JSON object with keys: " "tldr_points (array of 3 short bullets), summary_body (detailed summary), " "source_citation (concise source/citation text). " "Always summarize from provided text only. No markdown." ), }, { "role": "user", "content": json.dumps( { "headline": headline, "summary": summary, "source_citation": source_url or "Original source", "summary_length_scale": config.SUMMARY_LENGTH_SCALE, "summary_length_rule": ( "1=very short, 2=short, 3=medium, 4=long, 5=very long. " "Use more detail as scale increases." ), } ), }, ], "temperature": 0.2, } async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload) response.raise_for_status() return response.json() def parse_summary_response(response: dict) -> dict | None: content = response.get("choices", [{}])[0].get("message", {}).get("content", "") content = content.strip() if content.startswith("```"): content = content.split("\n", 1)[-1].rsplit("```", 1)[0] try: parsed = json.loads(content) except json.JSONDecodeError: logger.error("Failed to parse summary response: %s", content[:200]) return None if not isinstance(parsed, dict): return None tldr_points = parsed.get("tldr_points", []) if not isinstance(tldr_points, list): tldr_points = [] cleaned_points = [str(point).strip() for point in tldr_points if str(point).strip()] summary_body = str(parsed.get("summary_body", "")).strip() source_citation = str(parsed.get("source_citation", "")).strip() if not cleaned_points and not summary_body: return None return { "tldr_points": cleaned_points[:5], "summary_body": summary_body or None, "source_citation": source_citation or None, } def build_fallback_summary(summary: str, source_url: str | None) -> dict: segments = [ s.strip() for s in summary.replace("!", ".").replace("?", ".").split(".") if s.strip() ] points = segments[:3] if not points and summary.strip(): points = [summary.strip()[:180]] return { "tldr_points": points, "summary_body": summary, "source_citation": source_url or "Original source", } async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]: if config.ROYALTY_IMAGE_MCP_ENDPOINT: try: async with httpx.AsyncClient(timeout=15.0) as client: response = await client.post( config.ROYALTY_IMAGE_MCP_ENDPOINT, json={"query": query}, ) response.raise_for_status() payload = response.json() image_url = payload.get("image_url") or payload.get("url") image_credit = payload.get("image_credit") or payload.get("credit") if image_url: return str(image_url), str(image_credit or "Royalty-free") except Exception: logger.exception("MCP image retrieval failed") if config.ROYALTY_IMAGE_PROVIDER.lower() == "wikimedia": try: encoded_query = quote_plus(query[:120]) search_url = ( "https://commons.wikimedia.org/w/api.php" "?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1" f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url" ) async with httpx.AsyncClient( timeout=15.0, headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"}, ) as client: response = await client.get(search_url) response.raise_for_status() data = response.json() pages = data.get("query", {}).get("pages", {}) if pages: first_page = next(iter(pages.values())) infos = first_page.get("imageinfo", []) if infos: url = infos[0].get("url") if url: return str(url), "Wikimedia Commons" except Exception: logger.exception("Wikimedia image retrieval failed") if config.ROYALTY_IMAGE_PROVIDER.lower() == "picsum": seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12] return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos" return None, None def parse_news_response(response: dict) -> list[dict]: content = response.get("choices", [{}])[0].get("message", {}).get("content", "") content = content.strip() if content.startswith("```"): content = content.split("\n", 1)[-1].rsplit("```", 1)[0] try: items = json.loads(content) if isinstance(items, list): return items except json.JSONDecodeError: logger.error("Failed to parse news response: %s", content[:200]) return [] async def download_and_optimize_image(image_url: str) -> str | None: if not image_url: return None try: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: response = await client.get(image_url) response.raise_for_status() img = Image.open(BytesIO(response.content)) if img.width > 1200: ratio = 1200 / img.width new_height = int(img.height * ratio) img = img.resize((1200, new_height), Image.Resampling.LANCZOS) if img.mode in ("RGBA", "P"): img = img.convert("RGB") filename = hashlib.md5(image_url.encode()).hexdigest() + ".jpg" filepath = os.path.join(config.STATIC_IMAGES_DIR, filename) img.save(filepath, "JPEG", quality=config.IMAGE_QUALITY, optimize=True) return f"/static/images/{filename}" except Exception: logger.exception("Failed to download/optimize image: %s", image_url) return None async def fetch_news_with_retry(max_attempts: int = 3) -> list[dict]: query = "What are the latest AI news from the last hour? Include source URLs and image URLs." for attempt in range(max_attempts): try: response = await call_perplexity_api(query) if response: return parse_news_response(response) except Exception: wait = 2**attempt logger.warning("Perplexity API attempt %d failed, retrying in %ds", attempt + 1, wait) await asyncio.sleep(wait) logger.warning("Perplexity API exhausted, trying OpenRouter fallback") try: response = await call_openrouter_api(query) if response: return parse_news_response(response) except Exception: logger.exception("OpenRouter fallback also failed") return [] async def process_and_store_news() -> int: items = await fetch_news_with_retry() if not items: logger.warning("No news items fetched this cycle") return 0 db = SessionLocal() stored = 0 try: for item in items: headline = item.get("headline", "").strip() summary = item.get("summary", "").strip() if not headline or not summary: continue if headline_exists_within_24h(db, headline): logger.debug("Duplicate headline skipped: %s", headline[:80]) continue local_image = await download_and_optimize_image(item.get("image_url", "")) image_url = local_image or PLACEHOLDER_IMAGE_PATH summary_artifact: dict | None = None if config.PERPLEXITY_API_KEY: try: summary_response = await call_perplexity_summary_api( headline=headline, summary=summary, source_url=item.get("source_url"), ) if summary_response: summary_artifact = parse_summary_response(summary_response) except Exception: logger.exception("Summary generation failed for article: %s", headline[:80]) if summary_artifact is None: summary_artifact = build_fallback_summary(summary, item.get("source_url")) summary_image_url, summary_image_credit = await fetch_royalty_free_image(headline) summary_local_image = None if summary_image_url: summary_local_image = await download_and_optimize_image(summary_image_url) if summary_local_image: summary_image_url = summary_local_image if not summary_image_url: summary_image_url = image_url if not summary_image_credit: summary_image_credit = item.get("image_credit") tldr_points = summary_artifact.get("tldr_points") if summary_artifact else None summary_body = summary_artifact.get("summary_body") if summary_artifact else None source_citation = summary_artifact.get("source_citation") if summary_artifact else None created_news_item = create_news( db=db, headline=headline, summary=summary, source_url=item.get("source_url"), image_url=image_url, image_credit=item.get("image_credit"), tldr_points=tldr_points, summary_body=summary_body, source_citation=source_citation, summary_image_url=summary_image_url, summary_image_credit=summary_image_credit, ) translations = await generate_translations( headline=headline, summary=summary, tldr_points=tldr_points, summary_body=summary_body, source_citation=source_citation, ) for language_code, payload in translations.items(): if translation_exists(db, created_news_item.id, language_code): continue create_translation( db=db, news_item_id=created_news_item.id, language=language_code, headline=payload["headline"], summary=payload["summary"], tldr_points=payload.get("tldr_points"), summary_body=payload.get("summary_body"), source_citation=payload.get("source_citation"), ) stored += 1 logger.info("Stored %d new news items", stored) finally: db.close() return stored def scheduled_news_fetch() -> None: start = time.monotonic() logger.info("Starting scheduled news fetch") count = asyncio.run(process_and_store_news()) elapsed = time.monotonic() - start logger.info("Scheduled news fetch complete: %d items in %.1fs", count, elapsed)