p08-seo-tweaks

2026-02-13 00:49:22 -05:00
parent a1da041f14
commit 88a5540b7d
63 changed files with 2228 additions and 37 deletions
--- a/backend/pycache/config.cpython-313.pyc
+++ b/backend/pycache/config.cpython-313.pyc
--- a/backend/pycache/database.cpython-313.pyc
+++ b/backend/pycache/database.cpython-313.pyc
--- a/backend/pycache/main.cpython-313.pyc
+++ b/backend/pycache/main.cpython-313.pyc
--- a/backend/pycache/models.cpython-313.pyc
+++ b/backend/pycache/models.cpython-313.pyc
--- a/backend/pycache/news_service.cpython-313.pyc
+++ b/backend/pycache/news_service.cpython-313.pyc
--- a/backend/pycache/repository.cpython-313.pyc
+++ b/backend/pycache/repository.cpython-313.pyc
--- a/backend/pycache/schemas.cpython-313.pyc
+++ b/backend/pycache/schemas.cpython-313.pyc
--- a/backend/config.py
+++ b/backend/config.py
@@ -10,6 +10,12 @@ IMAGE_QUALITY = int(os.getenv("IMAGE_QUALITY", "85"))
 RETENTION_DAYS = int(os.getenv("RETENTION_DAYS", "30"))
 UMAMI_SCRIPT_URL = os.getenv("UMAMI_SCRIPT_URL", "")
 UMAMI_WEBSITE_ID = os.getenv("UMAMI_WEBSITE_ID", "")
+ROYALTY_IMAGE_MCP_ENDPOINT = os.getenv("ROYALTY_IMAGE_MCP_ENDPOINT", "")
+ROYALTY_IMAGE_API_KEY = os.getenv("ROYALTY_IMAGE_API_KEY", "")
+ROYALTY_IMAGE_PROVIDER = os.getenv("ROYALTY_IMAGE_PROVIDER", "picsum")
+
+_summary_length_raw = int(os.getenv("SUMMARY_LENGTH_SCALE", "3"))
+SUMMARY_LENGTH_SCALE = max(1, min(5, _summary_length_raw))

 PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
 PERPLEXITY_MODEL = "sonar"
--- a/backend/database.py
+++ b/backend/database.py
@@ -1,6 +1,6 @@
 from collections.abc import Generator

-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker

 DATABASE_URL = "sqlite:///./data/clawfort.db"
@@ -25,3 +25,27 @@ def init_db() -> None:
    from backend.models import NewsItem, NewsTranslation  # noqa: F401

    Base.metadata.create_all(bind=engine)
+
+    migration_sql = {
+        "news_items": {
+            "tldr_points": "ALTER TABLE news_items ADD COLUMN tldr_points TEXT",
+            "summary_body": "ALTER TABLE news_items ADD COLUMN summary_body TEXT",
+            "source_citation": "ALTER TABLE news_items ADD COLUMN source_citation TEXT",
+            "summary_image_url": "ALTER TABLE news_items ADD COLUMN summary_image_url VARCHAR(2000)",
+            "summary_image_credit": "ALTER TABLE news_items ADD COLUMN summary_image_credit VARCHAR(500)",
+        },
+        "news_translations": {
+            "tldr_points": "ALTER TABLE news_translations ADD COLUMN tldr_points TEXT",
+            "summary_body": "ALTER TABLE news_translations ADD COLUMN summary_body TEXT",
+            "source_citation": "ALTER TABLE news_translations ADD COLUMN source_citation TEXT",
+        },
+    }
+
+    with engine.begin() as conn:
+        for table, columns in migration_sql.items():
+            existing_cols = {
+                row[1] for row in conn.execute(text(f"PRAGMA table_info({table})")).fetchall()
+            }
+            for col_name, ddl in columns.items():
+                if col_name not in existing_cols:
+                    conn.execute(text(ddl))
--- a/backend/main.py
+++ b/backend/main.py
@@ -2,8 +2,9 @@ import logging
 import os

 from apscheduler.schedulers.background import BackgroundScheduler
-from fastapi import Depends, FastAPI, Query
+from fastapi import Depends, FastAPI, Query, Request
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
 from sqlalchemy.orm import Session
@@ -20,6 +21,7 @@ from backend.repository import (
    get_translation,
    normalize_language,
    resolve_news_content,
+    resolve_summary_fields,
 )
 from backend.schemas import HealthResponse, NewsItemResponse, PaginatedNewsResponse

@@ -39,6 +41,30 @@ app.add_middleware(
    allow_headers=["*"],
 )

+app.add_middleware(GZipMiddleware, minimum_size=500)
+
+
+@app.middleware("http")
+async def add_cache_headers(request: Request, call_next):
+    response = await call_next(request)
+    path = request.url.path
+
+    if path.startswith("/static/"):
+        response.headers.setdefault("Cache-Control", "public, max-age=604800, immutable")
+    elif path.startswith("/api/"):
+        response.headers.setdefault(
+            "Cache-Control", "public, max-age=60, stale-while-revalidate=120"
+        )
+        response.headers.setdefault("Vary", "Accept-Encoding")
+    elif path in {"/", "/terms", "/attribution"}:
+        response.headers.setdefault(
+            "Cache-Control", "public, max-age=300, stale-while-revalidate=600"
+        )
+
+    response.headers.setdefault("X-Content-Type-Options", "nosniff")
+    return response
+
+
 static_dir = os.path.join(os.path.dirname(__file__), "static")
 app.mount("/static", StaticFiles(directory=static_dir), name="static")

@@ -101,6 +127,7 @@ def api_get_news(
        if lang != "en":
            translation = get_translation(db, item.id, lang)
        headline, summary = resolve_news_content(item, translation)
+        tldr_points, summary_body, source_citation = resolve_summary_fields(item, translation)
        response_items.append(
            NewsItemResponse(
                id=item.id,
@@ -109,6 +136,11 @@ def api_get_news(
                source_url=item.source_url,
                image_url=item.image_url,
                image_credit=item.image_credit,
+                tldr_points=tldr_points,
+                summary_body=summary_body,
+                source_citation=source_citation,
+                summary_image_url=item.summary_image_url,
+                summary_image_credit=item.summary_image_credit,
                published_at=item.published_at,
                created_at=item.created_at,
                language=lang if translation is not None else "en",
@@ -136,6 +168,7 @@ def api_get_latest_news(
    if lang != "en":
        translation = get_translation(db, item.id, lang)
    headline, summary = resolve_news_content(item, translation)
+    tldr_points, summary_body, source_citation = resolve_summary_fields(item, translation)
    return NewsItemResponse(
        id=item.id,
        headline=headline,
@@ -143,6 +176,11 @@ def api_get_latest_news(
        source_url=item.source_url,
        image_url=item.image_url,
        image_credit=item.image_credit,
+        tldr_points=tldr_points,
+        summary_body=summary_body,
+        source_citation=source_citation,
+        summary_image_url=item.summary_image_url,
+        summary_image_credit=item.summary_image_credit,
        published_at=item.published_at,
        created_at=item.created_at,
        language=lang if translation is not None else "en",
@@ -163,6 +201,16 @@ async def serve_frontend() -> FileResponse:
    return FileResponse(os.path.join(frontend_dir, "index.html"))


+@app.get("/terms")
+async def serve_terms() -> FileResponse:
+    return FileResponse(os.path.join(frontend_dir, "terms.html"))
+
+
+@app.get("/attribution")
+async def serve_attribution() -> FileResponse:
+    return FileResponse(os.path.join(frontend_dir, "attribution.html"))
+
+
@app.get("/config")
 async def serve_config() -> dict:
    return {
--- a/backend/models.py
+++ b/backend/models.py
@@ -12,6 +12,11 @@ class NewsItem(Base):
    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    headline: Mapped[str] = mapped_column(String(500), nullable=False, index=True)
    summary: Mapped[str] = mapped_column(Text, nullable=False)
+    tldr_points: Mapped[str | None] = mapped_column(Text, nullable=True)
+    summary_body: Mapped[str | None] = mapped_column(Text, nullable=True)
+    source_citation: Mapped[str | None] = mapped_column(Text, nullable=True)
+    summary_image_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
+    summary_image_credit: Mapped[str | None] = mapped_column(String(500), nullable=True)
    source_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
    image_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
    image_credit: Mapped[str | None] = mapped_column(String(500), nullable=True)
@@ -38,6 +43,9 @@ class NewsTranslation(Base):
    language: Mapped[str] = mapped_column(String(5), nullable=False, index=True)
    headline: Mapped[str] = mapped_column(String(500), nullable=False)
    summary: Mapped[str] = mapped_column(Text, nullable=False)
+    tldr_points: Mapped[str | None] = mapped_column(Text, nullable=True)
+    summary_body: Mapped[str | None] = mapped_column(Text, nullable=True)
+    source_citation: Mapped[str | None] = mapped_column(Text, nullable=True)
    created_at: Mapped[datetime.datetime] = mapped_column(
        DateTime, nullable=False, default=datetime.datetime.utcnow
    )
--- a/backend/news_service.py
+++ b/backend/news_service.py
@@ -5,6 +5,7 @@ import logging
 import os
 import time
 from io import BytesIO
+from urllib.parse import quote_plus

 import httpx
 from PIL import Image
@@ -95,7 +96,12 @@ async def call_openrouter_api(query: str) -> dict | None:


 async def call_perplexity_translation_api(
-    headline: str, summary: str, language: str
+    headline: str,
+    summary: str,
+    language: str,
+    tldr_points: list[str] | None = None,
+    summary_body: str | None = None,
+    source_citation: str | None = None,
 ) -> dict | None:
    headers = {
        "Authorization": f"Bearer {config.PERPLEXITY_API_KEY}",
@@ -119,6 +125,9 @@ async def call_perplexity_translation_api(
                        "target_language": language,
                        "headline": headline,
                        "summary": summary,
+                        "tldr_points": tldr_points or [],
+                        "summary_body": summary_body or "",
+                        "source_citation": source_citation or "",
                    }
                ),
            },
@@ -144,13 +153,31 @@ def parse_translation_response(response: dict) -> dict | None:
            headline = str(parsed.get("headline", "")).strip()
            summary = str(parsed.get("summary", "")).strip()
            if headline and summary:
-                return {"headline": headline, "summary": summary}
+                tldr_points = parsed.get("tldr_points", [])
+                if not isinstance(tldr_points, list):
+                    tldr_points = []
+                cleaned_points = [str(p).strip() for p in tldr_points if str(p).strip()]
+                summary_body = str(parsed.get("summary_body", "")).strip() or None
+                source_citation = str(parsed.get("source_citation", "")).strip() or None
+                return {
+                    "headline": headline,
+                    "summary": summary,
+                    "tldr_points": cleaned_points,
+                    "summary_body": summary_body,
+                    "source_citation": source_citation,
+                }
    except json.JSONDecodeError:
        logger.error("Failed to parse translation response: %s", content[:200])
    return None


-async def generate_translations(headline: str, summary: str) -> dict[str, dict]:
+async def generate_translations(
+    headline: str,
+    summary: str,
+    tldr_points: list[str] | None = None,
+    summary_body: str | None = None,
+    source_citation: str | None = None,
+) -> dict[str, dict]:
    translations: dict[str, dict] = {}
    language_names = {"ta": "Tamil", "ml": "Malayalam"}

@@ -159,7 +186,14 @@ async def generate_translations(headline: str, summary: str) -> dict[str, dict]:

    for language_code, language_name in language_names.items():
        try:
-            response = await call_perplexity_translation_api(headline, summary, language_name)
+            response = await call_perplexity_translation_api(
+                headline=headline,
+                summary=summary,
+                language=language_name,
+                tldr_points=tldr_points,
+                summary_body=summary_body,
+                source_citation=source_citation,
+            )
            if response:
                parsed = parse_translation_response(response)
                if parsed:
@@ -170,6 +204,146 @@ async def generate_translations(headline: str, summary: str) -> dict[str, dict]:
    return translations


+async def call_perplexity_summary_api(
+    headline: str, summary: str, source_url: str | None
+) -> dict | None:
+    headers = {
+        "Authorization": f"Bearer {config.PERPLEXITY_API_KEY}",
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "model": config.PERPLEXITY_MODEL,
+        "messages": [
+            {
+                "role": "system",
+                "content": (
+                    "Generate concise structured JSON for UI modal. Return only JSON object with keys: "
+                    "tldr_points (array of 3 short bullets), summary_body (detailed summary), "
+                    "source_citation (concise source/citation text). "
+                    "Always summarize from provided text only. No markdown."
+                ),
+            },
+            {
+                "role": "user",
+                "content": json.dumps(
+                    {
+                        "headline": headline,
+                        "summary": summary,
+                        "source_citation": source_url or "Original source",
+                        "summary_length_scale": config.SUMMARY_LENGTH_SCALE,
+                        "summary_length_rule": (
+                            "1=very short, 2=short, 3=medium, 4=long, 5=very long. "
+                            "Use more detail as scale increases."
+                        ),
+                    }
+                ),
+            },
+        ],
+        "temperature": 0.2,
+    }
+
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload)
+        response.raise_for_status()
+        return response.json()
+
+
+def parse_summary_response(response: dict) -> dict | None:
+    content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
+    content = content.strip()
+    if content.startswith("```"):
+        content = content.split("\n", 1)[-1].rsplit("```", 1)[0]
+    try:
+        parsed = json.loads(content)
+    except json.JSONDecodeError:
+        logger.error("Failed to parse summary response: %s", content[:200])
+        return None
+
+    if not isinstance(parsed, dict):
+        return None
+
+    tldr_points = parsed.get("tldr_points", [])
+    if not isinstance(tldr_points, list):
+        tldr_points = []
+    cleaned_points = [str(point).strip() for point in tldr_points if str(point).strip()]
+
+    summary_body = str(parsed.get("summary_body", "")).strip()
+    source_citation = str(parsed.get("source_citation", "")).strip()
+
+    if not cleaned_points and not summary_body:
+        return None
+
+    return {
+        "tldr_points": cleaned_points[:5],
+        "summary_body": summary_body or None,
+        "source_citation": source_citation or None,
+    }
+
+
+def build_fallback_summary(summary: str, source_url: str | None) -> dict:
+    segments = [
+        s.strip() for s in summary.replace("!", ".").replace("?", ".").split(".") if s.strip()
+    ]
+    points = segments[:3]
+    if not points and summary.strip():
+        points = [summary.strip()[:180]]
+    return {
+        "tldr_points": points,
+        "summary_body": summary,
+        "source_citation": source_url or "Original source",
+    }
+
+
+async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
+    if config.ROYALTY_IMAGE_MCP_ENDPOINT:
+        try:
+            async with httpx.AsyncClient(timeout=15.0) as client:
+                response = await client.post(
+                    config.ROYALTY_IMAGE_MCP_ENDPOINT,
+                    json={"query": query},
+                )
+                response.raise_for_status()
+                payload = response.json()
+                image_url = payload.get("image_url") or payload.get("url")
+                image_credit = payload.get("image_credit") or payload.get("credit")
+                if image_url:
+                    return str(image_url), str(image_credit or "Royalty-free")
+        except Exception:
+            logger.exception("MCP image retrieval failed")
+
+    if config.ROYALTY_IMAGE_PROVIDER.lower() == "wikimedia":
+        try:
+            encoded_query = quote_plus(query[:120])
+            search_url = (
+                "https://commons.wikimedia.org/w/api.php"
+                "?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
+                f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
+            )
+            async with httpx.AsyncClient(
+                timeout=15.0,
+                headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
+            ) as client:
+                response = await client.get(search_url)
+                response.raise_for_status()
+                data = response.json()
+            pages = data.get("query", {}).get("pages", {})
+            if pages:
+                first_page = next(iter(pages.values()))
+                infos = first_page.get("imageinfo", [])
+                if infos:
+                    url = infos[0].get("url")
+                    if url:
+                        return str(url), "Wikimedia Commons"
+        except Exception:
+            logger.exception("Wikimedia image retrieval failed")
+
+    if config.ROYALTY_IMAGE_PROVIDER.lower() == "picsum":
+        seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
+        return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
+
+    return None, None
+
+
 def parse_news_response(response: dict) -> list[dict]:
    content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
    content = content.strip()
@@ -261,6 +435,37 @@ async def process_and_store_news() -> int:
            local_image = await download_and_optimize_image(item.get("image_url", ""))
            image_url = local_image or PLACEHOLDER_IMAGE_PATH

+            summary_artifact: dict | None = None
+            if config.PERPLEXITY_API_KEY:
+                try:
+                    summary_response = await call_perplexity_summary_api(
+                        headline=headline,
+                        summary=summary,
+                        source_url=item.get("source_url"),
+                    )
+                    if summary_response:
+                        summary_artifact = parse_summary_response(summary_response)
+                except Exception:
+                    logger.exception("Summary generation failed for article: %s", headline[:80])
+
+            if summary_artifact is None:
+                summary_artifact = build_fallback_summary(summary, item.get("source_url"))
+
+            summary_image_url, summary_image_credit = await fetch_royalty_free_image(headline)
+            summary_local_image = None
+            if summary_image_url:
+                summary_local_image = await download_and_optimize_image(summary_image_url)
+            if summary_local_image:
+                summary_image_url = summary_local_image
+            if not summary_image_url:
+                summary_image_url = image_url
+            if not summary_image_credit:
+                summary_image_credit = item.get("image_credit")
+
+            tldr_points = summary_artifact.get("tldr_points") if summary_artifact else None
+            summary_body = summary_artifact.get("summary_body") if summary_artifact else None
+            source_citation = summary_artifact.get("source_citation") if summary_artifact else None
+
            created_news_item = create_news(
                db=db,
                headline=headline,
@@ -268,9 +473,20 @@ async def process_and_store_news() -> int:
                source_url=item.get("source_url"),
                image_url=image_url,
                image_credit=item.get("image_credit"),
+                tldr_points=tldr_points,
+                summary_body=summary_body,
+                source_citation=source_citation,
+                summary_image_url=summary_image_url,
+                summary_image_credit=summary_image_credit,
            )

-            translations = await generate_translations(headline, summary)
+            translations = await generate_translations(
+                headline=headline,
+                summary=summary,
+                tldr_points=tldr_points,
+                summary_body=summary_body,
+                source_citation=source_citation,
+            )
            for language_code, payload in translations.items():
                if translation_exists(db, created_news_item.id, language_code):
                    continue
@@ -280,6 +496,9 @@ async def process_and_store_news() -> int:
                    language=language_code,
                    headline=payload["headline"],
                    summary=payload["summary"],
+                    tldr_points=payload.get("tldr_points"),
+                    summary_body=payload.get("summary_body"),
+                    source_citation=payload.get("source_citation"),
                )

            stored += 1
--- a/backend/repository.py
+++ b/backend/repository.py
@@ -1,4 +1,5 @@
 import datetime
+import json

 from sqlalchemy import and_, desc
 from sqlalchemy.orm import Session
@@ -15,6 +16,11 @@ def create_news(
    source_url: str | None = None,
    image_url: str | None = None,
    image_credit: str | None = None,
+    tldr_points: list[str] | None = None,
+    summary_body: str | None = None,
+    source_citation: str | None = None,
+    summary_image_url: str | None = None,
+    summary_image_credit: str | None = None,
    published_at: datetime.datetime | None = None,
 ) -> NewsItem:
    item = NewsItem(
@@ -23,6 +29,11 @@ def create_news(
        source_url=source_url,
        image_url=image_url,
        image_credit=image_credit,
+        tldr_points=json.dumps(tldr_points) if tldr_points else None,
+        summary_body=summary_body,
+        source_citation=source_citation,
+        summary_image_url=summary_image_url,
+        summary_image_credit=summary_image_credit,
        published_at=published_at or datetime.datetime.utcnow(),
    )
    db.add(item)
@@ -56,12 +67,18 @@ def create_translation(
    language: str,
    headline: str,
    summary: str,
+    tldr_points: list[str] | None = None,
+    summary_body: str | None = None,
+    source_citation: str | None = None,
 ) -> NewsTranslation:
    translation = NewsTranslation(
        news_item_id=news_item_id,
        language=language,
        headline=headline,
        summary=summary,
+        tldr_points=json.dumps(tldr_points) if tldr_points else None,
+        summary_body=summary_body,
+        source_citation=source_citation,
    )
    db.add(translation)
    db.commit()
@@ -101,6 +118,28 @@ def resolve_news_content(item: NewsItem, translation: NewsTranslation | None) ->
    return translation.headline, translation.summary


+def resolve_tldr_points(item: NewsItem, translation: NewsTranslation | None) -> list[str] | None:
+    raw = translation.tldr_points if translation is not None else item.tldr_points
+    if not raw:
+        return None
+    try:
+        parsed = json.loads(raw)
+    except json.JSONDecodeError:
+        return None
+    if isinstance(parsed, list):
+        return [str(x) for x in parsed if str(x).strip()]
+    return None
+
+
+def resolve_summary_fields(
+    item: NewsItem, translation: NewsTranslation | None
+) -> tuple[list[str] | None, str | None, str | None]:
+    tldr_points = resolve_tldr_points(item, translation)
+    if translation is None:
+        return tldr_points, item.summary_body, item.source_citation
+    return tldr_points, translation.summary_body, translation.source_citation
+
+
 def normalize_language(language: str | None) -> str:
    if not language:
        return "en"
--- a/backend/schemas.py
+++ b/backend/schemas.py
@@ -10,6 +10,11 @@ class NewsItemResponse(BaseModel):
    source_url: str | None = None
    image_url: str | None = None
    image_credit: str | None = None
+    tldr_points: list[str] | None = None
+    summary_body: str | None = None
+    source_citation: str | None = None
+    summary_image_url: str | None = None
+    summary_image_credit: str | None = None
    published_at: datetime.datetime
    created_at: datetime.datetime
    language: str
@@ -29,6 +34,9 @@ class NewsTranslationResponse(BaseModel):
    language: str
    headline: str
    summary: str
+    tldr_points: list[str] | None = None
+    summary_body: str | None = None
+    source_citation: str | None = None
    created_at: datetime.datetime

    model_config = {"from_attributes": True}
--- a/backend/static/images/68e732e77472f22c71725abc0ce8decb.jpg
+++ b/backend/static/images/68e732e77472f22c71725abc0ce8decb.jpg
--- a/backend/static/images/70587129d6a509489e022bc0caa72a02.jpg
+++ b/backend/static/images/70587129d6a509489e022bc0caa72a02.jpg
--- a/backend/static/images/8b2039ed543726d2748b27a3dd960cc4.jpg
+++ b/backend/static/images/8b2039ed543726d2748b27a3dd960cc4.jpg
--- a/backend/static/images/beb46446a5b57e50a610eaab2a1bf23c.jpg
+++ b/backend/static/images/beb46446a5b57e50a610eaab2a1bf23c.jpg
--- a/backend/static/images/c23c52c3355062a4c6f2e3fc38d19758.jpg
+++ b/backend/static/images/c23c52c3355062a4c6f2e3fc38d19758.jpg