p08-seo-tweaks

This commit is contained in:
2026-02-13 00:49:22 -05:00
parent a1da041f14
commit 88a5540b7d
63 changed files with 2228 additions and 37 deletions

View File

@@ -10,6 +10,12 @@ IMAGE_QUALITY = int(os.getenv("IMAGE_QUALITY", "85"))
RETENTION_DAYS = int(os.getenv("RETENTION_DAYS", "30"))
UMAMI_SCRIPT_URL = os.getenv("UMAMI_SCRIPT_URL", "")
UMAMI_WEBSITE_ID = os.getenv("UMAMI_WEBSITE_ID", "")
ROYALTY_IMAGE_MCP_ENDPOINT = os.getenv("ROYALTY_IMAGE_MCP_ENDPOINT", "")
ROYALTY_IMAGE_API_KEY = os.getenv("ROYALTY_IMAGE_API_KEY", "")
ROYALTY_IMAGE_PROVIDER = os.getenv("ROYALTY_IMAGE_PROVIDER", "picsum")
_summary_length_raw = int(os.getenv("SUMMARY_LENGTH_SCALE", "3"))
SUMMARY_LENGTH_SCALE = max(1, min(5, _summary_length_raw))
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
PERPLEXITY_MODEL = "sonar"

View File

@@ -1,6 +1,6 @@
from collections.abc import Generator
from sqlalchemy import create_engine
from sqlalchemy import create_engine, text
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
DATABASE_URL = "sqlite:///./data/clawfort.db"
@@ -25,3 +25,27 @@ def init_db() -> None:
from backend.models import NewsItem, NewsTranslation # noqa: F401
Base.metadata.create_all(bind=engine)
migration_sql = {
"news_items": {
"tldr_points": "ALTER TABLE news_items ADD COLUMN tldr_points TEXT",
"summary_body": "ALTER TABLE news_items ADD COLUMN summary_body TEXT",
"source_citation": "ALTER TABLE news_items ADD COLUMN source_citation TEXT",
"summary_image_url": "ALTER TABLE news_items ADD COLUMN summary_image_url VARCHAR(2000)",
"summary_image_credit": "ALTER TABLE news_items ADD COLUMN summary_image_credit VARCHAR(500)",
},
"news_translations": {
"tldr_points": "ALTER TABLE news_translations ADD COLUMN tldr_points TEXT",
"summary_body": "ALTER TABLE news_translations ADD COLUMN summary_body TEXT",
"source_citation": "ALTER TABLE news_translations ADD COLUMN source_citation TEXT",
},
}
with engine.begin() as conn:
for table, columns in migration_sql.items():
existing_cols = {
row[1] for row in conn.execute(text(f"PRAGMA table_info({table})")).fetchall()
}
for col_name, ddl in columns.items():
if col_name not in existing_cols:
conn.execute(text(ddl))

View File

@@ -2,8 +2,9 @@ import logging
import os
from apscheduler.schedulers.background import BackgroundScheduler
from fastapi import Depends, FastAPI, Query
from fastapi import Depends, FastAPI, Query, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
@@ -20,6 +21,7 @@ from backend.repository import (
get_translation,
normalize_language,
resolve_news_content,
resolve_summary_fields,
)
from backend.schemas import HealthResponse, NewsItemResponse, PaginatedNewsResponse
@@ -39,6 +41,30 @@ app.add_middleware(
allow_headers=["*"],
)
app.add_middleware(GZipMiddleware, minimum_size=500)
@app.middleware("http")
async def add_cache_headers(request: Request, call_next):
response = await call_next(request)
path = request.url.path
if path.startswith("/static/"):
response.headers.setdefault("Cache-Control", "public, max-age=604800, immutable")
elif path.startswith("/api/"):
response.headers.setdefault(
"Cache-Control", "public, max-age=60, stale-while-revalidate=120"
)
response.headers.setdefault("Vary", "Accept-Encoding")
elif path in {"/", "/terms", "/attribution"}:
response.headers.setdefault(
"Cache-Control", "public, max-age=300, stale-while-revalidate=600"
)
response.headers.setdefault("X-Content-Type-Options", "nosniff")
return response
static_dir = os.path.join(os.path.dirname(__file__), "static")
app.mount("/static", StaticFiles(directory=static_dir), name="static")
@@ -101,6 +127,7 @@ def api_get_news(
if lang != "en":
translation = get_translation(db, item.id, lang)
headline, summary = resolve_news_content(item, translation)
tldr_points, summary_body, source_citation = resolve_summary_fields(item, translation)
response_items.append(
NewsItemResponse(
id=item.id,
@@ -109,6 +136,11 @@ def api_get_news(
source_url=item.source_url,
image_url=item.image_url,
image_credit=item.image_credit,
tldr_points=tldr_points,
summary_body=summary_body,
source_citation=source_citation,
summary_image_url=item.summary_image_url,
summary_image_credit=item.summary_image_credit,
published_at=item.published_at,
created_at=item.created_at,
language=lang if translation is not None else "en",
@@ -136,6 +168,7 @@ def api_get_latest_news(
if lang != "en":
translation = get_translation(db, item.id, lang)
headline, summary = resolve_news_content(item, translation)
tldr_points, summary_body, source_citation = resolve_summary_fields(item, translation)
return NewsItemResponse(
id=item.id,
headline=headline,
@@ -143,6 +176,11 @@ def api_get_latest_news(
source_url=item.source_url,
image_url=item.image_url,
image_credit=item.image_credit,
tldr_points=tldr_points,
summary_body=summary_body,
source_citation=source_citation,
summary_image_url=item.summary_image_url,
summary_image_credit=item.summary_image_credit,
published_at=item.published_at,
created_at=item.created_at,
language=lang if translation is not None else "en",
@@ -163,6 +201,16 @@ async def serve_frontend() -> FileResponse:
return FileResponse(os.path.join(frontend_dir, "index.html"))
@app.get("/terms")
async def serve_terms() -> FileResponse:
return FileResponse(os.path.join(frontend_dir, "terms.html"))
@app.get("/attribution")
async def serve_attribution() -> FileResponse:
return FileResponse(os.path.join(frontend_dir, "attribution.html"))
@app.get("/config")
async def serve_config() -> dict:
return {

View File

@@ -12,6 +12,11 @@ class NewsItem(Base):
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
headline: Mapped[str] = mapped_column(String(500), nullable=False, index=True)
summary: Mapped[str] = mapped_column(Text, nullable=False)
tldr_points: Mapped[str | None] = mapped_column(Text, nullable=True)
summary_body: Mapped[str | None] = mapped_column(Text, nullable=True)
source_citation: Mapped[str | None] = mapped_column(Text, nullable=True)
summary_image_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
summary_image_credit: Mapped[str | None] = mapped_column(String(500), nullable=True)
source_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
image_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
image_credit: Mapped[str | None] = mapped_column(String(500), nullable=True)
@@ -38,6 +43,9 @@ class NewsTranslation(Base):
language: Mapped[str] = mapped_column(String(5), nullable=False, index=True)
headline: Mapped[str] = mapped_column(String(500), nullable=False)
summary: Mapped[str] = mapped_column(Text, nullable=False)
tldr_points: Mapped[str | None] = mapped_column(Text, nullable=True)
summary_body: Mapped[str | None] = mapped_column(Text, nullable=True)
source_citation: Mapped[str | None] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime.datetime] = mapped_column(
DateTime, nullable=False, default=datetime.datetime.utcnow
)

View File

@@ -5,6 +5,7 @@ import logging
import os
import time
from io import BytesIO
from urllib.parse import quote_plus
import httpx
from PIL import Image
@@ -95,7 +96,12 @@ async def call_openrouter_api(query: str) -> dict | None:
async def call_perplexity_translation_api(
headline: str, summary: str, language: str
headline: str,
summary: str,
language: str,
tldr_points: list[str] | None = None,
summary_body: str | None = None,
source_citation: str | None = None,
) -> dict | None:
headers = {
"Authorization": f"Bearer {config.PERPLEXITY_API_KEY}",
@@ -119,6 +125,9 @@ async def call_perplexity_translation_api(
"target_language": language,
"headline": headline,
"summary": summary,
"tldr_points": tldr_points or [],
"summary_body": summary_body or "",
"source_citation": source_citation or "",
}
),
},
@@ -144,13 +153,31 @@ def parse_translation_response(response: dict) -> dict | None:
headline = str(parsed.get("headline", "")).strip()
summary = str(parsed.get("summary", "")).strip()
if headline and summary:
return {"headline": headline, "summary": summary}
tldr_points = parsed.get("tldr_points", [])
if not isinstance(tldr_points, list):
tldr_points = []
cleaned_points = [str(p).strip() for p in tldr_points if str(p).strip()]
summary_body = str(parsed.get("summary_body", "")).strip() or None
source_citation = str(parsed.get("source_citation", "")).strip() or None
return {
"headline": headline,
"summary": summary,
"tldr_points": cleaned_points,
"summary_body": summary_body,
"source_citation": source_citation,
}
except json.JSONDecodeError:
logger.error("Failed to parse translation response: %s", content[:200])
return None
async def generate_translations(headline: str, summary: str) -> dict[str, dict]:
async def generate_translations(
headline: str,
summary: str,
tldr_points: list[str] | None = None,
summary_body: str | None = None,
source_citation: str | None = None,
) -> dict[str, dict]:
translations: dict[str, dict] = {}
language_names = {"ta": "Tamil", "ml": "Malayalam"}
@@ -159,7 +186,14 @@ async def generate_translations(headline: str, summary: str) -> dict[str, dict]:
for language_code, language_name in language_names.items():
try:
response = await call_perplexity_translation_api(headline, summary, language_name)
response = await call_perplexity_translation_api(
headline=headline,
summary=summary,
language=language_name,
tldr_points=tldr_points,
summary_body=summary_body,
source_citation=source_citation,
)
if response:
parsed = parse_translation_response(response)
if parsed:
@@ -170,6 +204,146 @@ async def generate_translations(headline: str, summary: str) -> dict[str, dict]:
return translations
async def call_perplexity_summary_api(
headline: str, summary: str, source_url: str | None
) -> dict | None:
headers = {
"Authorization": f"Bearer {config.PERPLEXITY_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": config.PERPLEXITY_MODEL,
"messages": [
{
"role": "system",
"content": (
"Generate concise structured JSON for UI modal. Return only JSON object with keys: "
"tldr_points (array of 3 short bullets), summary_body (detailed summary), "
"source_citation (concise source/citation text). "
"Always summarize from provided text only. No markdown."
),
},
{
"role": "user",
"content": json.dumps(
{
"headline": headline,
"summary": summary,
"source_citation": source_url or "Original source",
"summary_length_scale": config.SUMMARY_LENGTH_SCALE,
"summary_length_rule": (
"1=very short, 2=short, 3=medium, 4=long, 5=very long. "
"Use more detail as scale increases."
),
}
),
},
],
"temperature": 0.2,
}
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload)
response.raise_for_status()
return response.json()
def parse_summary_response(response: dict) -> dict | None:
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
content = content.strip()
if content.startswith("```"):
content = content.split("\n", 1)[-1].rsplit("```", 1)[0]
try:
parsed = json.loads(content)
except json.JSONDecodeError:
logger.error("Failed to parse summary response: %s", content[:200])
return None
if not isinstance(parsed, dict):
return None
tldr_points = parsed.get("tldr_points", [])
if not isinstance(tldr_points, list):
tldr_points = []
cleaned_points = [str(point).strip() for point in tldr_points if str(point).strip()]
summary_body = str(parsed.get("summary_body", "")).strip()
source_citation = str(parsed.get("source_citation", "")).strip()
if not cleaned_points and not summary_body:
return None
return {
"tldr_points": cleaned_points[:5],
"summary_body": summary_body or None,
"source_citation": source_citation or None,
}
def build_fallback_summary(summary: str, source_url: str | None) -> dict:
segments = [
s.strip() for s in summary.replace("!", ".").replace("?", ".").split(".") if s.strip()
]
points = segments[:3]
if not points and summary.strip():
points = [summary.strip()[:180]]
return {
"tldr_points": points,
"summary_body": summary,
"source_citation": source_url or "Original source",
}
async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
if config.ROYALTY_IMAGE_MCP_ENDPOINT:
try:
async with httpx.AsyncClient(timeout=15.0) as client:
response = await client.post(
config.ROYALTY_IMAGE_MCP_ENDPOINT,
json={"query": query},
)
response.raise_for_status()
payload = response.json()
image_url = payload.get("image_url") or payload.get("url")
image_credit = payload.get("image_credit") or payload.get("credit")
if image_url:
return str(image_url), str(image_credit or "Royalty-free")
except Exception:
logger.exception("MCP image retrieval failed")
if config.ROYALTY_IMAGE_PROVIDER.lower() == "wikimedia":
try:
encoded_query = quote_plus(query[:120])
search_url = (
"https://commons.wikimedia.org/w/api.php"
"?action=query&format=json&generator=search&gsrnamespace=6&gsrlimit=1"
f"&gsrsearch={encoded_query}&prop=imageinfo&iiprop=url"
)
async with httpx.AsyncClient(
timeout=15.0,
headers={"User-Agent": "ClawFortBot/1.0 (news image enrichment)"},
) as client:
response = await client.get(search_url)
response.raise_for_status()
data = response.json()
pages = data.get("query", {}).get("pages", {})
if pages:
first_page = next(iter(pages.values()))
infos = first_page.get("imageinfo", [])
if infos:
url = infos[0].get("url")
if url:
return str(url), "Wikimedia Commons"
except Exception:
logger.exception("Wikimedia image retrieval failed")
if config.ROYALTY_IMAGE_PROVIDER.lower() == "picsum":
seed = hashlib.md5(query.encode("utf-8")).hexdigest()[:12]
return f"https://picsum.photos/seed/{seed}/1200/630", "Picsum Photos"
return None, None
def parse_news_response(response: dict) -> list[dict]:
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
content = content.strip()
@@ -261,6 +435,37 @@ async def process_and_store_news() -> int:
local_image = await download_and_optimize_image(item.get("image_url", ""))
image_url = local_image or PLACEHOLDER_IMAGE_PATH
summary_artifact: dict | None = None
if config.PERPLEXITY_API_KEY:
try:
summary_response = await call_perplexity_summary_api(
headline=headline,
summary=summary,
source_url=item.get("source_url"),
)
if summary_response:
summary_artifact = parse_summary_response(summary_response)
except Exception:
logger.exception("Summary generation failed for article: %s", headline[:80])
if summary_artifact is None:
summary_artifact = build_fallback_summary(summary, item.get("source_url"))
summary_image_url, summary_image_credit = await fetch_royalty_free_image(headline)
summary_local_image = None
if summary_image_url:
summary_local_image = await download_and_optimize_image(summary_image_url)
if summary_local_image:
summary_image_url = summary_local_image
if not summary_image_url:
summary_image_url = image_url
if not summary_image_credit:
summary_image_credit = item.get("image_credit")
tldr_points = summary_artifact.get("tldr_points") if summary_artifact else None
summary_body = summary_artifact.get("summary_body") if summary_artifact else None
source_citation = summary_artifact.get("source_citation") if summary_artifact else None
created_news_item = create_news(
db=db,
headline=headline,
@@ -268,9 +473,20 @@ async def process_and_store_news() -> int:
source_url=item.get("source_url"),
image_url=image_url,
image_credit=item.get("image_credit"),
tldr_points=tldr_points,
summary_body=summary_body,
source_citation=source_citation,
summary_image_url=summary_image_url,
summary_image_credit=summary_image_credit,
)
translations = await generate_translations(headline, summary)
translations = await generate_translations(
headline=headline,
summary=summary,
tldr_points=tldr_points,
summary_body=summary_body,
source_citation=source_citation,
)
for language_code, payload in translations.items():
if translation_exists(db, created_news_item.id, language_code):
continue
@@ -280,6 +496,9 @@ async def process_and_store_news() -> int:
language=language_code,
headline=payload["headline"],
summary=payload["summary"],
tldr_points=payload.get("tldr_points"),
summary_body=payload.get("summary_body"),
source_citation=payload.get("source_citation"),
)
stored += 1

View File

@@ -1,4 +1,5 @@
import datetime
import json
from sqlalchemy import and_, desc
from sqlalchemy.orm import Session
@@ -15,6 +16,11 @@ def create_news(
source_url: str | None = None,
image_url: str | None = None,
image_credit: str | None = None,
tldr_points: list[str] | None = None,
summary_body: str | None = None,
source_citation: str | None = None,
summary_image_url: str | None = None,
summary_image_credit: str | None = None,
published_at: datetime.datetime | None = None,
) -> NewsItem:
item = NewsItem(
@@ -23,6 +29,11 @@ def create_news(
source_url=source_url,
image_url=image_url,
image_credit=image_credit,
tldr_points=json.dumps(tldr_points) if tldr_points else None,
summary_body=summary_body,
source_citation=source_citation,
summary_image_url=summary_image_url,
summary_image_credit=summary_image_credit,
published_at=published_at or datetime.datetime.utcnow(),
)
db.add(item)
@@ -56,12 +67,18 @@ def create_translation(
language: str,
headline: str,
summary: str,
tldr_points: list[str] | None = None,
summary_body: str | None = None,
source_citation: str | None = None,
) -> NewsTranslation:
translation = NewsTranslation(
news_item_id=news_item_id,
language=language,
headline=headline,
summary=summary,
tldr_points=json.dumps(tldr_points) if tldr_points else None,
summary_body=summary_body,
source_citation=source_citation,
)
db.add(translation)
db.commit()
@@ -101,6 +118,28 @@ def resolve_news_content(item: NewsItem, translation: NewsTranslation | None) ->
return translation.headline, translation.summary
def resolve_tldr_points(item: NewsItem, translation: NewsTranslation | None) -> list[str] | None:
raw = translation.tldr_points if translation is not None else item.tldr_points
if not raw:
return None
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
return None
if isinstance(parsed, list):
return [str(x) for x in parsed if str(x).strip()]
return None
def resolve_summary_fields(
item: NewsItem, translation: NewsTranslation | None
) -> tuple[list[str] | None, str | None, str | None]:
tldr_points = resolve_tldr_points(item, translation)
if translation is None:
return tldr_points, item.summary_body, item.source_citation
return tldr_points, translation.summary_body, translation.source_citation
def normalize_language(language: str | None) -> str:
if not language:
return "en"

View File

@@ -10,6 +10,11 @@ class NewsItemResponse(BaseModel):
source_url: str | None = None
image_url: str | None = None
image_credit: str | None = None
tldr_points: list[str] | None = None
summary_body: str | None = None
source_citation: str | None = None
summary_image_url: str | None = None
summary_image_credit: str | None = None
published_at: datetime.datetime
created_at: datetime.datetime
language: str
@@ -29,6 +34,9 @@ class NewsTranslationResponse(BaseModel):
language: str
headline: str
summary: str
tldr_points: list[str] | None = None
summary_body: str | None = None
source_citation: str | None = None
created_at: datetime.datetime
model_config = {"from_attributes": True}

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 182 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB