Initial Commit

This commit is contained in:
2026-02-12 16:50:29 -05:00
commit a1da041f14
74 changed files with 6140 additions and 0 deletions

0
backend/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

71
backend/cli.py Normal file
View File

@@ -0,0 +1,71 @@
import argparse
import asyncio
import logging
import os
import sys
import time
from backend import config
from backend.database import init_db
from backend.news_service import process_and_store_news
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="clawfort", description="ClawFort operations CLI")
subparsers = parser.add_subparsers(dest="command", required=True)
force_fetch_parser = subparsers.add_parser(
"force-fetch",
help="Run one immediate news fetch cycle",
description="Trigger one immediate news fetch run outside scheduler cadence.",
)
force_fetch_parser.set_defaults(handler=handle_force_fetch)
return parser
def validate_runtime() -> None:
if not config.PERPLEXITY_API_KEY and not config.OPENROUTER_API_KEY:
raise RuntimeError(
"No provider API key configured. Set PERPLEXITY_API_KEY or OPENROUTER_API_KEY in the environment."
)
def handle_force_fetch(_: argparse.Namespace) -> int:
start = time.monotonic()
try:
validate_runtime()
os.makedirs("data", exist_ok=True)
init_db()
stored_count = asyncio.run(process_and_store_news())
elapsed = time.monotonic() - start
print(f"force-fetch succeeded: stored={stored_count} elapsed={elapsed:.1f}s")
return 0
except Exception as exc:
logger.exception("force-fetch failed")
print(f"force-fetch failed: {exc}", file=sys.stderr)
print(
"Check API keys, network connectivity, and provider status, then retry the command.",
file=sys.stderr,
)
return 1
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
handler = args.handler
return handler(args)
if __name__ == "__main__":
raise SystemExit(main())

23
backend/config.py Normal file
View File

@@ -0,0 +1,23 @@
import os
from dotenv import load_dotenv
load_dotenv()
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY", "")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
IMAGE_QUALITY = int(os.getenv("IMAGE_QUALITY", "85"))
RETENTION_DAYS = int(os.getenv("RETENTION_DAYS", "30"))
UMAMI_SCRIPT_URL = os.getenv("UMAMI_SCRIPT_URL", "")
UMAMI_WEBSITE_ID = os.getenv("UMAMI_WEBSITE_ID", "")
PERPLEXITY_API_URL = "https://api.perplexity.ai/chat/completions"
PERPLEXITY_MODEL = "sonar"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
OPENROUTER_MODEL = "google/gemini-2.0-flash-001"
SUPPORTED_LANGUAGES = ["en", "ta", "ml"]
STATIC_IMAGES_DIR = os.path.join(os.path.dirname(__file__), "static", "images")
os.makedirs(STATIC_IMAGES_DIR, exist_ok=True)

27
backend/database.py Normal file
View File

@@ -0,0 +1,27 @@
from collections.abc import Generator
from sqlalchemy import create_engine
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
DATABASE_URL = "sqlite:///./data/clawfort.db"
engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False)
class Base(DeclarativeBase):
pass
def get_db() -> Generator[Session, None, None]:
db = SessionLocal()
try:
yield db
finally:
db.close()
def init_db() -> None:
from backend.models import NewsItem, NewsTranslation # noqa: F401
Base.metadata.create_all(bind=engine)

16
backend/init_db.py Normal file
View File

@@ -0,0 +1,16 @@
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from backend.database import init_db
def main() -> None:
os.makedirs("data", exist_ok=True)
init_db()
print("Database initialized successfully.")
if __name__ == "__main__":
main()

173
backend/main.py Normal file
View File

@@ -0,0 +1,173 @@
import logging
import os
from apscheduler.schedulers.background import BackgroundScheduler
from fastapi import Depends, FastAPI, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from backend import config
from backend.database import get_db, init_db
from backend.models import NewsItem
from backend.news_service import scheduled_news_fetch
from backend.repository import (
archive_old_news,
delete_archived_news,
get_latest_news,
get_news_paginated,
get_translation,
normalize_language,
resolve_news_content,
)
from backend.schemas import HealthResponse, NewsItemResponse, PaginatedNewsResponse
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
app = FastAPI(title="ClawFort News API", version="0.1.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
static_dir = os.path.join(os.path.dirname(__file__), "static")
app.mount("/static", StaticFiles(directory=static_dir), name="static")
scheduler = BackgroundScheduler()
def nightly_cleanup() -> None:
from backend.database import SessionLocal
db = SessionLocal()
try:
archived = archive_old_news(db, config.RETENTION_DAYS)
deleted = delete_archived_news(db, days_after_archive=60)
logger.info("Nightly cleanup: archived=%d, deleted=%d", archived, deleted)
finally:
db.close()
@app.on_event("startup")
async def startup_event() -> None:
if not config.PERPLEXITY_API_KEY:
logger.error("PERPLEXITY_API_KEY is not set — news fetching will fail")
os.makedirs("data", exist_ok=True)
init_db()
logger.info("Database initialized")
scheduler.add_job(scheduled_news_fetch, "interval", hours=1, id="news_fetch")
scheduler.add_job(nightly_cleanup, "cron", hour=3, minute=0, id="nightly_cleanup")
scheduler.start()
logger.info("Scheduler started: hourly news fetch + nightly cleanup")
@app.on_event("shutdown")
async def shutdown_event() -> None:
scheduler.shutdown(wait=False)
logger.info("Scheduler shut down")
@app.get("/api/news", response_model=PaginatedNewsResponse)
def api_get_news(
cursor: int | None = Query(None, description="Cursor for pagination (last item ID)"),
limit: int = Query(10, ge=1, le=50),
exclude_hero: int | None = Query(None, description="Hero item ID to exclude from feed"),
language: str = Query("en", description="Language code: en, ta, ml"),
db: Session = Depends(get_db),
) -> PaginatedNewsResponse:
lang = normalize_language(language)
items = get_news_paginated(db, cursor=cursor, limit=limit + 1, exclude_id=exclude_hero)
has_more = len(items) > limit
if has_more:
items = items[:limit]
next_cursor = items[-1].id if items and has_more else None
response_items: list[NewsItemResponse] = []
for item in items:
translation = None
if lang != "en":
translation = get_translation(db, item.id, lang)
headline, summary = resolve_news_content(item, translation)
response_items.append(
NewsItemResponse(
id=item.id,
headline=headline,
summary=summary,
source_url=item.source_url,
image_url=item.image_url,
image_credit=item.image_credit,
published_at=item.published_at,
created_at=item.created_at,
language=lang if translation is not None else "en",
)
)
return PaginatedNewsResponse(
items=response_items,
next_cursor=next_cursor,
has_more=has_more,
)
@app.get("/api/news/latest", response_model=NewsItemResponse | None)
def api_get_latest_news(
language: str = Query("en", description="Language code: en, ta, ml"),
db: Session = Depends(get_db),
) -> NewsItemResponse | None:
lang = normalize_language(language)
item = get_latest_news(db)
if not item:
return None
translation = None
if lang != "en":
translation = get_translation(db, item.id, lang)
headline, summary = resolve_news_content(item, translation)
return NewsItemResponse(
id=item.id,
headline=headline,
summary=summary,
source_url=item.source_url,
image_url=item.image_url,
image_credit=item.image_credit,
published_at=item.published_at,
created_at=item.created_at,
language=lang if translation is not None else "en",
)
@app.get("/api/health", response_model=HealthResponse)
def api_health(db: Session = Depends(get_db)) -> HealthResponse:
count = db.query(NewsItem).filter(NewsItem.archived.is_(False)).count()
return HealthResponse(status="ok", version="0.1.0", news_count=count)
frontend_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "frontend")
@app.get("/")
async def serve_frontend() -> FileResponse:
return FileResponse(os.path.join(frontend_dir, "index.html"))
@app.get("/config")
async def serve_config() -> dict:
return {
"umami_script_url": config.UMAMI_SCRIPT_URL,
"umami_website_id": config.UMAMI_WEBSITE_ID,
"supported_languages": config.SUPPORTED_LANGUAGES,
"default_language": "en",
}

45
backend/models.py Normal file
View File

@@ -0,0 +1,45 @@
import datetime
from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.database import Base
class NewsItem(Base):
__tablename__ = "news_items"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
headline: Mapped[str] = mapped_column(String(500), nullable=False, index=True)
summary: Mapped[str] = mapped_column(Text, nullable=False)
source_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
image_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
image_credit: Mapped[str | None] = mapped_column(String(500), nullable=True)
published_at: Mapped[datetime.datetime] = mapped_column(
DateTime, nullable=False, default=datetime.datetime.utcnow
)
created_at: Mapped[datetime.datetime] = mapped_column(
DateTime, nullable=False, default=datetime.datetime.utcnow
)
archived: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
translations: Mapped[list["NewsTranslation"]] = relationship(
back_populates="news_item", cascade="all, delete-orphan"
)
class NewsTranslation(Base):
__tablename__ = "news_translations"
__table_args__ = (UniqueConstraint("news_item_id", "language", name="uq_news_item_language"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
news_item_id: Mapped[int] = mapped_column(
ForeignKey("news_items.id"), nullable=False, index=True
)
language: Mapped[str] = mapped_column(String(5), nullable=False, index=True)
headline: Mapped[str] = mapped_column(String(500), nullable=False)
summary: Mapped[str] = mapped_column(Text, nullable=False)
created_at: Mapped[datetime.datetime] = mapped_column(
DateTime, nullable=False, default=datetime.datetime.utcnow
)
news_item: Mapped[NewsItem] = relationship(back_populates="translations")

299
backend/news_service.py Normal file
View File

@@ -0,0 +1,299 @@
import asyncio
import hashlib
import json
import logging
import os
import time
from io import BytesIO
import httpx
from PIL import Image
from backend import config
from backend.database import SessionLocal
from backend.repository import (
create_news,
create_translation,
headline_exists_within_24h,
translation_exists,
)
logger = logging.getLogger(__name__)
PLACEHOLDER_IMAGE_PATH = "/static/images/placeholder.png"
async def call_perplexity_api(query: str) -> dict | None:
headers = {
"Authorization": f"Bearer {config.PERPLEXITY_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": config.PERPLEXITY_MODEL,
"messages": [
{
"role": "system",
"content": (
"You are a news aggregator. Return a JSON array of news items. "
"Each item must have: headline, summary (2-3 sentences), source_url, "
"image_url (a relevant image URL if available), image_credit. "
"Return between 3 and 5 items. Respond ONLY with valid JSON array, no markdown."
),
},
{"role": "user", "content": query},
],
"temperature": 0.3,
}
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload)
cost_info = {
"model": config.PERPLEXITY_MODEL,
"status": response.status_code,
"usage": response.json().get("usage", {}),
}
logger.info("Perplexity API cost: %s", json.dumps(cost_info))
response.raise_for_status()
return response.json()
async def call_openrouter_api(query: str) -> dict | None:
if not config.OPENROUTER_API_KEY:
return None
headers = {
"Authorization": f"Bearer {config.OPENROUTER_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": config.OPENROUTER_MODEL,
"messages": [
{
"role": "system",
"content": (
"You are a news aggregator. Return a JSON array of news items. "
"Each item must have: headline, summary (2-3 sentences), source_url, "
"image_url (a relevant image URL if available), image_credit. "
"Return between 3 and 5 items. Respond ONLY with valid JSON array, no markdown."
),
},
{"role": "user", "content": query},
],
"temperature": 0.3,
}
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(config.OPENROUTER_API_URL, headers=headers, json=payload)
cost_info = {
"model": config.OPENROUTER_MODEL,
"status": response.status_code,
"usage": response.json().get("usage", {}),
}
logger.info("OpenRouter API cost: %s", json.dumps(cost_info))
response.raise_for_status()
return response.json()
async def call_perplexity_translation_api(
headline: str, summary: str, language: str
) -> dict | None:
headers = {
"Authorization": f"Bearer {config.PERPLEXITY_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": config.PERPLEXITY_MODEL,
"messages": [
{
"role": "system",
"content": (
"Translate the given headline and summary to the target language. "
"Return only valid JSON object with keys: headline, summary. "
"No markdown, no extra text."
),
},
{
"role": "user",
"content": json.dumps(
{
"target_language": language,
"headline": headline,
"summary": summary,
}
),
},
],
"temperature": 0.1,
}
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(config.PERPLEXITY_API_URL, headers=headers, json=payload)
response.raise_for_status()
return response.json()
def parse_translation_response(response: dict) -> dict | None:
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
content = content.strip()
if content.startswith("```"):
content = content.split("\n", 1)[-1].rsplit("```", 1)[0]
try:
parsed = json.loads(content)
if isinstance(parsed, dict):
headline = str(parsed.get("headline", "")).strip()
summary = str(parsed.get("summary", "")).strip()
if headline and summary:
return {"headline": headline, "summary": summary}
except json.JSONDecodeError:
logger.error("Failed to parse translation response: %s", content[:200])
return None
async def generate_translations(headline: str, summary: str) -> dict[str, dict]:
translations: dict[str, dict] = {}
language_names = {"ta": "Tamil", "ml": "Malayalam"}
if not config.PERPLEXITY_API_KEY:
return translations
for language_code, language_name in language_names.items():
try:
response = await call_perplexity_translation_api(headline, summary, language_name)
if response:
parsed = parse_translation_response(response)
if parsed:
translations[language_code] = parsed
except Exception:
logger.exception("Translation generation failed for %s", language_code)
return translations
def parse_news_response(response: dict) -> list[dict]:
content = response.get("choices", [{}])[0].get("message", {}).get("content", "")
content = content.strip()
if content.startswith("```"):
content = content.split("\n", 1)[-1].rsplit("```", 1)[0]
try:
items = json.loads(content)
if isinstance(items, list):
return items
except json.JSONDecodeError:
logger.error("Failed to parse news response: %s", content[:200])
return []
async def download_and_optimize_image(image_url: str) -> str | None:
if not image_url:
return None
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(image_url)
response.raise_for_status()
img = Image.open(BytesIO(response.content))
if img.width > 1200:
ratio = 1200 / img.width
new_height = int(img.height * ratio)
img = img.resize((1200, new_height), Image.Resampling.LANCZOS)
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
filename = hashlib.md5(image_url.encode()).hexdigest() + ".jpg"
filepath = os.path.join(config.STATIC_IMAGES_DIR, filename)
img.save(filepath, "JPEG", quality=config.IMAGE_QUALITY, optimize=True)
return f"/static/images/{filename}"
except Exception:
logger.exception("Failed to download/optimize image: %s", image_url)
return None
async def fetch_news_with_retry(max_attempts: int = 3) -> list[dict]:
query = "What are the latest AI news from the last hour? Include source URLs and image URLs."
for attempt in range(max_attempts):
try:
response = await call_perplexity_api(query)
if response:
return parse_news_response(response)
except Exception:
wait = 2**attempt
logger.warning("Perplexity API attempt %d failed, retrying in %ds", attempt + 1, wait)
await asyncio.sleep(wait)
logger.warning("Perplexity API exhausted, trying OpenRouter fallback")
try:
response = await call_openrouter_api(query)
if response:
return parse_news_response(response)
except Exception:
logger.exception("OpenRouter fallback also failed")
return []
async def process_and_store_news() -> int:
items = await fetch_news_with_retry()
if not items:
logger.warning("No news items fetched this cycle")
return 0
db = SessionLocal()
stored = 0
try:
for item in items:
headline = item.get("headline", "").strip()
summary = item.get("summary", "").strip()
if not headline or not summary:
continue
if headline_exists_within_24h(db, headline):
logger.debug("Duplicate headline skipped: %s", headline[:80])
continue
local_image = await download_and_optimize_image(item.get("image_url", ""))
image_url = local_image or PLACEHOLDER_IMAGE_PATH
created_news_item = create_news(
db=db,
headline=headline,
summary=summary,
source_url=item.get("source_url"),
image_url=image_url,
image_credit=item.get("image_credit"),
)
translations = await generate_translations(headline, summary)
for language_code, payload in translations.items():
if translation_exists(db, created_news_item.id, language_code):
continue
create_translation(
db=db,
news_item_id=created_news_item.id,
language=language_code,
headline=payload["headline"],
summary=payload["summary"],
)
stored += 1
logger.info("Stored %d new news items", stored)
finally:
db.close()
return stored
def scheduled_news_fetch() -> None:
start = time.monotonic()
logger.info("Starting scheduled news fetch")
count = asyncio.run(process_and_store_news())
elapsed = time.monotonic() - start
logger.info("Scheduled news fetch complete: %d items in %.1fs", count, elapsed)

171
backend/repository.py Normal file
View File

@@ -0,0 +1,171 @@
import datetime
from sqlalchemy import and_, desc
from sqlalchemy.orm import Session
from backend.models import NewsItem, NewsTranslation
SUPPORTED_LANGUAGES = {"en", "ta", "ml"}
def create_news(
db: Session,
headline: str,
summary: str,
source_url: str | None = None,
image_url: str | None = None,
image_credit: str | None = None,
published_at: datetime.datetime | None = None,
) -> NewsItem:
item = NewsItem(
headline=headline,
summary=summary,
source_url=source_url,
image_url=image_url,
image_credit=image_credit,
published_at=published_at or datetime.datetime.utcnow(),
)
db.add(item)
db.commit()
db.refresh(item)
return item
def get_recent_news(db: Session, limit: int = 10) -> list[NewsItem]:
return (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.limit(limit)
.all()
)
def get_latest_news(db: Session) -> NewsItem | None:
return (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.first()
)
def create_translation(
db: Session,
news_item_id: int,
language: str,
headline: str,
summary: str,
) -> NewsTranslation:
translation = NewsTranslation(
news_item_id=news_item_id,
language=language,
headline=headline,
summary=summary,
)
db.add(translation)
db.commit()
db.refresh(translation)
return translation
def get_translation(db: Session, news_item_id: int, language: str) -> NewsTranslation | None:
return (
db.query(NewsTranslation)
.filter(
and_(
NewsTranslation.news_item_id == news_item_id,
NewsTranslation.language == language,
)
)
.first()
)
def translation_exists(db: Session, news_item_id: int, language: str) -> bool:
return get_translation(db, news_item_id, language) is not None
def get_translations_by_article(db: Session, news_item_id: int) -> list[NewsTranslation]:
return (
db.query(NewsTranslation)
.filter(NewsTranslation.news_item_id == news_item_id)
.order_by(NewsTranslation.language.asc())
.all()
)
def resolve_news_content(item: NewsItem, translation: NewsTranslation | None) -> tuple[str, str]:
if translation is None:
return item.headline, item.summary
return translation.headline, translation.summary
def normalize_language(language: str | None) -> str:
if not language:
return "en"
lower = language.lower()
if lower not in SUPPORTED_LANGUAGES:
return "en"
return lower
def get_news_paginated(
db: Session, cursor: int | None = None, limit: int = 10, exclude_id: int | None = None
) -> list[NewsItem]:
query = db.query(NewsItem).filter(NewsItem.archived.is_(False))
if exclude_id is not None:
query = query.filter(NewsItem.id != exclude_id)
if cursor is not None:
query = query.filter(NewsItem.id < cursor)
return query.order_by(desc(NewsItem.id)).limit(limit).all()
def headline_exists_within_24h(db: Session, headline: str) -> bool:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=24)
return (
db.query(NewsItem)
.filter(
and_(
NewsItem.headline == headline,
NewsItem.created_at >= cutoff,
)
)
.first()
is not None
)
def archive_old_news(db: Session, retention_days: int = 30) -> int:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
count = (
db.query(NewsItem)
.filter(
and_(
NewsItem.created_at < cutoff,
NewsItem.archived.is_(False),
)
)
.update({"archived": True})
)
db.commit()
return count
def delete_archived_news(db: Session, days_after_archive: int = 60) -> int:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=days_after_archive)
count = (
db.query(NewsItem)
.filter(
and_(
NewsItem.archived.is_(True),
NewsItem.created_at < cutoff,
)
)
.delete()
)
db.commit()
return count

40
backend/schemas.py Normal file
View File

@@ -0,0 +1,40 @@
import datetime
from pydantic import BaseModel
class NewsItemResponse(BaseModel):
id: int
headline: str
summary: str
source_url: str | None = None
image_url: str | None = None
image_credit: str | None = None
published_at: datetime.datetime
created_at: datetime.datetime
language: str
model_config = {"from_attributes": True}
class PaginatedNewsResponse(BaseModel):
items: list[NewsItemResponse]
next_cursor: int | None = None
has_more: bool = False
class NewsTranslationResponse(BaseModel):
id: int
news_item_id: int
language: str
headline: str
summary: str
created_at: datetime.datetime
model_config = {"from_attributes": True}
class HealthResponse(BaseModel):
status: str
version: str
news_count: int

Binary file not shown.

After

Width:  |  Height:  |  Size: 182 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.5 KiB