bulk commit changes!

This commit is contained in:
2026-02-13 02:32:06 -05:00
parent c8f98c54c9
commit bf4a40f533
152 changed files with 2210 additions and 19 deletions

View File

@@ -1,13 +1,31 @@
import argparse
import asyncio
import datetime
import json
import logging
import os
import re
import sys
import time
from sqlalchemy import and_, desc
from backend import config
from backend.database import init_db
from backend.news_service import process_and_store_news
from backend.database import SessionLocal, init_db
from backend.models import NewsItem
from backend.news_service import (
download_and_optimize_image,
extract_image_keywords,
fetch_royalty_free_image,
generate_translations,
process_and_store_news,
)
from backend.repository import (
create_translation,
delete_archived_news,
get_translation,
resolve_tldr_points,
)
logging.basicConfig(
level=logging.INFO,
@@ -16,6 +34,131 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
def positive_int(value: str) -> int:
try:
parsed = int(value)
except ValueError as exc:
raise argparse.ArgumentTypeError("must be an integer") from exc
if parsed <= 0:
raise argparse.ArgumentTypeError("must be greater than 0")
return parsed
def bounded_count(value: str) -> int:
parsed = positive_int(value)
if parsed > 50:
raise argparse.ArgumentTypeError("must be <= 50")
return parsed
def print_result(command: str, status: str, **fields: object) -> None:
parts = [f"admin:{command}", f"status={status}"]
parts.extend([f"{key}={value}" for key, value in fields.items()])
print(" ".join(parts))
def require_confirm(args: argparse.Namespace, action: str) -> bool:
if getattr(args, "confirm", False):
return True
print_result(
action,
"blocked",
reason="missing-confirm",
hint="rerun with --confirm",
)
return False
def build_contextual_query(headline: str, summary: str | None) -> str:
headline_query = extract_image_keywords(headline)
summary_query = extract_image_keywords(summary or "")
mood_terms: list[str] = []
text = f"{headline} {summary or ''}".lower()
if any(word in text for word in ("breakthrough", "launch", "record", "surge", "growth")):
mood_terms.extend(["innovation", "future"])
if any(word in text for word in ("risk", "lawsuit", "ban", "decline", "drop", "crash")):
mood_terms.extend(["serious", "technology"])
combined = " ".join([headline_query, summary_query, " ".join(mood_terms)]).strip()
cleaned = re.sub(r"\s+", " ", combined).strip()
if not cleaned:
return "ai machine learning deep learning"
return cleaned
async def refetch_images_for_latest(
limit: int,
max_attempts: int,
dry_run: bool,
) -> tuple[int, int]:
db = SessionLocal()
processed = 0
refreshed = 0
try:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.limit(limit)
.all()
)
total = len(items)
for idx, item in enumerate(items, start=1):
processed += 1
query = build_contextual_query(item.headline, item.summary)
image_url: str | None = None
image_credit: str | None = None
local_image: str | None = None
for attempt in range(max_attempts):
try:
image_url, image_credit = await fetch_royalty_free_image(query)
if not image_url:
raise RuntimeError("no-image-url")
local_image = await download_and_optimize_image(image_url)
if not local_image:
raise RuntimeError("image-download-or-optimize-failed")
break
except Exception:
if attempt == max_attempts - 1:
logger.exception("Image refetch failed for item=%s after retries", item.id)
image_url = None
local_image = None
break
delay = 2**attempt
logger.warning(
"Refetch retry item=%s attempt=%d delay=%ds",
item.id,
attempt + 1,
delay,
)
await asyncio.sleep(delay)
if local_image:
refreshed += 1
if not dry_run:
item.summary_image_url = local_image
item.summary_image_credit = image_credit or item.summary_image_credit
db.commit()
print_result(
"refetch-images",
"progress",
current=idx,
total=total,
refreshed=refreshed,
article_id=item.id,
)
return processed, refreshed
finally:
db.close()
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="clawfort", description="ClawFort operations CLI")
subparsers = parser.add_subparsers(dest="command", required=True)
@@ -27,6 +170,64 @@ def build_parser() -> argparse.ArgumentParser:
)
force_fetch_parser.set_defaults(handler=handle_force_fetch)
admin_parser = subparsers.add_parser(
"admin",
help="Administrative maintenance commands",
description="Run admin maintenance and recovery operations.",
)
admin_subparsers = admin_parser.add_subparsers(dest="admin_command", required=True)
fetch_parser = admin_subparsers.add_parser("fetch", help="Fetch n number of articles")
fetch_parser.add_argument("--count", type=bounded_count, default=5)
fetch_parser.set_defaults(handler=handle_admin_fetch)
refetch_parser = admin_subparsers.add_parser(
"refetch-images",
help="Refetch and optimize latest article images",
)
refetch_parser.add_argument("--limit", type=positive_int, default=30)
refetch_parser.add_argument("--max-attempts", type=positive_int, default=4)
refetch_parser.add_argument("--dry-run", action="store_true")
refetch_parser.set_defaults(handler=handle_admin_refetch_images)
clean_archive_parser = admin_subparsers.add_parser(
"clean-archive",
help="Delete archived items older than retention window",
)
clean_archive_parser.add_argument("--days", type=positive_int, default=60)
clean_archive_parser.add_argument("--confirm", action="store_true")
clean_archive_parser.add_argument("--dry-run", action="store_true")
clean_archive_parser.set_defaults(handler=handle_admin_clean_archive)
clear_cache_parser = admin_subparsers.add_parser(
"clear-cache", help="Clear optimized image cache"
)
clear_cache_parser.add_argument("--confirm", action="store_true")
clear_cache_parser.add_argument("--dry-run", action="store_true")
clear_cache_parser.set_defaults(handler=handle_admin_clear_cache)
clear_news_parser = admin_subparsers.add_parser("clear-news", help="Clear existing news items")
clear_news_parser.add_argument("--include-archived", action="store_true")
clear_news_parser.add_argument("--confirm", action="store_true")
clear_news_parser.add_argument("--dry-run", action="store_true")
clear_news_parser.set_defaults(handler=handle_admin_clear_news)
rebuild_parser = admin_subparsers.add_parser(
"rebuild-site", help="Clear and rebuild site content"
)
rebuild_parser.add_argument("--count", type=bounded_count, default=5)
rebuild_parser.add_argument("--confirm", action="store_true")
rebuild_parser.add_argument("--dry-run", action="store_true")
rebuild_parser.set_defaults(handler=handle_admin_rebuild_site)
regen_parser = admin_subparsers.add_parser(
"regenerate-translations",
help="Regenerate translations for existing articles",
)
regen_parser.add_argument("--limit", type=positive_int, default=0)
regen_parser.add_argument("--dry-run", action="store_true")
regen_parser.set_defaults(handler=handle_admin_regenerate_translations)
return parser
@@ -60,6 +261,221 @@ def handle_force_fetch(_: argparse.Namespace) -> int:
return 1
def handle_admin_fetch(args: argparse.Namespace) -> int:
start = time.monotonic()
try:
validate_runtime()
init_db()
stored = asyncio.run(process_and_store_news(article_count=args.count))
elapsed = time.monotonic() - start
print_result("fetch", "ok", requested=args.count, stored=stored, elapsed=f"{elapsed:.1f}s")
return 0
except Exception:
logger.exception("admin fetch failed")
print_result("fetch", "error")
return 1
def handle_admin_refetch_images(args: argparse.Namespace) -> int:
start = time.monotonic()
try:
init_db()
processed, refreshed = asyncio.run(
refetch_images_for_latest(
limit=min(args.limit, 30),
max_attempts=args.max_attempts,
dry_run=args.dry_run,
)
)
elapsed = time.monotonic() - start
print_result(
"refetch-images",
"ok",
processed=processed,
refreshed=refreshed,
dry_run=args.dry_run,
elapsed=f"{elapsed:.1f}s",
)
return 0
except Exception:
logger.exception("admin refetch-images failed")
print_result("refetch-images", "error")
return 1
def handle_admin_clean_archive(args: argparse.Namespace) -> int:
if not require_confirm(args, "clean-archive"):
return 2
db = SessionLocal()
try:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=args.days)
query = db.query(NewsItem).filter(
and_(NewsItem.archived.is_(True), NewsItem.created_at < cutoff)
)
count = query.count()
if args.dry_run:
print_result("clean-archive", "ok", dry_run=True, would_delete=count)
return 0
deleted = delete_archived_news(db, days_after_archive=args.days)
print_result("clean-archive", "ok", deleted=deleted)
return 0
except Exception:
logger.exception("admin clean-archive failed")
print_result("clean-archive", "error")
return 1
finally:
db.close()
def handle_admin_clear_cache(args: argparse.Namespace) -> int:
if not require_confirm(args, "clear-cache"):
return 2
try:
os.makedirs(config.STATIC_IMAGES_DIR, exist_ok=True)
files = [
os.path.join(config.STATIC_IMAGES_DIR, name)
for name in os.listdir(config.STATIC_IMAGES_DIR)
if name.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
]
if args.dry_run:
print_result("clear-cache", "ok", dry_run=True, would_delete=len(files))
return 0
deleted = 0
for file_path in files:
try:
os.remove(file_path)
deleted += 1
except OSError:
logger.warning("Failed to remove cache file: %s", file_path)
print_result("clear-cache", "ok", deleted=deleted)
return 0
except Exception:
logger.exception("admin clear-cache failed")
print_result("clear-cache", "error")
return 1
def handle_admin_clear_news(args: argparse.Namespace) -> int:
if not require_confirm(args, "clear-news"):
return 2
db = SessionLocal()
try:
query = db.query(NewsItem)
if not args.include_archived:
query = query.filter(NewsItem.archived.is_(False))
items = query.all()
if args.dry_run:
print_result("clear-news", "ok", dry_run=True, would_delete=len(items))
return 0
deleted = 0
for item in items:
db.delete(item)
deleted += 1
db.commit()
print_result("clear-news", "ok", deleted=deleted)
return 0
except Exception:
db.rollback()
logger.exception("admin clear-news failed")
print_result("clear-news", "error")
return 1
finally:
db.close()
def handle_admin_rebuild_site(args: argparse.Namespace) -> int:
if not require_confirm(args, "rebuild-site"):
return 2
if args.dry_run:
print_result("rebuild-site", "ok", dry_run=True, steps="clear-news,fetch")
return 0
clear_result = handle_admin_clear_news(
argparse.Namespace(include_archived=True, confirm=True, dry_run=False)
)
if clear_result != 0:
print_result("rebuild-site", "error", step="clear-news")
return clear_result
fetch_result = handle_admin_fetch(argparse.Namespace(count=args.count))
if fetch_result != 0:
print_result("rebuild-site", "error", step="fetch")
return fetch_result
print_result("rebuild-site", "ok", count=args.count)
return 0
def handle_admin_regenerate_translations(args: argparse.Namespace) -> int:
db = SessionLocal()
try:
query = db.query(NewsItem).filter(NewsItem.archived.is_(False)).order_by(desc(NewsItem.id))
if args.limit and args.limit > 0:
query = query.limit(args.limit)
items = query.all()
regenerated = 0
for item in items:
tldr_points = resolve_tldr_points(item, None)
translations = asyncio.run(
generate_translations(
headline=item.headline,
summary=item.summary,
tldr_points=tldr_points,
summary_body=item.summary_body,
source_citation=item.source_citation,
)
)
for language_code, payload in translations.items():
if args.dry_run:
regenerated += 1
continue
existing = get_translation(db, item.id, language_code)
if existing is None:
create_translation(
db=db,
news_item_id=item.id,
language=language_code,
headline=payload["headline"],
summary=payload["summary"],
tldr_points=payload.get("tldr_points"),
summary_body=payload.get("summary_body"),
source_citation=payload.get("source_citation"),
)
else:
existing.headline = payload["headline"]
existing.summary = payload["summary"]
existing.tldr_points = (
json.dumps(payload.get("tldr_points"))
if payload.get("tldr_points")
else None
)
existing.summary_body = payload.get("summary_body")
existing.source_citation = payload.get("source_citation")
regenerated += 1
if not args.dry_run:
db.commit()
print_result(
"regenerate-translations",
"ok",
articles=len(items),
regenerated=regenerated,
dry_run=args.dry_run,
)
return 0
except Exception:
db.rollback()
logger.exception("admin regenerate-translations failed")
print_result("regenerate-translations", "error")
return 1
finally:
db.close()
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)

View File

@@ -5,6 +5,7 @@ import logging
import os
import re
import time
from collections.abc import Awaitable, Callable
from io import BytesIO
from urllib.parse import quote_plus
@@ -23,6 +24,7 @@ from backend.repository import (
logger = logging.getLogger(__name__)
PLACEHOLDER_IMAGE_PATH = "/static/images/placeholder.png"
GENERIC_AI_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=AI+News"
async def call_perplexity_api(query: str) -> dict | None:
@@ -419,7 +421,7 @@ def extract_image_keywords(headline: str) -> str:
- Handles edge cases (empty, only stop words, special characters)
"""
if not headline or not headline.strip():
return "news technology"
return "ai machine learning deep learning"
# Normalize: remove special characters, keep alphanumeric and spaces
cleaned = re.sub(r"[^\w\s]", " ", headline)
@@ -433,7 +435,7 @@ def extract_image_keywords(headline: str) -> str:
keywords = keywords[:5]
if not keywords:
return "news technology"
return "ai machine learning deep learning"
return " ".join(keywords)
@@ -465,7 +467,7 @@ async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
except Exception:
logger.exception("Pixabay image retrieval failed")
return None, None
return GENERIC_AI_FALLBACK_URL, "Generic AI fallback"
async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
@@ -569,7 +571,9 @@ _PROVIDER_REGISTRY: dict[str, tuple] = {
}
def get_enabled_providers() -> list[tuple[str, callable]]:
def get_enabled_providers() -> list[
tuple[str, Callable[[str], Awaitable[tuple[str | None, str | None]]]]
]:
"""Get ordered list of enabled providers based on config and available API keys."""
provider_names = [
p.strip().lower() for p in config.ROYALTY_IMAGE_PROVIDERS.split(",") if p.strip()
@@ -663,8 +667,16 @@ async def download_and_optimize_image(image_url: str) -> str | None:
return None
async def fetch_news_with_retry(max_attempts: int = 3) -> list[dict]:
async def fetch_news_with_retry(
max_attempts: int = 3, article_count: int | None = None
) -> list[dict]:
query = "What are the latest AI news from the last hour? Include source URLs and image URLs."
if article_count is not None:
bounded = max(1, min(50, int(article_count)))
query = (
f"What are the latest AI news from the last hour? Return exactly {bounded} items. "
"Include source URLs and image URLs."
)
for attempt in range(max_attempts):
try:
@@ -687,8 +699,8 @@ async def fetch_news_with_retry(max_attempts: int = 3) -> list[dict]:
return []
async def process_and_store_news() -> int:
items = await fetch_news_with_retry()
async def process_and_store_news(article_count: int | None = None) -> int:
items = await fetch_news_with_retry(article_count=article_count)
if not items:
logger.warning("No news items fetched this cycle")
return 0

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB