First deployment

2026-02-13 09:14:04 -05:00
parent 0e21e035f5
commit 679561bcdb
128 changed files with 3479 additions and 120 deletions
--- a/backend/pycache/cli.cpython-313.pyc
+++ b/backend/pycache/cli.cpython-313.pyc
--- a/backend/pycache/config.cpython-313.pyc
+++ b/backend/pycache/config.cpython-313.pyc
--- a/backend/pycache/main.cpython-313.pyc
+++ b/backend/pycache/main.cpython-313.pyc
--- a/backend/pycache/news_service.cpython-313.pyc
+++ b/backend/pycache/news_service.cpython-313.pyc
--- a/backend/cli.py
+++ b/backend/cli.py
@@ -14,6 +14,7 @@ from backend import config
 from backend.database import SessionLocal, init_db
 from backend.models import NewsItem
 from backend.news_service import (
+    GENERIC_AI_FALLBACK_URL,
    download_and_optimize_image,
    extract_image_keywords,
    fetch_royalty_free_image,
@@ -87,56 +88,105 @@ def build_contextual_query(headline: str, summary: str | None) -> str:
    return cleaned


+def resolve_article_id_from_permalink(value: str | None) -> int | None:
+    if not value:
+        return None
+    if value.isdigit():
+        return int(value)
+    match = re.search(r"(?:\?|&)article=(\d+)", value)
+    if match:
+        return int(match.group(1))
+    return None
+
+
+def is_unrelated_image_candidate(image_url: str | None, image_credit: str | None) -> bool:
+    text = f"{image_url or ''} {image_credit or ''}".lower()
+    blocked = (
+        "cat",
+        "dog",
+        "pet",
+        "animal",
+        "wildlife",
+        "lion",
+        "tiger",
+        "bird",
+        "horse",
+    )
+    return any(term in text for term in blocked)
+
+
+async def refetch_image_for_item(
+    item: NewsItem,
+    max_attempts: int,
+) -> tuple[str | None, str | None, str]:
+    query = build_contextual_query(item.headline, item.summary)
+    current_summary_image = item.summary_image_url
+    query_variants = [
+        f"{query} alternative angle",
+        f"{query} concept illustration",
+        query,
+    ]
+
+    for query_variant in query_variants:
+        for attempt in range(max_attempts):
+            try:
+                image_url, image_credit = await fetch_royalty_free_image(query_variant)
+                if not image_url:
+                    raise RuntimeError("no-image-url")
+                if is_unrelated_image_candidate(image_url, image_credit):
+                    logger.info("Rejected unrelated image candidate: %s", image_url)
+                    continue
+                local_image = await download_and_optimize_image(image_url)
+                if not local_image:
+                    raise RuntimeError("image-download-or-optimize-failed")
+                if current_summary_image and local_image == current_summary_image:
+                    logger.info("Rejected duplicate image candidate for article=%s", item.id)
+                    continue
+                return local_image, image_credit, "provider"
+            except Exception:
+                if attempt < max_attempts - 1:
+                    delay = 2**attempt
+                    await asyncio.sleep(delay)
+
+    fallback_local = await download_and_optimize_image(GENERIC_AI_FALLBACK_URL)
+    if fallback_local and fallback_local != current_summary_image:
+        return fallback_local, "AI-themed fallback", "fallback"
+    return None, None, "none"
+
+
 async def refetch_images_for_latest(
    limit: int,
    max_attempts: int,
    dry_run: bool,
+    target_article_id: int | None = None,
 ) -> tuple[int, int]:
    db = SessionLocal()
    processed = 0
    refreshed = 0

    try:
-        items = (
-            db.query(NewsItem)
-            .filter(NewsItem.archived.is_(False))
-            .order_by(desc(NewsItem.published_at))
-            .limit(limit)
-            .all()
-        )
+        if target_article_id is not None:
+            items = (
+                db.query(NewsItem)
+                .filter(NewsItem.archived.is_(False), NewsItem.id == target_article_id)
+                .all()
+            )
+        else:
+            items = (
+                db.query(NewsItem)
+                .filter(NewsItem.archived.is_(False))
+                .order_by(desc(NewsItem.published_at))
+                .limit(limit)
+                .all()
+            )

        total = len(items)
        for idx, item in enumerate(items, start=1):
            processed += 1
-            query = build_contextual_query(item.headline, item.summary)
-
-            image_url: str | None = None
-            image_credit: str | None = None
-            local_image: str | None = None
-
-            for attempt in range(max_attempts):
-                try:
-                    image_url, image_credit = await fetch_royalty_free_image(query)
-                    if not image_url:
-                        raise RuntimeError("no-image-url")
-                    local_image = await download_and_optimize_image(image_url)
-                    if not local_image:
-                        raise RuntimeError("image-download-or-optimize-failed")
-                    break
-                except Exception:
-                    if attempt == max_attempts - 1:
-                        logger.exception("Image refetch failed for item=%s after retries", item.id)
-                        image_url = None
-                        local_image = None
-                        break
-                    delay = 2**attempt
-                    logger.warning(
-                        "Refetch retry item=%s attempt=%d delay=%ds",
-                        item.id,
-                        attempt + 1,
-                        delay,
-                    )
-                    await asyncio.sleep(delay)
+            local_image, image_credit, decision = await refetch_image_for_item(
+                item=item,
+                max_attempts=max_attempts,
+            )

            if local_image:
                refreshed += 1
@@ -152,6 +202,7 @@ async def refetch_images_for_latest(
                total=total,
                refreshed=refreshed,
                article_id=item.id,
+                decision=decision,
            )

        return processed, refreshed
@@ -186,6 +237,12 @@ def build_parser() -> argparse.ArgumentParser:
        help="Refetch and optimize latest article images",
    )
    refetch_parser.add_argument("--limit", type=positive_int, default=30)
+    refetch_parser.add_argument(
+        "--permalink",
+        type=str,
+        default="",
+        help="Target one article by permalink (for example '/?article=123' or '123')",
+    )
    refetch_parser.add_argument("--max-attempts", type=positive_int, default=4)
    refetch_parser.add_argument("--dry-run", action="store_true")
    refetch_parser.set_defaults(handler=handle_admin_refetch_images)
@@ -280,11 +337,22 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
    start = time.monotonic()
    try:
        init_db()
+        target_article_id = resolve_article_id_from_permalink(args.permalink)
+        if args.permalink and target_article_id is None:
+            print_result(
+                "refetch-images",
+                "blocked",
+                reason="invalid-permalink",
+                hint="use '/?article=<id>' or raw numeric id",
+            )
+            return 2
+
        processed, refreshed = asyncio.run(
            refetch_images_for_latest(
                limit=min(args.limit, 30),
                max_attempts=args.max_attempts,
                dry_run=args.dry_run,
+                target_article_id=target_article_id,
            )
        )
        elapsed = time.monotonic() - start
@@ -293,6 +361,7 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
            "ok",
            processed=processed,
            refreshed=refreshed,
+            target_article_id=target_article_id,
            dry_run=args.dry_run,
            elapsed=f"{elapsed:.1f}s",
        )
--- a/backend/main.py
+++ b/backend/main.py
@@ -37,18 +37,18 @@ app = FastAPI(title="ClawFort News API", version="0.1.0")

 _ERROR_MESSAGES = {
    404: [
-        "Oh no! This page wandered off to train a tiny model.",
-        "Oh no! We looked everywhere, even in the latent space.",
-        "Oh no! The link took a creative detour.",
-        "Oh no! This route is currently off doing research.",
-        "Oh no! The page you asked for is not in this timeline.",
+        "This page wandered off to train a tiny model.",
+        "We looked everywhere, even in the latent space.",
+        "The link took a creative detour.",
+        "This route is currently off doing research.",
+        "The page you asked for is not in this timeline.",
    ],
    500: [
-        "Oh no! The server hit a logic knot and needs a quick reset.",
-        "Oh no! Our robots dropped a semicolon somewhere important.",
-        "Oh no! A background process got stage fright.",
-        "Oh no! The AI took an unexpected coffee break.",
-        "Oh no! Something internal blinked at the wrong moment.",
+        "The server hit a logic knot and needs a quick reset.",
+        "Our robots dropped a semicolon somewhere important.",
+        "A background process got stage fright.",
+        "The AI took an unexpected coffee break.",
+        "Something internal blinked at the wrong moment.",
    ],
 }

--- a/backend/news_service.py
+++ b/backend/news_service.py
@@ -25,6 +25,49 @@ logger = logging.getLogger(__name__)

 PLACEHOLDER_IMAGE_PATH = "/static/images/placeholder.png"
 GENERIC_AI_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=AI+News"
+GENERIC_FINANCE_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=Market+News"
+
+_FINANCE_TOPIC_TERMS = frozenset(
+    {
+        "finance",
+        "financial",
+        "market",
+        "markets",
+        "stock",
+        "stocks",
+        "share",
+        "shares",
+        "earnings",
+        "investor",
+        "investors",
+        "nasdaq",
+        "nyse",
+        "dow",
+        "s&p",
+        "bank",
+        "banking",
+        "revenue",
+        "profit",
+        "trading",
+        "ipo",
+        "valuation",
+    }
+)
+
+_FINANCE_IMAGE_BLOCKLIST = (
+    "cat",
+    "dog",
+    "pet",
+    "lion",
+    "tiger",
+    "bird",
+    "horse",
+    "portrait",
+    "selfie",
+    "wedding",
+    "food",
+    "nature-only",
+)


 async def call_perplexity_api(query: str) -> dict | None:
@@ -174,6 +217,43 @@ def parse_translation_response(response: dict) -> dict | None:
    return None


+def validate_translation_quality(
+    headline: str, summary: str, language_code: str
+) -> tuple[bool, str | None]:
+    text = f"{headline} {summary}".strip()
+    if not headline or not summary:
+        return False, "empty-content"
+    if len(text) < 20:
+        return False, "too-short"
+
+    repeated_runs = re.search(r"(.)\1{6,}", text)
+    if repeated_runs:
+        return False, "repeated-sequence"
+
+    lines = [segment.strip() for segment in re.split(r"[.!?]\s+", text) if segment.strip()]
+    if lines:
+        unique_ratio = len(set(lines)) / len(lines)
+        if unique_ratio < 0.4:
+            return False, "low-unique-content"
+
+    if language_code == "ta":
+        script_hits = sum(1 for char in text if "\u0b80" <= char <= "\u0bff")
+    elif language_code == "ml":
+        script_hits = sum(1 for char in text if "\u0d00" <= char <= "\u0d7f")
+    else:
+        return True, None
+
+    alpha_hits = sum(1 for char in text if char.isalpha())
+    if alpha_hits == 0:
+        return False, "no-alpha-content"
+
+    script_ratio = script_hits / alpha_hits
+    if script_ratio < 0.35:
+        return False, "script-mismatch"
+
+    return True, None
+
+
 async def generate_translations(
    headline: str,
    summary: str,
@@ -200,7 +280,20 @@ async def generate_translations(
            if response:
                parsed = parse_translation_response(response)
                if parsed:
-                    translations[language_code] = parsed
+                    is_valid, reason = validate_translation_quality(
+                        parsed["headline"],
+                        parsed["summary"],
+                        language_code,
+                    )
+                    if is_valid:
+                        logger.info("Translation accepted for %s", language_code)
+                        translations[language_code] = parsed
+                    else:
+                        logger.warning(
+                            "Translation rejected for %s: %s",
+                            language_code,
+                            reason,
+                        )
        except Exception:
            logger.exception("Translation generation failed for %s", language_code)

@@ -467,7 +560,7 @@ async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
    except Exception:
        logger.exception("Pixabay image retrieval failed")

-    return GENERIC_AI_FALLBACK_URL, "Generic AI fallback"
+    return None, None


 async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
@@ -591,6 +684,15 @@ def get_enabled_providers() -> list[

 async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
    """Fetch royalty-free image using provider chain with fallback."""
+
+    def is_finance_story(text: str) -> bool:
+        lowered = (text or "").lower()
+        return any(term in lowered for term in _FINANCE_TOPIC_TERMS)
+
+    def is_finance_safe_image(image_url: str, credit: str | None) -> bool:
+        haystack = f"{image_url or ''} {credit or ''}".lower()
+        return not any(term in haystack for term in _FINANCE_IMAGE_BLOCKLIST)
+
    # MCP endpoint takes highest priority if configured
    if config.ROYALTY_IMAGE_MCP_ENDPOINT:
        try:
@@ -610,15 +712,35 @@ async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:

    # Extract keywords for better image search
    refined_query = extract_image_keywords(query)
+    finance_story = is_finance_story(query)
+    query_variants = [refined_query]
+    if finance_story:
+        query_variants = [
+            f"{refined_query} stock market trading chart finance business",
+            refined_query,
+        ]

    # Try each enabled provider in order
-    for provider_name, fetch_fn in get_enabled_providers():
-        try:
-            image_url, credit = await fetch_fn(refined_query)
-            if image_url:
+    for query_variant in query_variants:
+        for provider_name, fetch_fn in get_enabled_providers():
+            try:
+                image_url, credit = await fetch_fn(query_variant)
+                if not image_url:
+                    continue
+                if finance_story and not is_finance_safe_image(image_url, credit):
+                    logger.info(
+                        "Rejected non-finance-safe image from %s for query '%s': %s",
+                        provider_name,
+                        query_variant,
+                        image_url,
+                    )
+                    continue
                return image_url, credit
-        except Exception:
-            logger.exception("%s image retrieval failed", provider_name.capitalize())
+            except Exception:
+                logger.exception("%s image retrieval failed", provider_name.capitalize())
+
+    if finance_story:
+        return GENERIC_FINANCE_FALLBACK_URL, "Finance-safe fallback"

    return None, None