First deployment
Some checks failed
quality-gates / lint-and-test (push) Has been cancelled
quality-gates / security-scan (push) Has been cancelled

This commit is contained in:
2026-02-13 09:14:04 -05:00
parent 0e21e035f5
commit 679561bcdb
128 changed files with 3479 additions and 120 deletions

View File

@@ -14,6 +14,7 @@ from backend import config
from backend.database import SessionLocal, init_db
from backend.models import NewsItem
from backend.news_service import (
GENERIC_AI_FALLBACK_URL,
download_and_optimize_image,
extract_image_keywords,
fetch_royalty_free_image,
@@ -87,56 +88,105 @@ def build_contextual_query(headline: str, summary: str | None) -> str:
return cleaned
def resolve_article_id_from_permalink(value: str | None) -> int | None:
if not value:
return None
if value.isdigit():
return int(value)
match = re.search(r"(?:\?|&)article=(\d+)", value)
if match:
return int(match.group(1))
return None
def is_unrelated_image_candidate(image_url: str | None, image_credit: str | None) -> bool:
text = f"{image_url or ''} {image_credit or ''}".lower()
blocked = (
"cat",
"dog",
"pet",
"animal",
"wildlife",
"lion",
"tiger",
"bird",
"horse",
)
return any(term in text for term in blocked)
async def refetch_image_for_item(
item: NewsItem,
max_attempts: int,
) -> tuple[str | None, str | None, str]:
query = build_contextual_query(item.headline, item.summary)
current_summary_image = item.summary_image_url
query_variants = [
f"{query} alternative angle",
f"{query} concept illustration",
query,
]
for query_variant in query_variants:
for attempt in range(max_attempts):
try:
image_url, image_credit = await fetch_royalty_free_image(query_variant)
if not image_url:
raise RuntimeError("no-image-url")
if is_unrelated_image_candidate(image_url, image_credit):
logger.info("Rejected unrelated image candidate: %s", image_url)
continue
local_image = await download_and_optimize_image(image_url)
if not local_image:
raise RuntimeError("image-download-or-optimize-failed")
if current_summary_image and local_image == current_summary_image:
logger.info("Rejected duplicate image candidate for article=%s", item.id)
continue
return local_image, image_credit, "provider"
except Exception:
if attempt < max_attempts - 1:
delay = 2**attempt
await asyncio.sleep(delay)
fallback_local = await download_and_optimize_image(GENERIC_AI_FALLBACK_URL)
if fallback_local and fallback_local != current_summary_image:
return fallback_local, "AI-themed fallback", "fallback"
return None, None, "none"
async def refetch_images_for_latest(
limit: int,
max_attempts: int,
dry_run: bool,
target_article_id: int | None = None,
) -> tuple[int, int]:
db = SessionLocal()
processed = 0
refreshed = 0
try:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.limit(limit)
.all()
)
if target_article_id is not None:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False), NewsItem.id == target_article_id)
.all()
)
else:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.limit(limit)
.all()
)
total = len(items)
for idx, item in enumerate(items, start=1):
processed += 1
query = build_contextual_query(item.headline, item.summary)
image_url: str | None = None
image_credit: str | None = None
local_image: str | None = None
for attempt in range(max_attempts):
try:
image_url, image_credit = await fetch_royalty_free_image(query)
if not image_url:
raise RuntimeError("no-image-url")
local_image = await download_and_optimize_image(image_url)
if not local_image:
raise RuntimeError("image-download-or-optimize-failed")
break
except Exception:
if attempt == max_attempts - 1:
logger.exception("Image refetch failed for item=%s after retries", item.id)
image_url = None
local_image = None
break
delay = 2**attempt
logger.warning(
"Refetch retry item=%s attempt=%d delay=%ds",
item.id,
attempt + 1,
delay,
)
await asyncio.sleep(delay)
local_image, image_credit, decision = await refetch_image_for_item(
item=item,
max_attempts=max_attempts,
)
if local_image:
refreshed += 1
@@ -152,6 +202,7 @@ async def refetch_images_for_latest(
total=total,
refreshed=refreshed,
article_id=item.id,
decision=decision,
)
return processed, refreshed
@@ -186,6 +237,12 @@ def build_parser() -> argparse.ArgumentParser:
help="Refetch and optimize latest article images",
)
refetch_parser.add_argument("--limit", type=positive_int, default=30)
refetch_parser.add_argument(
"--permalink",
type=str,
default="",
help="Target one article by permalink (for example '/?article=123' or '123')",
)
refetch_parser.add_argument("--max-attempts", type=positive_int, default=4)
refetch_parser.add_argument("--dry-run", action="store_true")
refetch_parser.set_defaults(handler=handle_admin_refetch_images)
@@ -280,11 +337,22 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
start = time.monotonic()
try:
init_db()
target_article_id = resolve_article_id_from_permalink(args.permalink)
if args.permalink and target_article_id is None:
print_result(
"refetch-images",
"blocked",
reason="invalid-permalink",
hint="use '/?article=<id>' or raw numeric id",
)
return 2
processed, refreshed = asyncio.run(
refetch_images_for_latest(
limit=min(args.limit, 30),
max_attempts=args.max_attempts,
dry_run=args.dry_run,
target_article_id=target_article_id,
)
)
elapsed = time.monotonic() - start
@@ -293,6 +361,7 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
"ok",
processed=processed,
refreshed=refreshed,
target_article_id=target_article_id,
dry_run=args.dry_run,
elapsed=f"{elapsed:.1f}s",
)

View File

@@ -37,18 +37,18 @@ app = FastAPI(title="ClawFort News API", version="0.1.0")
_ERROR_MESSAGES = {
404: [
"Oh no! This page wandered off to train a tiny model.",
"Oh no! We looked everywhere, even in the latent space.",
"Oh no! The link took a creative detour.",
"Oh no! This route is currently off doing research.",
"Oh no! The page you asked for is not in this timeline.",
"This page wandered off to train a tiny model.",
"We looked everywhere, even in the latent space.",
"The link took a creative detour.",
"This route is currently off doing research.",
"The page you asked for is not in this timeline.",
],
500: [
"Oh no! The server hit a logic knot and needs a quick reset.",
"Oh no! Our robots dropped a semicolon somewhere important.",
"Oh no! A background process got stage fright.",
"Oh no! The AI took an unexpected coffee break.",
"Oh no! Something internal blinked at the wrong moment.",
"The server hit a logic knot and needs a quick reset.",
"Our robots dropped a semicolon somewhere important.",
"A background process got stage fright.",
"The AI took an unexpected coffee break.",
"Something internal blinked at the wrong moment.",
],
}

View File

@@ -25,6 +25,49 @@ logger = logging.getLogger(__name__)
PLACEHOLDER_IMAGE_PATH = "/static/images/placeholder.png"
GENERIC_AI_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=AI+News"
GENERIC_FINANCE_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=Market+News"
_FINANCE_TOPIC_TERMS = frozenset(
{
"finance",
"financial",
"market",
"markets",
"stock",
"stocks",
"share",
"shares",
"earnings",
"investor",
"investors",
"nasdaq",
"nyse",
"dow",
"s&p",
"bank",
"banking",
"revenue",
"profit",
"trading",
"ipo",
"valuation",
}
)
_FINANCE_IMAGE_BLOCKLIST = (
"cat",
"dog",
"pet",
"lion",
"tiger",
"bird",
"horse",
"portrait",
"selfie",
"wedding",
"food",
"nature-only",
)
async def call_perplexity_api(query: str) -> dict | None:
@@ -174,6 +217,43 @@ def parse_translation_response(response: dict) -> dict | None:
return None
def validate_translation_quality(
headline: str, summary: str, language_code: str
) -> tuple[bool, str | None]:
text = f"{headline} {summary}".strip()
if not headline or not summary:
return False, "empty-content"
if len(text) < 20:
return False, "too-short"
repeated_runs = re.search(r"(.)\1{6,}", text)
if repeated_runs:
return False, "repeated-sequence"
lines = [segment.strip() for segment in re.split(r"[.!?]\s+", text) if segment.strip()]
if lines:
unique_ratio = len(set(lines)) / len(lines)
if unique_ratio < 0.4:
return False, "low-unique-content"
if language_code == "ta":
script_hits = sum(1 for char in text if "\u0b80" <= char <= "\u0bff")
elif language_code == "ml":
script_hits = sum(1 for char in text if "\u0d00" <= char <= "\u0d7f")
else:
return True, None
alpha_hits = sum(1 for char in text if char.isalpha())
if alpha_hits == 0:
return False, "no-alpha-content"
script_ratio = script_hits / alpha_hits
if script_ratio < 0.35:
return False, "script-mismatch"
return True, None
async def generate_translations(
headline: str,
summary: str,
@@ -200,7 +280,20 @@ async def generate_translations(
if response:
parsed = parse_translation_response(response)
if parsed:
translations[language_code] = parsed
is_valid, reason = validate_translation_quality(
parsed["headline"],
parsed["summary"],
language_code,
)
if is_valid:
logger.info("Translation accepted for %s", language_code)
translations[language_code] = parsed
else:
logger.warning(
"Translation rejected for %s: %s",
language_code,
reason,
)
except Exception:
logger.exception("Translation generation failed for %s", language_code)
@@ -467,7 +560,7 @@ async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
except Exception:
logger.exception("Pixabay image retrieval failed")
return GENERIC_AI_FALLBACK_URL, "Generic AI fallback"
return None, None
async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
@@ -591,6 +684,15 @@ def get_enabled_providers() -> list[
async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
"""Fetch royalty-free image using provider chain with fallback."""
def is_finance_story(text: str) -> bool:
lowered = (text or "").lower()
return any(term in lowered for term in _FINANCE_TOPIC_TERMS)
def is_finance_safe_image(image_url: str, credit: str | None) -> bool:
haystack = f"{image_url or ''} {credit or ''}".lower()
return not any(term in haystack for term in _FINANCE_IMAGE_BLOCKLIST)
# MCP endpoint takes highest priority if configured
if config.ROYALTY_IMAGE_MCP_ENDPOINT:
try:
@@ -610,15 +712,35 @@ async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
# Extract keywords for better image search
refined_query = extract_image_keywords(query)
finance_story = is_finance_story(query)
query_variants = [refined_query]
if finance_story:
query_variants = [
f"{refined_query} stock market trading chart finance business",
refined_query,
]
# Try each enabled provider in order
for provider_name, fetch_fn in get_enabled_providers():
try:
image_url, credit = await fetch_fn(refined_query)
if image_url:
for query_variant in query_variants:
for provider_name, fetch_fn in get_enabled_providers():
try:
image_url, credit = await fetch_fn(query_variant)
if not image_url:
continue
if finance_story and not is_finance_safe_image(image_url, credit):
logger.info(
"Rejected non-finance-safe image from %s for query '%s': %s",
provider_name,
query_variant,
image_url,
)
continue
return image_url, credit
except Exception:
logger.exception("%s image retrieval failed", provider_name.capitalize())
except Exception:
logger.exception("%s image retrieval failed", provider_name.capitalize())
if finance_story:
return GENERIC_FINANCE_FALLBACK_URL, "Finance-safe fallback"
return None, None