First deployment
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
141
backend/cli.py
141
backend/cli.py
@@ -14,6 +14,7 @@ from backend import config
|
||||
from backend.database import SessionLocal, init_db
|
||||
from backend.models import NewsItem
|
||||
from backend.news_service import (
|
||||
GENERIC_AI_FALLBACK_URL,
|
||||
download_and_optimize_image,
|
||||
extract_image_keywords,
|
||||
fetch_royalty_free_image,
|
||||
@@ -87,56 +88,105 @@ def build_contextual_query(headline: str, summary: str | None) -> str:
|
||||
return cleaned
|
||||
|
||||
|
||||
def resolve_article_id_from_permalink(value: str | None) -> int | None:
|
||||
if not value:
|
||||
return None
|
||||
if value.isdigit():
|
||||
return int(value)
|
||||
match = re.search(r"(?:\?|&)article=(\d+)", value)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def is_unrelated_image_candidate(image_url: str | None, image_credit: str | None) -> bool:
|
||||
text = f"{image_url or ''} {image_credit or ''}".lower()
|
||||
blocked = (
|
||||
"cat",
|
||||
"dog",
|
||||
"pet",
|
||||
"animal",
|
||||
"wildlife",
|
||||
"lion",
|
||||
"tiger",
|
||||
"bird",
|
||||
"horse",
|
||||
)
|
||||
return any(term in text for term in blocked)
|
||||
|
||||
|
||||
async def refetch_image_for_item(
|
||||
item: NewsItem,
|
||||
max_attempts: int,
|
||||
) -> tuple[str | None, str | None, str]:
|
||||
query = build_contextual_query(item.headline, item.summary)
|
||||
current_summary_image = item.summary_image_url
|
||||
query_variants = [
|
||||
f"{query} alternative angle",
|
||||
f"{query} concept illustration",
|
||||
query,
|
||||
]
|
||||
|
||||
for query_variant in query_variants:
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
image_url, image_credit = await fetch_royalty_free_image(query_variant)
|
||||
if not image_url:
|
||||
raise RuntimeError("no-image-url")
|
||||
if is_unrelated_image_candidate(image_url, image_credit):
|
||||
logger.info("Rejected unrelated image candidate: %s", image_url)
|
||||
continue
|
||||
local_image = await download_and_optimize_image(image_url)
|
||||
if not local_image:
|
||||
raise RuntimeError("image-download-or-optimize-failed")
|
||||
if current_summary_image and local_image == current_summary_image:
|
||||
logger.info("Rejected duplicate image candidate for article=%s", item.id)
|
||||
continue
|
||||
return local_image, image_credit, "provider"
|
||||
except Exception:
|
||||
if attempt < max_attempts - 1:
|
||||
delay = 2**attempt
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
fallback_local = await download_and_optimize_image(GENERIC_AI_FALLBACK_URL)
|
||||
if fallback_local and fallback_local != current_summary_image:
|
||||
return fallback_local, "AI-themed fallback", "fallback"
|
||||
return None, None, "none"
|
||||
|
||||
|
||||
async def refetch_images_for_latest(
|
||||
limit: int,
|
||||
max_attempts: int,
|
||||
dry_run: bool,
|
||||
target_article_id: int | None = None,
|
||||
) -> tuple[int, int]:
|
||||
db = SessionLocal()
|
||||
processed = 0
|
||||
refreshed = 0
|
||||
|
||||
try:
|
||||
items = (
|
||||
db.query(NewsItem)
|
||||
.filter(NewsItem.archived.is_(False))
|
||||
.order_by(desc(NewsItem.published_at))
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
if target_article_id is not None:
|
||||
items = (
|
||||
db.query(NewsItem)
|
||||
.filter(NewsItem.archived.is_(False), NewsItem.id == target_article_id)
|
||||
.all()
|
||||
)
|
||||
else:
|
||||
items = (
|
||||
db.query(NewsItem)
|
||||
.filter(NewsItem.archived.is_(False))
|
||||
.order_by(desc(NewsItem.published_at))
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
|
||||
total = len(items)
|
||||
for idx, item in enumerate(items, start=1):
|
||||
processed += 1
|
||||
query = build_contextual_query(item.headline, item.summary)
|
||||
|
||||
image_url: str | None = None
|
||||
image_credit: str | None = None
|
||||
local_image: str | None = None
|
||||
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
image_url, image_credit = await fetch_royalty_free_image(query)
|
||||
if not image_url:
|
||||
raise RuntimeError("no-image-url")
|
||||
local_image = await download_and_optimize_image(image_url)
|
||||
if not local_image:
|
||||
raise RuntimeError("image-download-or-optimize-failed")
|
||||
break
|
||||
except Exception:
|
||||
if attempt == max_attempts - 1:
|
||||
logger.exception("Image refetch failed for item=%s after retries", item.id)
|
||||
image_url = None
|
||||
local_image = None
|
||||
break
|
||||
delay = 2**attempt
|
||||
logger.warning(
|
||||
"Refetch retry item=%s attempt=%d delay=%ds",
|
||||
item.id,
|
||||
attempt + 1,
|
||||
delay,
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
local_image, image_credit, decision = await refetch_image_for_item(
|
||||
item=item,
|
||||
max_attempts=max_attempts,
|
||||
)
|
||||
|
||||
if local_image:
|
||||
refreshed += 1
|
||||
@@ -152,6 +202,7 @@ async def refetch_images_for_latest(
|
||||
total=total,
|
||||
refreshed=refreshed,
|
||||
article_id=item.id,
|
||||
decision=decision,
|
||||
)
|
||||
|
||||
return processed, refreshed
|
||||
@@ -186,6 +237,12 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
help="Refetch and optimize latest article images",
|
||||
)
|
||||
refetch_parser.add_argument("--limit", type=positive_int, default=30)
|
||||
refetch_parser.add_argument(
|
||||
"--permalink",
|
||||
type=str,
|
||||
default="",
|
||||
help="Target one article by permalink (for example '/?article=123' or '123')",
|
||||
)
|
||||
refetch_parser.add_argument("--max-attempts", type=positive_int, default=4)
|
||||
refetch_parser.add_argument("--dry-run", action="store_true")
|
||||
refetch_parser.set_defaults(handler=handle_admin_refetch_images)
|
||||
@@ -280,11 +337,22 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
|
||||
start = time.monotonic()
|
||||
try:
|
||||
init_db()
|
||||
target_article_id = resolve_article_id_from_permalink(args.permalink)
|
||||
if args.permalink and target_article_id is None:
|
||||
print_result(
|
||||
"refetch-images",
|
||||
"blocked",
|
||||
reason="invalid-permalink",
|
||||
hint="use '/?article=<id>' or raw numeric id",
|
||||
)
|
||||
return 2
|
||||
|
||||
processed, refreshed = asyncio.run(
|
||||
refetch_images_for_latest(
|
||||
limit=min(args.limit, 30),
|
||||
max_attempts=args.max_attempts,
|
||||
dry_run=args.dry_run,
|
||||
target_article_id=target_article_id,
|
||||
)
|
||||
)
|
||||
elapsed = time.monotonic() - start
|
||||
@@ -293,6 +361,7 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
|
||||
"ok",
|
||||
processed=processed,
|
||||
refreshed=refreshed,
|
||||
target_article_id=target_article_id,
|
||||
dry_run=args.dry_run,
|
||||
elapsed=f"{elapsed:.1f}s",
|
||||
)
|
||||
|
||||
@@ -37,18 +37,18 @@ app = FastAPI(title="ClawFort News API", version="0.1.0")
|
||||
|
||||
_ERROR_MESSAGES = {
|
||||
404: [
|
||||
"Oh no! This page wandered off to train a tiny model.",
|
||||
"Oh no! We looked everywhere, even in the latent space.",
|
||||
"Oh no! The link took a creative detour.",
|
||||
"Oh no! This route is currently off doing research.",
|
||||
"Oh no! The page you asked for is not in this timeline.",
|
||||
"This page wandered off to train a tiny model.",
|
||||
"We looked everywhere, even in the latent space.",
|
||||
"The link took a creative detour.",
|
||||
"This route is currently off doing research.",
|
||||
"The page you asked for is not in this timeline.",
|
||||
],
|
||||
500: [
|
||||
"Oh no! The server hit a logic knot and needs a quick reset.",
|
||||
"Oh no! Our robots dropped a semicolon somewhere important.",
|
||||
"Oh no! A background process got stage fright.",
|
||||
"Oh no! The AI took an unexpected coffee break.",
|
||||
"Oh no! Something internal blinked at the wrong moment.",
|
||||
"The server hit a logic knot and needs a quick reset.",
|
||||
"Our robots dropped a semicolon somewhere important.",
|
||||
"A background process got stage fright.",
|
||||
"The AI took an unexpected coffee break.",
|
||||
"Something internal blinked at the wrong moment.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -25,6 +25,49 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
PLACEHOLDER_IMAGE_PATH = "/static/images/placeholder.png"
|
||||
GENERIC_AI_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=AI+News"
|
||||
GENERIC_FINANCE_FALLBACK_URL = "https://placehold.co/1200x630/0f172a/e2e8f0/png?text=Market+News"
|
||||
|
||||
_FINANCE_TOPIC_TERMS = frozenset(
|
||||
{
|
||||
"finance",
|
||||
"financial",
|
||||
"market",
|
||||
"markets",
|
||||
"stock",
|
||||
"stocks",
|
||||
"share",
|
||||
"shares",
|
||||
"earnings",
|
||||
"investor",
|
||||
"investors",
|
||||
"nasdaq",
|
||||
"nyse",
|
||||
"dow",
|
||||
"s&p",
|
||||
"bank",
|
||||
"banking",
|
||||
"revenue",
|
||||
"profit",
|
||||
"trading",
|
||||
"ipo",
|
||||
"valuation",
|
||||
}
|
||||
)
|
||||
|
||||
_FINANCE_IMAGE_BLOCKLIST = (
|
||||
"cat",
|
||||
"dog",
|
||||
"pet",
|
||||
"lion",
|
||||
"tiger",
|
||||
"bird",
|
||||
"horse",
|
||||
"portrait",
|
||||
"selfie",
|
||||
"wedding",
|
||||
"food",
|
||||
"nature-only",
|
||||
)
|
||||
|
||||
|
||||
async def call_perplexity_api(query: str) -> dict | None:
|
||||
@@ -174,6 +217,43 @@ def parse_translation_response(response: dict) -> dict | None:
|
||||
return None
|
||||
|
||||
|
||||
def validate_translation_quality(
|
||||
headline: str, summary: str, language_code: str
|
||||
) -> tuple[bool, str | None]:
|
||||
text = f"{headline} {summary}".strip()
|
||||
if not headline or not summary:
|
||||
return False, "empty-content"
|
||||
if len(text) < 20:
|
||||
return False, "too-short"
|
||||
|
||||
repeated_runs = re.search(r"(.)\1{6,}", text)
|
||||
if repeated_runs:
|
||||
return False, "repeated-sequence"
|
||||
|
||||
lines = [segment.strip() for segment in re.split(r"[.!?]\s+", text) if segment.strip()]
|
||||
if lines:
|
||||
unique_ratio = len(set(lines)) / len(lines)
|
||||
if unique_ratio < 0.4:
|
||||
return False, "low-unique-content"
|
||||
|
||||
if language_code == "ta":
|
||||
script_hits = sum(1 for char in text if "\u0b80" <= char <= "\u0bff")
|
||||
elif language_code == "ml":
|
||||
script_hits = sum(1 for char in text if "\u0d00" <= char <= "\u0d7f")
|
||||
else:
|
||||
return True, None
|
||||
|
||||
alpha_hits = sum(1 for char in text if char.isalpha())
|
||||
if alpha_hits == 0:
|
||||
return False, "no-alpha-content"
|
||||
|
||||
script_ratio = script_hits / alpha_hits
|
||||
if script_ratio < 0.35:
|
||||
return False, "script-mismatch"
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
async def generate_translations(
|
||||
headline: str,
|
||||
summary: str,
|
||||
@@ -200,7 +280,20 @@ async def generate_translations(
|
||||
if response:
|
||||
parsed = parse_translation_response(response)
|
||||
if parsed:
|
||||
translations[language_code] = parsed
|
||||
is_valid, reason = validate_translation_quality(
|
||||
parsed["headline"],
|
||||
parsed["summary"],
|
||||
language_code,
|
||||
)
|
||||
if is_valid:
|
||||
logger.info("Translation accepted for %s", language_code)
|
||||
translations[language_code] = parsed
|
||||
else:
|
||||
logger.warning(
|
||||
"Translation rejected for %s: %s",
|
||||
language_code,
|
||||
reason,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Translation generation failed for %s", language_code)
|
||||
|
||||
@@ -467,7 +560,7 @@ async def fetch_pixabay_image(query: str) -> tuple[str | None, str | None]:
|
||||
except Exception:
|
||||
logger.exception("Pixabay image retrieval failed")
|
||||
|
||||
return GENERIC_AI_FALLBACK_URL, "Generic AI fallback"
|
||||
return None, None
|
||||
|
||||
|
||||
async def fetch_unsplash_image(query: str) -> tuple[str | None, str | None]:
|
||||
@@ -591,6 +684,15 @@ def get_enabled_providers() -> list[
|
||||
|
||||
async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
|
||||
"""Fetch royalty-free image using provider chain with fallback."""
|
||||
|
||||
def is_finance_story(text: str) -> bool:
|
||||
lowered = (text or "").lower()
|
||||
return any(term in lowered for term in _FINANCE_TOPIC_TERMS)
|
||||
|
||||
def is_finance_safe_image(image_url: str, credit: str | None) -> bool:
|
||||
haystack = f"{image_url or ''} {credit or ''}".lower()
|
||||
return not any(term in haystack for term in _FINANCE_IMAGE_BLOCKLIST)
|
||||
|
||||
# MCP endpoint takes highest priority if configured
|
||||
if config.ROYALTY_IMAGE_MCP_ENDPOINT:
|
||||
try:
|
||||
@@ -610,15 +712,35 @@ async def fetch_royalty_free_image(query: str) -> tuple[str | None, str | None]:
|
||||
|
||||
# Extract keywords for better image search
|
||||
refined_query = extract_image_keywords(query)
|
||||
finance_story = is_finance_story(query)
|
||||
query_variants = [refined_query]
|
||||
if finance_story:
|
||||
query_variants = [
|
||||
f"{refined_query} stock market trading chart finance business",
|
||||
refined_query,
|
||||
]
|
||||
|
||||
# Try each enabled provider in order
|
||||
for provider_name, fetch_fn in get_enabled_providers():
|
||||
try:
|
||||
image_url, credit = await fetch_fn(refined_query)
|
||||
if image_url:
|
||||
for query_variant in query_variants:
|
||||
for provider_name, fetch_fn in get_enabled_providers():
|
||||
try:
|
||||
image_url, credit = await fetch_fn(query_variant)
|
||||
if not image_url:
|
||||
continue
|
||||
if finance_story and not is_finance_safe_image(image_url, credit):
|
||||
logger.info(
|
||||
"Rejected non-finance-safe image from %s for query '%s': %s",
|
||||
provider_name,
|
||||
query_variant,
|
||||
image_url,
|
||||
)
|
||||
continue
|
||||
return image_url, credit
|
||||
except Exception:
|
||||
logger.exception("%s image retrieval failed", provider_name.capitalize())
|
||||
except Exception:
|
||||
logger.exception("%s image retrieval failed", provider_name.capitalize())
|
||||
|
||||
if finance_story:
|
||||
return GENERIC_FINANCE_FALLBACK_URL, "Finance-safe fallback"
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user