First deployment
Some checks failed
quality-gates / lint-and-test (push) Has been cancelled
quality-gates / security-scan (push) Has been cancelled

This commit is contained in:
2026-02-13 09:14:04 -05:00
parent 0e21e035f5
commit 679561bcdb
128 changed files with 3479 additions and 120 deletions

View File

@@ -14,6 +14,7 @@ from backend import config
from backend.database import SessionLocal, init_db
from backend.models import NewsItem
from backend.news_service import (
GENERIC_AI_FALLBACK_URL,
download_and_optimize_image,
extract_image_keywords,
fetch_royalty_free_image,
@@ -87,56 +88,105 @@ def build_contextual_query(headline: str, summary: str | None) -> str:
return cleaned
def resolve_article_id_from_permalink(value: str | None) -> int | None:
if not value:
return None
if value.isdigit():
return int(value)
match = re.search(r"(?:\?|&)article=(\d+)", value)
if match:
return int(match.group(1))
return None
def is_unrelated_image_candidate(image_url: str | None, image_credit: str | None) -> bool:
text = f"{image_url or ''} {image_credit or ''}".lower()
blocked = (
"cat",
"dog",
"pet",
"animal",
"wildlife",
"lion",
"tiger",
"bird",
"horse",
)
return any(term in text for term in blocked)
async def refetch_image_for_item(
item: NewsItem,
max_attempts: int,
) -> tuple[str | None, str | None, str]:
query = build_contextual_query(item.headline, item.summary)
current_summary_image = item.summary_image_url
query_variants = [
f"{query} alternative angle",
f"{query} concept illustration",
query,
]
for query_variant in query_variants:
for attempt in range(max_attempts):
try:
image_url, image_credit = await fetch_royalty_free_image(query_variant)
if not image_url:
raise RuntimeError("no-image-url")
if is_unrelated_image_candidate(image_url, image_credit):
logger.info("Rejected unrelated image candidate: %s", image_url)
continue
local_image = await download_and_optimize_image(image_url)
if not local_image:
raise RuntimeError("image-download-or-optimize-failed")
if current_summary_image and local_image == current_summary_image:
logger.info("Rejected duplicate image candidate for article=%s", item.id)
continue
return local_image, image_credit, "provider"
except Exception:
if attempt < max_attempts - 1:
delay = 2**attempt
await asyncio.sleep(delay)
fallback_local = await download_and_optimize_image(GENERIC_AI_FALLBACK_URL)
if fallback_local and fallback_local != current_summary_image:
return fallback_local, "AI-themed fallback", "fallback"
return None, None, "none"
async def refetch_images_for_latest(
limit: int,
max_attempts: int,
dry_run: bool,
target_article_id: int | None = None,
) -> tuple[int, int]:
db = SessionLocal()
processed = 0
refreshed = 0
try:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.limit(limit)
.all()
)
if target_article_id is not None:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False), NewsItem.id == target_article_id)
.all()
)
else:
items = (
db.query(NewsItem)
.filter(NewsItem.archived.is_(False))
.order_by(desc(NewsItem.published_at))
.limit(limit)
.all()
)
total = len(items)
for idx, item in enumerate(items, start=1):
processed += 1
query = build_contextual_query(item.headline, item.summary)
image_url: str | None = None
image_credit: str | None = None
local_image: str | None = None
for attempt in range(max_attempts):
try:
image_url, image_credit = await fetch_royalty_free_image(query)
if not image_url:
raise RuntimeError("no-image-url")
local_image = await download_and_optimize_image(image_url)
if not local_image:
raise RuntimeError("image-download-or-optimize-failed")
break
except Exception:
if attempt == max_attempts - 1:
logger.exception("Image refetch failed for item=%s after retries", item.id)
image_url = None
local_image = None
break
delay = 2**attempt
logger.warning(
"Refetch retry item=%s attempt=%d delay=%ds",
item.id,
attempt + 1,
delay,
)
await asyncio.sleep(delay)
local_image, image_credit, decision = await refetch_image_for_item(
item=item,
max_attempts=max_attempts,
)
if local_image:
refreshed += 1
@@ -152,6 +202,7 @@ async def refetch_images_for_latest(
total=total,
refreshed=refreshed,
article_id=item.id,
decision=decision,
)
return processed, refreshed
@@ -186,6 +237,12 @@ def build_parser() -> argparse.ArgumentParser:
help="Refetch and optimize latest article images",
)
refetch_parser.add_argument("--limit", type=positive_int, default=30)
refetch_parser.add_argument(
"--permalink",
type=str,
default="",
help="Target one article by permalink (for example '/?article=123' or '123')",
)
refetch_parser.add_argument("--max-attempts", type=positive_int, default=4)
refetch_parser.add_argument("--dry-run", action="store_true")
refetch_parser.set_defaults(handler=handle_admin_refetch_images)
@@ -280,11 +337,22 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
start = time.monotonic()
try:
init_db()
target_article_id = resolve_article_id_from_permalink(args.permalink)
if args.permalink and target_article_id is None:
print_result(
"refetch-images",
"blocked",
reason="invalid-permalink",
hint="use '/?article=<id>' or raw numeric id",
)
return 2
processed, refreshed = asyncio.run(
refetch_images_for_latest(
limit=min(args.limit, 30),
max_attempts=args.max_attempts,
dry_run=args.dry_run,
target_article_id=target_article_id,
)
)
elapsed = time.monotonic() - start
@@ -293,6 +361,7 @@ def handle_admin_refetch_images(args: argparse.Namespace) -> int:
"ok",
processed=processed,
refreshed=refreshed,
target_article_id=target_article_id,
dry_run=args.dry_run,
elapsed=f"{elapsed:.1f}s",
)