From 2041dd941220608323733285f38028ab496e18f7 Mon Sep 17 00:00:00 2001 From: Santhosh Janardhanan Date: Mon, 13 Apr 2026 17:09:17 -0400 Subject: [PATCH] fix: add /api prefix to all backend routes Frontend expects /api/chat but backend had /chat. Added APIRouter with prefix=/api to fix route mismatch. --- docs/gpu-compatibility.md | 97 +++++++++++++++++++++++++++++++++++++++ src/companion/api.py | 29 +++++++----- 2 files changed, 115 insertions(+), 11 deletions(-) create mode 100644 docs/gpu-compatibility.md diff --git a/docs/gpu-compatibility.md b/docs/gpu-compatibility.md new file mode 100644 index 0000000..25f8191 --- /dev/null +++ b/docs/gpu-compatibility.md @@ -0,0 +1,97 @@ +# GPU Compatibility Guide + +## RTX 50-Series (Blackwell) Compatibility Notice + +### Issue +NVIDIA RTX 50-series GPUs (RTX 5070, 5080, 5090) use CUDA capability `sm_120` (Blackwell architecture). PyTorch stable releases (up to 2.5.1) only officially support up to `sm_90` (Hopper/Ada). + +**Warning you'll see:** +``` +NVIDIA GeForce RTX 5070 with CUDA capability sm_120 is not compatible with the current PyTorch installation. +The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90. +``` + +### Current Status +- ✅ PyTorch detects the GPU +- ✅ CUDA operations generally work +- ⚠️ Some operations may fail or fall back to CPU +- ⚠️ Performance may not be optimal + +### Workarounds + +#### Option 1: Use PyTorch Nightly (Recommended for RTX 50-series) +```bash +pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124 +``` + +#### Option 2: Use Current Stable with Known Limitations +Many workloads work fine despite the warning. Test your specific use case. + +#### Option 3: Wait for PyTorch 2.7 +Full sm_120 support is expected in the next stable release. + +### Installation Steps for KV-RAG with GPU + +1. **Install CUDA-enabled PyTorch:** + ```bash + pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + ``` + +2. **Install unsloth without dependencies:** + ```bash + pip install unsloth --no-deps + pip install unsloth_zoo + ``` + +3. **Install remaining training dependencies:** + ```bash + pip install bitsandbytes accelerate peft transformers datasets trl + ``` + Note: Skip `xformers` as it may overwrite torch. Unsloth works without it. + +### Verify GPU is Working + +```python +import torch +print(f"CUDA available: {torch.cuda.is_available()}") +print(f"GPU: {torch.cuda.get_device_name(0)}") +print(f"CUDA version: {torch.version.cuda}") +``` + +### Ollama GPU Status + +Ollama runs **natively on Windows** and uses GPU automatically when available: +- Check with: `nvidia-smi` (look for `ollama.exe` processes) +- Embedding model (`mxbai-embed-large:335m`) runs on GPU +- Chat models also use GPU when loaded + +### Forge Training GPU Status + +The training script uses `unsloth` + `trl` for QLoRA fine-tuning: +- Requires CUDA-enabled PyTorch +- Optimized for 12GB VRAM (RTX 5070) +- Uses 4-bit quantization + LoRA adapters +- See `src/companion/forge/train.py` for implementation + +### Troubleshooting + +**Issue:** `CUDA available: False` after installation +**Fix:** PyTorch was overwritten by a package dependency. Reinstall: +```bash +pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --force-reinstall +``` + +**Issue:** `xformers` overwrites torch +**Fix:** Skip xformers or install matching wheel: +```bash +# Skip for now - unsloth works without it +# Or install specific version matching your torch +pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu121 +``` + +### References + +- [PyTorch CUDA Compatibility](https://pytorch.org/get-started/locally/) +- [NVIDIA CUDA Capability Matrix](https://developer.nvidia.com/cuda-gpus) +- [Unsloth Documentation](https://github.com/unsloth/unsloth) +- [RTX 50-Series Architecture](https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/) diff --git a/src/companion/api.py b/src/companion/api.py index 54d557c..30523d3 100644 --- a/src/companion/api.py +++ b/src/companion/api.py @@ -8,7 +8,7 @@ from contextlib import asynccontextmanager from typing import AsyncGenerator import httpx -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, APIRouter from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from sse_starlette.sse import EventSourceResponse @@ -37,6 +37,12 @@ class ChatResponse(BaseModel): sources: list[dict] | None = None +class ReloadModelRequest(BaseModel): + """Model reload request.""" + + model_path: str + + # Global instances config: Config vector_store: VectorStore @@ -98,8 +104,11 @@ app.add_middleware( allow_headers=["*"], ) +# Create API router with /api prefix +api_router = APIRouter(prefix="/api") -@app.get("/health") + +@api_router.get("/health") async def health_check() -> dict: """Health check endpoint.""" return { @@ -109,7 +118,7 @@ async def health_check() -> dict: } -@app.post("/chat") +@api_router.post("/chat") async def chat(request: ChatRequest) -> EventSourceResponse: """Chat endpoint with SSE streaming.""" if not request.message.strip(): @@ -167,7 +176,7 @@ async def chat(request: ChatRequest) -> EventSourceResponse: ) -@app.get("/sessions/{session_id}/history") +@api_router.get("/sessions/{session_id}/history") async def get_session_history(session_id: str) -> dict: """Get conversation history for a session.""" history = memory.get_history(session_id) @@ -184,13 +193,7 @@ async def get_session_history(session_id: str) -> dict: } -class ReloadModelRequest(BaseModel): - """Model reload request.""" - - model_path: str - - -@app.post("/admin/reload-model") +@api_router.post("/admin/reload-model") async def reload_model_endpoint(request: ReloadModelRequest) -> dict: """Reload the model with a new fine-tuned version (admin only).""" from pathlib import Path @@ -213,6 +216,10 @@ async def reload_model_endpoint(request: ReloadModelRequest) -> dict: raise HTTPException(status_code=500, detail=f"Failed to reload model: {e}") +# Include the API router +app.include_router(api_router) + + if __name__ == "__main__": import uvicorn