From 2041dd941220608323733285f38028ab496e18f7 Mon Sep 17 00:00:00 2001
From: Santhosh Janardhanan <santhoshj@gmail.com>
Date: Mon, 13 Apr 2026 17:09:17 -0400
Subject: [PATCH] fix: add /api prefix to all backend routes

Frontend expects /api/chat but backend had /chat.
Added APIRouter with prefix=/api to fix route mismatch.
---
 docs/gpu-compatibility.md | 97 +++++++++++++++++++++++++++++++++++++++
 src/companion/api.py      | 29 +++++++-----
 2 files changed, 115 insertions(+), 11 deletions(-)
 create mode 100644 docs/gpu-compatibility.md

diff --git a/docs/gpu-compatibility.md b/docs/gpu-compatibility.md
new file mode 100644
index 0000000..25f8191
--- /dev/null
+++ b/docs/gpu-compatibility.md
@@ -0,0 +1,97 @@
+# GPU Compatibility Guide
+
+## RTX 50-Series (Blackwell) Compatibility Notice
+
+### Issue
+NVIDIA RTX 50-series GPUs (RTX 5070, 5080, 5090) use CUDA capability `sm_120` (Blackwell architecture). PyTorch stable releases (up to 2.5.1) only officially support up to `sm_90` (Hopper/Ada).
+
+**Warning you'll see:**
+```
+NVIDIA GeForce RTX 5070 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
+The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90.
+```
+
+### Current Status
+- ✅ PyTorch detects the GPU
+- ✅ CUDA operations generally work
+- ⚠️ Some operations may fail or fall back to CPU
+- ⚠️ Performance may not be optimal
+
+### Workarounds
+
+#### Option 1: Use PyTorch Nightly (Recommended for RTX 50-series)
+```bash
+pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
+```
+
+#### Option 2: Use Current Stable with Known Limitations
+Many workloads work fine despite the warning. Test your specific use case.
+
+#### Option 3: Wait for PyTorch 2.7
+Full sm_120 support is expected in the next stable release.
+
+### Installation Steps for KV-RAG with GPU
+
+1. **Install CUDA-enabled PyTorch:**
+   ```bash
+   pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+   ```
+
+2. **Install unsloth without dependencies:**
+   ```bash
+   pip install unsloth --no-deps
+   pip install unsloth_zoo
+   ```
+
+3. **Install remaining training dependencies:**
+   ```bash
+   pip install bitsandbytes accelerate peft transformers datasets trl
+   ```
+   Note: Skip `xformers` as it may overwrite torch. Unsloth works without it.
+
+### Verify GPU is Working
+
+```python
+import torch
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"GPU: {torch.cuda.get_device_name(0)}")
+print(f"CUDA version: {torch.version.cuda}")
+```
+
+### Ollama GPU Status
+
+Ollama runs **natively on Windows** and uses GPU automatically when available:
+- Check with: `nvidia-smi` (look for `ollama.exe` processes)
+- Embedding model (`mxbai-embed-large:335m`) runs on GPU
+- Chat models also use GPU when loaded
+
+### Forge Training GPU Status
+
+The training script uses `unsloth` + `trl` for QLoRA fine-tuning:
+- Requires CUDA-enabled PyTorch
+- Optimized for 12GB VRAM (RTX 5070)
+- Uses 4-bit quantization + LoRA adapters
+- See `src/companion/forge/train.py` for implementation
+
+### Troubleshooting
+
+**Issue:** `CUDA available: False` after installation
+**Fix:** PyTorch was overwritten by a package dependency. Reinstall:
+```bash
+pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --force-reinstall
+```
+
+**Issue:** `xformers` overwrites torch
+**Fix:** Skip xformers or install matching wheel:
+```bash
+# Skip for now - unsloth works without it
+# Or install specific version matching your torch
+pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu121
+```
+
+### References
+
+- [PyTorch CUDA Compatibility](https://pytorch.org/get-started/locally/)
+- [NVIDIA CUDA Capability Matrix](https://developer.nvidia.com/cuda-gpus)
+- [Unsloth Documentation](https://github.com/unsloth/unsloth)
+- [RTX 50-Series Architecture](https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/)
diff --git a/src/companion/api.py b/src/companion/api.py
index 54d557c..30523d3 100644
--- a/src/companion/api.py
+++ b/src/companion/api.py
@@ -8,7 +8,7 @@ from contextlib import asynccontextmanager
 from typing import AsyncGenerator
 
 import httpx
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException, APIRouter
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from sse_starlette.sse import EventSourceResponse
@@ -37,6 +37,12 @@ class ChatResponse(BaseModel):
     sources: list[dict] | None = None
 
 
+class ReloadModelRequest(BaseModel):
+    """Model reload request."""
+
+    model_path: str
+
+
 # Global instances
 config: Config
 vector_store: VectorStore
@@ -98,8 +104,11 @@ app.add_middleware(
     allow_headers=["*"],
 )
 
+# Create API router with /api prefix
+api_router = APIRouter(prefix="/api")
 
-@app.get("/health")
+
+@api_router.get("/health")
 async def health_check() -> dict:
     """Health check endpoint."""
     return {
@@ -109,7 +118,7 @@ async def health_check() -> dict:
     }
 
 
-@app.post("/chat")
+@api_router.post("/chat")
 async def chat(request: ChatRequest) -> EventSourceResponse:
     """Chat endpoint with SSE streaming."""
     if not request.message.strip():
@@ -167,7 +176,7 @@ async def chat(request: ChatRequest) -> EventSourceResponse:
     )
 
 
-@app.get("/sessions/{session_id}/history")
+@api_router.get("/sessions/{session_id}/history")
 async def get_session_history(session_id: str) -> dict:
     """Get conversation history for a session."""
     history = memory.get_history(session_id)
@@ -184,13 +193,7 @@ async def get_session_history(session_id: str) -> dict:
     }
 
 
-class ReloadModelRequest(BaseModel):
-    """Model reload request."""
-
-    model_path: str
-
-
-@app.post("/admin/reload-model")
+@api_router.post("/admin/reload-model")
 async def reload_model_endpoint(request: ReloadModelRequest) -> dict:
     """Reload the model with a new fine-tuned version (admin only)."""
     from pathlib import Path
@@ -213,6 +216,10 @@ async def reload_model_endpoint(request: ReloadModelRequest) -> dict:
         raise HTTPException(status_code=500, detail=f"Failed to reload model: {e}")
 
 
+# Include the API router
+app.include_router(api_router)
+
+
 if __name__ == "__main__":
     import uvicorn