fix: add /api prefix to all backend routes

Frontend expects /api/chat but backend had /chat.
Added APIRouter with prefix=/api to fix route mismatch.
This commit is contained in:
2026-04-13 17:09:17 -04:00
parent f000f13672
commit 2041dd9412
2 changed files with 115 additions and 11 deletions

97
docs/gpu-compatibility.md Normal file
View File

@@ -0,0 +1,97 @@
# GPU Compatibility Guide
## RTX 50-Series (Blackwell) Compatibility Notice
### Issue
NVIDIA RTX 50-series GPUs (RTX 5070, 5080, 5090) use CUDA capability `sm_120` (Blackwell architecture). PyTorch stable releases (up to 2.5.1) only officially support up to `sm_90` (Hopper/Ada).
**Warning you'll see:**
```
NVIDIA GeForce RTX 5070 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90.
```
### Current Status
- ✅ PyTorch detects the GPU
- ✅ CUDA operations generally work
- ⚠️ Some operations may fail or fall back to CPU
- ⚠️ Performance may not be optimal
### Workarounds
#### Option 1: Use PyTorch Nightly (Recommended for RTX 50-series)
```bash
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
```
#### Option 2: Use Current Stable with Known Limitations
Many workloads work fine despite the warning. Test your specific use case.
#### Option 3: Wait for PyTorch 2.7
Full sm_120 support is expected in the next stable release.
### Installation Steps for KV-RAG with GPU
1. **Install CUDA-enabled PyTorch:**
```bash
pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
```
2. **Install unsloth without dependencies:**
```bash
pip install unsloth --no-deps
pip install unsloth_zoo
```
3. **Install remaining training dependencies:**
```bash
pip install bitsandbytes accelerate peft transformers datasets trl
```
Note: Skip `xformers` as it may overwrite torch. Unsloth works without it.
### Verify GPU is Working
```python
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA version: {torch.version.cuda}")
```
### Ollama GPU Status
Ollama runs **natively on Windows** and uses GPU automatically when available:
- Check with: `nvidia-smi` (look for `ollama.exe` processes)
- Embedding model (`mxbai-embed-large:335m`) runs on GPU
- Chat models also use GPU when loaded
### Forge Training GPU Status
The training script uses `unsloth` + `trl` for QLoRA fine-tuning:
- Requires CUDA-enabled PyTorch
- Optimized for 12GB VRAM (RTX 5070)
- Uses 4-bit quantization + LoRA adapters
- See `src/companion/forge/train.py` for implementation
### Troubleshooting
**Issue:** `CUDA available: False` after installation
**Fix:** PyTorch was overwritten by a package dependency. Reinstall:
```bash
pip install torch==2.5.1+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --force-reinstall
```
**Issue:** `xformers` overwrites torch
**Fix:** Skip xformers or install matching wheel:
```bash
# Skip for now - unsloth works without it
# Or install specific version matching your torch
pip install xformers==0.0.28.post3 --index-url https://download.pytorch.org/whl/cu121
```
### References
- [PyTorch CUDA Compatibility](https://pytorch.org/get-started/locally/)
- [NVIDIA CUDA Capability Matrix](https://developer.nvidia.com/cuda-gpus)
- [Unsloth Documentation](https://github.com/unsloth/unsloth)
- [RTX 50-Series Architecture](https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/)

View File

@@ -8,7 +8,7 @@ from contextlib import asynccontextmanager
from typing import AsyncGenerator from typing import AsyncGenerator
import httpx import httpx
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException, APIRouter
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel from pydantic import BaseModel
from sse_starlette.sse import EventSourceResponse from sse_starlette.sse import EventSourceResponse
@@ -37,6 +37,12 @@ class ChatResponse(BaseModel):
sources: list[dict] | None = None sources: list[dict] | None = None
class ReloadModelRequest(BaseModel):
"""Model reload request."""
model_path: str
# Global instances # Global instances
config: Config config: Config
vector_store: VectorStore vector_store: VectorStore
@@ -98,8 +104,11 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
# Create API router with /api prefix
api_router = APIRouter(prefix="/api")
@app.get("/health")
@api_router.get("/health")
async def health_check() -> dict: async def health_check() -> dict:
"""Health check endpoint.""" """Health check endpoint."""
return { return {
@@ -109,7 +118,7 @@ async def health_check() -> dict:
} }
@app.post("/chat") @api_router.post("/chat")
async def chat(request: ChatRequest) -> EventSourceResponse: async def chat(request: ChatRequest) -> EventSourceResponse:
"""Chat endpoint with SSE streaming.""" """Chat endpoint with SSE streaming."""
if not request.message.strip(): if not request.message.strip():
@@ -167,7 +176,7 @@ async def chat(request: ChatRequest) -> EventSourceResponse:
) )
@app.get("/sessions/{session_id}/history") @api_router.get("/sessions/{session_id}/history")
async def get_session_history(session_id: str) -> dict: async def get_session_history(session_id: str) -> dict:
"""Get conversation history for a session.""" """Get conversation history for a session."""
history = memory.get_history(session_id) history = memory.get_history(session_id)
@@ -184,13 +193,7 @@ async def get_session_history(session_id: str) -> dict:
} }
class ReloadModelRequest(BaseModel): @api_router.post("/admin/reload-model")
"""Model reload request."""
model_path: str
@app.post("/admin/reload-model")
async def reload_model_endpoint(request: ReloadModelRequest) -> dict: async def reload_model_endpoint(request: ReloadModelRequest) -> dict:
"""Reload the model with a new fine-tuned version (admin only).""" """Reload the model with a new fine-tuned version (admin only)."""
from pathlib import Path from pathlib import Path
@@ -213,6 +216,10 @@ async def reload_model_endpoint(request: ReloadModelRequest) -> dict:
raise HTTPException(status_code=500, detail=f"Failed to reload model: {e}") raise HTTPException(status_code=500, detail=f"Failed to reload model: {e}")
# Include the API router
app.include_router(api_router)
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn