fix: Docker paths, chunker word→char split, and ChatOrchestrator args
Fixed multiple issues preventing the indexer from running: - Docker COPY paths: companion/ → src/companion/ to match project structure - pyproject.toml: [tool.hatchling] → [tool.hatch.build.targets.wheel] - api.py: ChatOrchestrator init params (session_memory instead of http_client) - chunker.py: Fixed character-based chunking (was word-based, causing 400 errors from Ollama embedding API due to exceeding token limits) - config.json: Use exact model tag mxbai-embed-large:335m - docker-compose.yml: Fixed vault mount path
This commit is contained in:
@@ -28,10 +28,10 @@ COPY --from=builder /app/wheels /wheels
|
||||
RUN pip install --no-cache-dir /wheels/*
|
||||
|
||||
# Copy application code
|
||||
COPY companion/ ./companion/
|
||||
COPY companion/forge/ ./companion/forge/
|
||||
COPY companion/indexer_daemon/ ./companion/indexer_daemon/
|
||||
COPY companion/rag/ ./companion/rag/
|
||||
COPY src/companion/ ./companion/
|
||||
COPY src/companion/forge/ ./companion/forge/
|
||||
COPY src/companion/indexer_daemon/ ./companion/indexer_daemon/
|
||||
COPY src/companion/rag/ ./companion/rag/
|
||||
|
||||
# Create directories for data
|
||||
RUN mkdir -p /data/vectors /data/memory /models
|
||||
|
||||
@@ -13,9 +13,9 @@ RUN pip install --no-cache-dir \
|
||||
pydantic lancedb pyarrow requests watchdog typer rich numpy httpx
|
||||
|
||||
# Copy application code
|
||||
COPY companion/ ./companion/
|
||||
COPY companion/indexer_daemon/ ./companion/indexer_daemon/
|
||||
COPY companion/rag/ ./companion/rag/
|
||||
COPY src/companion/ ./companion/
|
||||
COPY src/companion/indexer_daemon/ ./companion/indexer_daemon/
|
||||
COPY src/companion/rag/ ./companion/rag/
|
||||
|
||||
# Create directories for data
|
||||
RUN mkdir -p /data/vectors
|
||||
|
||||
@@ -55,7 +55,7 @@
|
||||
"rag": {
|
||||
"embedding": {
|
||||
"provider": "ollama",
|
||||
"model": "mxbai-embed-large",
|
||||
"model": "mxbai-embed-large:335m",
|
||||
"base_url": "http://localhost:11434",
|
||||
"dimensions": 1024,
|
||||
"batch_size": 32
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
version: '3.8'
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
companion-api:
|
||||
@@ -20,7 +20,13 @@ services:
|
||||
- companion-network
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:7373/health')"]
|
||||
test:
|
||||
[
|
||||
"CMD",
|
||||
"python",
|
||||
"-c",
|
||||
"import requests; requests.get('http://localhost:7373/health')",
|
||||
]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
@@ -34,7 +40,7 @@ services:
|
||||
volumes:
|
||||
- ./config.json:/app/config.json:ro
|
||||
- companion-data:/data
|
||||
- /home/san/KnowledgeVault:/vault:ro # Mount Obsidian vault as read-only
|
||||
- ./sample-data/Default:/app/sample-data/Default:ro # Mount Obsidian vault as read-only
|
||||
environment:
|
||||
- COMPANION_CONFIG=/app/config.json
|
||||
- COMPANION_DATA_DIR=/data
|
||||
|
||||
@@ -37,7 +37,7 @@ train = [
|
||||
"trl>=0.7.0",
|
||||
]
|
||||
|
||||
[tool.hatchling]
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/companion"]
|
||||
|
||||
[build-system]
|
||||
|
||||
@@ -70,8 +70,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
orchestrator = ChatOrchestrator(
|
||||
config=config,
|
||||
search_engine=search_engine,
|
||||
memory=memory,
|
||||
http_client=http_client,
|
||||
session_memory=memory,
|
||||
)
|
||||
|
||||
yield
|
||||
|
||||
@@ -28,16 +28,27 @@ class ChunkingRule:
|
||||
|
||||
|
||||
def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
|
||||
words = text.split()
|
||||
if not words:
|
||||
"""Split text into chunks based on character count (not word count)."""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
step = chunk_size - chunk_overlap
|
||||
for i in range(0, len(words), step):
|
||||
chunk_words = words[i : i + chunk_size]
|
||||
chunks.append(" ".join(chunk_words))
|
||||
if i + chunk_size >= len(words):
|
||||
start = 0
|
||||
text_len = len(text)
|
||||
|
||||
while start < text_len:
|
||||
end = min(start + chunk_size, text_len)
|
||||
# Try to break at word boundary
|
||||
if end < text_len:
|
||||
# Look for whitespace to break at
|
||||
while end > start and not text[end].isspace():
|
||||
end -= 1
|
||||
if end == start: # No good break found, force cut
|
||||
end = min(start + chunk_size, text_len)
|
||||
chunks.append(text[start:end].strip())
|
||||
start += step
|
||||
if end >= text_len:
|
||||
break
|
||||
|
||||
return chunks
|
||||
|
||||
Reference in New Issue
Block a user