fix: Docker paths, chunker word→char split, and ChatOrchestrator args
Fixed multiple issues preventing the indexer from running: - Docker COPY paths: companion/ → src/companion/ to match project structure - pyproject.toml: [tool.hatchling] → [tool.hatch.build.targets.wheel] - api.py: ChatOrchestrator init params (session_memory instead of http_client) - chunker.py: Fixed character-based chunking (was word-based, causing 400 errors from Ollama embedding API due to exceeding token limits) - config.json: Use exact model tag mxbai-embed-large:335m - docker-compose.yml: Fixed vault mount path
This commit is contained in:
@@ -28,10 +28,10 @@ COPY --from=builder /app/wheels /wheels
|
|||||||
RUN pip install --no-cache-dir /wheels/*
|
RUN pip install --no-cache-dir /wheels/*
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY companion/ ./companion/
|
COPY src/companion/ ./companion/
|
||||||
COPY companion/forge/ ./companion/forge/
|
COPY src/companion/forge/ ./companion/forge/
|
||||||
COPY companion/indexer_daemon/ ./companion/indexer_daemon/
|
COPY src/companion/indexer_daemon/ ./companion/indexer_daemon/
|
||||||
COPY companion/rag/ ./companion/rag/
|
COPY src/companion/rag/ ./companion/rag/
|
||||||
|
|
||||||
# Create directories for data
|
# Create directories for data
|
||||||
RUN mkdir -p /data/vectors /data/memory /models
|
RUN mkdir -p /data/vectors /data/memory /models
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ RUN pip install --no-cache-dir \
|
|||||||
pydantic lancedb pyarrow requests watchdog typer rich numpy httpx
|
pydantic lancedb pyarrow requests watchdog typer rich numpy httpx
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY companion/ ./companion/
|
COPY src/companion/ ./companion/
|
||||||
COPY companion/indexer_daemon/ ./companion/indexer_daemon/
|
COPY src/companion/indexer_daemon/ ./companion/indexer_daemon/
|
||||||
COPY companion/rag/ ./companion/rag/
|
COPY src/companion/rag/ ./companion/rag/
|
||||||
|
|
||||||
# Create directories for data
|
# Create directories for data
|
||||||
RUN mkdir -p /data/vectors
|
RUN mkdir -p /data/vectors
|
||||||
|
|||||||
@@ -55,7 +55,7 @@
|
|||||||
"rag": {
|
"rag": {
|
||||||
"embedding": {
|
"embedding": {
|
||||||
"provider": "ollama",
|
"provider": "ollama",
|
||||||
"model": "mxbai-embed-large",
|
"model": "mxbai-embed-large:335m",
|
||||||
"base_url": "http://localhost:11434",
|
"base_url": "http://localhost:11434",
|
||||||
"dimensions": 1024,
|
"dimensions": 1024,
|
||||||
"batch_size": 32
|
"batch_size": 32
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
version: '3.8'
|
version: "3.8"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
companion-api:
|
companion-api:
|
||||||
@@ -20,7 +20,13 @@ services:
|
|||||||
- companion-network
|
- companion-network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:7373/health')"]
|
test:
|
||||||
|
[
|
||||||
|
"CMD",
|
||||||
|
"python",
|
||||||
|
"-c",
|
||||||
|
"import requests; requests.get('http://localhost:7373/health')",
|
||||||
|
]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
@@ -34,7 +40,7 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- ./config.json:/app/config.json:ro
|
- ./config.json:/app/config.json:ro
|
||||||
- companion-data:/data
|
- companion-data:/data
|
||||||
- /home/san/KnowledgeVault:/vault:ro # Mount Obsidian vault as read-only
|
- ./sample-data/Default:/app/sample-data/Default:ro # Mount Obsidian vault as read-only
|
||||||
environment:
|
environment:
|
||||||
- COMPANION_CONFIG=/app/config.json
|
- COMPANION_CONFIG=/app/config.json
|
||||||
- COMPANION_DATA_DIR=/data
|
- COMPANION_DATA_DIR=/data
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ train = [
|
|||||||
"trl>=0.7.0",
|
"trl>=0.7.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatchling]
|
[tool.hatch.build.targets.wheel]
|
||||||
packages = ["src/companion"]
|
packages = ["src/companion"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|||||||
@@ -70,8 +70,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
|||||||
orchestrator = ChatOrchestrator(
|
orchestrator = ChatOrchestrator(
|
||||||
config=config,
|
config=config,
|
||||||
search_engine=search_engine,
|
search_engine=search_engine,
|
||||||
memory=memory,
|
session_memory=memory,
|
||||||
http_client=http_client,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|||||||
@@ -28,16 +28,27 @@ class ChunkingRule:
|
|||||||
|
|
||||||
|
|
||||||
def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
|
def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
|
||||||
words = text.split()
|
"""Split text into chunks based on character count (not word count)."""
|
||||||
if not words:
|
if not text:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
step = chunk_size - chunk_overlap
|
step = chunk_size - chunk_overlap
|
||||||
for i in range(0, len(words), step):
|
start = 0
|
||||||
chunk_words = words[i : i + chunk_size]
|
text_len = len(text)
|
||||||
chunks.append(" ".join(chunk_words))
|
|
||||||
if i + chunk_size >= len(words):
|
while start < text_len:
|
||||||
|
end = min(start + chunk_size, text_len)
|
||||||
|
# Try to break at word boundary
|
||||||
|
if end < text_len:
|
||||||
|
# Look for whitespace to break at
|
||||||
|
while end > start and not text[end].isspace():
|
||||||
|
end -= 1
|
||||||
|
if end == start: # No good break found, force cut
|
||||||
|
end = min(start + chunk_size, text_len)
|
||||||
|
chunks.append(text[start:end].strip())
|
||||||
|
start += step
|
||||||
|
if end >= text_len:
|
||||||
break
|
break
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|||||||
Reference in New Issue
Block a user