From f000f13672071f652735e90bc09e100e05c2c379 Mon Sep 17 00:00:00 2001
From: Santhosh Janardhanan <santhoshj@gmail.com>
Date: Mon, 13 Apr 2026 16:45:50 -0400
Subject: [PATCH] =?UTF-8?q?fix:=20Docker=20paths,=20chunker=20word?=
 =?UTF-8?q?=E2=86=92char=20split,=20and=20ChatOrchestrator=20args?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed multiple issues preventing the indexer from running:
- Docker COPY paths: companion/ → src/companion/ to match project structure
- pyproject.toml: [tool.hatchling] → [tool.hatch.build.targets.wheel]
- api.py: ChatOrchestrator init params (session_memory instead of http_client)
- chunker.py: Fixed character-based chunking (was word-based, causing 400 errors from Ollama embedding API due to exceeding token limits)
- config.json: Use exact model tag mxbai-embed-large:335m
- docker-compose.yml: Fixed vault mount path
---
 Dockerfile                   |  8 ++++----
 Dockerfile.indexer           |  6 +++---
 config.json                  |  2 +-
 docker-compose.yml           | 12 +++++++++---
 pyproject.toml               |  2 +-
 src/companion/api.py         |  3 +--
 src/companion/rag/chunker.py | 23 +++++++++++++++++------
 7 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0a7b336..df4dc8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,10 +28,10 @@ COPY --from=builder /app/wheels /wheels
 RUN pip install --no-cache-dir /wheels/*
 
 # Copy application code
-COPY companion/ ./companion/
-COPY companion/forge/ ./companion/forge/
-COPY companion/indexer_daemon/ ./companion/indexer_daemon/
-COPY companion/rag/ ./companion/rag/
+COPY src/companion/ ./companion/
+COPY src/companion/forge/ ./companion/forge/
+COPY src/companion/indexer_daemon/ ./companion/indexer_daemon/
+COPY src/companion/rag/ ./companion/rag/
 
 # Create directories for data
 RUN mkdir -p /data/vectors /data/memory /models
diff --git a/Dockerfile.indexer b/Dockerfile.indexer
index 6c81024..9ead366 100644
--- a/Dockerfile.indexer
+++ b/Dockerfile.indexer
@@ -13,9 +13,9 @@ RUN pip install --no-cache-dir \
     pydantic lancedb pyarrow requests watchdog typer rich numpy httpx
 
 # Copy application code
-COPY companion/ ./companion/
-COPY companion/indexer_daemon/ ./companion/indexer_daemon/
-COPY companion/rag/ ./companion/rag/
+COPY src/companion/ ./companion/
+COPY src/companion/indexer_daemon/ ./companion/indexer_daemon/
+COPY src/companion/rag/ ./companion/rag/
 
 # Create directories for data
 RUN mkdir -p /data/vectors
diff --git a/config.json b/config.json
index 9062ee0..06bf052 100644
--- a/config.json
+++ b/config.json
@@ -55,7 +55,7 @@
   "rag": {
     "embedding": {
       "provider": "ollama",
-      "model": "mxbai-embed-large",
+      "model": "mxbai-embed-large:335m",
       "base_url": "http://localhost:11434",
       "dimensions": 1024,
       "batch_size": 32
diff --git a/docker-compose.yml b/docker-compose.yml
index 6e1f18d..d94c309 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,4 +1,4 @@
-version: '3.8'
+version: "3.8"
 
 services:
   companion-api:
@@ -20,7 +20,13 @@ services:
       - companion-network
     restart: unless-stopped
     healthcheck:
-      test: ["CMD", "python", "-c", "import requests; requests.get('http://localhost:7373/health')"]
+      test:
+        [
+          "CMD",
+          "python",
+          "-c",
+          "import requests; requests.get('http://localhost:7373/health')",
+        ]
       interval: 30s
       timeout: 10s
       retries: 3
@@ -34,7 +40,7 @@ services:
     volumes:
       - ./config.json:/app/config.json:ro
       - companion-data:/data
-      - /home/san/KnowledgeVault:/vault:ro  # Mount Obsidian vault as read-only
+      - ./sample-data/Default:/app/sample-data/Default:ro # Mount Obsidian vault as read-only
     environment:
       - COMPANION_CONFIG=/app/config.json
       - COMPANION_DATA_DIR=/data
diff --git a/pyproject.toml b/pyproject.toml
index 8002fe9..3f7ea84 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ train = [
     "trl>=0.7.0",
 ]
 
-[tool.hatchling]
+[tool.hatch.build.targets.wheel]
 packages = ["src/companion"]
 
 [build-system]
diff --git a/src/companion/api.py b/src/companion/api.py
index 2dc6015..54d557c 100644
--- a/src/companion/api.py
+++ b/src/companion/api.py
@@ -70,8 +70,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     orchestrator = ChatOrchestrator(
         config=config,
         search_engine=search_engine,
-        memory=memory,
-        http_client=http_client,
+        session_memory=memory,
     )
 
     yield
diff --git a/src/companion/rag/chunker.py b/src/companion/rag/chunker.py
index 4c9c9f6..e66359d 100644
--- a/src/companion/rag/chunker.py
+++ b/src/companion/rag/chunker.py
@@ -28,16 +28,27 @@ class ChunkingRule:
 
 
 def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
-    words = text.split()
-    if not words:
+    """Split text into chunks based on character count (not word count)."""
+    if not text:
         return []
 
     chunks = []
     step = chunk_size - chunk_overlap
-    for i in range(0, len(words), step):
-        chunk_words = words[i : i + chunk_size]
-        chunks.append(" ".join(chunk_words))
-        if i + chunk_size >= len(words):
+    start = 0
+    text_len = len(text)
+
+    while start < text_len:
+        end = min(start + chunk_size, text_len)
+        # Try to break at word boundary
+        if end < text_len:
+            # Look for whitespace to break at
+            while end > start and not text[end].isspace():
+                end -= 1
+            if end == start:  # No good break found, force cut
+                end = min(start + chunk_size, text_len)
+        chunks.append(text[start:end].strip())
+        start += step
+        if end >= text_len:
             break
 
     return chunks