feat: add indexer orchestrator with full index, sync, and status

2026-04-13 14:24:33 -04:00
parent ec69aa6f8d
commit ce0414a5ce
2 changed files with 317 additions and 0 deletions
--- a/src/companion/rag/indexer.py
+++ b/src/companion/rag/indexer.py
@@ -0,0 +1,162 @@
+import fnmatch
+import os
+from pathlib import Path
+from typing import Dict, Iterator, List
+
+from companion.config import Config
+from companion.rag.chunker import Chunk, ChunkingRule, chunk_file
+from companion.rag.embedder import OllamaEmbedder
+from companion.rag.vector_store import VectorStore
+
+
+class Indexer:
+    def __init__(self, config: Config, vector_store: VectorStore):
+        self.config = config
+        self.vector_store = vector_store
+        self.vault_path = Path(config.vault.path).resolve()
+        self.embedding_config = config.rag.embedding
+        self.indexing_config = config.vault.indexing
+        self.chunking_rules = self._load_chunking_rules()
+        self.embedder = OllamaEmbedder(
+            base_url=self.embedding_config.base_url,
+            model=self.embedding_config.model,
+            batch_size=self.embedding_config.batch_size,
+        )
+
+    def _load_chunking_rules(self) -> Dict[str, ChunkingRule]:
+        rules = {}
+        for pattern, rule in self.config.vault.chunking_rules.items():
+            rules[pattern] = ChunkingRule(
+                strategy=rule.strategy,
+                chunk_size=rule.chunk_size,
+                chunk_overlap=rule.chunk_overlap,
+                section_tags=rule.section_tags if rule.section_tags else None,
+            )
+        return rules
+
+    def _should_index(self, relative_path: str) -> bool:
+        parts = Path(relative_path).parts
+        for deny_dir in self.indexing_config.deny_dirs:
+            if deny_dir in parts:
+                return False
+        for pattern in self.indexing_config.deny_patterns:
+            if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(
+                Path(relative_path).name, pattern
+            ):
+                return False
+        for pattern in self.indexing_config.file_patterns:
+            if fnmatch.fnmatch(Path(relative_path).name, pattern):
+                return True
+        return False
+
+    def _list_files(self) -> Iterator[Path]:
+        for root, dirs, files in os.walk(self.vault_path):
+            for file_name in files:
+                file_path = Path(root) / file_name
+                try:
+                    relative_path = file_path.relative_to(self.vault_path).as_posix()
+                except ValueError:
+                    continue
+                if self._should_index(relative_path):
+                    yield file_path
+
+    def _index_files(self, file_paths: List[Path]) -> None:
+        all_chunks: List[Chunk] = []
+        for file_path in file_paths:
+            modified_at = file_path.stat().st_mtime
+            chunks = chunk_file(
+                file_path=file_path,
+                vault_root=self.vault_path,
+                rules=self.chunking_rules,
+                modified_at=modified_at,
+            )
+            all_chunks.extend(chunks)
+
+        if not all_chunks:
+            return
+
+        texts = [chunk.text for chunk in all_chunks]
+        embeddings = self.embedder.embed(texts)
+
+        ids = []
+        metadatas = []
+        for chunk in all_chunks:
+            chunk_id = f"{chunk.source_file}::{chunk.chunk_index}"
+            ids.append(chunk_id)
+            metadatas.append(
+                {
+                    "source_file": chunk.source_file,
+                    "source_directory": chunk.source_directory,
+                    "section": chunk.section,
+                    "date": chunk.date,
+                    "tags": chunk.tags,
+                    "chunk_index": chunk.chunk_index,
+                    "total_chunks": chunk.total_chunks,
+                    "modified_at": chunk.modified_at,
+                    "rule_applied": chunk.rule_applied,
+                }
+            )
+
+        self.vector_store.upsert(
+            ids=ids,
+            texts=texts,
+            embeddings=embeddings,
+            metadatas=metadatas,
+        )
+
+    def full_index(self) -> None:
+        try:
+            self.vector_store.table.drop()
+        except Exception:
+            pass
+        self.vector_store.table = self.vector_store._get_or_create_table()
+
+        file_paths = list(self._list_files())
+        self._index_files(file_paths)
+
+    def sync(self) -> None:
+        file_paths_to_index = []
+        for file_path in self._list_files():
+            relative_path = file_path.relative_to(self.vault_path).as_posix()
+            modified_at = file_path.stat().st_mtime
+
+            results = (
+                self.vector_store.table.search()
+                .limit(1)
+                .where(f"source_file = '{relative_path}'")
+                .to_list()
+            )
+
+            needs_index = True
+            if results:
+                existing_modified_at = results[0].get("modified_at")
+                if (
+                    existing_modified_at is not None
+                    and existing_modified_at >= modified_at
+                ):
+                    needs_index = False
+
+            if needs_index:
+                file_paths_to_index.append(file_path)
+                self.vector_store.delete_by_source_file(relative_path)
+
+        self._index_files(file_paths_to_index)
+
+    def status(self) -> Dict[str, int]:
+        total_chunks = self.vector_store.count()
+        indexed_files = set()
+        for row in (
+            self.vector_store.table.to_lance().to_table().to_pydict()["source_file"]
+        ):
+            indexed_files.add(row)
+
+        all_files = set()
+        for file_path in self._list_files():
+            all_files.add(file_path.relative_to(self.vault_path).as_posix())
+
+        unindexed_files = list(all_files - indexed_files)
+        return {
+            "total_chunks": total_chunks,
+            "indexed_files": len(indexed_files),
+            "unindexed_files": len(unindexed_files),
+        }
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -0,0 +1,155 @@
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from companion.config import (
+    Config,
+    VaultConfig,
+    IndexingConfig,
+    RagConfig,
+    EmbeddingConfig,
+    VectorStoreConfig,
+    SearchConfig,
+    HybridSearchConfig,
+    FiltersConfig,
+    CompanionConfig,
+    PersonaConfig,
+    MemoryConfig,
+    ChatConfig,
+    ModelConfig,
+    InferenceConfig,
+    FineTuningConfig,
+    RetrainScheduleConfig,
+    ApiConfig,
+    AuthConfig,
+    UiConfig,
+    WebConfig,
+    WebFeaturesConfig,
+    CliConfig,
+    LoggingConfig,
+    SecurityConfig,
+)
+from companion.rag.indexer import Indexer
+from companion.rag.vector_store import VectorStore
+
+
+def _make_config(vault_path: Path, vector_store_path: Path) -> Config:
+    return Config(
+        companion=CompanionConfig(
+            name="SAN",
+            persona=PersonaConfig(
+                role="companion", tone="reflective", style="questioning", boundaries=[]
+            ),
+            memory=MemoryConfig(
+                session_turns=20, persistent_store="", summarize_after=10
+            ),
+            chat=ChatConfig(
+                streaming=True,
+                max_response_tokens=2048,
+                default_temperature=0.7,
+                allow_temperature_override=True,
+            ),
+        ),
+        vault=VaultConfig(
+            path=str(vault_path),
+            indexing=IndexingConfig(
+                auto_sync=False,
+                auto_sync_interval_minutes=1440,
+                watch_fs_events=False,
+                file_patterns=["*.md"],
+                deny_dirs=[".git"],
+                deny_patterns=[".*"],
+            ),
+            chunking_rules={},
+        ),
+        rag=RagConfig(
+            embedding=EmbeddingConfig(
+                provider="ollama",
+                model="dummy",
+                base_url="http://localhost:11434",
+                dimensions=4,
+                batch_size=2,
+            ),
+            vector_store=VectorStoreConfig(type="lancedb", path=str(vector_store_path)),
+            search=SearchConfig(
+                default_top_k=8,
+                max_top_k=20,
+                similarity_threshold=0.75,
+                hybrid_search=HybridSearchConfig(
+                    enabled=False, keyword_weight=0.3, semantic_weight=0.7
+                ),
+                filters=FiltersConfig(
+                    date_range_enabled=True,
+                    tag_filter_enabled=True,
+                    directory_filter_enabled=True,
+                ),
+            ),
+        ),
+        model=ModelConfig(
+            inference=InferenceConfig(
+                backend="llama.cpp",
+                model_path="",
+                context_length=8192,
+                gpu_layers=35,
+                batch_size=512,
+                threads=8,
+            ),
+            fine_tuning=FineTuningConfig(
+                base_model="",
+                output_dir="",
+                lora_rank=16,
+                lora_alpha=32,
+                learning_rate=0.0002,
+                batch_size=4,
+                gradient_accumulation_steps=4,
+                num_epochs=3,
+                warmup_steps=100,
+                save_steps=500,
+                eval_steps=250,
+                training_data_path="",
+                validation_split=0.1,
+            ),
+            retrain_schedule=RetrainScheduleConfig(
+                auto_reminder=True, default_interval_days=90, reminder_channels=[]
+            ),
+        ),
+        api=ApiConfig(
+            host="127.0.0.1", port=7373, cors_origins=[], auth=AuthConfig(enabled=False)
+        ),
+        ui=UiConfig(
+            web=WebConfig(
+                enabled=True,
+                theme="obsidian",
+                features=WebFeaturesConfig(
+                    streaming=True, citations=True, source_preview=True
+                ),
+            ),
+            cli=CliConfig(enabled=True, rich_output=True),
+        ),
+        logging=LoggingConfig(level="INFO", file="", max_size_mb=100, backup_count=5),
+        security=SecurityConfig(
+            local_only=True,
+            vault_path_traversal_check=True,
+            sensitive_content_detection=True,
+            sensitive_patterns=[],
+            require_confirmation_for_external_apis=True,
+        ),
+    )
+
+
+@patch("companion.rag.indexer.OllamaEmbedder")
+def test_full_index_creates_vectors(mock_embedder_cls):
+    mock_embedder = MagicMock()
+    mock_embedder.embed.return_value = [[1.0, 0.0, 0.0, 0.0]]
+    mock_embedder_cls.return_value = mock_embedder
+
+    with tempfile.TemporaryDirectory() as tmp:
+        vault = Path(tmp) / "vault"
+        vault.mkdir()
+        (vault / "hello.md").write_text("hello world", encoding="utf-8")
+        vs_path = Path(tmp) / "vectors"
+        config = _make_config(vault, vs_path)
+        store = VectorStore(uri=vs_path, dimensions=4)
+        indexer = Indexer(config, store)
+        indexer.full_index()
+        assert store.count() == 1