From ce0414a5ce5b0fa79c5494e507fa85ab9d560222 Mon Sep 17 00:00:00 2001 From: Santhosh Janardhanan Date: Mon, 13 Apr 2026 14:24:33 -0400 Subject: [PATCH] feat: add indexer orchestrator with full index, sync, and status --- src/companion/rag/indexer.py | 162 +++++++++++++++++++++++++++++++++++ tests/test_indexer.py | 155 +++++++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 src/companion/rag/indexer.py create mode 100644 tests/test_indexer.py diff --git a/src/companion/rag/indexer.py b/src/companion/rag/indexer.py new file mode 100644 index 0000000..098c529 --- /dev/null +++ b/src/companion/rag/indexer.py @@ -0,0 +1,162 @@ +import fnmatch +import os +from pathlib import Path +from typing import Dict, Iterator, List + +from companion.config import Config +from companion.rag.chunker import Chunk, ChunkingRule, chunk_file +from companion.rag.embedder import OllamaEmbedder +from companion.rag.vector_store import VectorStore + + +class Indexer: + def __init__(self, config: Config, vector_store: VectorStore): + self.config = config + self.vector_store = vector_store + self.vault_path = Path(config.vault.path).resolve() + self.embedding_config = config.rag.embedding + self.indexing_config = config.vault.indexing + self.chunking_rules = self._load_chunking_rules() + self.embedder = OllamaEmbedder( + base_url=self.embedding_config.base_url, + model=self.embedding_config.model, + batch_size=self.embedding_config.batch_size, + ) + + def _load_chunking_rules(self) -> Dict[str, ChunkingRule]: + rules = {} + for pattern, rule in self.config.vault.chunking_rules.items(): + rules[pattern] = ChunkingRule( + strategy=rule.strategy, + chunk_size=rule.chunk_size, + chunk_overlap=rule.chunk_overlap, + section_tags=rule.section_tags if rule.section_tags else None, + ) + return rules + + def _should_index(self, relative_path: str) -> bool: + parts = Path(relative_path).parts + for deny_dir in self.indexing_config.deny_dirs: + if deny_dir in parts: + return False + for pattern in self.indexing_config.deny_patterns: + if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch( + Path(relative_path).name, pattern + ): + return False + for pattern in self.indexing_config.file_patterns: + if fnmatch.fnmatch(Path(relative_path).name, pattern): + return True + return False + + def _list_files(self) -> Iterator[Path]: + for root, dirs, files in os.walk(self.vault_path): + for file_name in files: + file_path = Path(root) / file_name + try: + relative_path = file_path.relative_to(self.vault_path).as_posix() + except ValueError: + continue + if self._should_index(relative_path): + yield file_path + + def _index_files(self, file_paths: List[Path]) -> None: + all_chunks: List[Chunk] = [] + for file_path in file_paths: + modified_at = file_path.stat().st_mtime + chunks = chunk_file( + file_path=file_path, + vault_root=self.vault_path, + rules=self.chunking_rules, + modified_at=modified_at, + ) + all_chunks.extend(chunks) + + if not all_chunks: + return + + texts = [chunk.text for chunk in all_chunks] + embeddings = self.embedder.embed(texts) + + ids = [] + metadatas = [] + for chunk in all_chunks: + chunk_id = f"{chunk.source_file}::{chunk.chunk_index}" + ids.append(chunk_id) + metadatas.append( + { + "source_file": chunk.source_file, + "source_directory": chunk.source_directory, + "section": chunk.section, + "date": chunk.date, + "tags": chunk.tags, + "chunk_index": chunk.chunk_index, + "total_chunks": chunk.total_chunks, + "modified_at": chunk.modified_at, + "rule_applied": chunk.rule_applied, + } + ) + + self.vector_store.upsert( + ids=ids, + texts=texts, + embeddings=embeddings, + metadatas=metadatas, + ) + + def full_index(self) -> None: + try: + self.vector_store.table.drop() + except Exception: + pass + self.vector_store.table = self.vector_store._get_or_create_table() + + file_paths = list(self._list_files()) + self._index_files(file_paths) + + def sync(self) -> None: + file_paths_to_index = [] + for file_path in self._list_files(): + relative_path = file_path.relative_to(self.vault_path).as_posix() + modified_at = file_path.stat().st_mtime + + results = ( + self.vector_store.table.search() + .limit(1) + .where(f"source_file = '{relative_path}'") + .to_list() + ) + + needs_index = True + if results: + existing_modified_at = results[0].get("modified_at") + if ( + existing_modified_at is not None + and existing_modified_at >= modified_at + ): + needs_index = False + + if needs_index: + file_paths_to_index.append(file_path) + self.vector_store.delete_by_source_file(relative_path) + + self._index_files(file_paths_to_index) + + def status(self) -> Dict[str, int]: + total_chunks = self.vector_store.count() + indexed_files = set() + for row in ( + self.vector_store.table.to_lance().to_table().to_pydict()["source_file"] + ): + indexed_files.add(row) + + all_files = set() + for file_path in self._list_files(): + all_files.add(file_path.relative_to(self.vault_path).as_posix()) + + unindexed_files = list(all_files - indexed_files) + return { + "total_chunks": total_chunks, + "indexed_files": len(indexed_files), + "unindexed_files": len(unindexed_files), + } diff --git a/tests/test_indexer.py b/tests/test_indexer.py new file mode 100644 index 0000000..1ba4f26 --- /dev/null +++ b/tests/test_indexer.py @@ -0,0 +1,155 @@ +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +from companion.config import ( + Config, + VaultConfig, + IndexingConfig, + RagConfig, + EmbeddingConfig, + VectorStoreConfig, + SearchConfig, + HybridSearchConfig, + FiltersConfig, + CompanionConfig, + PersonaConfig, + MemoryConfig, + ChatConfig, + ModelConfig, + InferenceConfig, + FineTuningConfig, + RetrainScheduleConfig, + ApiConfig, + AuthConfig, + UiConfig, + WebConfig, + WebFeaturesConfig, + CliConfig, + LoggingConfig, + SecurityConfig, +) +from companion.rag.indexer import Indexer +from companion.rag.vector_store import VectorStore + + +def _make_config(vault_path: Path, vector_store_path: Path) -> Config: + return Config( + companion=CompanionConfig( + name="SAN", + persona=PersonaConfig( + role="companion", tone="reflective", style="questioning", boundaries=[] + ), + memory=MemoryConfig( + session_turns=20, persistent_store="", summarize_after=10 + ), + chat=ChatConfig( + streaming=True, + max_response_tokens=2048, + default_temperature=0.7, + allow_temperature_override=True, + ), + ), + vault=VaultConfig( + path=str(vault_path), + indexing=IndexingConfig( + auto_sync=False, + auto_sync_interval_minutes=1440, + watch_fs_events=False, + file_patterns=["*.md"], + deny_dirs=[".git"], + deny_patterns=[".*"], + ), + chunking_rules={}, + ), + rag=RagConfig( + embedding=EmbeddingConfig( + provider="ollama", + model="dummy", + base_url="http://localhost:11434", + dimensions=4, + batch_size=2, + ), + vector_store=VectorStoreConfig(type="lancedb", path=str(vector_store_path)), + search=SearchConfig( + default_top_k=8, + max_top_k=20, + similarity_threshold=0.75, + hybrid_search=HybridSearchConfig( + enabled=False, keyword_weight=0.3, semantic_weight=0.7 + ), + filters=FiltersConfig( + date_range_enabled=True, + tag_filter_enabled=True, + directory_filter_enabled=True, + ), + ), + ), + model=ModelConfig( + inference=InferenceConfig( + backend="llama.cpp", + model_path="", + context_length=8192, + gpu_layers=35, + batch_size=512, + threads=8, + ), + fine_tuning=FineTuningConfig( + base_model="", + output_dir="", + lora_rank=16, + lora_alpha=32, + learning_rate=0.0002, + batch_size=4, + gradient_accumulation_steps=4, + num_epochs=3, + warmup_steps=100, + save_steps=500, + eval_steps=250, + training_data_path="", + validation_split=0.1, + ), + retrain_schedule=RetrainScheduleConfig( + auto_reminder=True, default_interval_days=90, reminder_channels=[] + ), + ), + api=ApiConfig( + host="127.0.0.1", port=7373, cors_origins=[], auth=AuthConfig(enabled=False) + ), + ui=UiConfig( + web=WebConfig( + enabled=True, + theme="obsidian", + features=WebFeaturesConfig( + streaming=True, citations=True, source_preview=True + ), + ), + cli=CliConfig(enabled=True, rich_output=True), + ), + logging=LoggingConfig(level="INFO", file="", max_size_mb=100, backup_count=5), + security=SecurityConfig( + local_only=True, + vault_path_traversal_check=True, + sensitive_content_detection=True, + sensitive_patterns=[], + require_confirmation_for_external_apis=True, + ), + ) + + +@patch("companion.rag.indexer.OllamaEmbedder") +def test_full_index_creates_vectors(mock_embedder_cls): + mock_embedder = MagicMock() + mock_embedder.embed.return_value = [[1.0, 0.0, 0.0, 0.0]] + mock_embedder_cls.return_value = mock_embedder + + with tempfile.TemporaryDirectory() as tmp: + vault = Path(tmp) / "vault" + vault.mkdir() + (vault / "hello.md").write_text("hello world", encoding="utf-8") + vs_path = Path(tmp) / "vectors" + config = _make_config(vault, vs_path) + store = VectorStore(uri=vs_path, dimensions=4) + indexer = Indexer(config, store) + indexer.full_index() + assert store.count() == 1