feat: add indexer orchestrator with full index, sync, and status
This commit is contained in:
162
src/companion/rag/indexer.py
Normal file
162
src/companion/rag/indexer.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import fnmatch
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterator, List
|
||||
|
||||
from companion.config import Config
|
||||
from companion.rag.chunker import Chunk, ChunkingRule, chunk_file
|
||||
from companion.rag.embedder import OllamaEmbedder
|
||||
from companion.rag.vector_store import VectorStore
|
||||
|
||||
|
||||
class Indexer:
|
||||
def __init__(self, config: Config, vector_store: VectorStore):
|
||||
self.config = config
|
||||
self.vector_store = vector_store
|
||||
self.vault_path = Path(config.vault.path).resolve()
|
||||
self.embedding_config = config.rag.embedding
|
||||
self.indexing_config = config.vault.indexing
|
||||
self.chunking_rules = self._load_chunking_rules()
|
||||
self.embedder = OllamaEmbedder(
|
||||
base_url=self.embedding_config.base_url,
|
||||
model=self.embedding_config.model,
|
||||
batch_size=self.embedding_config.batch_size,
|
||||
)
|
||||
|
||||
def _load_chunking_rules(self) -> Dict[str, ChunkingRule]:
|
||||
rules = {}
|
||||
for pattern, rule in self.config.vault.chunking_rules.items():
|
||||
rules[pattern] = ChunkingRule(
|
||||
strategy=rule.strategy,
|
||||
chunk_size=rule.chunk_size,
|
||||
chunk_overlap=rule.chunk_overlap,
|
||||
section_tags=rule.section_tags if rule.section_tags else None,
|
||||
)
|
||||
return rules
|
||||
|
||||
def _should_index(self, relative_path: str) -> bool:
|
||||
parts = Path(relative_path).parts
|
||||
for deny_dir in self.indexing_config.deny_dirs:
|
||||
if deny_dir in parts:
|
||||
return False
|
||||
for pattern in self.indexing_config.deny_patterns:
|
||||
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(
|
||||
Path(relative_path).name, pattern
|
||||
):
|
||||
return False
|
||||
for pattern in self.indexing_config.file_patterns:
|
||||
if fnmatch.fnmatch(Path(relative_path).name, pattern):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _list_files(self) -> Iterator[Path]:
|
||||
for root, dirs, files in os.walk(self.vault_path):
|
||||
for file_name in files:
|
||||
file_path = Path(root) / file_name
|
||||
try:
|
||||
relative_path = file_path.relative_to(self.vault_path).as_posix()
|
||||
except ValueError:
|
||||
continue
|
||||
if self._should_index(relative_path):
|
||||
yield file_path
|
||||
|
||||
def _index_files(self, file_paths: List[Path]) -> None:
|
||||
all_chunks: List[Chunk] = []
|
||||
for file_path in file_paths:
|
||||
modified_at = file_path.stat().st_mtime
|
||||
chunks = chunk_file(
|
||||
file_path=file_path,
|
||||
vault_root=self.vault_path,
|
||||
rules=self.chunking_rules,
|
||||
modified_at=modified_at,
|
||||
)
|
||||
all_chunks.extend(chunks)
|
||||
|
||||
if not all_chunks:
|
||||
return
|
||||
|
||||
texts = [chunk.text for chunk in all_chunks]
|
||||
embeddings = self.embedder.embed(texts)
|
||||
|
||||
ids = []
|
||||
metadatas = []
|
||||
for chunk in all_chunks:
|
||||
chunk_id = f"{chunk.source_file}::{chunk.chunk_index}"
|
||||
ids.append(chunk_id)
|
||||
metadatas.append(
|
||||
{
|
||||
"source_file": chunk.source_file,
|
||||
"source_directory": chunk.source_directory,
|
||||
"section": chunk.section,
|
||||
"date": chunk.date,
|
||||
"tags": chunk.tags,
|
||||
"chunk_index": chunk.chunk_index,
|
||||
"total_chunks": chunk.total_chunks,
|
||||
"modified_at": chunk.modified_at,
|
||||
"rule_applied": chunk.rule_applied,
|
||||
}
|
||||
)
|
||||
|
||||
self.vector_store.upsert(
|
||||
ids=ids,
|
||||
texts=texts,
|
||||
embeddings=embeddings,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
|
||||
def full_index(self) -> None:
|
||||
try:
|
||||
self.vector_store.table.drop()
|
||||
except Exception:
|
||||
pass
|
||||
self.vector_store.table = self.vector_store._get_or_create_table()
|
||||
|
||||
file_paths = list(self._list_files())
|
||||
self._index_files(file_paths)
|
||||
|
||||
def sync(self) -> None:
|
||||
file_paths_to_index = []
|
||||
for file_path in self._list_files():
|
||||
relative_path = file_path.relative_to(self.vault_path).as_posix()
|
||||
modified_at = file_path.stat().st_mtime
|
||||
|
||||
results = (
|
||||
self.vector_store.table.search()
|
||||
.limit(1)
|
||||
.where(f"source_file = '{relative_path}'")
|
||||
.to_list()
|
||||
)
|
||||
|
||||
needs_index = True
|
||||
if results:
|
||||
existing_modified_at = results[0].get("modified_at")
|
||||
if (
|
||||
existing_modified_at is not None
|
||||
and existing_modified_at >= modified_at
|
||||
):
|
||||
needs_index = False
|
||||
|
||||
if needs_index:
|
||||
file_paths_to_index.append(file_path)
|
||||
self.vector_store.delete_by_source_file(relative_path)
|
||||
|
||||
self._index_files(file_paths_to_index)
|
||||
|
||||
def status(self) -> Dict[str, int]:
|
||||
total_chunks = self.vector_store.count()
|
||||
indexed_files = set()
|
||||
for row in (
|
||||
self.vector_store.table.to_lance().to_table().to_pydict()["source_file"]
|
||||
):
|
||||
indexed_files.add(row)
|
||||
|
||||
all_files = set()
|
||||
for file_path in self._list_files():
|
||||
all_files.add(file_path.relative_to(self.vault_path).as_posix())
|
||||
|
||||
unindexed_files = list(all_files - indexed_files)
|
||||
return {
|
||||
"total_chunks": total_chunks,
|
||||
"indexed_files": len(indexed_files),
|
||||
"unindexed_files": len(unindexed_files),
|
||||
}
|
||||
155
tests/test_indexer.py
Normal file
155
tests/test_indexer.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from companion.config import (
|
||||
Config,
|
||||
VaultConfig,
|
||||
IndexingConfig,
|
||||
RagConfig,
|
||||
EmbeddingConfig,
|
||||
VectorStoreConfig,
|
||||
SearchConfig,
|
||||
HybridSearchConfig,
|
||||
FiltersConfig,
|
||||
CompanionConfig,
|
||||
PersonaConfig,
|
||||
MemoryConfig,
|
||||
ChatConfig,
|
||||
ModelConfig,
|
||||
InferenceConfig,
|
||||
FineTuningConfig,
|
||||
RetrainScheduleConfig,
|
||||
ApiConfig,
|
||||
AuthConfig,
|
||||
UiConfig,
|
||||
WebConfig,
|
||||
WebFeaturesConfig,
|
||||
CliConfig,
|
||||
LoggingConfig,
|
||||
SecurityConfig,
|
||||
)
|
||||
from companion.rag.indexer import Indexer
|
||||
from companion.rag.vector_store import VectorStore
|
||||
|
||||
|
||||
def _make_config(vault_path: Path, vector_store_path: Path) -> Config:
|
||||
return Config(
|
||||
companion=CompanionConfig(
|
||||
name="SAN",
|
||||
persona=PersonaConfig(
|
||||
role="companion", tone="reflective", style="questioning", boundaries=[]
|
||||
),
|
||||
memory=MemoryConfig(
|
||||
session_turns=20, persistent_store="", summarize_after=10
|
||||
),
|
||||
chat=ChatConfig(
|
||||
streaming=True,
|
||||
max_response_tokens=2048,
|
||||
default_temperature=0.7,
|
||||
allow_temperature_override=True,
|
||||
),
|
||||
),
|
||||
vault=VaultConfig(
|
||||
path=str(vault_path),
|
||||
indexing=IndexingConfig(
|
||||
auto_sync=False,
|
||||
auto_sync_interval_minutes=1440,
|
||||
watch_fs_events=False,
|
||||
file_patterns=["*.md"],
|
||||
deny_dirs=[".git"],
|
||||
deny_patterns=[".*"],
|
||||
),
|
||||
chunking_rules={},
|
||||
),
|
||||
rag=RagConfig(
|
||||
embedding=EmbeddingConfig(
|
||||
provider="ollama",
|
||||
model="dummy",
|
||||
base_url="http://localhost:11434",
|
||||
dimensions=4,
|
||||
batch_size=2,
|
||||
),
|
||||
vector_store=VectorStoreConfig(type="lancedb", path=str(vector_store_path)),
|
||||
search=SearchConfig(
|
||||
default_top_k=8,
|
||||
max_top_k=20,
|
||||
similarity_threshold=0.75,
|
||||
hybrid_search=HybridSearchConfig(
|
||||
enabled=False, keyword_weight=0.3, semantic_weight=0.7
|
||||
),
|
||||
filters=FiltersConfig(
|
||||
date_range_enabled=True,
|
||||
tag_filter_enabled=True,
|
||||
directory_filter_enabled=True,
|
||||
),
|
||||
),
|
||||
),
|
||||
model=ModelConfig(
|
||||
inference=InferenceConfig(
|
||||
backend="llama.cpp",
|
||||
model_path="",
|
||||
context_length=8192,
|
||||
gpu_layers=35,
|
||||
batch_size=512,
|
||||
threads=8,
|
||||
),
|
||||
fine_tuning=FineTuningConfig(
|
||||
base_model="",
|
||||
output_dir="",
|
||||
lora_rank=16,
|
||||
lora_alpha=32,
|
||||
learning_rate=0.0002,
|
||||
batch_size=4,
|
||||
gradient_accumulation_steps=4,
|
||||
num_epochs=3,
|
||||
warmup_steps=100,
|
||||
save_steps=500,
|
||||
eval_steps=250,
|
||||
training_data_path="",
|
||||
validation_split=0.1,
|
||||
),
|
||||
retrain_schedule=RetrainScheduleConfig(
|
||||
auto_reminder=True, default_interval_days=90, reminder_channels=[]
|
||||
),
|
||||
),
|
||||
api=ApiConfig(
|
||||
host="127.0.0.1", port=7373, cors_origins=[], auth=AuthConfig(enabled=False)
|
||||
),
|
||||
ui=UiConfig(
|
||||
web=WebConfig(
|
||||
enabled=True,
|
||||
theme="obsidian",
|
||||
features=WebFeaturesConfig(
|
||||
streaming=True, citations=True, source_preview=True
|
||||
),
|
||||
),
|
||||
cli=CliConfig(enabled=True, rich_output=True),
|
||||
),
|
||||
logging=LoggingConfig(level="INFO", file="", max_size_mb=100, backup_count=5),
|
||||
security=SecurityConfig(
|
||||
local_only=True,
|
||||
vault_path_traversal_check=True,
|
||||
sensitive_content_detection=True,
|
||||
sensitive_patterns=[],
|
||||
require_confirmation_for_external_apis=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@patch("companion.rag.indexer.OllamaEmbedder")
|
||||
def test_full_index_creates_vectors(mock_embedder_cls):
|
||||
mock_embedder = MagicMock()
|
||||
mock_embedder.embed.return_value = [[1.0, 0.0, 0.0, 0.0]]
|
||||
mock_embedder_cls.return_value = mock_embedder
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
vault = Path(tmp) / "vault"
|
||||
vault.mkdir()
|
||||
(vault / "hello.md").write_text("hello world", encoding="utf-8")
|
||||
vs_path = Path(tmp) / "vectors"
|
||||
config = _make_config(vault, vs_path)
|
||||
store = VectorStore(uri=vs_path, dimensions=4)
|
||||
indexer = Indexer(config, store)
|
||||
indexer.full_index()
|
||||
assert store.count() == 1
|
||||
Reference in New Issue
Block a user