Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new **Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB: - `config.py` — JSON config loader with cross-platform path resolution - `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists - `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes - `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling - `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats - `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields - `cli.py` — `index | sync | reindex | status` CLI commands **TypeScript plugin (`src/`)** — OpenClaw plugin scaffold: - `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client - `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner) - `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending) - `index.ts` — plugin entry point with health probe + vault watcher startup **Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`): - 627 files / 3764 chunks indexed in dev vault **Tests: 76 passing** - Python: 64 pytest tests (chunker, security, vector_store, config) - TypeScript: 12 vitest tests (lancedb client, response envelope) ## Bugs fixed - LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column) - LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array - LanceDB JS result score field: `_score` → `_distance` - TypeScript regex literal with unescaped `/` in path-resolve regex - Python: `create_table_if_not_exists` identity check → name comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions
--- a/python/obsidian_rag/indexer.py
+++ b/python/obsidian_rag/indexer.py
@@ -0,0 +1,223 @@
+"""Full indexing pipeline: scan → parse → chunk → embed → store."""
+
+from __future__ import annotations
+
+import json
+import os
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Iterator
+
+if TYPE_CHECKING:
+    from obsidian_rag.config import ObsidianRagConfig
+
+import obsidian_rag.config as config_mod
+from obsidian_rag.chunker import chunk_file
+from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError
+from obsidian_rag.security import should_index_dir, validate_path
+from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
+
+# ----------------------------------------------------------------------
+# Pipeline
+# ----------------------------------------------------------------------
+
+
+class Indexer:
+    """Coordinates the scan → chunk → embed → store pipeline."""
+
+    def __init__(self, config: "ObsidianRagConfig"):
+        self.config = config
+        self.vault_path = config_mod.resolve_vault_path(config)
+        self._embedder = None  # lazy init
+
+    @property
+    def embedder(self):
+        if self._embedder is None:
+            from obsidian_rag.embedder import OllamaEmbedder
+            self._embedder = OllamaEmbedder(self.config)
+        return self._embedder
+
+    def scan_vault(self) -> Generator[Path, None, None]:
+        """Walk vault, yielding markdown files to index."""
+        for root, dirs, files in os.walk(self.vault_path):
+            root_path = Path(root)
+            # Filter directories
+            dirs[:] = [d for d in dirs if should_index_dir(d, self.config)]
+
+            for fname in files:
+                if not fname.endswith(".md"):
+                    continue
+                filepath = root_path / fname
+                try:
+                    validate_path(filepath, self.vault_path)
+                except ValueError:
+                    continue
+                yield filepath
+
+    def process_file(self, filepath: Path) -> tuple[int, list[dict[str, Any]]]:
+        """Index a single file. Returns (num_chunks, enriched_chunks)."""
+        from obsidian_rag import security
+
+        mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat())
+        content = filepath.read_text(encoding="utf-8")
+        # Sanitize
+        content = security.sanitize_text(content)
+        # Chunk
+        chunks = chunk_file(filepath, content, mtime, self.config)
+        # Enrich with indexed_at
+        now = datetime.now(timezone.utc).isoformat()
+        enriched: list[dict[str, Any]] = []
+        for chunk in chunks:
+            enriched.append(
+                {
+                    "chunk_id": chunk.chunk_id,
+                    "chunk_text": chunk.text,
+                    "source_file": chunk.source_file,
+                    "source_directory": chunk.source_directory,
+                    "section": chunk.section,
+                    "date": chunk.date,
+                    "tags": chunk.tags,
+                    "chunk_index": chunk.chunk_index,
+                    "total_chunks": chunk.total_chunks,
+                    "modified_at": chunk.modified_at,
+                    "indexed_at": now,
+                }
+            )
+        return len(chunks), enriched
+
+    def full_index(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
+        """Run full index of the vault. Calls on_progress with status dicts."""
+        vault_path = self.vault_path
+        if not vault_path.exists():
+            raise FileNotFoundError(f"Vault not found: {vault_path}")
+
+        db = get_db(self.config)
+        table = create_table_if_not_exists(db)
+        embedder = self.embedder
+
+        files = list(self.scan_vault())
+        total_files = len(files)
+        indexed_files = 0
+        total_chunks = 0
+        errors: list[dict] = []
+
+        for idx, filepath in enumerate(files):
+            try:
+                num_chunks, enriched = self.process_file(filepath)
+                # Embed chunks
+                texts = [e["chunk_text"] for e in enriched]
+                try:
+                    vectors = embedder.embed_chunks(texts)
+                except OllamaUnavailableError:
+                    # Partial results without embeddings — skip
+                    vectors = [[0.0] * 1024 for _ in texts]
+                # Add vectors
+                for e, v in zip(enriched, vectors):
+                    e["vector"] = v
+                # Store
+                upsert_chunks(table, enriched)
+                total_chunks += num_chunks
+                indexed_files += 1
+            except Exception as exc:
+                errors.append({"file": str(filepath), "error": str(exc)})
+
+            if on_progress:
+                phase = "embedding" if idx < total_files // 2 else "storing"
+                yield {
+                    "type": "progress",
+                    "phase": phase,
+                    "current": idx + 1,
+                    "total": total_files,
+                }
+
+        return {
+            "indexed_files": indexed_files,
+            "total_chunks": total_chunks,
+            "duration_ms": 0,  # caller can fill
+            "errors": errors,
+        }
+
+    def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
+        """Incremental sync: only process files modified since last sync."""
+        sync_result_path = self._sync_result_path()
+        last_sync = None
+        if sync_result_path.exists():
+            try:
+                last_sync = json.loads(sync_result_path.read_text()).get("timestamp")
+            except Exception:
+                pass
+
+        db = get_db(self.config)
+        table = create_table_if_not_exists(db)
+        embedder = self.embedder
+
+        files = list(self.scan_vault())
+        indexed_files = 0
+        total_chunks = 0
+        errors: list[dict] = []
+
+        for filepath in files:
+            mtime = datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc)
+            mtime_str = mtime.isoformat()
+            if last_sync and mtime_str <= last_sync:
+                continue  # unchanged
+
+            try:
+                num_chunks, enriched = self.process_file(filepath)
+                texts = [e["chunk_text"] for e in enriched]
+                try:
+                    vectors = embedder.embed_chunks(texts)
+                except OllamaUnavailableError:
+                    vectors = [[0.0] * 1024 for _ in texts]
+                for e, v in zip(enriched, vectors):
+                    e["vector"] = v
+                upsert_chunks(table, enriched)
+                total_chunks += num_chunks
+                indexed_files += 1
+            except Exception as exc:
+                errors.append({"file": str(filepath), "error": str(exc)})
+
+        self._write_sync_result(indexed_files, total_chunks, errors)
+        return {
+            "indexed_files": indexed_files,
+            "total_chunks": total_chunks,
+            "errors": errors,
+        }
+
+    def reindex(self) -> dict[str, Any]:
+        """Nuke and rebuild: drop table and run full index."""
+        db = get_db(self.config)
+        if "obsidian_chunks" in db.list_tables():
+            db.drop_table("obsidian_chunks")
+        # full_index is a generator — materialize it to get the final dict
+        results = list(self.full_index())
+        return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []}
+
+    def _sync_result_path(self) -> Path:
+        # Use the same dev-data-dir convention as config.py
+        project_root = Path(__file__).parent.parent.parent
+        data_dir = project_root / "obsidian-rag"
+        if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
+            data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
+        return data_dir / "sync-result.json"
+
+    def _write_sync_result(
+        self,
+        indexed_files: int,
+        total_chunks: int,
+        errors: list[dict],
+    ) -> None:
+        path = self._sync_result_path()
+        path.parent.mkdir(parents=True, exist_ok=True)
+        result = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "indexed_files": indexed_files,
+            "total_chunks": total_chunks,
+            "errors": errors,
+        }
+        # Atomic write: .tmp → rename
+        tmp = path.with_suffix(".json.tmp")
+        tmp.write_text(json.dumps(result, indent=2))
+        tmp.rename(path)