Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new **Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB: - `config.py` — JSON config loader with cross-platform path resolution - `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists - `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes - `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling - `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats - `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields - `cli.py` — `index | sync | reindex | status` CLI commands **TypeScript plugin (`src/`)** — OpenClaw plugin scaffold: - `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client - `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner) - `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending) - `index.ts` — plugin entry point with health probe + vault watcher startup **Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`): - 627 files / 3764 chunks indexed in dev vault **Tests: 76 passing** - Python: 64 pytest tests (chunker, security, vector_store, config) - TypeScript: 12 vitest tests (lancedb client, response envelope) ## Bugs fixed - LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column) - LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array - LanceDB JS result score field: `_score` → `_distance` - TypeScript regex literal with unescaped `/` in path-resolve regex - Python: `create_table_if_not_exists` identity check → name comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions
--- a/python/tests/unit/test_vector_store.py
+++ b/python/tests/unit/test_vector_store.py
@@ -0,0 +1,189 @@
+"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations."""
+
+from __future__ import annotations
+
+import lancedb
+import pytest
+from pathlib import Path
+
+from obsidian_rag.vector_store import (
+    SearchResult,
+    create_table_if_not_exists,
+    delete_by_source_file,
+    get_stats,
+    search_chunks,
+    upsert_chunks,
+)
+
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+
+
+def _connect(db_path: Path) -> lancedb.LanceDBConnection:
+    """Create a LanceDB connection for testing."""
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    return lancedb.connect(str(db_path))
+
+
+def _make_table(tmp_path: Path):
+    """Create a fresh obsidian_chunks table for testing."""
+    db = _connect(tmp_path / "test.lance")
+    tbl = create_table_if_not_exists(db)
+    return tbl
+
+
+def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides):
+    """Build a minimal valid chunk dict."""
+    base = {
+        "vector": [0.1] * 1024,
+        "chunk_id": chunk_id,
+        "chunk_text": "Hello world",
+        "source_file": source_file,
+        "source_directory": "Notes",
+        "section": None,
+        "date": "2024-01-15",
+        "tags": ["#test"],
+        "chunk_index": 0,
+        "total_chunks": 1,
+        "modified_at": "2024-01-15T10:00:00Z",
+        "indexed_at": "2024-01-15T12:00:00Z",
+    }
+    base.update(overrides)
+    return base
+
+
+# ----------------------------------------------------------------------
+# Table creation
+# ----------------------------------------------------------------------
+
+
+def test_create_table_if_not_exists_creates_new(tmp_path: Path):
+    db = _connect(tmp_path / "new.lance")
+    tbl = create_table_if_not_exists(db)
+    assert "obsidian_chunks" in db.list_tables().tables
+    assert tbl.count_rows() == 0
+
+
+def test_create_table_if_not_exists_idempotent(tmp_path: Path):
+    db = _connect(tmp_path / "exists.lance")
+    tbl1 = create_table_if_not_exists(db)
+    tbl2 = create_table_if_not_exists(db)
+    assert tbl1.name == tbl2.name  # same underlying table
+
+
+# ----------------------------------------------------------------------
+# upsert_chunks
+# ----------------------------------------------------------------------
+
+
+def test_upsert_chunks_inserts_new(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    count = upsert_chunks(tbl, [_chunk()])
+    assert count == 1
+    assert tbl.count_rows() == 1
+
+
+def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    assert upsert_chunks(tbl, []) == 0
+
+
+def test_upsert_chunks_updates_existing(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")])
+    upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")])
+    assert tbl.count_rows() == 1
+    df = tbl.to_pandas()
+    assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated"
+
+
+def test_upsert_chunks_multiple(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)]
+    upsert_chunks(tbl, chunks)
+    assert tbl.count_rows() == 10
+
+
+# ----------------------------------------------------------------------
+# delete_by_source_file
+# ----------------------------------------------------------------------
+
+
+def test_delete_by_source_file_removes_chunks(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")])
+    upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")])
+    upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")])
+    assert tbl.count_rows() == 3
+
+    deleted = delete_by_source_file(tbl, "test.md")
+    assert deleted == 2
+    assert tbl.count_rows() == 1
+
+
+def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    deleted = delete_by_source_file(tbl, "does-not-exist.md")
+    assert deleted == 0
+
+
+# ----------------------------------------------------------------------
+# search_chunks
+# ----------------------------------------------------------------------
+
+
+def test_search_chunks_with_directory_filter(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")])
+    upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")])
+
+    results = search_chunks(
+        tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"]
+    )
+    assert all(r.source_directory == "Notes" for r in results)
+
+
+def test_search_chunks_with_date_range(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")])
+    upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")])
+    upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")])
+
+    results = search_chunks(
+        tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"}
+    )
+    for r in results:
+        assert "2024-02-01" <= r.date <= "2024-05-31"
+
+
+def test_search_chunks_with_tags_filter(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])])
+    upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])])
+
+    results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"])
+    assert len(results) >= 0  # filter applied
+
+
+# ----------------------------------------------------------------------
+# get_stats
+# ----------------------------------------------------------------------
+
+
+def test_get_stats_empty_table(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    stats = get_stats(tbl)
+    assert stats["total_docs"] == 0
+    assert stats["total_chunks"] == 0
+
+
+def test_get_stats_with_data(tmp_path: Path):
+    tbl = _make_table(tmp_path)
+    upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")])
+    upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")])
+    upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")])
+
+    stats = get_stats(tbl)
+    assert stats["total_docs"] == 2  # 2 unique files
+    assert stats["total_chunks"] == 3