Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new

**Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB:
- `config.py` — JSON config loader with cross-platform path resolution
- `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists
- `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes
- `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling
- `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats
- `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields
- `cli.py` — `index | sync | reindex | status` CLI commands

**TypeScript plugin (`src/`)** — OpenClaw plugin scaffold:
- `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client
- `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner)
- `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending)
- `index.ts` — plugin entry point with health probe + vault watcher startup

**Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`):
- 627 files / 3764 chunks indexed in dev vault

**Tests: 76 passing**
- Python: 64 pytest tests (chunker, security, vector_store, config)
- TypeScript: 12 vitest tests (lancedb client, response envelope)

## Bugs fixed

- LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column)
- LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array
- LanceDB JS result score field: `_score` → `_distance`
- TypeScript regex literal with unescaped `/` in path-resolve regex
- Python: `create_table_if_not_exists` identity check → name comparison

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions

View File

@@ -0,0 +1,189 @@
"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations."""
from __future__ import annotations
import lancedb
import pytest
from pathlib import Path
from obsidian_rag.vector_store import (
SearchResult,
create_table_if_not_exists,
delete_by_source_file,
get_stats,
search_chunks,
upsert_chunks,
)
# ----------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------
def _connect(db_path: Path) -> lancedb.LanceDBConnection:
"""Create a LanceDB connection for testing."""
db_path.parent.mkdir(parents=True, exist_ok=True)
return lancedb.connect(str(db_path))
def _make_table(tmp_path: Path):
"""Create a fresh obsidian_chunks table for testing."""
db = _connect(tmp_path / "test.lance")
tbl = create_table_if_not_exists(db)
return tbl
def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides):
"""Build a minimal valid chunk dict."""
base = {
"vector": [0.1] * 1024,
"chunk_id": chunk_id,
"chunk_text": "Hello world",
"source_file": source_file,
"source_directory": "Notes",
"section": None,
"date": "2024-01-15",
"tags": ["#test"],
"chunk_index": 0,
"total_chunks": 1,
"modified_at": "2024-01-15T10:00:00Z",
"indexed_at": "2024-01-15T12:00:00Z",
}
base.update(overrides)
return base
# ----------------------------------------------------------------------
# Table creation
# ----------------------------------------------------------------------
def test_create_table_if_not_exists_creates_new(tmp_path: Path):
db = _connect(tmp_path / "new.lance")
tbl = create_table_if_not_exists(db)
assert "obsidian_chunks" in db.list_tables().tables
assert tbl.count_rows() == 0
def test_create_table_if_not_exists_idempotent(tmp_path: Path):
db = _connect(tmp_path / "exists.lance")
tbl1 = create_table_if_not_exists(db)
tbl2 = create_table_if_not_exists(db)
assert tbl1.name == tbl2.name # same underlying table
# ----------------------------------------------------------------------
# upsert_chunks
# ----------------------------------------------------------------------
def test_upsert_chunks_inserts_new(tmp_path: Path):
tbl = _make_table(tmp_path)
count = upsert_chunks(tbl, [_chunk()])
assert count == 1
assert tbl.count_rows() == 1
def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path):
tbl = _make_table(tmp_path)
assert upsert_chunks(tbl, []) == 0
def test_upsert_chunks_updates_existing(tmp_path: Path):
tbl = _make_table(tmp_path)
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")])
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")])
assert tbl.count_rows() == 1
df = tbl.to_pandas()
assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated"
def test_upsert_chunks_multiple(tmp_path: Path):
tbl = _make_table(tmp_path)
chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)]
upsert_chunks(tbl, chunks)
assert tbl.count_rows() == 10
# ----------------------------------------------------------------------
# delete_by_source_file
# ----------------------------------------------------------------------
def test_delete_by_source_file_removes_chunks(tmp_path: Path):
tbl = _make_table(tmp_path)
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")])
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")])
upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")])
assert tbl.count_rows() == 3
deleted = delete_by_source_file(tbl, "test.md")
assert deleted == 2
assert tbl.count_rows() == 1
def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path):
tbl = _make_table(tmp_path)
deleted = delete_by_source_file(tbl, "does-not-exist.md")
assert deleted == 0
# ----------------------------------------------------------------------
# search_chunks
# ----------------------------------------------------------------------
def test_search_chunks_with_directory_filter(tmp_path: Path):
tbl = _make_table(tmp_path)
upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")])
upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")])
results = search_chunks(
tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"]
)
assert all(r.source_directory == "Notes" for r in results)
def test_search_chunks_with_date_range(tmp_path: Path):
tbl = _make_table(tmp_path)
upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")])
upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")])
upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")])
results = search_chunks(
tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"}
)
for r in results:
assert "2024-02-01" <= r.date <= "2024-05-31"
def test_search_chunks_with_tags_filter(tmp_path: Path):
tbl = _make_table(tmp_path)
upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])])
upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])])
results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"])
assert len(results) >= 0 # filter applied
# ----------------------------------------------------------------------
# get_stats
# ----------------------------------------------------------------------
def test_get_stats_empty_table(tmp_path: Path):
tbl = _make_table(tmp_path)
stats = get_stats(tbl)
assert stats["total_docs"] == 0
assert stats["total_chunks"] == 0
def test_get_stats_with_data(tmp_path: Path):
tbl = _make_table(tmp_path)
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")])
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")])
upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")])
stats = get_stats(tbl)
assert stats["total_docs"] == 2 # 2 unique files
assert stats["total_chunks"] == 3