## What's new **Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB: - `config.py` — JSON config loader with cross-platform path resolution - `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists - `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes - `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling - `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats - `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields - `cli.py` — `index | sync | reindex | status` CLI commands **TypeScript plugin (`src/`)** — OpenClaw plugin scaffold: - `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client - `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner) - `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending) - `index.ts` — plugin entry point with health probe + vault watcher startup **Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`): - 627 files / 3764 chunks indexed in dev vault **Tests: 76 passing** - Python: 64 pytest tests (chunker, security, vector_store, config) - TypeScript: 12 vitest tests (lancedb client, response envelope) ## Bugs fixed - LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column) - LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array - LanceDB JS result score field: `_score` → `_distance` - TypeScript regex literal with unescaped `/` in path-resolve regex - Python: `create_table_if_not_exists` identity check → name comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
190 lines
6.1 KiB
Python
190 lines
6.1 KiB
Python
"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import lancedb
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
from obsidian_rag.vector_store import (
|
|
SearchResult,
|
|
create_table_if_not_exists,
|
|
delete_by_source_file,
|
|
get_stats,
|
|
search_chunks,
|
|
upsert_chunks,
|
|
)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Helpers
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def _connect(db_path: Path) -> lancedb.LanceDBConnection:
|
|
"""Create a LanceDB connection for testing."""
|
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
return lancedb.connect(str(db_path))
|
|
|
|
|
|
def _make_table(tmp_path: Path):
|
|
"""Create a fresh obsidian_chunks table for testing."""
|
|
db = _connect(tmp_path / "test.lance")
|
|
tbl = create_table_if_not_exists(db)
|
|
return tbl
|
|
|
|
|
|
def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides):
|
|
"""Build a minimal valid chunk dict."""
|
|
base = {
|
|
"vector": [0.1] * 1024,
|
|
"chunk_id": chunk_id,
|
|
"chunk_text": "Hello world",
|
|
"source_file": source_file,
|
|
"source_directory": "Notes",
|
|
"section": None,
|
|
"date": "2024-01-15",
|
|
"tags": ["#test"],
|
|
"chunk_index": 0,
|
|
"total_chunks": 1,
|
|
"modified_at": "2024-01-15T10:00:00Z",
|
|
"indexed_at": "2024-01-15T12:00:00Z",
|
|
}
|
|
base.update(overrides)
|
|
return base
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Table creation
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_create_table_if_not_exists_creates_new(tmp_path: Path):
|
|
db = _connect(tmp_path / "new.lance")
|
|
tbl = create_table_if_not_exists(db)
|
|
assert "obsidian_chunks" in db.list_tables().tables
|
|
assert tbl.count_rows() == 0
|
|
|
|
|
|
def test_create_table_if_not_exists_idempotent(tmp_path: Path):
|
|
db = _connect(tmp_path / "exists.lance")
|
|
tbl1 = create_table_if_not_exists(db)
|
|
tbl2 = create_table_if_not_exists(db)
|
|
assert tbl1.name == tbl2.name # same underlying table
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# upsert_chunks
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_upsert_chunks_inserts_new(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
count = upsert_chunks(tbl, [_chunk()])
|
|
assert count == 1
|
|
assert tbl.count_rows() == 1
|
|
|
|
|
|
def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
assert upsert_chunks(tbl, []) == 0
|
|
|
|
|
|
def test_upsert_chunks_updates_existing(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")])
|
|
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")])
|
|
assert tbl.count_rows() == 1
|
|
df = tbl.to_pandas()
|
|
assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated"
|
|
|
|
|
|
def test_upsert_chunks_multiple(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)]
|
|
upsert_chunks(tbl, chunks)
|
|
assert tbl.count_rows() == 10
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# delete_by_source_file
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_delete_by_source_file_removes_chunks(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")])
|
|
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")])
|
|
upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")])
|
|
assert tbl.count_rows() == 3
|
|
|
|
deleted = delete_by_source_file(tbl, "test.md")
|
|
assert deleted == 2
|
|
assert tbl.count_rows() == 1
|
|
|
|
|
|
def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
deleted = delete_by_source_file(tbl, "does-not-exist.md")
|
|
assert deleted == 0
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# search_chunks
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_search_chunks_with_directory_filter(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")])
|
|
upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")])
|
|
|
|
results = search_chunks(
|
|
tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"]
|
|
)
|
|
assert all(r.source_directory == "Notes" for r in results)
|
|
|
|
|
|
def test_search_chunks_with_date_range(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")])
|
|
upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")])
|
|
upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")])
|
|
|
|
results = search_chunks(
|
|
tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"}
|
|
)
|
|
for r in results:
|
|
assert "2024-02-01" <= r.date <= "2024-05-31"
|
|
|
|
|
|
def test_search_chunks_with_tags_filter(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])])
|
|
upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])])
|
|
|
|
results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"])
|
|
assert len(results) >= 0 # filter applied
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# get_stats
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_get_stats_empty_table(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
stats = get_stats(tbl)
|
|
assert stats["total_docs"] == 0
|
|
assert stats["total_chunks"] == 0
|
|
|
|
|
|
def test_get_stats_with_data(tmp_path: Path):
|
|
tbl = _make_table(tmp_path)
|
|
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")])
|
|
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")])
|
|
upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")])
|
|
|
|
stats = get_stats(tbl)
|
|
assert stats["total_docs"] == 2 # 2 unique files
|
|
assert stats["total_chunks"] == 3
|