"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations.""" from __future__ import annotations import lancedb import pytest from pathlib import Path from obsidian_rag.vector_store import ( SearchResult, create_table_if_not_exists, delete_by_source_file, get_stats, search_chunks, upsert_chunks, ) # ---------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------- def _connect(db_path: Path) -> lancedb.LanceDBConnection: """Create a LanceDB connection for testing.""" db_path.parent.mkdir(parents=True, exist_ok=True) return lancedb.connect(str(db_path)) def _make_table(tmp_path: Path): """Create a fresh obsidian_chunks table for testing.""" db = _connect(tmp_path / "test.lance") tbl = create_table_if_not_exists(db) return tbl def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides): """Build a minimal valid chunk dict.""" base = { "vector": [0.1] * 1024, "chunk_id": chunk_id, "chunk_text": "Hello world", "source_file": source_file, "source_directory": "Notes", "section": None, "date": "2024-01-15", "tags": ["#test"], "chunk_index": 0, "total_chunks": 1, "modified_at": "2024-01-15T10:00:00Z", "indexed_at": "2024-01-15T12:00:00Z", } base.update(overrides) return base # ---------------------------------------------------------------------- # Table creation # ---------------------------------------------------------------------- def test_create_table_if_not_exists_creates_new(tmp_path: Path): db = _connect(tmp_path / "new.lance") tbl = create_table_if_not_exists(db) assert "obsidian_chunks" in db.list_tables().tables assert tbl.count_rows() == 0 def test_create_table_if_not_exists_idempotent(tmp_path: Path): db = _connect(tmp_path / "exists.lance") tbl1 = create_table_if_not_exists(db) tbl2 = create_table_if_not_exists(db) assert tbl1.name == tbl2.name # same underlying table # ---------------------------------------------------------------------- # upsert_chunks # ---------------------------------------------------------------------- def test_upsert_chunks_inserts_new(tmp_path: Path): tbl = _make_table(tmp_path) count = upsert_chunks(tbl, [_chunk()]) assert count == 1 assert tbl.count_rows() == 1 def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path): tbl = _make_table(tmp_path) assert upsert_chunks(tbl, []) == 0 def test_upsert_chunks_updates_existing(tmp_path: Path): tbl = _make_table(tmp_path) upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")]) upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")]) assert tbl.count_rows() == 1 df = tbl.to_pandas() assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated" def test_upsert_chunks_multiple(tmp_path: Path): tbl = _make_table(tmp_path) chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)] upsert_chunks(tbl, chunks) assert tbl.count_rows() == 10 # ---------------------------------------------------------------------- # delete_by_source_file # ---------------------------------------------------------------------- def test_delete_by_source_file_removes_chunks(tmp_path: Path): tbl = _make_table(tmp_path) upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")]) upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")]) upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")]) assert tbl.count_rows() == 3 deleted = delete_by_source_file(tbl, "test.md") assert deleted == 2 assert tbl.count_rows() == 1 def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path): tbl = _make_table(tmp_path) deleted = delete_by_source_file(tbl, "does-not-exist.md") assert deleted == 0 # ---------------------------------------------------------------------- # search_chunks # ---------------------------------------------------------------------- def test_search_chunks_with_directory_filter(tmp_path: Path): tbl = _make_table(tmp_path) upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")]) upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")]) results = search_chunks( tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"] ) assert all(r.source_directory == "Notes" for r in results) def test_search_chunks_with_date_range(tmp_path: Path): tbl = _make_table(tmp_path) upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")]) upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")]) upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")]) results = search_chunks( tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"} ) for r in results: assert "2024-02-01" <= r.date <= "2024-05-31" def test_search_chunks_with_tags_filter(tmp_path: Path): tbl = _make_table(tmp_path) upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])]) upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])]) results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"]) assert len(results) >= 0 # filter applied # ---------------------------------------------------------------------- # get_stats # ---------------------------------------------------------------------- def test_get_stats_empty_table(tmp_path: Path): tbl = _make_table(tmp_path) stats = get_stats(tbl) assert stats["total_docs"] == 0 assert stats["total_chunks"] == 0 def test_get_stats_with_data(tmp_path: Path): tbl = _make_table(tmp_path) upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")]) upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")]) upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")]) stats = get_stats(tbl) assert stats["total_docs"] == 2 # 2 unique files assert stats["total_chunks"] == 3