obsidian-rag/python/tests/unit/test_vector_store.py

"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations."""

from __future__ import annotations

import lancedb
import pytest
from pathlib import Path

from obsidian_rag.vector_store import (
    SearchResult,
    create_table_if_not_exists,
    delete_by_source_file,
    get_stats,
    search_chunks,
    upsert_chunks,
)

# ----------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------


def _connect(db_path: Path) -> lancedb.LanceDBConnection:
    """Create a LanceDB connection for testing."""
    db_path.parent.mkdir(parents=True, exist_ok=True)
    return lancedb.connect(str(db_path))


def _make_table(tmp_path: Path):
    """Create a fresh obsidian_chunks table for testing."""
    db = _connect(tmp_path / "test.lance")
    tbl = create_table_if_not_exists(db)
    return tbl


def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides):
    """Build a minimal valid chunk dict."""
    base = {
        "vector": [0.1] * 1024,
        "chunk_id": chunk_id,
        "chunk_text": "Hello world",
        "source_file": source_file,
        "source_directory": "Notes",
        "section": None,
        "date": "2024-01-15",
        "tags": ["#test"],
        "chunk_index": 0,
        "total_chunks": 1,
        "modified_at": "2024-01-15T10:00:00Z",
        "indexed_at": "2024-01-15T12:00:00Z",
    }
    base.update(overrides)
    return base


# ----------------------------------------------------------------------
# Table creation
# ----------------------------------------------------------------------


def test_create_table_if_not_exists_creates_new(tmp_path: Path):
    db = _connect(tmp_path / "new.lance")
    tbl = create_table_if_not_exists(db)
    assert "obsidian_chunks" in db.list_tables().tables
    assert tbl.count_rows() == 0


def test_create_table_if_not_exists_idempotent(tmp_path: Path):
    db = _connect(tmp_path / "exists.lance")
    tbl1 = create_table_if_not_exists(db)
    tbl2 = create_table_if_not_exists(db)
    assert tbl1.name == tbl2.name  # same underlying table


# ----------------------------------------------------------------------
# upsert_chunks
# ----------------------------------------------------------------------


def test_upsert_chunks_inserts_new(tmp_path: Path):
    tbl = _make_table(tmp_path)
    count = upsert_chunks(tbl, [_chunk()])
    assert count == 1
    assert tbl.count_rows() == 1


def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path):
    tbl = _make_table(tmp_path)
    assert upsert_chunks(tbl, []) == 0


def test_upsert_chunks_updates_existing(tmp_path: Path):
    tbl = _make_table(tmp_path)
    upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")])
    upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")])
    assert tbl.count_rows() == 1
    df = tbl.to_pandas()
    assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated"


def test_upsert_chunks_multiple(tmp_path: Path):
    tbl = _make_table(tmp_path)
    chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)]
    upsert_chunks(tbl, chunks)
    assert tbl.count_rows() == 10


# ----------------------------------------------------------------------
# delete_by_source_file
# ----------------------------------------------------------------------


def test_delete_by_source_file_removes_chunks(tmp_path: Path):
    tbl = _make_table(tmp_path)
    upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")])
    upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")])
    upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")])
    assert tbl.count_rows() == 3

    deleted = delete_by_source_file(tbl, "test.md")
    assert deleted == 2
    assert tbl.count_rows() == 1


def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path):
    tbl = _make_table(tmp_path)
    deleted = delete_by_source_file(tbl, "does-not-exist.md")
    assert deleted == 0


# ----------------------------------------------------------------------
# search_chunks
# ----------------------------------------------------------------------


def test_search_chunks_with_directory_filter(tmp_path: Path):
    tbl = _make_table(tmp_path)
    upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")])
    upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")])

    results = search_chunks(
        tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"]
    )
    assert all(r.source_directory == "Notes" for r in results)


def test_search_chunks_with_date_range(tmp_path: Path):
    tbl = _make_table(tmp_path)
    upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")])
    upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")])
    upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")])

    results = search_chunks(
        tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"}
    )
    for r in results:
        assert "2024-02-01" <= r.date <= "2024-05-31"


def test_search_chunks_with_tags_filter(tmp_path: Path):
    tbl = _make_table(tmp_path)
    upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])])
    upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])])

    results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"])
    assert len(results) >= 0  # filter applied


# ----------------------------------------------------------------------
# get_stats
# ----------------------------------------------------------------------


def test_get_stats_empty_table(tmp_path: Path):
    tbl = _make_table(tmp_path)
    stats = get_stats(tbl)
    assert stats["total_docs"] == 0
    assert stats["total_chunks"] == 0


def test_get_stats_with_data(tmp_path: Path):
    tbl = _make_table(tmp_path)
    upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")])
    upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")])
    upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")])

    stats = get_stats(tbl)
    assert stats["total_docs"] == 2  # 2 unique files
    assert stats["total_chunks"] == 3