Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite
## What's new **Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB: - `config.py` — JSON config loader with cross-platform path resolution - `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists - `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes - `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling - `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats - `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields - `cli.py` — `index | sync | reindex | status` CLI commands **TypeScript plugin (`src/`)** — OpenClaw plugin scaffold: - `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client - `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner) - `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending) - `index.ts` — plugin entry point with health probe + vault watcher startup **Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`): - 627 files / 3764 chunks indexed in dev vault **Tests: 76 passing** - Python: 64 pytest tests (chunker, security, vector_store, config) - TypeScript: 12 vitest tests (lancedb client, response envelope) ## Bugs fixed - LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column) - LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array - LanceDB JS result score field: `_score` → `_distance` - TypeScript regex literal with unescaped `/` in path-resolve regex - Python: `create_table_if_not_exists` identity check → name comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
250
python/tests/unit/test_chunker.py
Normal file
250
python/tests/unit/test_chunker.py
Normal file
@@ -0,0 +1,250 @@
|
||||
"""Tests for obsidian_rag.chunker — section splitting and sliding window."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from obsidian_rag.chunker import (
|
||||
extract_tags,
|
||||
extract_date_from_filename,
|
||||
is_structured_note,
|
||||
parse_frontmatter,
|
||||
split_by_sections,
|
||||
sliding_window_chunks,
|
||||
chunk_file,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# parse_frontmatter
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_parse_frontmatter_with_yaml():
|
||||
content = """---
|
||||
title: My Journal
|
||||
tags: [journal, personal]
|
||||
---
|
||||
# Morning
|
||||
|
||||
Some content here.
|
||||
"""
|
||||
meta, body = parse_frontmatter(content)
|
||||
assert meta.get("title") == "My Journal"
|
||||
assert "# Morning" in body
|
||||
assert "Some content" in body
|
||||
|
||||
|
||||
def test_parse_frontmatter_without_frontmatter():
|
||||
content = "# Just a header\n\nSome text without frontmatter."
|
||||
meta, body = parse_frontmatter(content)
|
||||
assert meta == {}
|
||||
assert "# Just a header" in body
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# extract_tags
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_extract_tags_basic():
|
||||
text = "Hello #world and #python-code is nice"
|
||||
tags = extract_tags(text)
|
||||
assert "#world" in tags
|
||||
assert "#python-code" in tags
|
||||
# lowercased
|
||||
assert all(t.startswith("#") for t in tags)
|
||||
|
||||
|
||||
def test_extract_tags_deduplicates():
|
||||
text = "#hello #world #hello #python"
|
||||
tags = extract_tags(text)
|
||||
assert len(tags) == 3
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# extract_date_from_filename
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_extract_date_from_filename_iso():
|
||||
p = Path("2024-01-15.md")
|
||||
assert extract_date_from_filename(p) == "2024-01-15"
|
||||
|
||||
|
||||
def test_extract_date_from_filename_compact():
|
||||
p = Path("20240115.md")
|
||||
assert extract_date_from_filename(p) == "2024-01-15"
|
||||
|
||||
|
||||
def test_extract_date_from_filename_no_date():
|
||||
p = Path("my-journal.md")
|
||||
assert extract_date_from_filename(p) is None
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# is_structured_note
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_is_structured_note_journal():
|
||||
assert is_structured_note(Path("2024-01-15.md")) is True
|
||||
assert is_structured_note(Path("Journal/2024-02-20.md")) is True
|
||||
|
||||
|
||||
def test_is_structured_note_project():
|
||||
assert is_structured_note(Path("My Project Ideas.md")) is False
|
||||
assert is_structured_note(Path("shopping-list.md")) is False
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# split_by_sections
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_split_by_sections_multiple():
|
||||
body = """# Mental Health
|
||||
Feeling anxious today.
|
||||
|
||||
## Work
|
||||
Project deadline approaching.
|
||||
|
||||
### Home
|
||||
Need to clean the garage.
|
||||
"""
|
||||
sections = split_by_sections(body, {})
|
||||
assert len(sections) == 3
|
||||
assert sections[0][0] == "Mental Health"
|
||||
# Section content excludes the header line itself
|
||||
assert "Feeling anxious today." in sections[0][1]
|
||||
assert sections[1][0] == "Work"
|
||||
assert sections[2][0] == "Home"
|
||||
|
||||
|
||||
def test_split_by_sections_no_headers():
|
||||
body = "Just plain text without any headers at all."
|
||||
sections = split_by_sections(body, {})
|
||||
assert len(sections) == 1
|
||||
assert sections[0][0] is None
|
||||
assert "Just plain text" in sections[0][1]
|
||||
|
||||
|
||||
def test_split_by_sections_leading_content():
|
||||
"""Content before the first header belongs to the first section."""
|
||||
body = """Some intro text before any header.
|
||||
|
||||
# First Section
|
||||
Content of first.
|
||||
"""
|
||||
sections = split_by_sections(body, {})
|
||||
assert sections[0][0] is None
|
||||
assert "Some intro text" in sections[0][1]
|
||||
assert sections[1][0] == "First Section"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# sliding_window_chunks
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_sliding_window_basic():
|
||||
words = " ".join([f"word{i}" for i in range(1200)])
|
||||
chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
|
||||
assert len(chunks) >= 2
|
||||
# First chunk: words 0-499
|
||||
assert chunks[0].startswith("word0")
|
||||
# Chunks should have ~500 tokens each
|
||||
for c in chunks:
|
||||
assert len(c.split()) <= 500
|
||||
|
||||
|
||||
def test_sliding_window_overlap():
|
||||
"""Adjacent chunks should share the overlap region."""
|
||||
text = " ".join([f"word{i}" for i in range(1000)])
|
||||
chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
|
||||
# Every chunk after the first should start with words from the previous chunk
|
||||
for i in range(1, len(chunks)):
|
||||
prev_words = chunks[i - 1].split()
|
||||
curr_words = chunks[i].split()
|
||||
# Overlap should be evident
|
||||
assert prev_words[-100:] == curr_words[:100]
|
||||
|
||||
|
||||
def test_sliding_window_empty():
|
||||
assert sliding_window_chunks("", chunk_size=500, overlap=100) == []
|
||||
|
||||
|
||||
def test_sliding_window_exact_size_produces_two_chunks():
|
||||
"""With overlap=100, exactly 500 words produces 2 chunks (0-499 and 400-end)."""
|
||||
words = " ".join([f"word{i}" for i in range(500)])
|
||||
chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].startswith("word0")
|
||||
assert chunks[1].startswith("word400") # advance = 500-100 = 400
|
||||
|
||||
|
||||
def test_sliding_window_small_text():
|
||||
"""Text much shorter than chunk_size returns single chunk."""
|
||||
text = "just a few words"
|
||||
chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0] == text
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# chunk_file integration
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def _mock_config(tmp_path: Path) -> MagicMock:
|
||||
"""Build a minimal mock config pointing at a tmp vault."""
|
||||
cfg = MagicMock()
|
||||
cfg.vault_path = str(tmp_path)
|
||||
cfg.indexing.chunk_size = 500
|
||||
cfg.indexing.chunk_overlap = 100
|
||||
cfg.indexing.file_patterns = ["*.md"]
|
||||
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
||||
cfg.indexing.allow_dirs = []
|
||||
return cfg
|
||||
|
||||
|
||||
def test_chunk_file_structured_journal(tmp_path: Path):
|
||||
vault = tmp_path / "Journal"
|
||||
vault.mkdir()
|
||||
fpath = vault / "2024-03-15.md"
|
||||
fpath.write_text("""# Morning
|
||||
|
||||
Felt #anxious about the deadline.
|
||||
|
||||
## Work
|
||||
Finished the report.
|
||||
""")
|
||||
|
||||
cfg = _mock_config(tmp_path)
|
||||
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||
|
||||
# Journal file → section-split → 2 chunks
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].section == "#Morning"
|
||||
assert chunks[0].date == "2024-03-15"
|
||||
assert "#anxious" in chunks[0].tags or "#anxious" in chunks[1].tags
|
||||
assert chunks[0].source_file.endswith("Journal/2024-03-15.md")
|
||||
|
||||
|
||||
def test_chunk_file_unstructured(tmp_path: Path):
|
||||
vault = tmp_path / "Notes"
|
||||
vault.mkdir()
|
||||
fpath = vault / "project-ideas.md"
|
||||
fpath.write_text("This is a long note " * 200) # ~1000 words
|
||||
|
||||
cfg = _mock_config(tmp_path)
|
||||
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||
|
||||
# Unstructured → sliding window → multiple chunks
|
||||
assert len(chunks) > 1
|
||||
assert all(c.section is None for c in chunks)
|
||||
assert chunks[0].chunk_index == 0
|
||||
130
python/tests/unit/test_config.py
Normal file
130
python/tests/unit/test_config.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""Tests for obsidian_rag.config — loader, path resolution, defaults."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from obsidian_rag.config import (
|
||||
EmbeddingConfig,
|
||||
ObsidianRagConfig,
|
||||
load_config,
|
||||
resolve_vector_db_path,
|
||||
resolve_vault_path,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Config loading
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_load_config_parses_valid_json(tmp_path: Path):
|
||||
config_path = tmp_path / "config.json"
|
||||
config_path.write_text(
|
||||
json.dumps({
|
||||
"vault_path": "/path/to/vault",
|
||||
"embedding": {"model": "custom-model:tag", "dimensions": 512},
|
||||
"vector_store": {"path": "/vectors/db"},
|
||||
})
|
||||
)
|
||||
config = load_config(config_path)
|
||||
assert config.vault_path == "/path/to/vault"
|
||||
assert config.embedding.model == "custom-model:tag"
|
||||
assert config.embedding.dimensions == 512 # overridden
|
||||
|
||||
|
||||
def test_load_config_missing_file_raises(tmp_path: Path):
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_config(tmp_path / "nonexistent.json")
|
||||
|
||||
|
||||
def test_load_config_merges_partial_json(tmp_path: Path):
|
||||
config_path = tmp_path / "config.json"
|
||||
config_path.write_text(json.dumps({"vault_path": "/custom/vault"}))
|
||||
config = load_config(config_path)
|
||||
# Unspecified fields fall back to defaults
|
||||
assert config.vault_path == "/custom/vault"
|
||||
assert config.embedding.base_url == "http://localhost:11434" # default
|
||||
assert config.indexing.chunk_size == 500 # default
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# resolve_vault_path
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_resolve_vault_path_absolute():
|
||||
cfg = ObsidianRagConfig(vault_path="/absolute/vault")
|
||||
assert resolve_vault_path(cfg) == Path("/absolute/vault")
|
||||
|
||||
|
||||
def test_resolve_vault_path_relative_defaults_to_project_root():
|
||||
cfg = ObsidianRagConfig(vault_path="KnowledgeVault/Default")
|
||||
result = resolve_vault_path(cfg)
|
||||
# Should resolve relative to python/obsidian_rag/ → project root
|
||||
assert result.name == "Default"
|
||||
assert result.parent.name == "KnowledgeVault"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# resolve_vector_db_path
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_resolve_vector_db_path_string_absolute():
|
||||
"""VectorStoreConfig stores path as a string; Path objects should be converted first."""
|
||||
from obsidian_rag.config import VectorStoreConfig
|
||||
|
||||
# Using a string path (the actual usage)
|
||||
cfg = ObsidianRagConfig(vector_store=VectorStoreConfig(path="/my/vectors.lance"))
|
||||
result = resolve_vector_db_path(cfg)
|
||||
assert result == Path("/my/vectors.lance")
|
||||
|
||||
|
||||
def test_resolve_vector_db_path_string_relative(tmp_path: Path):
|
||||
"""Relative paths are resolved against the data directory."""
|
||||
import obsidian_rag.config as cfg_mod
|
||||
|
||||
# Set up data dir + vault marker (required by _resolve_data_dir)
|
||||
# Note: the dev data dir is "obsidian-rag" (without leading dot)
|
||||
data_dir = tmp_path / "obsidian-rag"
|
||||
data_dir.mkdir()
|
||||
(tmp_path / "KnowledgeVault").mkdir()
|
||||
vector_file = data_dir / "vectors.lance"
|
||||
vector_file.touch()
|
||||
|
||||
cfg = ObsidianRagConfig(vector_store=cfg_mod.VectorStoreConfig(path="vectors.lance"))
|
||||
orig = cfg_mod.DEFAULT_CONFIG_DIR
|
||||
cfg_mod.DEFAULT_CONFIG_DIR = tmp_path
|
||||
try:
|
||||
result = resolve_vector_db_path(cfg)
|
||||
finally:
|
||||
cfg_mod.DEFAULT_CONFIG_DIR = orig
|
||||
|
||||
# Resolves to data_dir / vectors.lance
|
||||
assert result.parent.name == "obsidian-rag" # dev dir is "obsidian-rag" (no leading dot)
|
||||
assert result.name == "vectors.lance"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Dataclass defaults
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_embedding_config_defaults():
|
||||
cfg = EmbeddingConfig()
|
||||
assert cfg.model == "mxbai-embed-large"
|
||||
assert cfg.dimensions == 1024
|
||||
assert cfg.batch_size == 64
|
||||
|
||||
|
||||
def test_security_config_defaults():
|
||||
from obsidian_rag.config import SecurityConfig
|
||||
|
||||
cfg = SecurityConfig()
|
||||
assert "#mentalhealth" in cfg.sensitive_sections
|
||||
assert "health" in cfg.require_confirmation_for
|
||||
254
python/tests/unit/test_security.py
Normal file
254
python/tests/unit/test_security.py
Normal file
@@ -0,0 +1,254 @@
|
||||
"""Tests for obsidian_rag.security — path traversal, sanitization, sensitive detection."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from obsidian_rag.security import (
|
||||
detect_sensitive,
|
||||
filter_tags,
|
||||
is_symlink_outside_vault,
|
||||
sanitize_text,
|
||||
should_index_dir,
|
||||
validate_path,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# validate_path
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_validate_path_normal_file(tmp_path: Path):
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
target = vault / "subdir" / "note.md"
|
||||
target.parent.mkdir()
|
||||
target.touch()
|
||||
|
||||
result = validate_path(Path("subdir/note.md"), vault)
|
||||
assert result == target.resolve()
|
||||
|
||||
|
||||
def test_validate_path_traversal_attempt(tmp_path: Path):
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
|
||||
with pytest.raises(ValueError, match="traversal"):
|
||||
validate_path(Path("../etc/passwd"), vault)
|
||||
|
||||
|
||||
def test_validate_path_deep_traversal(tmp_path: Path):
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
|
||||
with pytest.raises(ValueError, match="traversal"):
|
||||
validate_path(Path("subdir/../../../etc/passwd"), vault)
|
||||
|
||||
|
||||
def test_validate_path_absolute_path(tmp_path: Path):
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
validate_path(Path("/etc/passwd"), vault)
|
||||
|
||||
|
||||
def test_validate_path_path_with_dotdot_in_resolve(tmp_path: Path):
|
||||
"""Path that resolves inside vault but has .. in parts should be caught."""
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
sub = vault / "subdir"
|
||||
sub.mkdir()
|
||||
|
||||
# validate_path checks parts for ".."
|
||||
with pytest.raises(ValueError, match="traversal"):
|
||||
validate_path(Path("subdir/../subdir/../note.md"), vault)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# is_symlink_outside_vault
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_is_symlink_outside_vault_internal(tmp_path: Path):
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
note = vault / "note.md"
|
||||
note.touch()
|
||||
|
||||
link = vault / "link.md"
|
||||
link.symlink_to(note)
|
||||
|
||||
assert is_symlink_outside_vault(link, vault) is False
|
||||
|
||||
|
||||
def test_is_symlink_outside_vault_external(tmp_path: Path):
|
||||
vault = tmp_path / "vault"
|
||||
vault.mkdir()
|
||||
outside = tmp_path / "outside.md"
|
||||
outside.touch()
|
||||
|
||||
link = vault / "link.md"
|
||||
link.symlink_to(outside)
|
||||
|
||||
assert is_symlink_outside_vault(link, vault) is True
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# sanitize_text
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_sanitize_text_strips_html():
|
||||
raw = "<script>alert('xss')</script>Hello #world"
|
||||
result = sanitize_text(raw)
|
||||
assert "<script>" not in result
|
||||
assert "Hello #world" in result
|
||||
# Text content inside HTML tags is preserved (sanitize_text strips the tags only)
|
||||
|
||||
|
||||
def test_sanitize_text_removes_code_blocks():
|
||||
raw = """Some text
|
||||
|
||||
```
|
||||
secret_api_key = "sk-12345"
|
||||
```
|
||||
|
||||
More text
|
||||
"""
|
||||
result = sanitize_text(raw)
|
||||
assert "secret_api_key" not in result
|
||||
assert "Some text" in result
|
||||
assert "More text" in result
|
||||
|
||||
|
||||
def test_sanitize_text_normalizes_whitespace():
|
||||
raw = "Hello\n\n\n world\t\t spaces"
|
||||
result = sanitize_text(raw)
|
||||
assert "\n" not in result
|
||||
assert "\t" not in result
|
||||
assert " " not in result
|
||||
|
||||
|
||||
def test_sanitize_text_caps_length():
|
||||
long_text = "word " * 1000
|
||||
result = sanitize_text(long_text)
|
||||
assert len(result) <= 2000
|
||||
|
||||
|
||||
def test_sanitize_text_preserves_hashtags():
|
||||
raw = "#mentalhealth #python #machine-learning"
|
||||
result = sanitize_text(raw)
|
||||
assert "#mentalhealth" in result
|
||||
assert "#python" in result
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# detect_sensitive
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_detect_sensitive_mental_health_section():
|
||||
text = " #mentalhealth section content"
|
||||
sensitive_sections = ["#mentalhealth", "#physicalhealth", "#Relations"]
|
||||
patterns = {"financial": [], "health": []}
|
||||
|
||||
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||
assert result["health"] is True
|
||||
|
||||
|
||||
def test_detect_sensitive_financial_pattern():
|
||||
text = "I owe Sreenivas $50 and need to pay it back"
|
||||
sensitive_sections = ["#mentalhealth"]
|
||||
patterns = {"financial": ["owe", "$"], "health": []}
|
||||
|
||||
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||
assert result["financial"] is True
|
||||
assert result["health"] is False
|
||||
|
||||
|
||||
def test_detect_sensitive_relations():
|
||||
text = "Had coffee with Sarah #Relations"
|
||||
sensitive_sections = ["#Relations"]
|
||||
patterns = {"financial": [], "health": []}
|
||||
|
||||
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||
# Only specific health sections set health=true
|
||||
assert result["relations"] is False
|
||||
|
||||
|
||||
def test_detect_sensitive_clean_text():
|
||||
text = "This is a normal note about cooking dinner."
|
||||
sensitive_sections = []
|
||||
patterns = {"financial": [], "health": []}
|
||||
|
||||
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||
assert result == {"health": False, "financial": False, "relations": False}
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# should_index_dir
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def _mock_config() -> MagicMock:
|
||||
cfg = MagicMock()
|
||||
cfg.indexing.allow_dirs = []
|
||||
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
||||
return cfg
|
||||
|
||||
|
||||
def test_should_index_dir_allows_normal():
|
||||
cfg = _mock_config()
|
||||
assert should_index_dir("Journal", cfg) is True
|
||||
assert should_index_dir("Finance", cfg) is True
|
||||
assert should_index_dir("Projects", cfg) is True
|
||||
|
||||
|
||||
def test_should_index_dir_denies_hidden():
|
||||
cfg = _mock_config()
|
||||
assert should_index_dir(".obsidian", cfg) is False
|
||||
assert should_index_dir(".git", cfg) is False
|
||||
assert should_index_dir(".trash", cfg) is False
|
||||
|
||||
|
||||
def test_should_index_dir_denies_configured():
|
||||
cfg = _mock_config()
|
||||
assert should_index_dir("zzz-Archive", cfg) is False
|
||||
|
||||
|
||||
def test_should_index_dir_allow_list_override():
|
||||
cfg = _mock_config()
|
||||
cfg.indexing.allow_dirs = ["Journal", "Finance"]
|
||||
assert should_index_dir("Journal", cfg) is True
|
||||
assert should_index_dir("Finance", cfg) is True
|
||||
assert should_index_dir("Projects", cfg) is False
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# filter_tags
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_filter_tags_basic():
|
||||
text = "Hello #world and #python tags #AI"
|
||||
tags = filter_tags(text)
|
||||
assert "#world" in tags
|
||||
assert "#python" in tags
|
||||
assert "#ai" in tags
|
||||
|
||||
|
||||
def test_filter_tags_deduplicates():
|
||||
text = "#hello #world #hello"
|
||||
tags = filter_tags(text)
|
||||
assert len(tags) == 2
|
||||
|
||||
|
||||
def test_filter_tags_no_tags():
|
||||
text = "just plain text without any hashtags"
|
||||
assert filter_tags(text) == []
|
||||
189
python/tests/unit/test_vector_store.py
Normal file
189
python/tests/unit/test_vector_store.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import lancedb
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from obsidian_rag.vector_store import (
|
||||
SearchResult,
|
||||
create_table_if_not_exists,
|
||||
delete_by_source_file,
|
||||
get_stats,
|
||||
search_chunks,
|
||||
upsert_chunks,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def _connect(db_path: Path) -> lancedb.LanceDBConnection:
|
||||
"""Create a LanceDB connection for testing."""
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return lancedb.connect(str(db_path))
|
||||
|
||||
|
||||
def _make_table(tmp_path: Path):
|
||||
"""Create a fresh obsidian_chunks table for testing."""
|
||||
db = _connect(tmp_path / "test.lance")
|
||||
tbl = create_table_if_not_exists(db)
|
||||
return tbl
|
||||
|
||||
|
||||
def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides):
|
||||
"""Build a minimal valid chunk dict."""
|
||||
base = {
|
||||
"vector": [0.1] * 1024,
|
||||
"chunk_id": chunk_id,
|
||||
"chunk_text": "Hello world",
|
||||
"source_file": source_file,
|
||||
"source_directory": "Notes",
|
||||
"section": None,
|
||||
"date": "2024-01-15",
|
||||
"tags": ["#test"],
|
||||
"chunk_index": 0,
|
||||
"total_chunks": 1,
|
||||
"modified_at": "2024-01-15T10:00:00Z",
|
||||
"indexed_at": "2024-01-15T12:00:00Z",
|
||||
}
|
||||
base.update(overrides)
|
||||
return base
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Table creation
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_create_table_if_not_exists_creates_new(tmp_path: Path):
|
||||
db = _connect(tmp_path / "new.lance")
|
||||
tbl = create_table_if_not_exists(db)
|
||||
assert "obsidian_chunks" in db.list_tables().tables
|
||||
assert tbl.count_rows() == 0
|
||||
|
||||
|
||||
def test_create_table_if_not_exists_idempotent(tmp_path: Path):
|
||||
db = _connect(tmp_path / "exists.lance")
|
||||
tbl1 = create_table_if_not_exists(db)
|
||||
tbl2 = create_table_if_not_exists(db)
|
||||
assert tbl1.name == tbl2.name # same underlying table
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# upsert_chunks
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_upsert_chunks_inserts_new(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
count = upsert_chunks(tbl, [_chunk()])
|
||||
assert count == 1
|
||||
assert tbl.count_rows() == 1
|
||||
|
||||
|
||||
def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
assert upsert_chunks(tbl, []) == 0
|
||||
|
||||
|
||||
def test_upsert_chunks_updates_existing(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")])
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")])
|
||||
assert tbl.count_rows() == 1
|
||||
df = tbl.to_pandas()
|
||||
assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated"
|
||||
|
||||
|
||||
def test_upsert_chunks_multiple(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)]
|
||||
upsert_chunks(tbl, chunks)
|
||||
assert tbl.count_rows() == 10
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# delete_by_source_file
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_delete_by_source_file_removes_chunks(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")])
|
||||
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")])
|
||||
upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")])
|
||||
assert tbl.count_rows() == 3
|
||||
|
||||
deleted = delete_by_source_file(tbl, "test.md")
|
||||
assert deleted == 2
|
||||
assert tbl.count_rows() == 1
|
||||
|
||||
|
||||
def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
deleted = delete_by_source_file(tbl, "does-not-exist.md")
|
||||
assert deleted == 0
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# search_chunks
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_search_chunks_with_directory_filter(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")])
|
||||
upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")])
|
||||
|
||||
results = search_chunks(
|
||||
tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"]
|
||||
)
|
||||
assert all(r.source_directory == "Notes" for r in results)
|
||||
|
||||
|
||||
def test_search_chunks_with_date_range(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")])
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")])
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")])
|
||||
|
||||
results = search_chunks(
|
||||
tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"}
|
||||
)
|
||||
for r in results:
|
||||
assert "2024-02-01" <= r.date <= "2024-05-31"
|
||||
|
||||
|
||||
def test_search_chunks_with_tags_filter(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])])
|
||||
upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])])
|
||||
|
||||
results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"])
|
||||
assert len(results) >= 0 # filter applied
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# get_stats
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_get_stats_empty_table(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
stats = get_stats(tbl)
|
||||
assert stats["total_docs"] == 0
|
||||
assert stats["total_chunks"] == 0
|
||||
|
||||
|
||||
def test_get_stats_with_data(tmp_path: Path):
|
||||
tbl = _make_table(tmp_path)
|
||||
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")])
|
||||
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")])
|
||||
upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")])
|
||||
|
||||
stats = get_stats(tbl)
|
||||
assert stats["total_docs"] == 2 # 2 unique files
|
||||
assert stats["total_chunks"] == 3
|
||||
Reference in New Issue
Block a user