Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new **Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB: - `config.py` — JSON config loader with cross-platform path resolution - `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists - `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes - `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling - `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats - `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields - `cli.py` — `index | sync | reindex | status` CLI commands **TypeScript plugin (`src/`)** — OpenClaw plugin scaffold: - `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client - `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner) - `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending) - `index.ts` — plugin entry point with health probe + vault watcher startup **Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`): - 627 files / 3764 chunks indexed in dev vault **Tests: 76 passing** - Python: 64 pytest tests (chunker, security, vector_store, config) - TypeScript: 12 vitest tests (lancedb client, response envelope) ## Bugs fixed - LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column) - LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array - LanceDB JS result score field: `_score` → `_distance` - TypeScript regex literal with unescaped `/` in path-resolve regex - Python: `create_table_if_not_exists` identity check → name comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions
--- a/python/tests/unit/test_chunker.py
+++ b/python/tests/unit/test_chunker.py
@@ -0,0 +1,250 @@
+"""Tests for obsidian_rag.chunker — section splitting and sliding window."""
+
+from __future__ import annotations
+
+from pathlib import Path
+import tempfile
+from unittest.mock import MagicMock
+
+import pytest
+
+from obsidian_rag.chunker import (
+    extract_tags,
+    extract_date_from_filename,
+    is_structured_note,
+    parse_frontmatter,
+    split_by_sections,
+    sliding_window_chunks,
+    chunk_file,
+)
+
+
+# ----------------------------------------------------------------------
+# parse_frontmatter
+# ----------------------------------------------------------------------
+
+
+def test_parse_frontmatter_with_yaml():
+    content = """---
+title: My Journal
+tags: [journal, personal]
+---
+# Morning
+
+Some content here.
+"""
+    meta, body = parse_frontmatter(content)
+    assert meta.get("title") == "My Journal"
+    assert "# Morning" in body
+    assert "Some content" in body
+
+
+def test_parse_frontmatter_without_frontmatter():
+    content = "# Just a header\n\nSome text without frontmatter."
+    meta, body = parse_frontmatter(content)
+    assert meta == {}
+    assert "# Just a header" in body
+
+
+# ----------------------------------------------------------------------
+# extract_tags
+# ----------------------------------------------------------------------
+
+
+def test_extract_tags_basic():
+    text = "Hello #world and #python-code is nice"
+    tags = extract_tags(text)
+    assert "#world" in tags
+    assert "#python-code" in tags
+    # lowercased
+    assert all(t.startswith("#") for t in tags)
+
+
+def test_extract_tags_deduplicates():
+    text = "#hello #world #hello #python"
+    tags = extract_tags(text)
+    assert len(tags) == 3
+
+
+# ----------------------------------------------------------------------
+# extract_date_from_filename
+# ----------------------------------------------------------------------
+
+
+def test_extract_date_from_filename_iso():
+    p = Path("2024-01-15.md")
+    assert extract_date_from_filename(p) == "2024-01-15"
+
+
+def test_extract_date_from_filename_compact():
+    p = Path("20240115.md")
+    assert extract_date_from_filename(p) == "2024-01-15"
+
+
+def test_extract_date_from_filename_no_date():
+    p = Path("my-journal.md")
+    assert extract_date_from_filename(p) is None
+
+
+# ----------------------------------------------------------------------
+# is_structured_note
+# ----------------------------------------------------------------------
+
+
+def test_is_structured_note_journal():
+    assert is_structured_note(Path("2024-01-15.md")) is True
+    assert is_structured_note(Path("Journal/2024-02-20.md")) is True
+
+
+def test_is_structured_note_project():
+    assert is_structured_note(Path("My Project Ideas.md")) is False
+    assert is_structured_note(Path("shopping-list.md")) is False
+
+
+# ----------------------------------------------------------------------
+# split_by_sections
+# ----------------------------------------------------------------------
+
+
+def test_split_by_sections_multiple():
+    body = """# Mental Health
+Feeling anxious today.
+
+## Work
+Project deadline approaching.
+
+### Home
+Need to clean the garage.
+"""
+    sections = split_by_sections(body, {})
+    assert len(sections) == 3
+    assert sections[0][0] == "Mental Health"
+    # Section content excludes the header line itself
+    assert "Feeling anxious today." in sections[0][1]
+    assert sections[1][0] == "Work"
+    assert sections[2][0] == "Home"
+
+
+def test_split_by_sections_no_headers():
+    body = "Just plain text without any headers at all."
+    sections = split_by_sections(body, {})
+    assert len(sections) == 1
+    assert sections[0][0] is None
+    assert "Just plain text" in sections[0][1]
+
+
+def test_split_by_sections_leading_content():
+    """Content before the first header belongs to the first section."""
+    body = """Some intro text before any header.
+
+# First Section
+Content of first.
+"""
+    sections = split_by_sections(body, {})
+    assert sections[0][0] is None
+    assert "Some intro text" in sections[0][1]
+    assert sections[1][0] == "First Section"
+
+
+# ----------------------------------------------------------------------
+# sliding_window_chunks
+# ----------------------------------------------------------------------
+
+
+def test_sliding_window_basic():
+    words = " ".join([f"word{i}" for i in range(1200)])
+    chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
+    assert len(chunks) >= 2
+    # First chunk: words 0-499
+    assert chunks[0].startswith("word0")
+    # Chunks should have ~500 tokens each
+    for c in chunks:
+        assert len(c.split()) <= 500
+
+
+def test_sliding_window_overlap():
+    """Adjacent chunks should share the overlap region."""
+    text = " ".join([f"word{i}" for i in range(1000)])
+    chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
+    # Every chunk after the first should start with words from the previous chunk
+    for i in range(1, len(chunks)):
+        prev_words = chunks[i - 1].split()
+        curr_words = chunks[i].split()
+        # Overlap should be evident
+        assert prev_words[-100:] == curr_words[:100]
+
+
+def test_sliding_window_empty():
+    assert sliding_window_chunks("", chunk_size=500, overlap=100) == []
+
+
+def test_sliding_window_exact_size_produces_two_chunks():
+    """With overlap=100, exactly 500 words produces 2 chunks (0-499 and 400-end)."""
+    words = " ".join([f"word{i}" for i in range(500)])
+    chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
+    assert len(chunks) == 2
+    assert chunks[0].startswith("word0")
+    assert chunks[1].startswith("word400")  # advance = 500-100 = 400
+
+
+def test_sliding_window_small_text():
+    """Text much shorter than chunk_size returns single chunk."""
+    text = "just a few words"
+    chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
+    assert len(chunks) == 1
+    assert chunks[0] == text
+
+
+# ----------------------------------------------------------------------
+# chunk_file integration
+# ----------------------------------------------------------------------
+
+
+def _mock_config(tmp_path: Path) -> MagicMock:
+    """Build a minimal mock config pointing at a tmp vault."""
+    cfg = MagicMock()
+    cfg.vault_path = str(tmp_path)
+    cfg.indexing.chunk_size = 500
+    cfg.indexing.chunk_overlap = 100
+    cfg.indexing.file_patterns = ["*.md"]
+    cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
+    cfg.indexing.allow_dirs = []
+    return cfg
+
+
+def test_chunk_file_structured_journal(tmp_path: Path):
+    vault = tmp_path / "Journal"
+    vault.mkdir()
+    fpath = vault / "2024-03-15.md"
+    fpath.write_text("""# Morning
+
+Felt #anxious about the deadline.
+
+## Work
+Finished the report.
+""")
+
+    cfg = _mock_config(tmp_path)
+    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
+
+    # Journal file → section-split → 2 chunks
+    assert len(chunks) == 2
+    assert chunks[0].section == "#Morning"
+    assert chunks[0].date == "2024-03-15"
+    assert "#anxious" in chunks[0].tags or "#anxious" in chunks[1].tags
+    assert chunks[0].source_file.endswith("Journal/2024-03-15.md")
+
+
+def test_chunk_file_unstructured(tmp_path: Path):
+    vault = tmp_path / "Notes"
+    vault.mkdir()
+    fpath = vault / "project-ideas.md"
+    fpath.write_text("This is a long note " * 200)  # ~1000 words
+
+    cfg = _mock_config(tmp_path)
+    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
+
+    # Unstructured → sliding window → multiple chunks
+    assert len(chunks) > 1
+    assert all(c.section is None for c in chunks)
+    assert chunks[0].chunk_index == 0