"""Tests for obsidian_rag.chunker — section splitting and sliding window.""" from __future__ import annotations from pathlib import Path import tempfile from unittest.mock import MagicMock import pytest from obsidian_rag.chunker import ( extract_tags, extract_date_from_filename, is_structured_note, parse_frontmatter, split_by_sections, sliding_window_chunks, chunk_file, ) # ---------------------------------------------------------------------- # parse_frontmatter # ---------------------------------------------------------------------- def test_parse_frontmatter_with_yaml(): content = """--- title: My Journal tags: [journal, personal] --- # Morning Some content here. """ meta, body = parse_frontmatter(content) assert meta.get("title") == "My Journal" assert "# Morning" in body assert "Some content" in body def test_parse_frontmatter_without_frontmatter(): content = "# Just a header\n\nSome text without frontmatter." meta, body = parse_frontmatter(content) assert meta == {} assert "# Just a header" in body # ---------------------------------------------------------------------- # extract_tags # ---------------------------------------------------------------------- def test_extract_tags_basic(): text = "Hello #world and #python-code is nice" tags = extract_tags(text) assert "#world" in tags assert "#python-code" in tags # lowercased assert all(t.startswith("#") for t in tags) def test_extract_tags_deduplicates(): text = "#hello #world #hello #python" tags = extract_tags(text) assert len(tags) == 3 # ---------------------------------------------------------------------- # extract_date_from_filename # ---------------------------------------------------------------------- def test_extract_date_from_filename_iso(): p = Path("2024-01-15.md") assert extract_date_from_filename(p) == "2024-01-15" def test_extract_date_from_filename_compact(): p = Path("20240115.md") assert extract_date_from_filename(p) == "2024-01-15" def test_extract_date_from_filename_no_date(): p = Path("my-journal.md") assert extract_date_from_filename(p) is None # ---------------------------------------------------------------------- # is_structured_note # ---------------------------------------------------------------------- def test_is_structured_note_journal(): assert is_structured_note(Path("2024-01-15.md")) is True assert is_structured_note(Path("Journal/2024-02-20.md")) is True def test_is_structured_note_project(): assert is_structured_note(Path("My Project Ideas.md")) is False assert is_structured_note(Path("shopping-list.md")) is False # ---------------------------------------------------------------------- # split_by_sections # ---------------------------------------------------------------------- def test_split_by_sections_multiple(): body = """# Mental Health Feeling anxious today. ## Work Project deadline approaching. ### Home Need to clean the garage. """ sections = split_by_sections(body, {}) assert len(sections) == 3 assert sections[0][0] == "Mental Health" # Section content excludes the header line itself assert "Feeling anxious today." in sections[0][1] assert sections[1][0] == "Work" assert sections[2][0] == "Home" def test_split_by_sections_no_headers(): body = "Just plain text without any headers at all." sections = split_by_sections(body, {}) assert len(sections) == 1 assert sections[0][0] is None assert "Just plain text" in sections[0][1] def test_split_by_sections_leading_content(): """Content before the first header belongs to the first section.""" body = """Some intro text before any header. # First Section Content of first. """ sections = split_by_sections(body, {}) assert sections[0][0] is None assert "Some intro text" in sections[0][1] assert sections[1][0] == "First Section" # ---------------------------------------------------------------------- # sliding_window_chunks # ---------------------------------------------------------------------- def test_sliding_window_basic(): words = " ".join([f"word{i}" for i in range(1200)]) chunks = sliding_window_chunks(words, chunk_size=500, overlap=100) assert len(chunks) >= 2 # First chunk: words 0-499 assert chunks[0].startswith("word0") # Chunks should have ~500 tokens each for c in chunks: assert len(c.split()) <= 500 def test_sliding_window_overlap(): """Adjacent chunks should share the overlap region.""" text = " ".join([f"word{i}" for i in range(1000)]) chunks = sliding_window_chunks(text, chunk_size=500, overlap=100) # Every chunk after the first should start with words from the previous chunk for i in range(1, len(chunks)): prev_words = chunks[i - 1].split() curr_words = chunks[i].split() # Overlap should be evident assert prev_words[-100:] == curr_words[:100] def test_sliding_window_empty(): assert sliding_window_chunks("", chunk_size=500, overlap=100) == [] def test_sliding_window_exact_size_produces_two_chunks(): """With overlap=100, exactly 500 words produces 2 chunks (0-499 and 400-end).""" words = " ".join([f"word{i}" for i in range(500)]) chunks = sliding_window_chunks(words, chunk_size=500, overlap=100) assert len(chunks) == 2 assert chunks[0].startswith("word0") assert chunks[1].startswith("word400") # advance = 500-100 = 400 def test_sliding_window_small_text(): """Text much shorter than chunk_size returns single chunk.""" text = "just a few words" chunks = sliding_window_chunks(text, chunk_size=500, overlap=100) assert len(chunks) == 1 assert chunks[0] == text # ---------------------------------------------------------------------- # chunk_file integration # ---------------------------------------------------------------------- def _mock_config(tmp_path: Path) -> MagicMock: """Build a minimal mock config pointing at a tmp vault.""" cfg = MagicMock() cfg.vault_path = str(tmp_path) cfg.indexing.chunk_size = 500 cfg.indexing.chunk_overlap = 100 cfg.indexing.max_section_chars = 4000 cfg.indexing.file_patterns = ["*.md"] cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"] cfg.indexing.allow_dirs = [] return cfg def test_chunk_file_structured_journal(tmp_path: Path): vault = tmp_path / "Journal" vault.mkdir() fpath = vault / "2024-03-15.md" fpath.write_text("""# Morning Felt #anxious about the deadline. ## Work Finished the report. """) cfg = _mock_config(tmp_path) chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg) # Journal file → section-split → 2 chunks assert len(chunks) == 2 assert chunks[0].section == "#Morning" assert chunks[0].date == "2024-03-15" assert "#anxious" in chunks[0].tags or "#anxious" in chunks[1].tags assert chunks[0].source_file.endswith("Journal/2024-03-15.md") def test_chunk_file_unstructured(tmp_path: Path): vault = tmp_path / "Notes" vault.mkdir() fpath = vault / "project-ideas.md" fpath.write_text("This is a long note " * 200) # ~1000 words cfg = _mock_config(tmp_path) chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg) # Unstructured → sliding window → multiple chunks assert len(chunks) > 1 assert all(c.section is None for c in chunks) assert chunks[0].chunk_index == 0 def test_large_section_split_into_sub_chunks(tmp_path: Path): """Large section (exceeding max_section_chars) is split via sliding window.""" vault = tmp_path / "Notes" vault.mkdir() fpath = vault / "2024-03-15-Podcast.md" large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS fpath.write_text(f"# Episode Notes\n\n{large_content}") cfg = _mock_config(tmp_path) cfg.indexing.max_section_chars = 4000 chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg) # Large section should be split into multiple sub-chunks assert len(chunks) > 1 # Each sub-chunk should preserve the section heading for chunk in chunks: assert chunk.section == "#Episode Notes", ( f"Expected #Episode Notes, got {chunk.section}" ) def test_small_section_kept_intact(tmp_path: Path): """Small section (under max_section_chars) remains a single chunk.""" vault = tmp_path / "Notes" vault.mkdir() fpath = vault / "2024-03-15-Short.md" fpath.write_text("# Notes\n\nShort content here.") cfg = _mock_config(tmp_path) cfg.indexing.max_section_chars = 4000 chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg) # Small section → single chunk assert len(chunks) == 1 assert chunks[0].section == "#Notes" assert chunks[0].text.strip().endswith("Short content here.")