- Section-split first for structured notes - Large sections (>max_section_chars) broken via sliding-window - Small sections stay intact with heading preserved - Adds max_section_chars config (default 4000) - 2 new TDD tests for hierarchical chunking
290 lines
8.8 KiB
Python
290 lines
8.8 KiB
Python
"""Tests for obsidian_rag.chunker — section splitting and sliding window."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
import tempfile
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
from obsidian_rag.chunker import (
|
|
extract_tags,
|
|
extract_date_from_filename,
|
|
is_structured_note,
|
|
parse_frontmatter,
|
|
split_by_sections,
|
|
sliding_window_chunks,
|
|
chunk_file,
|
|
)
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# parse_frontmatter
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_parse_frontmatter_with_yaml():
|
|
content = """---
|
|
title: My Journal
|
|
tags: [journal, personal]
|
|
---
|
|
# Morning
|
|
|
|
Some content here.
|
|
"""
|
|
meta, body = parse_frontmatter(content)
|
|
assert meta.get("title") == "My Journal"
|
|
assert "# Morning" in body
|
|
assert "Some content" in body
|
|
|
|
|
|
def test_parse_frontmatter_without_frontmatter():
|
|
content = "# Just a header\n\nSome text without frontmatter."
|
|
meta, body = parse_frontmatter(content)
|
|
assert meta == {}
|
|
assert "# Just a header" in body
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# extract_tags
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_extract_tags_basic():
|
|
text = "Hello #world and #python-code is nice"
|
|
tags = extract_tags(text)
|
|
assert "#world" in tags
|
|
assert "#python-code" in tags
|
|
# lowercased
|
|
assert all(t.startswith("#") for t in tags)
|
|
|
|
|
|
def test_extract_tags_deduplicates():
|
|
text = "#hello #world #hello #python"
|
|
tags = extract_tags(text)
|
|
assert len(tags) == 3
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# extract_date_from_filename
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_extract_date_from_filename_iso():
|
|
p = Path("2024-01-15.md")
|
|
assert extract_date_from_filename(p) == "2024-01-15"
|
|
|
|
|
|
def test_extract_date_from_filename_compact():
|
|
p = Path("20240115.md")
|
|
assert extract_date_from_filename(p) == "2024-01-15"
|
|
|
|
|
|
def test_extract_date_from_filename_no_date():
|
|
p = Path("my-journal.md")
|
|
assert extract_date_from_filename(p) is None
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# is_structured_note
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_is_structured_note_journal():
|
|
assert is_structured_note(Path("2024-01-15.md")) is True
|
|
assert is_structured_note(Path("Journal/2024-02-20.md")) is True
|
|
|
|
|
|
def test_is_structured_note_project():
|
|
assert is_structured_note(Path("My Project Ideas.md")) is False
|
|
assert is_structured_note(Path("shopping-list.md")) is False
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# split_by_sections
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_split_by_sections_multiple():
|
|
body = """# Mental Health
|
|
Feeling anxious today.
|
|
|
|
## Work
|
|
Project deadline approaching.
|
|
|
|
### Home
|
|
Need to clean the garage.
|
|
"""
|
|
sections = split_by_sections(body, {})
|
|
assert len(sections) == 3
|
|
assert sections[0][0] == "Mental Health"
|
|
# Section content excludes the header line itself
|
|
assert "Feeling anxious today." in sections[0][1]
|
|
assert sections[1][0] == "Work"
|
|
assert sections[2][0] == "Home"
|
|
|
|
|
|
def test_split_by_sections_no_headers():
|
|
body = "Just plain text without any headers at all."
|
|
sections = split_by_sections(body, {})
|
|
assert len(sections) == 1
|
|
assert sections[0][0] is None
|
|
assert "Just plain text" in sections[0][1]
|
|
|
|
|
|
def test_split_by_sections_leading_content():
|
|
"""Content before the first header belongs to the first section."""
|
|
body = """Some intro text before any header.
|
|
|
|
# First Section
|
|
Content of first.
|
|
"""
|
|
sections = split_by_sections(body, {})
|
|
assert sections[0][0] is None
|
|
assert "Some intro text" in sections[0][1]
|
|
assert sections[1][0] == "First Section"
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# sliding_window_chunks
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def test_sliding_window_basic():
|
|
words = " ".join([f"word{i}" for i in range(1200)])
|
|
chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
|
|
assert len(chunks) >= 2
|
|
# First chunk: words 0-499
|
|
assert chunks[0].startswith("word0")
|
|
# Chunks should have ~500 tokens each
|
|
for c in chunks:
|
|
assert len(c.split()) <= 500
|
|
|
|
|
|
def test_sliding_window_overlap():
|
|
"""Adjacent chunks should share the overlap region."""
|
|
text = " ".join([f"word{i}" for i in range(1000)])
|
|
chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
|
|
# Every chunk after the first should start with words from the previous chunk
|
|
for i in range(1, len(chunks)):
|
|
prev_words = chunks[i - 1].split()
|
|
curr_words = chunks[i].split()
|
|
# Overlap should be evident
|
|
assert prev_words[-100:] == curr_words[:100]
|
|
|
|
|
|
def test_sliding_window_empty():
|
|
assert sliding_window_chunks("", chunk_size=500, overlap=100) == []
|
|
|
|
|
|
def test_sliding_window_exact_size_produces_two_chunks():
|
|
"""With overlap=100, exactly 500 words produces 2 chunks (0-499 and 400-end)."""
|
|
words = " ".join([f"word{i}" for i in range(500)])
|
|
chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
|
|
assert len(chunks) == 2
|
|
assert chunks[0].startswith("word0")
|
|
assert chunks[1].startswith("word400") # advance = 500-100 = 400
|
|
|
|
|
|
def test_sliding_window_small_text():
|
|
"""Text much shorter than chunk_size returns single chunk."""
|
|
text = "just a few words"
|
|
chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
|
|
assert len(chunks) == 1
|
|
assert chunks[0] == text
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
# chunk_file integration
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def _mock_config(tmp_path: Path) -> MagicMock:
|
|
"""Build a minimal mock config pointing at a tmp vault."""
|
|
cfg = MagicMock()
|
|
cfg.vault_path = str(tmp_path)
|
|
cfg.indexing.chunk_size = 500
|
|
cfg.indexing.chunk_overlap = 100
|
|
cfg.indexing.max_section_chars = 4000
|
|
cfg.indexing.file_patterns = ["*.md"]
|
|
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
|
cfg.indexing.allow_dirs = []
|
|
return cfg
|
|
|
|
|
|
def test_chunk_file_structured_journal(tmp_path: Path):
|
|
vault = tmp_path / "Journal"
|
|
vault.mkdir()
|
|
fpath = vault / "2024-03-15.md"
|
|
fpath.write_text("""# Morning
|
|
|
|
Felt #anxious about the deadline.
|
|
|
|
## Work
|
|
Finished the report.
|
|
""")
|
|
|
|
cfg = _mock_config(tmp_path)
|
|
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
|
|
|
# Journal file → section-split → 2 chunks
|
|
assert len(chunks) == 2
|
|
assert chunks[0].section == "#Morning"
|
|
assert chunks[0].date == "2024-03-15"
|
|
assert "#anxious" in chunks[0].tags or "#anxious" in chunks[1].tags
|
|
assert chunks[0].source_file.endswith("Journal/2024-03-15.md")
|
|
|
|
|
|
def test_chunk_file_unstructured(tmp_path: Path):
|
|
vault = tmp_path / "Notes"
|
|
vault.mkdir()
|
|
fpath = vault / "project-ideas.md"
|
|
fpath.write_text("This is a long note " * 200) # ~1000 words
|
|
|
|
cfg = _mock_config(tmp_path)
|
|
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
|
|
|
# Unstructured → sliding window → multiple chunks
|
|
assert len(chunks) > 1
|
|
assert all(c.section is None for c in chunks)
|
|
assert chunks[0].chunk_index == 0
|
|
|
|
|
|
def test_large_section_split_into_sub_chunks(tmp_path: Path):
|
|
"""Large section (exceeding max_section_chars) is split via sliding window."""
|
|
vault = tmp_path / "Notes"
|
|
vault.mkdir()
|
|
fpath = vault / "2024-03-15-Podcast.md"
|
|
large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS
|
|
fpath.write_text(f"# Episode Notes\n\n{large_content}")
|
|
|
|
cfg = _mock_config(tmp_path)
|
|
cfg.indexing.max_section_chars = 4000
|
|
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
|
|
|
# Large section should be split into multiple sub-chunks
|
|
assert len(chunks) > 1
|
|
# Each sub-chunk should preserve the section heading
|
|
for chunk in chunks:
|
|
assert chunk.section == "#Episode Notes", (
|
|
f"Expected #Episode Notes, got {chunk.section}"
|
|
)
|
|
|
|
|
|
def test_small_section_kept_intact(tmp_path: Path):
|
|
"""Small section (under max_section_chars) remains a single chunk."""
|
|
vault = tmp_path / "Notes"
|
|
vault.mkdir()
|
|
fpath = vault / "2024-03-15-Short.md"
|
|
fpath.write_text("# Notes\n\nShort content here.")
|
|
|
|
cfg = _mock_config(tmp_path)
|
|
cfg.indexing.max_section_chars = 4000
|
|
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
|
|
|
# Small section → single chunk
|
|
assert len(chunks) == 1
|
|
assert chunks[0].section == "#Notes"
|
|
assert chunks[0].text.strip().endswith("Short content here.")
|