feat(indexer): hierarchical chunking for large sections
- Section-split first for structured notes - Large sections (>max_section_chars) broken via sliding-window - Small sections stay intact with heading preserved - Adds max_section_chars config (default 4000) - 2 new TDD tests for hierarchical chunking
This commit is contained in:
@@ -206,6 +206,7 @@ def _mock_config(tmp_path: Path) -> MagicMock:
|
||||
cfg.vault_path = str(tmp_path)
|
||||
cfg.indexing.chunk_size = 500
|
||||
cfg.indexing.chunk_overlap = 100
|
||||
cfg.indexing.max_section_chars = 4000
|
||||
cfg.indexing.file_patterns = ["*.md"]
|
||||
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
||||
cfg.indexing.allow_dirs = []
|
||||
@@ -248,3 +249,41 @@ def test_chunk_file_unstructured(tmp_path: Path):
|
||||
assert len(chunks) > 1
|
||||
assert all(c.section is None for c in chunks)
|
||||
assert chunks[0].chunk_index == 0
|
||||
|
||||
|
||||
def test_large_section_split_into_sub_chunks(tmp_path: Path):
|
||||
"""Large section (exceeding max_section_chars) is split via sliding window."""
|
||||
vault = tmp_path / "Notes"
|
||||
vault.mkdir()
|
||||
fpath = vault / "2024-03-15-Podcast.md"
|
||||
large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS
|
||||
fpath.write_text(f"# Episode Notes\n\n{large_content}")
|
||||
|
||||
cfg = _mock_config(tmp_path)
|
||||
cfg.indexing.max_section_chars = 4000
|
||||
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||
|
||||
# Large section should be split into multiple sub-chunks
|
||||
assert len(chunks) > 1
|
||||
# Each sub-chunk should preserve the section heading
|
||||
for chunk in chunks:
|
||||
assert chunk.section == "#Episode Notes", (
|
||||
f"Expected #Episode Notes, got {chunk.section}"
|
||||
)
|
||||
|
||||
|
||||
def test_small_section_kept_intact(tmp_path: Path):
|
||||
"""Small section (under max_section_chars) remains a single chunk."""
|
||||
vault = tmp_path / "Notes"
|
||||
vault.mkdir()
|
||||
fpath = vault / "2024-03-15-Short.md"
|
||||
fpath.write_text("# Notes\n\nShort content here.")
|
||||
|
||||
cfg = _mock_config(tmp_path)
|
||||
cfg.indexing.max_section_chars = 4000
|
||||
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||
|
||||
# Small section → single chunk
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].section == "#Notes"
|
||||
assert chunks[0].text.strip().endswith("Short content here.")
|
||||
|
||||
Reference in New Issue
Block a user