feat(indexer): hierarchical chunking for large sections

- Section-split first for structured notes - Large sections (>max_section_chars) broken via sliding-window - Small sections stay intact with heading preserved - Adds max_section_chars config (default 4000) - 2 new TDD tests for hierarchical chunking
2026-04-11 23:58:05 -04:00
parent a744c0c566
commit 34f3ce97f7
5 changed files with 88 additions and 21 deletions
--- a/python/obsidian_rag/chunker.py
+++ b/python/obsidian_rag/chunker.py
@@ -3,7 +3,6 @@
 from __future__ import annotations

 import re
-import unicodedata
 import hashlib
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -181,9 +180,7 @@ def chunk_file(
    Uses section-split for structured notes (journal entries with date filenames),
    sliding window for everything else.
    """
-    import uuid

-    vault_path = Path(config.vault_path)
    rel_path = filepath if filepath.is_absolute() else filepath
    source_file = str(rel_path)
    source_directory = rel_path.parts[0] if rel_path.parts else ""
@@ -201,7 +198,6 @@ def chunk_file(
    chunks: list[Chunk] = []

    if is_structured_note(filepath):
-        # Section-split for journal/daily notes
        sections = split_by_sections(body, metadata)
        total = len(sections)

@@ -211,20 +207,38 @@ def chunk_file(
            section_tags = extract_tags(section_text)
            combined_tags = list(dict.fromkeys([*tags, *section_tags]))

-            chunk_text = section_text
-            chunk = Chunk(
-                chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
-                text=chunk_text,
-                source_file=source_file,
-                source_directory=source_directory,
-                section=f"#{section}" if section else None,
-                date=date,
-                tags=combined_tags,
-                chunk_index=idx,
-                total_chunks=total,
-                modified_at=modified_at,
-            )
-            chunks.append(chunk)
+            section_heading = f"#{section}" if section else None
+            if len(section_text) > config.indexing.max_section_chars:
+                sub_chunks = sliding_window_chunks(section_text, chunk_size, overlap)
+                sub_total = len(sub_chunks)
+                for sub_idx, sub_text in enumerate(sub_chunks):
+                    chunk = Chunk(
+                        chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}_{sub_idx}",
+                        text=sub_text,
+                        source_file=source_file,
+                        source_directory=source_directory,
+                        section=section_heading,
+                        date=date,
+                        tags=combined_tags,
+                        chunk_index=sub_idx,
+                        total_chunks=sub_total,
+                        modified_at=modified_at,
+                    )
+                    chunks.append(chunk)
+            else:
+                chunk = Chunk(
+                    chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
+                    text=section_text,
+                    source_file=source_file,
+                    source_directory=source_directory,
+                    section=section_heading,
+                    date=date,
+                    tags=combined_tags,
+                    chunk_index=idx,
+                    total_chunks=total,
+                    modified_at=modified_at,
+                )
+                chunks.append(chunk)
    else:
        # Sliding window for unstructured notes
        text_chunks = sliding_window_chunks(body, chunk_size, overlap)
@@ -247,4 +261,4 @@ def chunk_file(
            )
            chunks.append(chunk)

-    return chunks
+    return chunks
--- a/python/obsidian_rag/config.py
+++ b/python/obsidian_rag/config.py
@@ -3,7 +3,6 @@
 from __future__ import annotations

 import json
-import os
 from enum import Enum
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -32,6 +31,7 @@ class VectorStoreConfig:
 class IndexingConfig:
    chunk_size: int = 500
    chunk_overlap: int = 100
+    max_section_chars: int = 4000
    file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
    deny_dirs: list[str] = field(
        default_factory=lambda: [
--- a/python/tests/unit/test_chunker.py
+++ b/python/tests/unit/test_chunker.py
@@ -206,6 +206,7 @@ def _mock_config(tmp_path: Path) -> MagicMock:
    cfg.vault_path = str(tmp_path)
    cfg.indexing.chunk_size = 500
    cfg.indexing.chunk_overlap = 100
+    cfg.indexing.max_section_chars = 4000
    cfg.indexing.file_patterns = ["*.md"]
    cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
    cfg.indexing.allow_dirs = []
@@ -248,3 +249,41 @@ def test_chunk_file_unstructured(tmp_path: Path):
    assert len(chunks) > 1
    assert all(c.section is None for c in chunks)
    assert chunks[0].chunk_index == 0
+
+
+def test_large_section_split_into_sub_chunks(tmp_path: Path):
+    """Large section (exceeding max_section_chars) is split via sliding window."""
+    vault = tmp_path / "Notes"
+    vault.mkdir()
+    fpath = vault / "2024-03-15-Podcast.md"
+    large_content = "word " * 3000  # ~15000 chars, exceeds MAX_SECTION_CHARS
+    fpath.write_text(f"# Episode Notes\n\n{large_content}")
+
+    cfg = _mock_config(tmp_path)
+    cfg.indexing.max_section_chars = 4000
+    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
+
+    # Large section should be split into multiple sub-chunks
+    assert len(chunks) > 1
+    # Each sub-chunk should preserve the section heading
+    for chunk in chunks:
+        assert chunk.section == "#Episode Notes", (
+            f"Expected #Episode Notes, got {chunk.section}"
+        )
+
+
+def test_small_section_kept_intact(tmp_path: Path):
+    """Small section (under max_section_chars) remains a single chunk."""
+    vault = tmp_path / "Notes"
+    vault.mkdir()
+    fpath = vault / "2024-03-15-Short.md"
+    fpath.write_text("# Notes\n\nShort content here.")
+
+    cfg = _mock_config(tmp_path)
+    cfg.indexing.max_section_chars = 4000
+    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
+
+    # Small section → single chunk
+    assert len(chunks) == 1
+    assert chunks[0].section == "#Notes"
+    assert chunks[0].text.strip().endswith("Short content here.")