feat(indexer): hierarchical chunking for large sections

- Section-split first for structured notes
- Large sections (>max_section_chars) broken via sliding-window
- Small sections stay intact with heading preserved
- Adds max_section_chars config (default 4000)
- 2 new TDD tests for hierarchical chunking
This commit is contained in:
2026-04-11 23:58:05 -04:00
parent a744c0c566
commit 34f3ce97f7
5 changed files with 88 additions and 21 deletions

View File

@@ -42,6 +42,11 @@ Plugin `package.json` MUST have:
User config at `~/.obsidian-rag/config.json` or `./obsidian-rag/` dev config. User config at `~/.obsidian-rag/config.json` or `./obsidian-rag/` dev config.
Key indexing fields:
- `indexing.chunk_size` — sliding window chunk size (default 500)
- `indexing.chunk_overlap` — overlap between chunks (default 100)
- `indexing.max_section_chars` — max chars per section before hierarchical split (default 4000)
Key security fields: Key security fields:
- `security.require_confirmation_for` — list of categories (e.g. `["health", "financial_debt"]`). Empty list disables guard. - `security.require_confirmation_for` — list of categories (e.g. `["health", "financial_debt"]`). Empty list disables guard.
- `security.auto_approve_sensitive``true` bypasses sensitive content prompts. - `security.auto_approve_sensitive``true` bypasses sensitive content prompts.
@@ -49,7 +54,11 @@ Key security fields:
## Ollama Context Length ## Ollama Context Length
`python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase this value or reduce `indexing.chunk_size` in config. `python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase `max_section_chars` (to reduce section sizes) or reduce `chunk_size` in config.
## Hierarchical Chunking
Structured notes (date-named files) use section-split first, then sliding-window within sections that exceed `max_section_chars`. Small sections stay intact; large sections are broken into sub-chunks with the parent section heading preserved.
## Sensitive Content Guard ## Sensitive Content Guard

View File

@@ -65,6 +65,11 @@
"type": "integer", "type": "integer",
"minimum": 0 "minimum": 0
}, },
"max_section_chars": {
"type": "integer",
"minimum": 1,
"description": "Max chars per section before splitting into sub-chunks. Default 4000."
},
"file_patterns": { "file_patterns": {
"type": "array", "type": "array",
"items": { "items": {

View File

@@ -3,7 +3,6 @@
from __future__ import annotations from __future__ import annotations
import re import re
import unicodedata
import hashlib import hashlib
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
@@ -181,9 +180,7 @@ def chunk_file(
Uses section-split for structured notes (journal entries with date filenames), Uses section-split for structured notes (journal entries with date filenames),
sliding window for everything else. sliding window for everything else.
""" """
import uuid
vault_path = Path(config.vault_path)
rel_path = filepath if filepath.is_absolute() else filepath rel_path = filepath if filepath.is_absolute() else filepath
source_file = str(rel_path) source_file = str(rel_path)
source_directory = rel_path.parts[0] if rel_path.parts else "" source_directory = rel_path.parts[0] if rel_path.parts else ""
@@ -201,7 +198,6 @@ def chunk_file(
chunks: list[Chunk] = [] chunks: list[Chunk] = []
if is_structured_note(filepath): if is_structured_note(filepath):
# Section-split for journal/daily notes
sections = split_by_sections(body, metadata) sections = split_by_sections(body, metadata)
total = len(sections) total = len(sections)
@@ -211,20 +207,38 @@ def chunk_file(
section_tags = extract_tags(section_text) section_tags = extract_tags(section_text)
combined_tags = list(dict.fromkeys([*tags, *section_tags])) combined_tags = list(dict.fromkeys([*tags, *section_tags]))
chunk_text = section_text section_heading = f"#{section}" if section else None
chunk = Chunk( if len(section_text) > config.indexing.max_section_chars:
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}", sub_chunks = sliding_window_chunks(section_text, chunk_size, overlap)
text=chunk_text, sub_total = len(sub_chunks)
source_file=source_file, for sub_idx, sub_text in enumerate(sub_chunks):
source_directory=source_directory, chunk = Chunk(
section=f"#{section}" if section else None, chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}_{sub_idx}",
date=date, text=sub_text,
tags=combined_tags, source_file=source_file,
chunk_index=idx, source_directory=source_directory,
total_chunks=total, section=section_heading,
modified_at=modified_at, date=date,
) tags=combined_tags,
chunks.append(chunk) chunk_index=sub_idx,
total_chunks=sub_total,
modified_at=modified_at,
)
chunks.append(chunk)
else:
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=section_text,
source_file=source_file,
source_directory=source_directory,
section=section_heading,
date=date,
tags=combined_tags,
chunk_index=idx,
total_chunks=total,
modified_at=modified_at,
)
chunks.append(chunk)
else: else:
# Sliding window for unstructured notes # Sliding window for unstructured notes
text_chunks = sliding_window_chunks(body, chunk_size, overlap) text_chunks = sliding_window_chunks(body, chunk_size, overlap)

View File

@@ -3,7 +3,6 @@
from __future__ import annotations from __future__ import annotations
import json import json
import os
from enum import Enum from enum import Enum
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
@@ -32,6 +31,7 @@ class VectorStoreConfig:
class IndexingConfig: class IndexingConfig:
chunk_size: int = 500 chunk_size: int = 500
chunk_overlap: int = 100 chunk_overlap: int = 100
max_section_chars: int = 4000
file_patterns: list[str] = field(default_factory=lambda: ["*.md"]) file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
deny_dirs: list[str] = field( deny_dirs: list[str] = field(
default_factory=lambda: [ default_factory=lambda: [

View File

@@ -206,6 +206,7 @@ def _mock_config(tmp_path: Path) -> MagicMock:
cfg.vault_path = str(tmp_path) cfg.vault_path = str(tmp_path)
cfg.indexing.chunk_size = 500 cfg.indexing.chunk_size = 500
cfg.indexing.chunk_overlap = 100 cfg.indexing.chunk_overlap = 100
cfg.indexing.max_section_chars = 4000
cfg.indexing.file_patterns = ["*.md"] cfg.indexing.file_patterns = ["*.md"]
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"] cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
cfg.indexing.allow_dirs = [] cfg.indexing.allow_dirs = []
@@ -248,3 +249,41 @@ def test_chunk_file_unstructured(tmp_path: Path):
assert len(chunks) > 1 assert len(chunks) > 1
assert all(c.section is None for c in chunks) assert all(c.section is None for c in chunks)
assert chunks[0].chunk_index == 0 assert chunks[0].chunk_index == 0
def test_large_section_split_into_sub_chunks(tmp_path: Path):
"""Large section (exceeding max_section_chars) is split via sliding window."""
vault = tmp_path / "Notes"
vault.mkdir()
fpath = vault / "2024-03-15-Podcast.md"
large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS
fpath.write_text(f"# Episode Notes\n\n{large_content}")
cfg = _mock_config(tmp_path)
cfg.indexing.max_section_chars = 4000
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
# Large section should be split into multiple sub-chunks
assert len(chunks) > 1
# Each sub-chunk should preserve the section heading
for chunk in chunks:
assert chunk.section == "#Episode Notes", (
f"Expected #Episode Notes, got {chunk.section}"
)
def test_small_section_kept_intact(tmp_path: Path):
"""Small section (under max_section_chars) remains a single chunk."""
vault = tmp_path / "Notes"
vault.mkdir()
fpath = vault / "2024-03-15-Short.md"
fpath.write_text("# Notes\n\nShort content here.")
cfg = _mock_config(tmp_path)
cfg.indexing.max_section_chars = 4000
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
# Small section → single chunk
assert len(chunks) == 1
assert chunks[0].section == "#Notes"
assert chunks[0].text.strip().endswith("Short content here.")