feat(indexer): hierarchical chunking for large sections
- Section-split first for structured notes - Large sections (>max_section_chars) broken via sliding-window - Small sections stay intact with heading preserved - Adds max_section_chars config (default 4000) - 2 new TDD tests for hierarchical chunking
This commit is contained in:
11
AGENTS.md
11
AGENTS.md
@@ -42,6 +42,11 @@ Plugin `package.json` MUST have:
|
||||
|
||||
User config at `~/.obsidian-rag/config.json` or `./obsidian-rag/` dev config.
|
||||
|
||||
Key indexing fields:
|
||||
- `indexing.chunk_size` — sliding window chunk size (default 500)
|
||||
- `indexing.chunk_overlap` — overlap between chunks (default 100)
|
||||
- `indexing.max_section_chars` — max chars per section before hierarchical split (default 4000)
|
||||
|
||||
Key security fields:
|
||||
- `security.require_confirmation_for` — list of categories (e.g. `["health", "financial_debt"]`). Empty list disables guard.
|
||||
- `security.auto_approve_sensitive` — `true` bypasses sensitive content prompts.
|
||||
@@ -49,7 +54,11 @@ Key security fields:
|
||||
|
||||
## Ollama Context Length
|
||||
|
||||
`python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase this value or reduce `indexing.chunk_size` in config.
|
||||
`python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase `max_section_chars` (to reduce section sizes) or reduce `chunk_size` in config.
|
||||
|
||||
## Hierarchical Chunking
|
||||
|
||||
Structured notes (date-named files) use section-split first, then sliding-window within sections that exceed `max_section_chars`. Small sections stay intact; large sections are broken into sub-chunks with the parent section heading preserved.
|
||||
|
||||
## Sensitive Content Guard
|
||||
|
||||
|
||||
@@ -65,6 +65,11 @@
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"max_section_chars": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"description": "Max chars per section before splitting into sub-chunks. Default 4000."
|
||||
},
|
||||
"file_patterns": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
import hashlib
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
@@ -181,9 +180,7 @@ def chunk_file(
|
||||
Uses section-split for structured notes (journal entries with date filenames),
|
||||
sliding window for everything else.
|
||||
"""
|
||||
import uuid
|
||||
|
||||
vault_path = Path(config.vault_path)
|
||||
rel_path = filepath if filepath.is_absolute() else filepath
|
||||
source_file = str(rel_path)
|
||||
source_directory = rel_path.parts[0] if rel_path.parts else ""
|
||||
@@ -201,7 +198,6 @@ def chunk_file(
|
||||
chunks: list[Chunk] = []
|
||||
|
||||
if is_structured_note(filepath):
|
||||
# Section-split for journal/daily notes
|
||||
sections = split_by_sections(body, metadata)
|
||||
total = len(sections)
|
||||
|
||||
@@ -211,20 +207,38 @@ def chunk_file(
|
||||
section_tags = extract_tags(section_text)
|
||||
combined_tags = list(dict.fromkeys([*tags, *section_tags]))
|
||||
|
||||
chunk_text = section_text
|
||||
chunk = Chunk(
|
||||
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
|
||||
text=chunk_text,
|
||||
source_file=source_file,
|
||||
source_directory=source_directory,
|
||||
section=f"#{section}" if section else None,
|
||||
date=date,
|
||||
tags=combined_tags,
|
||||
chunk_index=idx,
|
||||
total_chunks=total,
|
||||
modified_at=modified_at,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
section_heading = f"#{section}" if section else None
|
||||
if len(section_text) > config.indexing.max_section_chars:
|
||||
sub_chunks = sliding_window_chunks(section_text, chunk_size, overlap)
|
||||
sub_total = len(sub_chunks)
|
||||
for sub_idx, sub_text in enumerate(sub_chunks):
|
||||
chunk = Chunk(
|
||||
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}_{sub_idx}",
|
||||
text=sub_text,
|
||||
source_file=source_file,
|
||||
source_directory=source_directory,
|
||||
section=section_heading,
|
||||
date=date,
|
||||
tags=combined_tags,
|
||||
chunk_index=sub_idx,
|
||||
total_chunks=sub_total,
|
||||
modified_at=modified_at,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
else:
|
||||
chunk = Chunk(
|
||||
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
|
||||
text=section_text,
|
||||
source_file=source_file,
|
||||
source_directory=source_directory,
|
||||
section=section_heading,
|
||||
date=date,
|
||||
tags=combined_tags,
|
||||
chunk_index=idx,
|
||||
total_chunks=total,
|
||||
modified_at=modified_at,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
else:
|
||||
# Sliding window for unstructured notes
|
||||
text_chunks = sliding_window_chunks(body, chunk_size, overlap)
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
@@ -32,6 +31,7 @@ class VectorStoreConfig:
|
||||
class IndexingConfig:
|
||||
chunk_size: int = 500
|
||||
chunk_overlap: int = 100
|
||||
max_section_chars: int = 4000
|
||||
file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
|
||||
deny_dirs: list[str] = field(
|
||||
default_factory=lambda: [
|
||||
|
||||
@@ -206,6 +206,7 @@ def _mock_config(tmp_path: Path) -> MagicMock:
|
||||
cfg.vault_path = str(tmp_path)
|
||||
cfg.indexing.chunk_size = 500
|
||||
cfg.indexing.chunk_overlap = 100
|
||||
cfg.indexing.max_section_chars = 4000
|
||||
cfg.indexing.file_patterns = ["*.md"]
|
||||
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
||||
cfg.indexing.allow_dirs = []
|
||||
@@ -248,3 +249,41 @@ def test_chunk_file_unstructured(tmp_path: Path):
|
||||
assert len(chunks) > 1
|
||||
assert all(c.section is None for c in chunks)
|
||||
assert chunks[0].chunk_index == 0
|
||||
|
||||
|
||||
def test_large_section_split_into_sub_chunks(tmp_path: Path):
|
||||
"""Large section (exceeding max_section_chars) is split via sliding window."""
|
||||
vault = tmp_path / "Notes"
|
||||
vault.mkdir()
|
||||
fpath = vault / "2024-03-15-Podcast.md"
|
||||
large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS
|
||||
fpath.write_text(f"# Episode Notes\n\n{large_content}")
|
||||
|
||||
cfg = _mock_config(tmp_path)
|
||||
cfg.indexing.max_section_chars = 4000
|
||||
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||
|
||||
# Large section should be split into multiple sub-chunks
|
||||
assert len(chunks) > 1
|
||||
# Each sub-chunk should preserve the section heading
|
||||
for chunk in chunks:
|
||||
assert chunk.section == "#Episode Notes", (
|
||||
f"Expected #Episode Notes, got {chunk.section}"
|
||||
)
|
||||
|
||||
|
||||
def test_small_section_kept_intact(tmp_path: Path):
|
||||
"""Small section (under max_section_chars) remains a single chunk."""
|
||||
vault = tmp_path / "Notes"
|
||||
vault.mkdir()
|
||||
fpath = vault / "2024-03-15-Short.md"
|
||||
fpath.write_text("# Notes\n\nShort content here.")
|
||||
|
||||
cfg = _mock_config(tmp_path)
|
||||
cfg.indexing.max_section_chars = 4000
|
||||
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||
|
||||
# Small section → single chunk
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].section == "#Notes"
|
||||
assert chunks[0].text.strip().endswith("Short content here.")
|
||||
|
||||
Reference in New Issue
Block a user