diff --git a/AGENTS.md b/AGENTS.md index 9ad2197..7926e25 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -42,6 +42,11 @@ Plugin `package.json` MUST have: User config at `~/.obsidian-rag/config.json` or `./obsidian-rag/` dev config. +Key indexing fields: +- `indexing.chunk_size` — sliding window chunk size (default 500) +- `indexing.chunk_overlap` — overlap between chunks (default 100) +- `indexing.max_section_chars` — max chars per section before hierarchical split (default 4000) + Key security fields: - `security.require_confirmation_for` — list of categories (e.g. `["health", "financial_debt"]`). Empty list disables guard. - `security.auto_approve_sensitive` — `true` bypasses sensitive content prompts. @@ -49,7 +54,11 @@ Key security fields: ## Ollama Context Length -`python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase this value or reduce `indexing.chunk_size` in config. +`python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase `max_section_chars` (to reduce section sizes) or reduce `chunk_size` in config. + +## Hierarchical Chunking + +Structured notes (date-named files) use section-split first, then sliding-window within sections that exceed `max_section_chars`. Small sections stay intact; large sections are broken into sub-chunks with the parent section heading preserved. ## Sensitive Content Guard diff --git a/openclaw.plugin.json b/openclaw.plugin.json index e04fa77..464f160 100644 --- a/openclaw.plugin.json +++ b/openclaw.plugin.json @@ -65,6 +65,11 @@ "type": "integer", "minimum": 0 }, + "max_section_chars": { + "type": "integer", + "minimum": 1, + "description": "Max chars per section before splitting into sub-chunks. Default 4000." + }, "file_patterns": { "type": "array", "items": { diff --git a/python/obsidian_rag/chunker.py b/python/obsidian_rag/chunker.py index 7b3f948..08ad797 100644 --- a/python/obsidian_rag/chunker.py +++ b/python/obsidian_rag/chunker.py @@ -3,7 +3,6 @@ from __future__ import annotations import re -import unicodedata import hashlib from dataclasses import dataclass, field from pathlib import Path @@ -181,9 +180,7 @@ def chunk_file( Uses section-split for structured notes (journal entries with date filenames), sliding window for everything else. """ - import uuid - vault_path = Path(config.vault_path) rel_path = filepath if filepath.is_absolute() else filepath source_file = str(rel_path) source_directory = rel_path.parts[0] if rel_path.parts else "" @@ -201,7 +198,6 @@ def chunk_file( chunks: list[Chunk] = [] if is_structured_note(filepath): - # Section-split for journal/daily notes sections = split_by_sections(body, metadata) total = len(sections) @@ -211,20 +207,38 @@ def chunk_file( section_tags = extract_tags(section_text) combined_tags = list(dict.fromkeys([*tags, *section_tags])) - chunk_text = section_text - chunk = Chunk( - chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}", - text=chunk_text, - source_file=source_file, - source_directory=source_directory, - section=f"#{section}" if section else None, - date=date, - tags=combined_tags, - chunk_index=idx, - total_chunks=total, - modified_at=modified_at, - ) - chunks.append(chunk) + section_heading = f"#{section}" if section else None + if len(section_text) > config.indexing.max_section_chars: + sub_chunks = sliding_window_chunks(section_text, chunk_size, overlap) + sub_total = len(sub_chunks) + for sub_idx, sub_text in enumerate(sub_chunks): + chunk = Chunk( + chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}_{sub_idx}", + text=sub_text, + source_file=source_file, + source_directory=source_directory, + section=section_heading, + date=date, + tags=combined_tags, + chunk_index=sub_idx, + total_chunks=sub_total, + modified_at=modified_at, + ) + chunks.append(chunk) + else: + chunk = Chunk( + chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}", + text=section_text, + source_file=source_file, + source_directory=source_directory, + section=section_heading, + date=date, + tags=combined_tags, + chunk_index=idx, + total_chunks=total, + modified_at=modified_at, + ) + chunks.append(chunk) else: # Sliding window for unstructured notes text_chunks = sliding_window_chunks(body, chunk_size, overlap) @@ -247,4 +261,4 @@ def chunk_file( ) chunks.append(chunk) - return chunks \ No newline at end of file + return chunks diff --git a/python/obsidian_rag/config.py b/python/obsidian_rag/config.py index 01395de..d0aab6a 100644 --- a/python/obsidian_rag/config.py +++ b/python/obsidian_rag/config.py @@ -3,7 +3,6 @@ from __future__ import annotations import json -import os from enum import Enum from dataclasses import dataclass, field from pathlib import Path @@ -32,6 +31,7 @@ class VectorStoreConfig: class IndexingConfig: chunk_size: int = 500 chunk_overlap: int = 100 + max_section_chars: int = 4000 file_patterns: list[str] = field(default_factory=lambda: ["*.md"]) deny_dirs: list[str] = field( default_factory=lambda: [ diff --git a/python/tests/unit/test_chunker.py b/python/tests/unit/test_chunker.py index e1bf1ae..e7d3848 100644 --- a/python/tests/unit/test_chunker.py +++ b/python/tests/unit/test_chunker.py @@ -206,6 +206,7 @@ def _mock_config(tmp_path: Path) -> MagicMock: cfg.vault_path = str(tmp_path) cfg.indexing.chunk_size = 500 cfg.indexing.chunk_overlap = 100 + cfg.indexing.max_section_chars = 4000 cfg.indexing.file_patterns = ["*.md"] cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"] cfg.indexing.allow_dirs = [] @@ -248,3 +249,41 @@ def test_chunk_file_unstructured(tmp_path: Path): assert len(chunks) > 1 assert all(c.section is None for c in chunks) assert chunks[0].chunk_index == 0 + + +def test_large_section_split_into_sub_chunks(tmp_path: Path): + """Large section (exceeding max_section_chars) is split via sliding window.""" + vault = tmp_path / "Notes" + vault.mkdir() + fpath = vault / "2024-03-15-Podcast.md" + large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS + fpath.write_text(f"# Episode Notes\n\n{large_content}") + + cfg = _mock_config(tmp_path) + cfg.indexing.max_section_chars = 4000 + chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg) + + # Large section should be split into multiple sub-chunks + assert len(chunks) > 1 + # Each sub-chunk should preserve the section heading + for chunk in chunks: + assert chunk.section == "#Episode Notes", ( + f"Expected #Episode Notes, got {chunk.section}" + ) + + +def test_small_section_kept_intact(tmp_path: Path): + """Small section (under max_section_chars) remains a single chunk.""" + vault = tmp_path / "Notes" + vault.mkdir() + fpath = vault / "2024-03-15-Short.md" + fpath.write_text("# Notes\n\nShort content here.") + + cfg = _mock_config(tmp_path) + cfg.indexing.max_section_chars = 4000 + chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg) + + # Small section → single chunk + assert len(chunks) == 1 + assert chunks[0].section == "#Notes" + assert chunks[0].text.strip().endswith("Short content here.")