feat(indexer): hierarchical chunking for large sections

- Section-split first for structured notes
- Large sections (>max_section_chars) broken via sliding-window
- Small sections stay intact with heading preserved
- Adds max_section_chars config (default 4000)
- 2 new TDD tests for hierarchical chunking
This commit is contained in:
2026-04-11 23:58:05 -04:00
parent a744c0c566
commit 34f3ce97f7
5 changed files with 88 additions and 21 deletions

View File

@@ -3,7 +3,6 @@
from __future__ import annotations
import re
import unicodedata
import hashlib
from dataclasses import dataclass, field
from pathlib import Path
@@ -181,9 +180,7 @@ def chunk_file(
Uses section-split for structured notes (journal entries with date filenames),
sliding window for everything else.
"""
import uuid
vault_path = Path(config.vault_path)
rel_path = filepath if filepath.is_absolute() else filepath
source_file = str(rel_path)
source_directory = rel_path.parts[0] if rel_path.parts else ""
@@ -201,7 +198,6 @@ def chunk_file(
chunks: list[Chunk] = []
if is_structured_note(filepath):
# Section-split for journal/daily notes
sections = split_by_sections(body, metadata)
total = len(sections)
@@ -211,20 +207,38 @@ def chunk_file(
section_tags = extract_tags(section_text)
combined_tags = list(dict.fromkeys([*tags, *section_tags]))
chunk_text = section_text
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=chunk_text,
source_file=source_file,
source_directory=source_directory,
section=f"#{section}" if section else None,
date=date,
tags=combined_tags,
chunk_index=idx,
total_chunks=total,
modified_at=modified_at,
)
chunks.append(chunk)
section_heading = f"#{section}" if section else None
if len(section_text) > config.indexing.max_section_chars:
sub_chunks = sliding_window_chunks(section_text, chunk_size, overlap)
sub_total = len(sub_chunks)
for sub_idx, sub_text in enumerate(sub_chunks):
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}_{sub_idx}",
text=sub_text,
source_file=source_file,
source_directory=source_directory,
section=section_heading,
date=date,
tags=combined_tags,
chunk_index=sub_idx,
total_chunks=sub_total,
modified_at=modified_at,
)
chunks.append(chunk)
else:
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=section_text,
source_file=source_file,
source_directory=source_directory,
section=section_heading,
date=date,
tags=combined_tags,
chunk_index=idx,
total_chunks=total,
modified_at=modified_at,
)
chunks.append(chunk)
else:
# Sliding window for unstructured notes
text_chunks = sliding_window_chunks(body, chunk_size, overlap)
@@ -247,4 +261,4 @@ def chunk_file(
)
chunks.append(chunk)
return chunks
return chunks

View File

@@ -3,7 +3,6 @@
from __future__ import annotations
import json
import os
from enum import Enum
from dataclasses import dataclass, field
from pathlib import Path
@@ -32,6 +31,7 @@ class VectorStoreConfig:
class IndexingConfig:
chunk_size: int = 500
chunk_overlap: int = 100
max_section_chars: int = 4000
file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
deny_dirs: list[str] = field(
default_factory=lambda: [