feat: add markdown chunker with sliding window and section strategies
This commit is contained in:
41
tests/test_chunker.py
Normal file
41
tests/test_chunker.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from companion.rag.chunker import sliding_window_chunks
|
||||
|
||||
|
||||
def test_sliding_window_basic():
|
||||
text = "word " * 100
|
||||
chunks = sliding_window_chunks(text, chunk_size=20, chunk_overlap=5)
|
||||
assert len(chunks) > 1
|
||||
assert len(chunks[0].split()) == 20
|
||||
# overlap check: last 5 words of chunk 0 should appear in chunk 1
|
||||
last_five = chunks[0].split()[-5:]
|
||||
first_chunk1 = chunks[1].split()[:5]
|
||||
assert last_five == first_chunk1
|
||||
from companion.rag.chunker import section_based_chunks, chunk_file, ChunkingRule
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_section_based_chunks_splits_on_tags():
|
||||
text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy"
|
||||
chunks = section_based_chunks(text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0][1] == "#DayInShort"
|
||||
assert chunks[1][1] == "#mentalhealth"
|
||||
assert chunks[2][1] == "#work"
|
||||
|
||||
|
||||
def test_chunk_file_extracts_metadata():
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
vault = Path(tmp)
|
||||
journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md"
|
||||
journal.parent.mkdir(parents=True)
|
||||
journal.write_text("#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", encoding="utf-8")
|
||||
rules = {
|
||||
"default": ChunkingRule(strategy="sliding_window", chunk_size=500, chunk_overlap=100),
|
||||
"Journal/**": ChunkingRule(strategy="section", chunk_size=300, chunk_overlap=50, section_tags=["#DayInShort", "#Relations"]),
|
||||
}
|
||||
chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].source_directory == "Journal"
|
||||
assert chunks[0].date == "2026-04-12"
|
||||
assert "Person/Vinay" in (chunks[1].tags or [])
|
||||
Reference in New Issue
Block a user