fix: address chunker review feedback

This commit is contained in:
2026-04-13 14:09:56 -04:00
parent fc9f975fff
commit 95687fad2e
2 changed files with 82 additions and 14 deletions

View File

@@ -1,4 +1,12 @@
from companion.rag.chunker import sliding_window_chunks
import tempfile
from pathlib import Path
from companion.rag.chunker import (
sliding_window_chunks,
section_based_chunks,
chunk_file,
ChunkingRule,
)
def test_sliding_window_basic():
@@ -10,14 +18,13 @@ def test_sliding_window_basic():
last_five = chunks[0].split()[-5:]
first_chunk1 = chunks[1].split()[:5]
assert last_five == first_chunk1
from companion.rag.chunker import section_based_chunks, chunk_file, ChunkingRule
import tempfile
from pathlib import Path
def test_section_based_chunks_splits_on_tags():
text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy"
chunks = section_based_chunks(text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2)
chunks = section_based_chunks(
text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2
)
assert len(chunks) == 3
assert chunks[0][1] == "#DayInShort"
assert chunks[1][1] == "#mentalhealth"
@@ -29,13 +36,72 @@ def test_chunk_file_extracts_metadata():
vault = Path(tmp)
journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md"
journal.parent.mkdir(parents=True)
journal.write_text("#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", encoding="utf-8")
journal.write_text(
"#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.",
encoding="utf-8",
)
rules = {
"default": ChunkingRule(strategy="sliding_window", chunk_size=500, chunk_overlap=100),
"Journal/**": ChunkingRule(strategy="section", chunk_size=300, chunk_overlap=50, section_tags=["#DayInShort", "#Relations"]),
"default": ChunkingRule(
strategy="sliding_window", chunk_size=500, chunk_overlap=100
),
"Journal/**": ChunkingRule(
strategy="section",
chunk_size=300,
chunk_overlap=50,
section_tags=["#DayInShort", "#Relations"],
),
}
chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0)
assert len(chunks) == 2
assert chunks[0].source_directory == "Journal"
assert chunks[0].date == "2026-04-12"
assert "Person/Vinay" in (chunks[1].tags or [])
def test_sliding_window_empty_input():
chunks = sliding_window_chunks("", chunk_size=20, chunk_overlap=5)
assert chunks == []
def test_chunk_file_root_level_file():
with tempfile.TemporaryDirectory() as tmp:
vault = Path(tmp)
note = vault / "note.md"
note.write_text("hello world", encoding="utf-8")
rules = {
"default": ChunkingRule(
strategy="sliding_window", chunk_size=500, chunk_overlap=100
)
}
chunks = chunk_file(note, vault, rules, modified_at=1.0)
assert len(chunks) == 1
assert chunks[0].source_directory == ""
assert chunks[0].source_file == "note.md"
def test_chunk_file_preserves_tag_order():
with tempfile.TemporaryDirectory() as tmp:
vault = Path(tmp)
note = vault / "tags.md"
note.write_text("#first #second #first #third", encoding="utf-8")
rules = {
"default": ChunkingRule(
strategy="sliding_window", chunk_size=500, chunk_overlap=100
)
}
chunks = chunk_file(note, vault, rules, modified_at=1.0)
assert chunks[0].tags == ["#first", "#second", "#third"]
def test_chunk_file_hyphenated_hashtags():
with tempfile.TemporaryDirectory() as tmp:
vault = Path(tmp)
note = vault / "hyphen.md"
note.write_text("#mental-health check-in", encoding="utf-8")
rules = {
"default": ChunkingRule(
strategy="sliding_window", chunk_size=500, chunk_overlap=100
)
}
chunks = chunk_file(note, vault, rules, modified_at=1.0)
assert "#mental-health" in chunks[0].tags