import tempfile from pathlib import Path from companion.rag.chunker import ( sliding_window_chunks, section_based_chunks, chunk_file, ChunkingRule, ) def test_sliding_window_basic(): text = "word " * 100 chunks = sliding_window_chunks(text, chunk_size=20, chunk_overlap=5) assert len(chunks) > 1 assert len(chunks[0].split()) == 20 # overlap check: last 5 words of chunk 0 should appear in chunk 1 last_five = chunks[0].split()[-5:] first_chunk1 = chunks[1].split()[:5] assert last_five == first_chunk1 def test_section_based_chunks_splits_on_tags(): text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy" chunks = section_based_chunks( text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2 ) assert len(chunks) == 3 assert chunks[0][1] == "#DayInShort" assert chunks[1][1] == "#mentalhealth" assert chunks[2][1] == "#work" def test_chunk_file_extracts_metadata(): with tempfile.TemporaryDirectory() as tmp: vault = Path(tmp) journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md" journal.parent.mkdir(parents=True) journal.write_text( "#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", encoding="utf-8", ) rules = { "default": ChunkingRule( strategy="sliding_window", chunk_size=500, chunk_overlap=100 ), "Journal/**": ChunkingRule( strategy="section", chunk_size=300, chunk_overlap=50, section_tags=["#DayInShort", "#Relations"], ), } chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0) assert len(chunks) == 2 assert chunks[0].source_directory == "Journal" assert chunks[0].date == "2026-04-12" assert "Person/Vinay" in (chunks[1].tags or []) def test_sliding_window_empty_input(): chunks = sliding_window_chunks("", chunk_size=20, chunk_overlap=5) assert chunks == [] def test_chunk_file_root_level_file(): with tempfile.TemporaryDirectory() as tmp: vault = Path(tmp) note = vault / "note.md" note.write_text("hello world", encoding="utf-8") rules = { "default": ChunkingRule( strategy="sliding_window", chunk_size=500, chunk_overlap=100 ) } chunks = chunk_file(note, vault, rules, modified_at=1.0) assert len(chunks) == 1 assert chunks[0].source_directory == "" assert chunks[0].source_file == "note.md" def test_chunk_file_preserves_tag_order(): with tempfile.TemporaryDirectory() as tmp: vault = Path(tmp) note = vault / "tags.md" note.write_text("#first #second #first #third", encoding="utf-8") rules = { "default": ChunkingRule( strategy="sliding_window", chunk_size=500, chunk_overlap=100 ) } chunks = chunk_file(note, vault, rules, modified_at=1.0) assert chunks[0].tags == ["#first", "#second", "#third"] def test_chunk_file_hyphenated_hashtags(): with tempfile.TemporaryDirectory() as tmp: vault = Path(tmp) note = vault / "hyphen.md" note.write_text("#mental-health check-in", encoding="utf-8") rules = { "default": ChunkingRule( strategy="sliding_window", chunk_size=500, chunk_overlap=100 ) } chunks = chunk_file(note, vault, rules, modified_at=1.0) assert "#mental-health" in chunks[0].tags