From 95687fad2ee61225079026b1af496837330d9d53 Mon Sep 17 00:00:00 2001 From: Santhosh Janardhanan Date: Mon, 13 Apr 2026 14:09:56 -0400 Subject: [PATCH] fix: address chunker review feedback --- src/companion/rag/chunker.py | 14 +++--- tests/test_chunker.py | 82 ++++++++++++++++++++++++++++++++---- 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/src/companion/rag/chunker.py b/src/companion/rag/chunker.py index 0a98436..4c9c9f6 100644 --- a/src/companion/rag/chunker.py +++ b/src/companion/rag/chunker.py @@ -45,7 +45,7 @@ def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> Lis def section_based_chunks( text: str, section_tags: List[str], chunk_size: int, chunk_overlap: int -) -> List[tuple]: +) -> List[tuple[str, str]]: lines = text.split("\n") sections = [] current_tag = None @@ -100,7 +100,9 @@ def match_chunking_rule( continue if fnmatch.fnmatch(relative_path, pattern): return rule - return rules.get("default") + return rules.get("default") or ChunkingRule( + strategy="sliding_window", chunk_size=500, chunk_overlap=100 + ) def chunk_file( @@ -126,9 +128,9 @@ def chunk_file( ) chunks = [] for i, (chunk_text, section) in enumerate(raw_chunks): - section_hashtags = re.findall(r"#\w+", chunk_text) + section_hashtags = re.findall(r"#[\w\-]+", chunk_text) section_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text) - section_tags = list(set(section_hashtags + section_wikilinks)) + section_tags = list(dict.fromkeys(section_hashtags + section_wikilinks)) chunk = Chunk( text=chunk_text, @@ -147,9 +149,9 @@ def chunk_file( raw_chunks = sliding_window_chunks(text, rule.chunk_size, rule.chunk_overlap) chunks = [] for i, chunk_text in enumerate(raw_chunks): - chunk_hashtags = re.findall(r"#\w+", chunk_text) + chunk_hashtags = re.findall(r"#[\w\-]+", chunk_text) chunk_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text) - chunk_tags = list(set(chunk_hashtags + chunk_wikilinks)) + chunk_tags = list(dict.fromkeys(chunk_hashtags + chunk_wikilinks)) chunk = Chunk( text=chunk_text, diff --git a/tests/test_chunker.py b/tests/test_chunker.py index 131829c..9f41a02 100644 --- a/tests/test_chunker.py +++ b/tests/test_chunker.py @@ -1,4 +1,12 @@ -from companion.rag.chunker import sliding_window_chunks +import tempfile +from pathlib import Path + +from companion.rag.chunker import ( + sliding_window_chunks, + section_based_chunks, + chunk_file, + ChunkingRule, +) def test_sliding_window_basic(): @@ -10,14 +18,13 @@ def test_sliding_window_basic(): last_five = chunks[0].split()[-5:] first_chunk1 = chunks[1].split()[:5] assert last_five == first_chunk1 -from companion.rag.chunker import section_based_chunks, chunk_file, ChunkingRule -import tempfile -from pathlib import Path def test_section_based_chunks_splits_on_tags(): text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy" - chunks = section_based_chunks(text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2) + chunks = section_based_chunks( + text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2 + ) assert len(chunks) == 3 assert chunks[0][1] == "#DayInShort" assert chunks[1][1] == "#mentalhealth" @@ -29,13 +36,72 @@ def test_chunk_file_extracts_metadata(): vault = Path(tmp) journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md" journal.parent.mkdir(parents=True) - journal.write_text("#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", encoding="utf-8") + journal.write_text( + "#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", + encoding="utf-8", + ) rules = { - "default": ChunkingRule(strategy="sliding_window", chunk_size=500, chunk_overlap=100), - "Journal/**": ChunkingRule(strategy="section", chunk_size=300, chunk_overlap=50, section_tags=["#DayInShort", "#Relations"]), + "default": ChunkingRule( + strategy="sliding_window", chunk_size=500, chunk_overlap=100 + ), + "Journal/**": ChunkingRule( + strategy="section", + chunk_size=300, + chunk_overlap=50, + section_tags=["#DayInShort", "#Relations"], + ), } chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0) assert len(chunks) == 2 assert chunks[0].source_directory == "Journal" assert chunks[0].date == "2026-04-12" assert "Person/Vinay" in (chunks[1].tags or []) + + +def test_sliding_window_empty_input(): + chunks = sliding_window_chunks("", chunk_size=20, chunk_overlap=5) + assert chunks == [] + + +def test_chunk_file_root_level_file(): + with tempfile.TemporaryDirectory() as tmp: + vault = Path(tmp) + note = vault / "note.md" + note.write_text("hello world", encoding="utf-8") + rules = { + "default": ChunkingRule( + strategy="sliding_window", chunk_size=500, chunk_overlap=100 + ) + } + chunks = chunk_file(note, vault, rules, modified_at=1.0) + assert len(chunks) == 1 + assert chunks[0].source_directory == "" + assert chunks[0].source_file == "note.md" + + +def test_chunk_file_preserves_tag_order(): + with tempfile.TemporaryDirectory() as tmp: + vault = Path(tmp) + note = vault / "tags.md" + note.write_text("#first #second #first #third", encoding="utf-8") + rules = { + "default": ChunkingRule( + strategy="sliding_window", chunk_size=500, chunk_overlap=100 + ) + } + chunks = chunk_file(note, vault, rules, modified_at=1.0) + assert chunks[0].tags == ["#first", "#second", "#third"] + + +def test_chunk_file_hyphenated_hashtags(): + with tempfile.TemporaryDirectory() as tmp: + vault = Path(tmp) + note = vault / "hyphen.md" + note.write_text("#mental-health check-in", encoding="utf-8") + rules = { + "default": ChunkingRule( + strategy="sliding_window", chunk_size=500, chunk_overlap=100 + ) + } + chunks = chunk_file(note, vault, rules, modified_at=1.0) + assert "#mental-health" in chunks[0].tags