fix: address chunker review feedback
This commit is contained in:
@@ -45,7 +45,7 @@ def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> Lis
|
|||||||
|
|
||||||
def section_based_chunks(
|
def section_based_chunks(
|
||||||
text: str, section_tags: List[str], chunk_size: int, chunk_overlap: int
|
text: str, section_tags: List[str], chunk_size: int, chunk_overlap: int
|
||||||
) -> List[tuple]:
|
) -> List[tuple[str, str]]:
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
sections = []
|
sections = []
|
||||||
current_tag = None
|
current_tag = None
|
||||||
@@ -100,7 +100,9 @@ def match_chunking_rule(
|
|||||||
continue
|
continue
|
||||||
if fnmatch.fnmatch(relative_path, pattern):
|
if fnmatch.fnmatch(relative_path, pattern):
|
||||||
return rule
|
return rule
|
||||||
return rules.get("default")
|
return rules.get("default") or ChunkingRule(
|
||||||
|
strategy="sliding_window", chunk_size=500, chunk_overlap=100
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def chunk_file(
|
def chunk_file(
|
||||||
@@ -126,9 +128,9 @@ def chunk_file(
|
|||||||
)
|
)
|
||||||
chunks = []
|
chunks = []
|
||||||
for i, (chunk_text, section) in enumerate(raw_chunks):
|
for i, (chunk_text, section) in enumerate(raw_chunks):
|
||||||
section_hashtags = re.findall(r"#\w+", chunk_text)
|
section_hashtags = re.findall(r"#[\w\-]+", chunk_text)
|
||||||
section_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
section_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
||||||
section_tags = list(set(section_hashtags + section_wikilinks))
|
section_tags = list(dict.fromkeys(section_hashtags + section_wikilinks))
|
||||||
|
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
text=chunk_text,
|
text=chunk_text,
|
||||||
@@ -147,9 +149,9 @@ def chunk_file(
|
|||||||
raw_chunks = sliding_window_chunks(text, rule.chunk_size, rule.chunk_overlap)
|
raw_chunks = sliding_window_chunks(text, rule.chunk_size, rule.chunk_overlap)
|
||||||
chunks = []
|
chunks = []
|
||||||
for i, chunk_text in enumerate(raw_chunks):
|
for i, chunk_text in enumerate(raw_chunks):
|
||||||
chunk_hashtags = re.findall(r"#\w+", chunk_text)
|
chunk_hashtags = re.findall(r"#[\w\-]+", chunk_text)
|
||||||
chunk_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
chunk_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
||||||
chunk_tags = list(set(chunk_hashtags + chunk_wikilinks))
|
chunk_tags = list(dict.fromkeys(chunk_hashtags + chunk_wikilinks))
|
||||||
|
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
text=chunk_text,
|
text=chunk_text,
|
||||||
|
|||||||
@@ -1,4 +1,12 @@
|
|||||||
from companion.rag.chunker import sliding_window_chunks
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from companion.rag.chunker import (
|
||||||
|
sliding_window_chunks,
|
||||||
|
section_based_chunks,
|
||||||
|
chunk_file,
|
||||||
|
ChunkingRule,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_sliding_window_basic():
|
def test_sliding_window_basic():
|
||||||
@@ -10,14 +18,13 @@ def test_sliding_window_basic():
|
|||||||
last_five = chunks[0].split()[-5:]
|
last_five = chunks[0].split()[-5:]
|
||||||
first_chunk1 = chunks[1].split()[:5]
|
first_chunk1 = chunks[1].split()[:5]
|
||||||
assert last_five == first_chunk1
|
assert last_five == first_chunk1
|
||||||
from companion.rag.chunker import section_based_chunks, chunk_file, ChunkingRule
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def test_section_based_chunks_splits_on_tags():
|
def test_section_based_chunks_splits_on_tags():
|
||||||
text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy"
|
text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy"
|
||||||
chunks = section_based_chunks(text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2)
|
chunks = section_based_chunks(
|
||||||
|
text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2
|
||||||
|
)
|
||||||
assert len(chunks) == 3
|
assert len(chunks) == 3
|
||||||
assert chunks[0][1] == "#DayInShort"
|
assert chunks[0][1] == "#DayInShort"
|
||||||
assert chunks[1][1] == "#mentalhealth"
|
assert chunks[1][1] == "#mentalhealth"
|
||||||
@@ -29,13 +36,72 @@ def test_chunk_file_extracts_metadata():
|
|||||||
vault = Path(tmp)
|
vault = Path(tmp)
|
||||||
journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md"
|
journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md"
|
||||||
journal.parent.mkdir(parents=True)
|
journal.parent.mkdir(parents=True)
|
||||||
journal.write_text("#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", encoding="utf-8")
|
journal.write_text(
|
||||||
|
"#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
rules = {
|
rules = {
|
||||||
"default": ChunkingRule(strategy="sliding_window", chunk_size=500, chunk_overlap=100),
|
"default": ChunkingRule(
|
||||||
"Journal/**": ChunkingRule(strategy="section", chunk_size=300, chunk_overlap=50, section_tags=["#DayInShort", "#Relations"]),
|
strategy="sliding_window", chunk_size=500, chunk_overlap=100
|
||||||
|
),
|
||||||
|
"Journal/**": ChunkingRule(
|
||||||
|
strategy="section",
|
||||||
|
chunk_size=300,
|
||||||
|
chunk_overlap=50,
|
||||||
|
section_tags=["#DayInShort", "#Relations"],
|
||||||
|
),
|
||||||
}
|
}
|
||||||
chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0)
|
chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0)
|
||||||
assert len(chunks) == 2
|
assert len(chunks) == 2
|
||||||
assert chunks[0].source_directory == "Journal"
|
assert chunks[0].source_directory == "Journal"
|
||||||
assert chunks[0].date == "2026-04-12"
|
assert chunks[0].date == "2026-04-12"
|
||||||
assert "Person/Vinay" in (chunks[1].tags or [])
|
assert "Person/Vinay" in (chunks[1].tags or [])
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_empty_input():
|
||||||
|
chunks = sliding_window_chunks("", chunk_size=20, chunk_overlap=5)
|
||||||
|
assert chunks == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_file_root_level_file():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
vault = Path(tmp)
|
||||||
|
note = vault / "note.md"
|
||||||
|
note.write_text("hello world", encoding="utf-8")
|
||||||
|
rules = {
|
||||||
|
"default": ChunkingRule(
|
||||||
|
strategy="sliding_window", chunk_size=500, chunk_overlap=100
|
||||||
|
)
|
||||||
|
}
|
||||||
|
chunks = chunk_file(note, vault, rules, modified_at=1.0)
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].source_directory == ""
|
||||||
|
assert chunks[0].source_file == "note.md"
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_file_preserves_tag_order():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
vault = Path(tmp)
|
||||||
|
note = vault / "tags.md"
|
||||||
|
note.write_text("#first #second #first #third", encoding="utf-8")
|
||||||
|
rules = {
|
||||||
|
"default": ChunkingRule(
|
||||||
|
strategy="sliding_window", chunk_size=500, chunk_overlap=100
|
||||||
|
)
|
||||||
|
}
|
||||||
|
chunks = chunk_file(note, vault, rules, modified_at=1.0)
|
||||||
|
assert chunks[0].tags == ["#first", "#second", "#third"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_file_hyphenated_hashtags():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
vault = Path(tmp)
|
||||||
|
note = vault / "hyphen.md"
|
||||||
|
note.write_text("#mental-health check-in", encoding="utf-8")
|
||||||
|
rules = {
|
||||||
|
"default": ChunkingRule(
|
||||||
|
strategy="sliding_window", chunk_size=500, chunk_overlap=100
|
||||||
|
)
|
||||||
|
}
|
||||||
|
chunks = chunk_file(note, vault, rules, modified_at=1.0)
|
||||||
|
assert "#mental-health" in chunks[0].tags
|
||||||
|
|||||||
Reference in New Issue
Block a user