feat: add markdown chunker with sliding window and section strategies
This commit is contained in:
0
src/companion/rag/__init__.py
Normal file
0
src/companion/rag/__init__.py
Normal file
168
src/companion/rag/chunker.py
Normal file
168
src/companion/rag/chunker.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Dict
|
||||||
|
import re
|
||||||
|
import fnmatch
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
text: str
|
||||||
|
source_file: str = ""
|
||||||
|
source_directory: str = ""
|
||||||
|
section: str = ""
|
||||||
|
date: str = ""
|
||||||
|
tags: List[str] = field(default_factory=list)
|
||||||
|
chunk_index: int = 0
|
||||||
|
total_chunks: int = 0
|
||||||
|
modified_at: float = 0.0
|
||||||
|
rule_applied: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkingRule:
|
||||||
|
strategy: str
|
||||||
|
chunk_size: int
|
||||||
|
chunk_overlap: int
|
||||||
|
section_tags: Optional[List[str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
|
||||||
|
words = text.split()
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
step = chunk_size - chunk_overlap
|
||||||
|
for i in range(0, len(words), step):
|
||||||
|
chunk_words = words[i : i + chunk_size]
|
||||||
|
chunks.append(" ".join(chunk_words))
|
||||||
|
if i + chunk_size >= len(words):
|
||||||
|
break
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def section_based_chunks(
|
||||||
|
text: str, section_tags: List[str], chunk_size: int, chunk_overlap: int
|
||||||
|
) -> List[tuple]:
|
||||||
|
lines = text.split("\n")
|
||||||
|
sections = []
|
||||||
|
current_tag = None
|
||||||
|
current_text = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
matched_tag = None
|
||||||
|
content = ""
|
||||||
|
for tag in section_tags:
|
||||||
|
if stripped.startswith(tag + ":"):
|
||||||
|
matched_tag = tag
|
||||||
|
content = stripped[len(tag) + 1 :].strip()
|
||||||
|
break
|
||||||
|
elif stripped.startswith(tag):
|
||||||
|
matched_tag = tag
|
||||||
|
remainder = stripped[len(tag) :].strip()
|
||||||
|
if remainder.startswith(":"):
|
||||||
|
remainder = remainder[1:].strip()
|
||||||
|
content = remainder
|
||||||
|
break
|
||||||
|
|
||||||
|
if matched_tag:
|
||||||
|
if current_tag is not None:
|
||||||
|
sections.append((current_tag, " ".join(current_text)))
|
||||||
|
current_tag = matched_tag
|
||||||
|
current_text = [content]
|
||||||
|
else:
|
||||||
|
if current_tag is not None:
|
||||||
|
current_text.append(stripped)
|
||||||
|
|
||||||
|
if current_tag is not None:
|
||||||
|
sections.append((current_tag, " ".join(current_text)))
|
||||||
|
|
||||||
|
if not sections:
|
||||||
|
return []
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for tag, section_text in sections:
|
||||||
|
sub_chunks = sliding_window_chunks(section_text, chunk_size, chunk_overlap)
|
||||||
|
for sub_chunk in sub_chunks:
|
||||||
|
result.append((sub_chunk, tag))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def match_chunking_rule(
|
||||||
|
relative_path: str, rules: Dict[str, ChunkingRule]
|
||||||
|
) -> ChunkingRule:
|
||||||
|
for pattern, rule in rules.items():
|
||||||
|
if pattern == "default":
|
||||||
|
continue
|
||||||
|
if fnmatch.fnmatch(relative_path, pattern):
|
||||||
|
return rule
|
||||||
|
return rules.get("default")
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_file(
|
||||||
|
file_path: Path,
|
||||||
|
vault_root: Path,
|
||||||
|
rules: Dict[str, ChunkingRule],
|
||||||
|
modified_at: float,
|
||||||
|
) -> List[Chunk]:
|
||||||
|
relative_path = file_path.relative_to(vault_root).as_posix()
|
||||||
|
rule = match_chunking_rule(relative_path, rules)
|
||||||
|
|
||||||
|
text = file_path.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
date_match = re.search(r"(\d{4}-\d{2}-\d{2})", file_path.stem)
|
||||||
|
date = date_match.group(1) if date_match else ""
|
||||||
|
|
||||||
|
source_directory = relative_path.split("/")[0] if "/" in relative_path else ""
|
||||||
|
source_file = relative_path
|
||||||
|
|
||||||
|
if rule.strategy == "section" and rule.section_tags:
|
||||||
|
raw_chunks = section_based_chunks(
|
||||||
|
text, rule.section_tags, rule.chunk_size, rule.chunk_overlap
|
||||||
|
)
|
||||||
|
chunks = []
|
||||||
|
for i, (chunk_text, section) in enumerate(raw_chunks):
|
||||||
|
section_hashtags = re.findall(r"#\w+", chunk_text)
|
||||||
|
section_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
||||||
|
section_tags = list(set(section_hashtags + section_wikilinks))
|
||||||
|
|
||||||
|
chunk = Chunk(
|
||||||
|
text=chunk_text,
|
||||||
|
source_file=source_file,
|
||||||
|
source_directory=source_directory,
|
||||||
|
section=section,
|
||||||
|
date=date,
|
||||||
|
tags=section_tags,
|
||||||
|
chunk_index=i,
|
||||||
|
total_chunks=len(raw_chunks),
|
||||||
|
modified_at=modified_at,
|
||||||
|
rule_applied=rule.strategy,
|
||||||
|
)
|
||||||
|
chunks.append(chunk)
|
||||||
|
else:
|
||||||
|
raw_chunks = sliding_window_chunks(text, rule.chunk_size, rule.chunk_overlap)
|
||||||
|
chunks = []
|
||||||
|
for i, chunk_text in enumerate(raw_chunks):
|
||||||
|
chunk_hashtags = re.findall(r"#\w+", chunk_text)
|
||||||
|
chunk_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
||||||
|
chunk_tags = list(set(chunk_hashtags + chunk_wikilinks))
|
||||||
|
|
||||||
|
chunk = Chunk(
|
||||||
|
text=chunk_text,
|
||||||
|
source_file=source_file,
|
||||||
|
source_directory=source_directory,
|
||||||
|
section="",
|
||||||
|
date=date,
|
||||||
|
tags=chunk_tags,
|
||||||
|
chunk_index=i,
|
||||||
|
total_chunks=len(raw_chunks),
|
||||||
|
modified_at=modified_at,
|
||||||
|
rule_applied=rule.strategy,
|
||||||
|
)
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
return chunks
|
||||||
41
tests/test_chunker.py
Normal file
41
tests/test_chunker.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from companion.rag.chunker import sliding_window_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_basic():
|
||||||
|
text = "word " * 100
|
||||||
|
chunks = sliding_window_chunks(text, chunk_size=20, chunk_overlap=5)
|
||||||
|
assert len(chunks) > 1
|
||||||
|
assert len(chunks[0].split()) == 20
|
||||||
|
# overlap check: last 5 words of chunk 0 should appear in chunk 1
|
||||||
|
last_five = chunks[0].split()[-5:]
|
||||||
|
first_chunk1 = chunks[1].split()[:5]
|
||||||
|
assert last_five == first_chunk1
|
||||||
|
from companion.rag.chunker import section_based_chunks, chunk_file, ChunkingRule
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def test_section_based_chunks_splits_on_tags():
|
||||||
|
text = "#DayInShort: good day\n#mentalhealth: stressed\n#work: busy"
|
||||||
|
chunks = section_based_chunks(text, ["#DayInShort", "#mentalhealth", "#work"], chunk_size=10, chunk_overlap=2)
|
||||||
|
assert len(chunks) == 3
|
||||||
|
assert chunks[0][1] == "#DayInShort"
|
||||||
|
assert chunks[1][1] == "#mentalhealth"
|
||||||
|
assert chunks[2][1] == "#work"
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_file_extracts_metadata():
|
||||||
|
with tempfile.TemporaryDirectory() as tmp:
|
||||||
|
vault = Path(tmp)
|
||||||
|
journal = vault / "Journal" / "2026" / "04" / "2026-04-12.md"
|
||||||
|
journal.parent.mkdir(parents=True)
|
||||||
|
journal.write_text("#DayInShort: good day\n#Relations: [[Person/Vinay]] visited.", encoding="utf-8")
|
||||||
|
rules = {
|
||||||
|
"default": ChunkingRule(strategy="sliding_window", chunk_size=500, chunk_overlap=100),
|
||||||
|
"Journal/**": ChunkingRule(strategy="section", chunk_size=300, chunk_overlap=50, section_tags=["#DayInShort", "#Relations"]),
|
||||||
|
}
|
||||||
|
chunks = chunk_file(journal, vault, rules, modified_at=1234567890.0)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].source_directory == "Journal"
|
||||||
|
assert chunks[0].date == "2026-04-12"
|
||||||
|
assert "Person/Vinay" in (chunks[1].tags or [])
|
||||||
Reference in New Issue
Block a user