fix: address chunker review feedback
This commit is contained in:
@@ -45,7 +45,7 @@ def sliding_window_chunks(text: str, chunk_size: int, chunk_overlap: int) -> Lis
|
||||
|
||||
def section_based_chunks(
|
||||
text: str, section_tags: List[str], chunk_size: int, chunk_overlap: int
|
||||
) -> List[tuple]:
|
||||
) -> List[tuple[str, str]]:
|
||||
lines = text.split("\n")
|
||||
sections = []
|
||||
current_tag = None
|
||||
@@ -100,7 +100,9 @@ def match_chunking_rule(
|
||||
continue
|
||||
if fnmatch.fnmatch(relative_path, pattern):
|
||||
return rule
|
||||
return rules.get("default")
|
||||
return rules.get("default") or ChunkingRule(
|
||||
strategy="sliding_window", chunk_size=500, chunk_overlap=100
|
||||
)
|
||||
|
||||
|
||||
def chunk_file(
|
||||
@@ -126,9 +128,9 @@ def chunk_file(
|
||||
)
|
||||
chunks = []
|
||||
for i, (chunk_text, section) in enumerate(raw_chunks):
|
||||
section_hashtags = re.findall(r"#\w+", chunk_text)
|
||||
section_hashtags = re.findall(r"#[\w\-]+", chunk_text)
|
||||
section_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
||||
section_tags = list(set(section_hashtags + section_wikilinks))
|
||||
section_tags = list(dict.fromkeys(section_hashtags + section_wikilinks))
|
||||
|
||||
chunk = Chunk(
|
||||
text=chunk_text,
|
||||
@@ -147,9 +149,9 @@ def chunk_file(
|
||||
raw_chunks = sliding_window_chunks(text, rule.chunk_size, rule.chunk_overlap)
|
||||
chunks = []
|
||||
for i, chunk_text in enumerate(raw_chunks):
|
||||
chunk_hashtags = re.findall(r"#\w+", chunk_text)
|
||||
chunk_hashtags = re.findall(r"#[\w\-]+", chunk_text)
|
||||
chunk_wikilinks = re.findall(r"\[\[(.*?)\]\]", chunk_text)
|
||||
chunk_tags = list(set(chunk_hashtags + chunk_wikilinks))
|
||||
chunk_tags = list(dict.fromkeys(chunk_hashtags + chunk_wikilinks))
|
||||
|
||||
chunk = Chunk(
|
||||
text=chunk_text,
|
||||
|
||||
Reference in New Issue
Block a user