From 83a54b2af6e14ff8437d651fbb973389038c9ba9 Mon Sep 17 00:00:00 2001 From: Santhosh Janardhanan Date: Sat, 11 Apr 2026 13:23:17 -0400 Subject: [PATCH] fix: stable chunk_id eliminates duplicate rows on re-index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UUID-based chunk_ids caused merge_insert to treat same content as new rows on each re-index run. Now uses SHA1(content_hash + index) for deterministic chunk_ids — same section/text always produces same chunk_id. Co-Authored-By: Claude Opus 4.6 --- python/obsidian_rag/chunker.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/python/obsidian_rag/chunker.py b/python/obsidian_rag/chunker.py index b9fec57..7b3f948 100644 --- a/python/obsidian_rag/chunker.py +++ b/python/obsidian_rag/chunker.py @@ -4,6 +4,7 @@ from __future__ import annotations import re import unicodedata +import hashlib from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING @@ -162,6 +163,12 @@ def sliding_window_chunks( # ---------------------------------------------------------------------- +def _stable_chunk_id(content_hash: str, chunk_index: int) -> str: + """Generate a stable chunk_id from content hash and index.""" + raw = f"{content_hash}:{chunk_index}" + return hashlib.sha1(raw.encode()).hexdigest()[:12] + + def chunk_file( filepath: Path, content: str, @@ -188,6 +195,9 @@ def chunk_file( chunk_size = config.indexing.chunk_size overlap = config.indexing.chunk_overlap + # Compute content hash for stable, content-addressable chunk_ids + content_hash = hashlib.sha1(body.encode()).hexdigest()[:12] + chunks: list[Chunk] = [] if is_structured_note(filepath): @@ -203,7 +213,7 @@ def chunk_file( chunk_text = section_text chunk = Chunk( - chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}", + chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}", text=chunk_text, source_file=source_file, source_directory=source_directory, @@ -224,7 +234,7 @@ def chunk_file( if not text_chunk.strip(): continue chunk = Chunk( - chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}", + chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}", text=text_chunk, source_file=source_file, source_directory=source_directory,