From 83a54b2af6e14ff8437d651fbb973389038c9ba9 Mon Sep 17 00:00:00 2001
From: Santhosh Janardhanan <santhoshj@gmail.com>
Date: Sat, 11 Apr 2026 13:23:17 -0400
Subject: [PATCH] fix: stable chunk_id eliminates duplicate rows on re-index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UUID-based chunk_ids caused merge_insert to treat same content as new rows
on each re-index run. Now uses SHA1(content_hash + index) for deterministic
chunk_ids — same section/text always produces same chunk_id.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/obsidian_rag/chunker.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/obsidian_rag/chunker.py b/python/obsidian_rag/chunker.py
index b9fec57..7b3f948 100644
--- a/python/obsidian_rag/chunker.py
+++ b/python/obsidian_rag/chunker.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 
 import re
 import unicodedata
+import hashlib
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -162,6 +163,12 @@ def sliding_window_chunks(
 # ----------------------------------------------------------------------
 
 
+def _stable_chunk_id(content_hash: str, chunk_index: int) -> str:
+    """Generate a stable chunk_id from content hash and index."""
+    raw = f"{content_hash}:{chunk_index}"
+    return hashlib.sha1(raw.encode()).hexdigest()[:12]
+
+
 def chunk_file(
     filepath: Path,
     content: str,
@@ -188,6 +195,9 @@ def chunk_file(
     chunk_size = config.indexing.chunk_size
     overlap = config.indexing.chunk_overlap
 
+    # Compute content hash for stable, content-addressable chunk_ids
+    content_hash = hashlib.sha1(body.encode()).hexdigest()[:12]
+
     chunks: list[Chunk] = []
 
     if is_structured_note(filepath):
@@ -203,7 +213,7 @@ def chunk_file(
 
             chunk_text = section_text
             chunk = Chunk(
-                chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}",
+                chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
                 text=chunk_text,
                 source_file=source_file,
                 source_directory=source_directory,
@@ -224,7 +234,7 @@ def chunk_file(
             if not text_chunk.strip():
                 continue
             chunk = Chunk(
-                chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}",
+                chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
                 text=text_chunk,
                 source_file=source_file,
                 source_directory=source_directory,