fix: stable chunk_id eliminates duplicate rows on re-index

UUID-based chunk_ids caused merge_insert to treat same content as new rows
on each re-index run. Now uses SHA1(content_hash + index) for deterministic
chunk_ids — same section/text always produces same chunk_id.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-11 13:23:17 -04:00
parent 5c281165c7
commit 83a54b2af6

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
import re import re
import unicodedata import unicodedata
import hashlib
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@@ -162,6 +163,12 @@ def sliding_window_chunks(
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
def _stable_chunk_id(content_hash: str, chunk_index: int) -> str:
"""Generate a stable chunk_id from content hash and index."""
raw = f"{content_hash}:{chunk_index}"
return hashlib.sha1(raw.encode()).hexdigest()[:12]
def chunk_file( def chunk_file(
filepath: Path, filepath: Path,
content: str, content: str,
@@ -188,6 +195,9 @@ def chunk_file(
chunk_size = config.indexing.chunk_size chunk_size = config.indexing.chunk_size
overlap = config.indexing.chunk_overlap overlap = config.indexing.chunk_overlap
# Compute content hash for stable, content-addressable chunk_ids
content_hash = hashlib.sha1(body.encode()).hexdigest()[:12]
chunks: list[Chunk] = [] chunks: list[Chunk] = []
if is_structured_note(filepath): if is_structured_note(filepath):
@@ -203,7 +213,7 @@ def chunk_file(
chunk_text = section_text chunk_text = section_text
chunk = Chunk( chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}", chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=chunk_text, text=chunk_text,
source_file=source_file, source_file=source_file,
source_directory=source_directory, source_directory=source_directory,
@@ -224,7 +234,7 @@ def chunk_file(
if not text_chunk.strip(): if not text_chunk.strip():
continue continue
chunk = Chunk( chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}", chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=text_chunk, text=text_chunk,
source_file=source_file, source_file=source_file,
source_directory=source_directory, source_directory=source_directory,