fix: stable chunk_id eliminates duplicate rows on re-index
UUID-based chunk_ids caused merge_insert to treat same content as new rows on each re-index run. Now uses SHA1(content_hash + index) for deterministic chunk_ids — same section/text always produces same chunk_id. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import hashlib
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
@@ -162,6 +163,12 @@ def sliding_window_chunks(
|
|||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _stable_chunk_id(content_hash: str, chunk_index: int) -> str:
|
||||||
|
"""Generate a stable chunk_id from content hash and index."""
|
||||||
|
raw = f"{content_hash}:{chunk_index}"
|
||||||
|
return hashlib.sha1(raw.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
|
|
||||||
def chunk_file(
|
def chunk_file(
|
||||||
filepath: Path,
|
filepath: Path,
|
||||||
content: str,
|
content: str,
|
||||||
@@ -188,6 +195,9 @@ def chunk_file(
|
|||||||
chunk_size = config.indexing.chunk_size
|
chunk_size = config.indexing.chunk_size
|
||||||
overlap = config.indexing.chunk_overlap
|
overlap = config.indexing.chunk_overlap
|
||||||
|
|
||||||
|
# Compute content hash for stable, content-addressable chunk_ids
|
||||||
|
content_hash = hashlib.sha1(body.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
chunks: list[Chunk] = []
|
chunks: list[Chunk] = []
|
||||||
|
|
||||||
if is_structured_note(filepath):
|
if is_structured_note(filepath):
|
||||||
@@ -203,7 +213,7 @@ def chunk_file(
|
|||||||
|
|
||||||
chunk_text = section_text
|
chunk_text = section_text
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}",
|
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
|
||||||
text=chunk_text,
|
text=chunk_text,
|
||||||
source_file=source_file,
|
source_file=source_file,
|
||||||
source_directory=source_directory,
|
source_directory=source_directory,
|
||||||
@@ -224,7 +234,7 @@ def chunk_file(
|
|||||||
if not text_chunk.strip():
|
if not text_chunk.strip():
|
||||||
continue
|
continue
|
||||||
chunk = Chunk(
|
chunk = Chunk(
|
||||||
chunk_id=f"{chunk_id_prefix}{uuid.uuid4().hex[:8]}",
|
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
|
||||||
text=text_chunk,
|
text=text_chunk,
|
||||||
source_file=source_file,
|
source_file=source_file,
|
||||||
source_directory=source_directory,
|
source_directory=source_directory,
|
||||||
|
|||||||
Reference in New Issue
Block a user