fix(indexer): write sync-result.json in reindex() not just sync()

reindex() was consuming the full_index() generator but never calling
_write_sync_result(), leaving sync-result.json stale while CLI output
showed correct indexed_files/total_chunks.
This commit is contained in:
2026-04-12 01:49:43 -04:00
parent 9d919dc237
commit 4ab504e87c

View File

@@ -4,8 +4,6 @@ from __future__ import annotations
import json
import os
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generator, Iterator
@@ -16,10 +14,13 @@ if TYPE_CHECKING:
import obsidian_rag.config as config_mod
from obsidian_rag.config import _resolve_data_dir
from obsidian_rag.chunker import chunk_file
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
from obsidian_rag.embedder import OllamaUnavailableError
from obsidian_rag.security import should_index_dir, validate_path
from obsidian_rag.audit_logger import AuditLogger
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
from obsidian_rag.vector_store import (
create_table_if_not_exists,
get_db,
upsert_chunks,
)
# ----------------------------------------------------------------------
# Pipeline
@@ -43,6 +44,7 @@ class Indexer:
def embedder(self):
if self._embedder is None:
from obsidian_rag.embedder import OllamaEmbedder
self._embedder = OllamaEmbedder(self.config)
return self._embedder
@@ -50,6 +52,7 @@ class Indexer:
def audit_logger(self):
if self._audit_logger is None:
from obsidian_rag.audit_logger import AuditLogger
log_dir = _resolve_data_dir() / "audit"
self._audit_logger = AuditLogger(log_dir / "audit.log")
return self._audit_logger
@@ -57,18 +60,18 @@ class Indexer:
def _check_sensitive_content_approval(self, chunks: list[dict[str, Any]]) -> None:
"""Enforce user approval for sensitive content before indexing."""
from obsidian_rag import security
sensitive_categories = self.config.security.require_confirmation_for
if not sensitive_categories:
return
for chunk in chunks:
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
chunk["chunk_text"],
self.config.security.sensitive_sections,
self.config.memory.patterns
self.config.memory.patterns,
)
for category in sensitive_categories:
if sensitivity.get(category, False):
if not self.config.security.auto_approve_sensitive:
@@ -99,7 +102,11 @@ class Indexer:
"""Index a single file. Returns (num_chunks, enriched_chunks)."""
from obsidian_rag import security
mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat())
mtime = str(
datetime.fromtimestamp(
filepath.stat().st_mtime, tz=timezone.utc
).isoformat()
)
content = filepath.read_text(encoding="utf-8")
# Sanitize
content = security.sanitize_text(content)
@@ -147,24 +154,25 @@ class Indexer:
num_chunks, enriched = self.process_file(filepath)
# Enforce sensitive content policies
self._check_sensitive_content_approval(enriched)
# Log sensitive content access
for chunk in enriched:
from obsidian_rag import security
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
chunk["chunk_text"],
self.config.security.sensitive_sections,
self.config.memory.patterns
self.config.memory.patterns,
)
for category in ['health', 'financial', 'relations']:
for category in ["health", "financial", "relations"]:
if sensitivity.get(category, False):
self.audit_logger.log_sensitive_access(
str(chunk['source_file']),
str(chunk["source_file"]),
category,
'index',
{'chunk_id': chunk['chunk_id']}
"index",
{"chunk_id": chunk["chunk_id"]},
)
# Embed chunks
texts = [e["chunk_text"] for e in enriched]
try:
@@ -249,9 +257,16 @@ class Indexer:
db = get_db(self.config)
if "obsidian_chunks" in db.list_tables():
db.drop_table("obsidian_chunks")
# full_index is a generator — materialize it to get the final dict
results = list(self.full_index())
return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []}
final = (
results[-1]
if results
else {"indexed_files": 0, "total_chunks": 0, "errors": []}
)
self._write_sync_result(
final["indexed_files"], final["total_chunks"], final["errors"]
)
return final
def _sync_result_path(self) -> Path:
# Use the same dev-data-dir convention as config.py
@@ -309,24 +324,25 @@ class Indexer:
num_chunks, enriched = self.process_file(filepath)
# Enforce sensitive content policies
self._check_sensitive_content_approval(enriched)
# Log sensitive content access
for chunk in enriched:
from obsidian_rag import security
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
chunk["chunk_text"],
self.config.security.sensitive_sections,
self.config.memory.patterns
self.config.memory.patterns,
)
for category in ['health', 'financial', 'relations']:
for category in ["health", "financial", "relations"]:
if sensitivity.get(category, False):
self.audit_logger.log_sensitive_access(
str(chunk['source_file']),
str(chunk["source_file"]),
category,
'index',
{'chunk_id': chunk['chunk_id']}
"index",
{"chunk_id": chunk["chunk_id"]},
)
# Embed chunks
texts = [e["chunk_text"] for e in enriched]
try:
@@ -353,4 +369,3 @@ class Indexer:
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
return data_dir / "sync-result.json"