fix(indexer): write sync-result.json in reindex() not just sync()
reindex() was consuming the full_index() generator but never calling _write_sync_result(), leaving sync-result.json stale while CLI output showed correct indexed_files/total_chunks.
This commit is contained in:
@@ -4,8 +4,6 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Generator, Iterator
|
||||
@@ -16,10 +14,13 @@ if TYPE_CHECKING:
|
||||
import obsidian_rag.config as config_mod
|
||||
from obsidian_rag.config import _resolve_data_dir
|
||||
from obsidian_rag.chunker import chunk_file
|
||||
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
|
||||
from obsidian_rag.embedder import OllamaUnavailableError
|
||||
from obsidian_rag.security import should_index_dir, validate_path
|
||||
from obsidian_rag.audit_logger import AuditLogger
|
||||
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
|
||||
from obsidian_rag.vector_store import (
|
||||
create_table_if_not_exists,
|
||||
get_db,
|
||||
upsert_chunks,
|
||||
)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Pipeline
|
||||
@@ -43,6 +44,7 @@ class Indexer:
|
||||
def embedder(self):
|
||||
if self._embedder is None:
|
||||
from obsidian_rag.embedder import OllamaEmbedder
|
||||
|
||||
self._embedder = OllamaEmbedder(self.config)
|
||||
return self._embedder
|
||||
|
||||
@@ -50,6 +52,7 @@ class Indexer:
|
||||
def audit_logger(self):
|
||||
if self._audit_logger is None:
|
||||
from obsidian_rag.audit_logger import AuditLogger
|
||||
|
||||
log_dir = _resolve_data_dir() / "audit"
|
||||
self._audit_logger = AuditLogger(log_dir / "audit.log")
|
||||
return self._audit_logger
|
||||
@@ -57,18 +60,18 @@ class Indexer:
|
||||
def _check_sensitive_content_approval(self, chunks: list[dict[str, Any]]) -> None:
|
||||
"""Enforce user approval for sensitive content before indexing."""
|
||||
from obsidian_rag import security
|
||||
|
||||
|
||||
sensitive_categories = self.config.security.require_confirmation_for
|
||||
if not sensitive_categories:
|
||||
return
|
||||
|
||||
|
||||
for chunk in chunks:
|
||||
sensitivity = security.detect_sensitive(
|
||||
chunk['chunk_text'],
|
||||
chunk["chunk_text"],
|
||||
self.config.security.sensitive_sections,
|
||||
self.config.memory.patterns
|
||||
self.config.memory.patterns,
|
||||
)
|
||||
|
||||
|
||||
for category in sensitive_categories:
|
||||
if sensitivity.get(category, False):
|
||||
if not self.config.security.auto_approve_sensitive:
|
||||
@@ -99,7 +102,11 @@ class Indexer:
|
||||
"""Index a single file. Returns (num_chunks, enriched_chunks)."""
|
||||
from obsidian_rag import security
|
||||
|
||||
mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat())
|
||||
mtime = str(
|
||||
datetime.fromtimestamp(
|
||||
filepath.stat().st_mtime, tz=timezone.utc
|
||||
).isoformat()
|
||||
)
|
||||
content = filepath.read_text(encoding="utf-8")
|
||||
# Sanitize
|
||||
content = security.sanitize_text(content)
|
||||
@@ -147,24 +154,25 @@ class Indexer:
|
||||
num_chunks, enriched = self.process_file(filepath)
|
||||
# Enforce sensitive content policies
|
||||
self._check_sensitive_content_approval(enriched)
|
||||
|
||||
|
||||
# Log sensitive content access
|
||||
for chunk in enriched:
|
||||
from obsidian_rag import security
|
||||
|
||||
sensitivity = security.detect_sensitive(
|
||||
chunk['chunk_text'],
|
||||
chunk["chunk_text"],
|
||||
self.config.security.sensitive_sections,
|
||||
self.config.memory.patterns
|
||||
self.config.memory.patterns,
|
||||
)
|
||||
for category in ['health', 'financial', 'relations']:
|
||||
for category in ["health", "financial", "relations"]:
|
||||
if sensitivity.get(category, False):
|
||||
self.audit_logger.log_sensitive_access(
|
||||
str(chunk['source_file']),
|
||||
str(chunk["source_file"]),
|
||||
category,
|
||||
'index',
|
||||
{'chunk_id': chunk['chunk_id']}
|
||||
"index",
|
||||
{"chunk_id": chunk["chunk_id"]},
|
||||
)
|
||||
|
||||
|
||||
# Embed chunks
|
||||
texts = [e["chunk_text"] for e in enriched]
|
||||
try:
|
||||
@@ -249,9 +257,16 @@ class Indexer:
|
||||
db = get_db(self.config)
|
||||
if "obsidian_chunks" in db.list_tables():
|
||||
db.drop_table("obsidian_chunks")
|
||||
# full_index is a generator — materialize it to get the final dict
|
||||
results = list(self.full_index())
|
||||
return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []}
|
||||
final = (
|
||||
results[-1]
|
||||
if results
|
||||
else {"indexed_files": 0, "total_chunks": 0, "errors": []}
|
||||
)
|
||||
self._write_sync_result(
|
||||
final["indexed_files"], final["total_chunks"], final["errors"]
|
||||
)
|
||||
return final
|
||||
|
||||
def _sync_result_path(self) -> Path:
|
||||
# Use the same dev-data-dir convention as config.py
|
||||
@@ -309,24 +324,25 @@ class Indexer:
|
||||
num_chunks, enriched = self.process_file(filepath)
|
||||
# Enforce sensitive content policies
|
||||
self._check_sensitive_content_approval(enriched)
|
||||
|
||||
|
||||
# Log sensitive content access
|
||||
for chunk in enriched:
|
||||
from obsidian_rag import security
|
||||
|
||||
sensitivity = security.detect_sensitive(
|
||||
chunk['chunk_text'],
|
||||
chunk["chunk_text"],
|
||||
self.config.security.sensitive_sections,
|
||||
self.config.memory.patterns
|
||||
self.config.memory.patterns,
|
||||
)
|
||||
for category in ['health', 'financial', 'relations']:
|
||||
for category in ["health", "financial", "relations"]:
|
||||
if sensitivity.get(category, False):
|
||||
self.audit_logger.log_sensitive_access(
|
||||
str(chunk['source_file']),
|
||||
str(chunk["source_file"]),
|
||||
category,
|
||||
'index',
|
||||
{'chunk_id': chunk['chunk_id']}
|
||||
"index",
|
||||
{"chunk_id": chunk["chunk_id"]},
|
||||
)
|
||||
|
||||
|
||||
# Embed chunks
|
||||
texts = [e["chunk_text"] for e in enriched]
|
||||
try:
|
||||
@@ -353,4 +369,3 @@ class Indexer:
|
||||
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
|
||||
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
|
||||
return data_dir / "sync-result.json"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user