fix(indexer): write sync-result.json in reindex() not just sync()
reindex() was consuming the full_index() generator but never calling _write_sync_result(), leaving sync-result.json stale while CLI output showed correct indexed_files/total_chunks.
This commit is contained in:
@@ -4,8 +4,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Generator, Iterator
|
from typing import TYPE_CHECKING, Any, Generator, Iterator
|
||||||
@@ -16,10 +14,13 @@ if TYPE_CHECKING:
|
|||||||
import obsidian_rag.config as config_mod
|
import obsidian_rag.config as config_mod
|
||||||
from obsidian_rag.config import _resolve_data_dir
|
from obsidian_rag.config import _resolve_data_dir
|
||||||
from obsidian_rag.chunker import chunk_file
|
from obsidian_rag.chunker import chunk_file
|
||||||
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
|
from obsidian_rag.embedder import OllamaUnavailableError
|
||||||
from obsidian_rag.security import should_index_dir, validate_path
|
from obsidian_rag.security import should_index_dir, validate_path
|
||||||
from obsidian_rag.audit_logger import AuditLogger
|
from obsidian_rag.vector_store import (
|
||||||
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
|
create_table_if_not_exists,
|
||||||
|
get_db,
|
||||||
|
upsert_chunks,
|
||||||
|
)
|
||||||
|
|
||||||
# ----------------------------------------------------------------------
|
# ----------------------------------------------------------------------
|
||||||
# Pipeline
|
# Pipeline
|
||||||
@@ -43,6 +44,7 @@ class Indexer:
|
|||||||
def embedder(self):
|
def embedder(self):
|
||||||
if self._embedder is None:
|
if self._embedder is None:
|
||||||
from obsidian_rag.embedder import OllamaEmbedder
|
from obsidian_rag.embedder import OllamaEmbedder
|
||||||
|
|
||||||
self._embedder = OllamaEmbedder(self.config)
|
self._embedder = OllamaEmbedder(self.config)
|
||||||
return self._embedder
|
return self._embedder
|
||||||
|
|
||||||
@@ -50,6 +52,7 @@ class Indexer:
|
|||||||
def audit_logger(self):
|
def audit_logger(self):
|
||||||
if self._audit_logger is None:
|
if self._audit_logger is None:
|
||||||
from obsidian_rag.audit_logger import AuditLogger
|
from obsidian_rag.audit_logger import AuditLogger
|
||||||
|
|
||||||
log_dir = _resolve_data_dir() / "audit"
|
log_dir = _resolve_data_dir() / "audit"
|
||||||
self._audit_logger = AuditLogger(log_dir / "audit.log")
|
self._audit_logger = AuditLogger(log_dir / "audit.log")
|
||||||
return self._audit_logger
|
return self._audit_logger
|
||||||
@@ -64,9 +67,9 @@ class Indexer:
|
|||||||
|
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
sensitivity = security.detect_sensitive(
|
sensitivity = security.detect_sensitive(
|
||||||
chunk['chunk_text'],
|
chunk["chunk_text"],
|
||||||
self.config.security.sensitive_sections,
|
self.config.security.sensitive_sections,
|
||||||
self.config.memory.patterns
|
self.config.memory.patterns,
|
||||||
)
|
)
|
||||||
|
|
||||||
for category in sensitive_categories:
|
for category in sensitive_categories:
|
||||||
@@ -99,7 +102,11 @@ class Indexer:
|
|||||||
"""Index a single file. Returns (num_chunks, enriched_chunks)."""
|
"""Index a single file. Returns (num_chunks, enriched_chunks)."""
|
||||||
from obsidian_rag import security
|
from obsidian_rag import security
|
||||||
|
|
||||||
mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat())
|
mtime = str(
|
||||||
|
datetime.fromtimestamp(
|
||||||
|
filepath.stat().st_mtime, tz=timezone.utc
|
||||||
|
).isoformat()
|
||||||
|
)
|
||||||
content = filepath.read_text(encoding="utf-8")
|
content = filepath.read_text(encoding="utf-8")
|
||||||
# Sanitize
|
# Sanitize
|
||||||
content = security.sanitize_text(content)
|
content = security.sanitize_text(content)
|
||||||
@@ -151,18 +158,19 @@ class Indexer:
|
|||||||
# Log sensitive content access
|
# Log sensitive content access
|
||||||
for chunk in enriched:
|
for chunk in enriched:
|
||||||
from obsidian_rag import security
|
from obsidian_rag import security
|
||||||
|
|
||||||
sensitivity = security.detect_sensitive(
|
sensitivity = security.detect_sensitive(
|
||||||
chunk['chunk_text'],
|
chunk["chunk_text"],
|
||||||
self.config.security.sensitive_sections,
|
self.config.security.sensitive_sections,
|
||||||
self.config.memory.patterns
|
self.config.memory.patterns,
|
||||||
)
|
)
|
||||||
for category in ['health', 'financial', 'relations']:
|
for category in ["health", "financial", "relations"]:
|
||||||
if sensitivity.get(category, False):
|
if sensitivity.get(category, False):
|
||||||
self.audit_logger.log_sensitive_access(
|
self.audit_logger.log_sensitive_access(
|
||||||
str(chunk['source_file']),
|
str(chunk["source_file"]),
|
||||||
category,
|
category,
|
||||||
'index',
|
"index",
|
||||||
{'chunk_id': chunk['chunk_id']}
|
{"chunk_id": chunk["chunk_id"]},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Embed chunks
|
# Embed chunks
|
||||||
@@ -249,9 +257,16 @@ class Indexer:
|
|||||||
db = get_db(self.config)
|
db = get_db(self.config)
|
||||||
if "obsidian_chunks" in db.list_tables():
|
if "obsidian_chunks" in db.list_tables():
|
||||||
db.drop_table("obsidian_chunks")
|
db.drop_table("obsidian_chunks")
|
||||||
# full_index is a generator — materialize it to get the final dict
|
|
||||||
results = list(self.full_index())
|
results = list(self.full_index())
|
||||||
return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []}
|
final = (
|
||||||
|
results[-1]
|
||||||
|
if results
|
||||||
|
else {"indexed_files": 0, "total_chunks": 0, "errors": []}
|
||||||
|
)
|
||||||
|
self._write_sync_result(
|
||||||
|
final["indexed_files"], final["total_chunks"], final["errors"]
|
||||||
|
)
|
||||||
|
return final
|
||||||
|
|
||||||
def _sync_result_path(self) -> Path:
|
def _sync_result_path(self) -> Path:
|
||||||
# Use the same dev-data-dir convention as config.py
|
# Use the same dev-data-dir convention as config.py
|
||||||
@@ -313,18 +328,19 @@ class Indexer:
|
|||||||
# Log sensitive content access
|
# Log sensitive content access
|
||||||
for chunk in enriched:
|
for chunk in enriched:
|
||||||
from obsidian_rag import security
|
from obsidian_rag import security
|
||||||
|
|
||||||
sensitivity = security.detect_sensitive(
|
sensitivity = security.detect_sensitive(
|
||||||
chunk['chunk_text'],
|
chunk["chunk_text"],
|
||||||
self.config.security.sensitive_sections,
|
self.config.security.sensitive_sections,
|
||||||
self.config.memory.patterns
|
self.config.memory.patterns,
|
||||||
)
|
)
|
||||||
for category in ['health', 'financial', 'relations']:
|
for category in ["health", "financial", "relations"]:
|
||||||
if sensitivity.get(category, False):
|
if sensitivity.get(category, False):
|
||||||
self.audit_logger.log_sensitive_access(
|
self.audit_logger.log_sensitive_access(
|
||||||
str(chunk['source_file']),
|
str(chunk["source_file"]),
|
||||||
category,
|
category,
|
||||||
'index',
|
"index",
|
||||||
{'chunk_id': chunk['chunk_id']}
|
{"chunk_id": chunk["chunk_id"]},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Embed chunks
|
# Embed chunks
|
||||||
@@ -353,4 +369,3 @@ class Indexer:
|
|||||||
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
|
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
|
||||||
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
|
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
|
||||||
return data_dir / "sync-result.json"
|
return data_dir / "sync-result.json"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user