diff --git a/python/obsidian_rag/indexer.py b/python/obsidian_rag/indexer.py index 2da45db..11b0377 100644 --- a/python/obsidian_rag/indexer.py +++ b/python/obsidian_rag/indexer.py @@ -4,8 +4,6 @@ from __future__ import annotations import json import os -import time -import uuid from datetime import datetime, timezone from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Iterator @@ -16,10 +14,13 @@ if TYPE_CHECKING: import obsidian_rag.config as config_mod from obsidian_rag.config import _resolve_data_dir from obsidian_rag.chunker import chunk_file -from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError +from obsidian_rag.embedder import OllamaUnavailableError from obsidian_rag.security import should_index_dir, validate_path -from obsidian_rag.audit_logger import AuditLogger -from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks +from obsidian_rag.vector_store import ( + create_table_if_not_exists, + get_db, + upsert_chunks, +) # ---------------------------------------------------------------------- # Pipeline @@ -43,6 +44,7 @@ class Indexer: def embedder(self): if self._embedder is None: from obsidian_rag.embedder import OllamaEmbedder + self._embedder = OllamaEmbedder(self.config) return self._embedder @@ -50,6 +52,7 @@ class Indexer: def audit_logger(self): if self._audit_logger is None: from obsidian_rag.audit_logger import AuditLogger + log_dir = _resolve_data_dir() / "audit" self._audit_logger = AuditLogger(log_dir / "audit.log") return self._audit_logger @@ -57,18 +60,18 @@ class Indexer: def _check_sensitive_content_approval(self, chunks: list[dict[str, Any]]) -> None: """Enforce user approval for sensitive content before indexing.""" from obsidian_rag import security - + sensitive_categories = self.config.security.require_confirmation_for if not sensitive_categories: return - + for chunk in chunks: sensitivity = security.detect_sensitive( - chunk['chunk_text'], + chunk["chunk_text"], self.config.security.sensitive_sections, - self.config.memory.patterns + self.config.memory.patterns, ) - + for category in sensitive_categories: if sensitivity.get(category, False): if not self.config.security.auto_approve_sensitive: @@ -99,7 +102,11 @@ class Indexer: """Index a single file. Returns (num_chunks, enriched_chunks).""" from obsidian_rag import security - mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat()) + mtime = str( + datetime.fromtimestamp( + filepath.stat().st_mtime, tz=timezone.utc + ).isoformat() + ) content = filepath.read_text(encoding="utf-8") # Sanitize content = security.sanitize_text(content) @@ -147,24 +154,25 @@ class Indexer: num_chunks, enriched = self.process_file(filepath) # Enforce sensitive content policies self._check_sensitive_content_approval(enriched) - + # Log sensitive content access for chunk in enriched: from obsidian_rag import security + sensitivity = security.detect_sensitive( - chunk['chunk_text'], + chunk["chunk_text"], self.config.security.sensitive_sections, - self.config.memory.patterns + self.config.memory.patterns, ) - for category in ['health', 'financial', 'relations']: + for category in ["health", "financial", "relations"]: if sensitivity.get(category, False): self.audit_logger.log_sensitive_access( - str(chunk['source_file']), + str(chunk["source_file"]), category, - 'index', - {'chunk_id': chunk['chunk_id']} + "index", + {"chunk_id": chunk["chunk_id"]}, ) - + # Embed chunks texts = [e["chunk_text"] for e in enriched] try: @@ -249,9 +257,16 @@ class Indexer: db = get_db(self.config) if "obsidian_chunks" in db.list_tables(): db.drop_table("obsidian_chunks") - # full_index is a generator — materialize it to get the final dict results = list(self.full_index()) - return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []} + final = ( + results[-1] + if results + else {"indexed_files": 0, "total_chunks": 0, "errors": []} + ) + self._write_sync_result( + final["indexed_files"], final["total_chunks"], final["errors"] + ) + return final def _sync_result_path(self) -> Path: # Use the same dev-data-dir convention as config.py @@ -309,24 +324,25 @@ class Indexer: num_chunks, enriched = self.process_file(filepath) # Enforce sensitive content policies self._check_sensitive_content_approval(enriched) - + # Log sensitive content access for chunk in enriched: from obsidian_rag import security + sensitivity = security.detect_sensitive( - chunk['chunk_text'], + chunk["chunk_text"], self.config.security.sensitive_sections, - self.config.memory.patterns + self.config.memory.patterns, ) - for category in ['health', 'financial', 'relations']: + for category in ["health", "financial", "relations"]: if sensitivity.get(category, False): self.audit_logger.log_sensitive_access( - str(chunk['source_file']), + str(chunk["source_file"]), category, - 'index', - {'chunk_id': chunk['chunk_id']} + "index", + {"chunk_id": chunk["chunk_id"]}, ) - + # Embed chunks texts = [e["chunk_text"] for e in enriched] try: @@ -353,4 +369,3 @@ class Indexer: if not data_dir.exists() and not (project_root / "KnowledgeVault").exists(): data_dir = Path(os.path.expanduser("~/.obsidian-rag")) return data_dir / "sync-result.json" -