Security review fixes

2026-04-11 15:11:07 -04:00
parent 0510df067d
commit 4e991c329e
7 changed files with 864 additions and 4 deletions
--- a/python/obsidian_rag/indexer.py
+++ b/python/obsidian_rag/indexer.py
@@ -14,9 +14,11 @@ if TYPE_CHECKING:
    from obsidian_rag.config import ObsidianRagConfig

 import obsidian_rag.config as config_mod
+from obsidian_rag.config import _resolve_data_dir
 from obsidian_rag.chunker import chunk_file
-from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError
+from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
 from obsidian_rag.security import should_index_dir, validate_path
+from obsidian_rag.audit_logger import AuditLogger
 from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks

 # ----------------------------------------------------------------------
@@ -24,6 +26,10 @@ from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_sour
 # ----------------------------------------------------------------------


+class SensitiveContentError(Exception):
+    """Raised when sensitive content requires approval but isn't approved."""
+
+
 class Indexer:
    """Coordinates the scan → chunk → embed → store pipeline."""

@@ -31,6 +37,7 @@ class Indexer:
        self.config = config
        self.vault_path = config_mod.resolve_vault_path(config)
        self._embedder = None  # lazy init
+        self._audit_logger = None  # lazy init

    @property
    def embedder(self):
@@ -39,6 +46,38 @@ class Indexer:
            self._embedder = OllamaEmbedder(self.config)
        return self._embedder

+    @property
+    def audit_logger(self):
+        if self._audit_logger is None:
+            from obsidian_rag.audit_logger import AuditLogger
+            log_dir = _resolve_data_dir() / "audit"
+            self._audit_logger = AuditLogger(log_dir / "audit.log")
+        return self._audit_logger
+
+    def _check_sensitive_content_approval(self, chunks: list[dict[str, Any]]) -> None:
+        """Enforce user approval for sensitive content before indexing."""
+        from obsidian_rag import security
+        
+        sensitive_categories = self.config.security.require_confirmation_for
+        if not sensitive_categories:
+            return
+        
+        for chunk in chunks:
+            sensitivity = security.detect_sensitive(
+                chunk['chunk_text'],
+                self.config.security.sensitive_sections,
+                self.config.memory.patterns
+            )
+            
+            for category in sensitive_categories:
+                if sensitivity.get(category, False):
+                    if not self.config.security.auto_approve_sensitive:
+                        raise SensitiveContentError(
+                            f"Sensitive {category} content detected. "
+                            f"Requires explicit approval before indexing. "
+                            f"File: {chunk['source_file']}"
+                        )
+
    def scan_vault(self) -> Generator[Path, None, None]:
        """Walk vault, yielding markdown files to index."""
        for root, dirs, files in os.walk(self.vault_path):
@@ -106,6 +145,26 @@ class Indexer:
        for idx, filepath in enumerate(files):
            try:
                num_chunks, enriched = self.process_file(filepath)
+                # Enforce sensitive content policies
+                self._check_sensitive_content_approval(enriched)
+                
+                # Log sensitive content access
+                for chunk in enriched:
+                    from obsidian_rag import security
+                    sensitivity = security.detect_sensitive(
+                        chunk['chunk_text'],
+                        self.config.security.sensitive_sections,
+                        self.config.memory.patterns
+                    )
+                    for category in ['health', 'financial', 'relations']:
+                        if sensitivity.get(category, False):
+                            self.audit_logger.log_sensitive_access(
+                                str(chunk['source_file']),
+                                category,
+                                'index',
+                                {'chunk_id': chunk['chunk_id']}
+                            )
+                
                # Embed chunks
                texts = [e["chunk_text"] for e in enriched]
                try:
@@ -132,14 +191,13 @@ class Indexer:
                    "total": total_files,
                }

-        return {
+        # Yield final result
+        yield {
            "indexed_files": indexed_files,
            "total_chunks": total_chunks,
            "duration_ms": 0,  # caller can fill
            "errors": errors,
        }
-
-    def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
        """Incremental sync: only process files modified since last sync."""
        sync_result_path = self._sync_result_path()
        last_sync = None
@@ -221,3 +279,78 @@ class Indexer:
        tmp = path.with_suffix(".json.tmp")
        tmp.write_text(json.dumps(result, indent=2))
        tmp.rename(path)
+
+    def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
+        """Incremental sync: only process files modified since last sync."""
+        sync_result_path = self._sync_result_path()
+        last_sync = None
+        if sync_result_path.exists():
+            try:
+                last_sync = json.loads(sync_result_path.read_text()).get("timestamp")
+            except Exception:
+                pass
+
+        db = get_db(self.config)
+        table = create_table_if_not_exists(db)
+        embedder = self.embedder
+
+        files = list(self.scan_vault())
+        indexed_files = 0
+        total_chunks = 0
+        errors: list[dict] = []
+
+        for filepath in files:
+            mtime = datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc)
+            mtime_str = mtime.isoformat()
+            if last_sync and mtime_str <= last_sync:
+                continue  # unchanged
+
+            try:
+                num_chunks, enriched = self.process_file(filepath)
+                # Enforce sensitive content policies
+                self._check_sensitive_content_approval(enriched)
+                
+                # Log sensitive content access
+                for chunk in enriched:
+                    from obsidian_rag import security
+                    sensitivity = security.detect_sensitive(
+                        chunk['chunk_text'],
+                        self.config.security.sensitive_sections,
+                        self.config.memory.patterns
+                    )
+                    for category in ['health', 'financial', 'relations']:
+                        if sensitivity.get(category, False):
+                            self.audit_logger.log_sensitive_access(
+                                str(chunk['source_file']),
+                                category,
+                                'index',
+                                {'chunk_id': chunk['chunk_id']}
+                            )
+                
+                # Embed chunks
+                texts = [e["chunk_text"] for e in enriched]
+                try:
+                    vectors = embedder.embed_chunks(texts)
+                except OllamaUnavailableError:
+                    vectors = [[0.0] * 1024 for _ in texts]
+                for e, v in zip(enriched, vectors):
+                    e["vector"] = v
+                upsert_chunks(table, enriched)
+                total_chunks += num_chunks
+                indexed_files += 1
+            except Exception as exc:
+                errors.append({"file": str(filepath), "error": str(exc)})
+
+        self._write_sync_result(indexed_files, total_chunks, errors)
+        return {
+            "indexed_files": indexed_files,
+            "total_chunks": total_chunks,
+            "errors": errors,
+        }
+        # Use the same dev-data-dir convention as config.py
+        project_root = Path(__file__).parent.parent.parent
+        data_dir = project_root / "obsidian-rag"
+        if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
+            data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
+        return data_dir / "sync-result.json"
+