Security review fixes

This commit is contained in:
2026-04-11 15:11:07 -04:00
parent 0510df067d
commit 4e991c329e
7 changed files with 864 additions and 4 deletions

View File

@@ -14,9 +14,11 @@ if TYPE_CHECKING:
from obsidian_rag.config import ObsidianRagConfig
import obsidian_rag.config as config_mod
from obsidian_rag.config import _resolve_data_dir
from obsidian_rag.chunker import chunk_file
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
from obsidian_rag.security import should_index_dir, validate_path
from obsidian_rag.audit_logger import AuditLogger
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
# ----------------------------------------------------------------------
@@ -24,6 +26,10 @@ from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_sour
# ----------------------------------------------------------------------
class SensitiveContentError(Exception):
"""Raised when sensitive content requires approval but isn't approved."""
class Indexer:
"""Coordinates the scan → chunk → embed → store pipeline."""
@@ -31,6 +37,7 @@ class Indexer:
self.config = config
self.vault_path = config_mod.resolve_vault_path(config)
self._embedder = None # lazy init
self._audit_logger = None # lazy init
@property
def embedder(self):
@@ -39,6 +46,38 @@ class Indexer:
self._embedder = OllamaEmbedder(self.config)
return self._embedder
@property
def audit_logger(self):
if self._audit_logger is None:
from obsidian_rag.audit_logger import AuditLogger
log_dir = _resolve_data_dir() / "audit"
self._audit_logger = AuditLogger(log_dir / "audit.log")
return self._audit_logger
def _check_sensitive_content_approval(self, chunks: list[dict[str, Any]]) -> None:
"""Enforce user approval for sensitive content before indexing."""
from obsidian_rag import security
sensitive_categories = self.config.security.require_confirmation_for
if not sensitive_categories:
return
for chunk in chunks:
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
self.config.security.sensitive_sections,
self.config.memory.patterns
)
for category in sensitive_categories:
if sensitivity.get(category, False):
if not self.config.security.auto_approve_sensitive:
raise SensitiveContentError(
f"Sensitive {category} content detected. "
f"Requires explicit approval before indexing. "
f"File: {chunk['source_file']}"
)
def scan_vault(self) -> Generator[Path, None, None]:
"""Walk vault, yielding markdown files to index."""
for root, dirs, files in os.walk(self.vault_path):
@@ -106,6 +145,26 @@ class Indexer:
for idx, filepath in enumerate(files):
try:
num_chunks, enriched = self.process_file(filepath)
# Enforce sensitive content policies
self._check_sensitive_content_approval(enriched)
# Log sensitive content access
for chunk in enriched:
from obsidian_rag import security
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
self.config.security.sensitive_sections,
self.config.memory.patterns
)
for category in ['health', 'financial', 'relations']:
if sensitivity.get(category, False):
self.audit_logger.log_sensitive_access(
str(chunk['source_file']),
category,
'index',
{'chunk_id': chunk['chunk_id']}
)
# Embed chunks
texts = [e["chunk_text"] for e in enriched]
try:
@@ -132,14 +191,13 @@ class Indexer:
"total": total_files,
}
return {
# Yield final result
yield {
"indexed_files": indexed_files,
"total_chunks": total_chunks,
"duration_ms": 0, # caller can fill
"errors": errors,
}
def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
"""Incremental sync: only process files modified since last sync."""
sync_result_path = self._sync_result_path()
last_sync = None
@@ -221,3 +279,78 @@ class Indexer:
tmp = path.with_suffix(".json.tmp")
tmp.write_text(json.dumps(result, indent=2))
tmp.rename(path)
def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
"""Incremental sync: only process files modified since last sync."""
sync_result_path = self._sync_result_path()
last_sync = None
if sync_result_path.exists():
try:
last_sync = json.loads(sync_result_path.read_text()).get("timestamp")
except Exception:
pass
db = get_db(self.config)
table = create_table_if_not_exists(db)
embedder = self.embedder
files = list(self.scan_vault())
indexed_files = 0
total_chunks = 0
errors: list[dict] = []
for filepath in files:
mtime = datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc)
mtime_str = mtime.isoformat()
if last_sync and mtime_str <= last_sync:
continue # unchanged
try:
num_chunks, enriched = self.process_file(filepath)
# Enforce sensitive content policies
self._check_sensitive_content_approval(enriched)
# Log sensitive content access
for chunk in enriched:
from obsidian_rag import security
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
self.config.security.sensitive_sections,
self.config.memory.patterns
)
for category in ['health', 'financial', 'relations']:
if sensitivity.get(category, False):
self.audit_logger.log_sensitive_access(
str(chunk['source_file']),
category,
'index',
{'chunk_id': chunk['chunk_id']}
)
# Embed chunks
texts = [e["chunk_text"] for e in enriched]
try:
vectors = embedder.embed_chunks(texts)
except OllamaUnavailableError:
vectors = [[0.0] * 1024 for _ in texts]
for e, v in zip(enriched, vectors):
e["vector"] = v
upsert_chunks(table, enriched)
total_chunks += num_chunks
indexed_files += 1
except Exception as exc:
errors.append({"file": str(filepath), "error": str(exc)})
self._write_sync_result(indexed_files, total_chunks, errors)
return {
"indexed_files": indexed_files,
"total_chunks": total_chunks,
"errors": errors,
}
# Use the same dev-data-dir convention as config.py
project_root = Path(__file__).parent.parent.parent
data_dir = project_root / "obsidian-rag"
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
return data_dir / "sync-result.json"