Security review fixes
This commit is contained in:
@@ -14,9 +14,11 @@ if TYPE_CHECKING:
|
||||
from obsidian_rag.config import ObsidianRagConfig
|
||||
|
||||
import obsidian_rag.config as config_mod
|
||||
from obsidian_rag.config import _resolve_data_dir
|
||||
from obsidian_rag.chunker import chunk_file
|
||||
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError
|
||||
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
|
||||
from obsidian_rag.security import should_index_dir, validate_path
|
||||
from obsidian_rag.audit_logger import AuditLogger
|
||||
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
@@ -24,6 +26,10 @@ from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_sour
|
||||
# ----------------------------------------------------------------------
|
||||
|
||||
|
||||
class SensitiveContentError(Exception):
|
||||
"""Raised when sensitive content requires approval but isn't approved."""
|
||||
|
||||
|
||||
class Indexer:
|
||||
"""Coordinates the scan → chunk → embed → store pipeline."""
|
||||
|
||||
@@ -31,6 +37,7 @@ class Indexer:
|
||||
self.config = config
|
||||
self.vault_path = config_mod.resolve_vault_path(config)
|
||||
self._embedder = None # lazy init
|
||||
self._audit_logger = None # lazy init
|
||||
|
||||
@property
|
||||
def embedder(self):
|
||||
@@ -39,6 +46,38 @@ class Indexer:
|
||||
self._embedder = OllamaEmbedder(self.config)
|
||||
return self._embedder
|
||||
|
||||
@property
|
||||
def audit_logger(self):
|
||||
if self._audit_logger is None:
|
||||
from obsidian_rag.audit_logger import AuditLogger
|
||||
log_dir = _resolve_data_dir() / "audit"
|
||||
self._audit_logger = AuditLogger(log_dir / "audit.log")
|
||||
return self._audit_logger
|
||||
|
||||
def _check_sensitive_content_approval(self, chunks: list[dict[str, Any]]) -> None:
|
||||
"""Enforce user approval for sensitive content before indexing."""
|
||||
from obsidian_rag import security
|
||||
|
||||
sensitive_categories = self.config.security.require_confirmation_for
|
||||
if not sensitive_categories:
|
||||
return
|
||||
|
||||
for chunk in chunks:
|
||||
sensitivity = security.detect_sensitive(
|
||||
chunk['chunk_text'],
|
||||
self.config.security.sensitive_sections,
|
||||
self.config.memory.patterns
|
||||
)
|
||||
|
||||
for category in sensitive_categories:
|
||||
if sensitivity.get(category, False):
|
||||
if not self.config.security.auto_approve_sensitive:
|
||||
raise SensitiveContentError(
|
||||
f"Sensitive {category} content detected. "
|
||||
f"Requires explicit approval before indexing. "
|
||||
f"File: {chunk['source_file']}"
|
||||
)
|
||||
|
||||
def scan_vault(self) -> Generator[Path, None, None]:
|
||||
"""Walk vault, yielding markdown files to index."""
|
||||
for root, dirs, files in os.walk(self.vault_path):
|
||||
@@ -106,6 +145,26 @@ class Indexer:
|
||||
for idx, filepath in enumerate(files):
|
||||
try:
|
||||
num_chunks, enriched = self.process_file(filepath)
|
||||
# Enforce sensitive content policies
|
||||
self._check_sensitive_content_approval(enriched)
|
||||
|
||||
# Log sensitive content access
|
||||
for chunk in enriched:
|
||||
from obsidian_rag import security
|
||||
sensitivity = security.detect_sensitive(
|
||||
chunk['chunk_text'],
|
||||
self.config.security.sensitive_sections,
|
||||
self.config.memory.patterns
|
||||
)
|
||||
for category in ['health', 'financial', 'relations']:
|
||||
if sensitivity.get(category, False):
|
||||
self.audit_logger.log_sensitive_access(
|
||||
str(chunk['source_file']),
|
||||
category,
|
||||
'index',
|
||||
{'chunk_id': chunk['chunk_id']}
|
||||
)
|
||||
|
||||
# Embed chunks
|
||||
texts = [e["chunk_text"] for e in enriched]
|
||||
try:
|
||||
@@ -132,14 +191,13 @@ class Indexer:
|
||||
"total": total_files,
|
||||
}
|
||||
|
||||
return {
|
||||
# Yield final result
|
||||
yield {
|
||||
"indexed_files": indexed_files,
|
||||
"total_chunks": total_chunks,
|
||||
"duration_ms": 0, # caller can fill
|
||||
"errors": errors,
|
||||
}
|
||||
|
||||
def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
|
||||
"""Incremental sync: only process files modified since last sync."""
|
||||
sync_result_path = self._sync_result_path()
|
||||
last_sync = None
|
||||
@@ -221,3 +279,78 @@ class Indexer:
|
||||
tmp = path.with_suffix(".json.tmp")
|
||||
tmp.write_text(json.dumps(result, indent=2))
|
||||
tmp.rename(path)
|
||||
|
||||
def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
|
||||
"""Incremental sync: only process files modified since last sync."""
|
||||
sync_result_path = self._sync_result_path()
|
||||
last_sync = None
|
||||
if sync_result_path.exists():
|
||||
try:
|
||||
last_sync = json.loads(sync_result_path.read_text()).get("timestamp")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
db = get_db(self.config)
|
||||
table = create_table_if_not_exists(db)
|
||||
embedder = self.embedder
|
||||
|
||||
files = list(self.scan_vault())
|
||||
indexed_files = 0
|
||||
total_chunks = 0
|
||||
errors: list[dict] = []
|
||||
|
||||
for filepath in files:
|
||||
mtime = datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc)
|
||||
mtime_str = mtime.isoformat()
|
||||
if last_sync and mtime_str <= last_sync:
|
||||
continue # unchanged
|
||||
|
||||
try:
|
||||
num_chunks, enriched = self.process_file(filepath)
|
||||
# Enforce sensitive content policies
|
||||
self._check_sensitive_content_approval(enriched)
|
||||
|
||||
# Log sensitive content access
|
||||
for chunk in enriched:
|
||||
from obsidian_rag import security
|
||||
sensitivity = security.detect_sensitive(
|
||||
chunk['chunk_text'],
|
||||
self.config.security.sensitive_sections,
|
||||
self.config.memory.patterns
|
||||
)
|
||||
for category in ['health', 'financial', 'relations']:
|
||||
if sensitivity.get(category, False):
|
||||
self.audit_logger.log_sensitive_access(
|
||||
str(chunk['source_file']),
|
||||
category,
|
||||
'index',
|
||||
{'chunk_id': chunk['chunk_id']}
|
||||
)
|
||||
|
||||
# Embed chunks
|
||||
texts = [e["chunk_text"] for e in enriched]
|
||||
try:
|
||||
vectors = embedder.embed_chunks(texts)
|
||||
except OllamaUnavailableError:
|
||||
vectors = [[0.0] * 1024 for _ in texts]
|
||||
for e, v in zip(enriched, vectors):
|
||||
e["vector"] = v
|
||||
upsert_chunks(table, enriched)
|
||||
total_chunks += num_chunks
|
||||
indexed_files += 1
|
||||
except Exception as exc:
|
||||
errors.append({"file": str(filepath), "error": str(exc)})
|
||||
|
||||
self._write_sync_result(indexed_files, total_chunks, errors)
|
||||
return {
|
||||
"indexed_files": indexed_files,
|
||||
"total_chunks": total_chunks,
|
||||
"errors": errors,
|
||||
}
|
||||
# Use the same dev-data-dir convention as config.py
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
data_dir = project_root / "obsidian-rag"
|
||||
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
|
||||
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
|
||||
return data_dir / "sync-result.json"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user