Files
obsidian-rag/python/obsidian_rag/config.py
Santhosh Janardhanan 5c281165c7 Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite
## What's new

**Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB:
- `config.py` — JSON config loader with cross-platform path resolution
- `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists
- `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes
- `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling
- `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats
- `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields
- `cli.py` — `index | sync | reindex | status` CLI commands

**TypeScript plugin (`src/`)** — OpenClaw plugin scaffold:
- `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client
- `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner)
- `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending)
- `index.ts` — plugin entry point with health probe + vault watcher startup

**Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`):
- 627 files / 3764 chunks indexed in dev vault

**Tests: 76 passing**
- Python: 64 pytest tests (chunker, security, vector_store, config)
- TypeScript: 12 vitest tests (lancedb client, response envelope)

## Bugs fixed

- LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column)
- LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array
- LanceDB JS result score field: `_score` → `_distance`
- TypeScript regex literal with unescaped `/` in path-resolve regex
- Python: `create_table_if_not_exists` identity check → name comparison

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 22:56:50 -04:00

145 lines
5.1 KiB
Python

"""Configuration loader — reads ~/.obsidian-rag/config.json (or ./obsidian-rag/ for dev)."""
from __future__ import annotations
import json
import os
from enum import Enum
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
DEFAULT_CONFIG_DIR = Path(__file__).parent.parent.parent # python/ → project root
@dataclass
class EmbeddingConfig:
provider: str = "ollama"
model: str = "mxbai-embed-large"
base_url: str = "http://localhost:11434"
dimensions: int = 1024
batch_size: int = 64
@dataclass
class VectorStoreConfig:
type: str = "lancedb"
path: str = "" # resolved relative to data_dir
@dataclass
class IndexingConfig:
chunk_size: int = 500
chunk_overlap: int = 100
file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
deny_dirs: list[str] = field(
default_factory=lambda: [".obsidian", ".trash", "zzz-Archive", ".git", ".logseq"]
)
allow_dirs: list[str] = field(default_factory=list)
@dataclass
class SecurityConfig:
require_confirmation_for: list[str] = field(default_factory=lambda: ["health", "financial_debt"])
sensitive_sections: list[str] = field(
default_factory=lambda: ["#mentalhealth", "#physicalhealth", "#Relations"]
)
local_only: bool = True
@dataclass
class MemoryConfig:
auto_suggest: bool = True
patterns: dict[str, list[str]] = field(
default_factory=lambda: {
"financial": ["owe", "owed", "debt", "paid", "$", "spent", "spend"],
"health": ["#mentalhealth", "#physicalhealth", "medication", "therapy"],
"commitments": ["shopping list", "costco", "amazon", "grocery"],
}
)
@dataclass
class ObsidianRagConfig:
vault_path: str = ""
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
vector_store: VectorStoreConfig = field(default_factory=VectorStoreConfig)
indexing: IndexingConfig = field(default_factory=IndexingConfig)
security: SecurityConfig = field(default_factory=SecurityConfig)
memory: MemoryConfig = field(default_factory=MemoryConfig)
def _resolve_data_dir() -> Path:
"""Resolve the data directory: dev (project root/obsidian-rag/) or production (~/.obsidian-rag/)."""
dev_data_dir = DEFAULT_CONFIG_DIR / "obsidian-rag"
if dev_data_dir.exists() or (DEFAULT_CONFIG_DIR / "KnowledgeVault").exists():
return dev_data_dir
# Production: ~/.obsidian-rag/
return Path(os.path.expanduser("~/.obsidian-rag"))
def load_config(config_path: str | Path | None = None) -> ObsidianRagConfig:
"""Load config from JSON file, falling back to dev/default config."""
if config_path is None:
config_path = _resolve_data_dir() / "config.json"
else:
config_path = Path(config_path)
if not config_path.exists():
raise FileNotFoundError(f"Config file not found: {config_path}")
with open(config_path) as f:
raw: dict[str, Any] = json.load(f)
return ObsidianRagConfig(
vault_path=raw.get("vault_path", ""),
embedding=_merge(EmbeddingConfig(), raw.get("embedding", {})),
vector_store=_merge(VectorStoreConfig(), raw.get("vector_store", {})),
indexing=_merge(IndexingConfig(), raw.get("indexing", {})),
security=_merge(SecurityConfig(), raw.get("security", {})),
memory=_merge(MemoryConfig(), raw.get("memory", {})),
)
def _merge(default: Any, overrides: dict[str, Any]) -> Any:
"""Shallow-merge a dict into a dataclass instance."""
if not isinstance(default, type) and not isinstance(default, (list, dict, str, int, float, bool)):
# It's a dataclass instance — merge fields
if hasattr(default, "__dataclass_fields__"):
fields = {}
for key, val in overrides.items():
if key in default.__dataclass_fields__:
field_def = default.__dataclass_fields__[key]
actual_default = field_def.default
if isinstance(actual_default, type) and issubclass(actual_default, Enum):
# Enum fields need special handling
fields[key] = val
elif isinstance(val, dict):
fields[key] = _merge(actual_default, val)
else:
fields[key] = val
else:
fields[key] = val
return default.__class__(**{**default.__dict__, **fields})
if isinstance(overrides, dict) and isinstance(default, dict):
return {**default, **overrides}
return overrides if overrides is not None else default
def resolve_vault_path(config: ObsidianRagConfig) -> Path:
"""Resolve vault_path relative to project root or as absolute."""
vp = Path(config.vault_path)
if vp.is_absolute():
return vp
# Resolve relative to project root
return (DEFAULT_CONFIG_DIR / vp).resolve()
def resolve_vector_db_path(config: ObsidianRagConfig) -> Path:
"""Resolve vector store path relative to data directory."""
data_dir = _resolve_data_dir()
vsp = Path(config.vector_store.path)
if vsp.is_absolute():
return vsp
return (data_dir / vsp).resolve()