Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite
## What's new **Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB: - `config.py` — JSON config loader with cross-platform path resolution - `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists - `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes - `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling - `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats - `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields - `cli.py` — `index | sync | reindex | status` CLI commands **TypeScript plugin (`src/`)** — OpenClaw plugin scaffold: - `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client - `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner) - `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending) - `index.ts` — plugin entry point with health probe + vault watcher startup **Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`): - 627 files / 3764 chunks indexed in dev vault **Tests: 76 passing** - Python: 64 pytest tests (chunker, security, vector_store, config) - TypeScript: 12 vitest tests (lancedb client, response envelope) ## Bugs fixed - LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column) - LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array - LanceDB JS result score field: `_score` → `_distance` - TypeScript regex literal with unescaped `/` in path-resolve regex - Python: `create_table_if_not_exists` identity check → name comparison Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
145
python/obsidian_rag/config.py
Normal file
145
python/obsidian_rag/config.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""Configuration loader — reads ~/.obsidian-rag/config.json (or ./obsidian-rag/ for dev)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
DEFAULT_CONFIG_DIR = Path(__file__).parent.parent.parent # python/ → project root
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmbeddingConfig:
|
||||
provider: str = "ollama"
|
||||
model: str = "mxbai-embed-large"
|
||||
base_url: str = "http://localhost:11434"
|
||||
dimensions: int = 1024
|
||||
batch_size: int = 64
|
||||
|
||||
|
||||
@dataclass
|
||||
class VectorStoreConfig:
|
||||
type: str = "lancedb"
|
||||
path: str = "" # resolved relative to data_dir
|
||||
|
||||
|
||||
@dataclass
|
||||
class IndexingConfig:
|
||||
chunk_size: int = 500
|
||||
chunk_overlap: int = 100
|
||||
file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
|
||||
deny_dirs: list[str] = field(
|
||||
default_factory=lambda: [".obsidian", ".trash", "zzz-Archive", ".git", ".logseq"]
|
||||
)
|
||||
allow_dirs: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SecurityConfig:
|
||||
require_confirmation_for: list[str] = field(default_factory=lambda: ["health", "financial_debt"])
|
||||
sensitive_sections: list[str] = field(
|
||||
default_factory=lambda: ["#mentalhealth", "#physicalhealth", "#Relations"]
|
||||
)
|
||||
local_only: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemoryConfig:
|
||||
auto_suggest: bool = True
|
||||
patterns: dict[str, list[str]] = field(
|
||||
default_factory=lambda: {
|
||||
"financial": ["owe", "owed", "debt", "paid", "$", "spent", "spend"],
|
||||
"health": ["#mentalhealth", "#physicalhealth", "medication", "therapy"],
|
||||
"commitments": ["shopping list", "costco", "amazon", "grocery"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ObsidianRagConfig:
|
||||
vault_path: str = ""
|
||||
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
|
||||
vector_store: VectorStoreConfig = field(default_factory=VectorStoreConfig)
|
||||
indexing: IndexingConfig = field(default_factory=IndexingConfig)
|
||||
security: SecurityConfig = field(default_factory=SecurityConfig)
|
||||
memory: MemoryConfig = field(default_factory=MemoryConfig)
|
||||
|
||||
|
||||
def _resolve_data_dir() -> Path:
|
||||
"""Resolve the data directory: dev (project root/obsidian-rag/) or production (~/.obsidian-rag/)."""
|
||||
dev_data_dir = DEFAULT_CONFIG_DIR / "obsidian-rag"
|
||||
if dev_data_dir.exists() or (DEFAULT_CONFIG_DIR / "KnowledgeVault").exists():
|
||||
return dev_data_dir
|
||||
# Production: ~/.obsidian-rag/
|
||||
return Path(os.path.expanduser("~/.obsidian-rag"))
|
||||
|
||||
|
||||
def load_config(config_path: str | Path | None = None) -> ObsidianRagConfig:
|
||||
"""Load config from JSON file, falling back to dev/default config."""
|
||||
if config_path is None:
|
||||
config_path = _resolve_data_dir() / "config.json"
|
||||
else:
|
||||
config_path = Path(config_path)
|
||||
|
||||
if not config_path.exists():
|
||||
raise FileNotFoundError(f"Config file not found: {config_path}")
|
||||
|
||||
with open(config_path) as f:
|
||||
raw: dict[str, Any] = json.load(f)
|
||||
|
||||
return ObsidianRagConfig(
|
||||
vault_path=raw.get("vault_path", ""),
|
||||
embedding=_merge(EmbeddingConfig(), raw.get("embedding", {})),
|
||||
vector_store=_merge(VectorStoreConfig(), raw.get("vector_store", {})),
|
||||
indexing=_merge(IndexingConfig(), raw.get("indexing", {})),
|
||||
security=_merge(SecurityConfig(), raw.get("security", {})),
|
||||
memory=_merge(MemoryConfig(), raw.get("memory", {})),
|
||||
)
|
||||
|
||||
|
||||
def _merge(default: Any, overrides: dict[str, Any]) -> Any:
|
||||
"""Shallow-merge a dict into a dataclass instance."""
|
||||
if not isinstance(default, type) and not isinstance(default, (list, dict, str, int, float, bool)):
|
||||
# It's a dataclass instance — merge fields
|
||||
if hasattr(default, "__dataclass_fields__"):
|
||||
fields = {}
|
||||
for key, val in overrides.items():
|
||||
if key in default.__dataclass_fields__:
|
||||
field_def = default.__dataclass_fields__[key]
|
||||
actual_default = field_def.default
|
||||
if isinstance(actual_default, type) and issubclass(actual_default, Enum):
|
||||
# Enum fields need special handling
|
||||
fields[key] = val
|
||||
elif isinstance(val, dict):
|
||||
fields[key] = _merge(actual_default, val)
|
||||
else:
|
||||
fields[key] = val
|
||||
else:
|
||||
fields[key] = val
|
||||
return default.__class__(**{**default.__dict__, **fields})
|
||||
if isinstance(overrides, dict) and isinstance(default, dict):
|
||||
return {**default, **overrides}
|
||||
return overrides if overrides is not None else default
|
||||
|
||||
|
||||
def resolve_vault_path(config: ObsidianRagConfig) -> Path:
|
||||
"""Resolve vault_path relative to project root or as absolute."""
|
||||
vp = Path(config.vault_path)
|
||||
if vp.is_absolute():
|
||||
return vp
|
||||
# Resolve relative to project root
|
||||
return (DEFAULT_CONFIG_DIR / vp).resolve()
|
||||
|
||||
|
||||
def resolve_vector_db_path(config: ObsidianRagConfig) -> Path:
|
||||
"""Resolve vector store path relative to data directory."""
|
||||
data_dir = _resolve_data_dir()
|
||||
vsp = Path(config.vector_store.path)
|
||||
if vsp.is_absolute():
|
||||
return vsp
|
||||
return (data_dir / vsp).resolve()
|
||||
Reference in New Issue
Block a user