Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new

**Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB:
- `config.py` — JSON config loader with cross-platform path resolution
- `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists
- `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes
- `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling
- `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats
- `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields
- `cli.py` — `index | sync | reindex | status` CLI commands

**TypeScript plugin (`src/`)** — OpenClaw plugin scaffold:
- `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client
- `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner)
- `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending)
- `index.ts` — plugin entry point with health probe + vault watcher startup

**Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`):
- 627 files / 3764 chunks indexed in dev vault

**Tests: 76 passing**
- Python: 64 pytest tests (chunker, security, vector_store, config)
- TypeScript: 12 vitest tests (lancedb client, response envelope)

## Bugs fixed

- LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column)
- LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array
- LanceDB JS result score field: `_score` → `_distance`
- TypeScript regex literal with unescaped `/` in path-resolve regex
- Python: `create_table_if_not_exists` identity check → name comparison

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions

156
python/obsidian_rag/cli.py Normal file
View File

@@ -0,0 +1,156 @@
"""CLI: obsidian-rag index | sync | reindex | status."""
from __future__ import annotations
import json
import sys
import time
from pathlib import Path
import obsidian_rag.config as config_mod
from obsidian_rag.vector_store import get_db, get_stats
from obsidian_rag.indexer import Indexer
def main(argv: list[str] | None = None) -> int:
argv = argv or sys.argv[1:]
if not argv or argv[0] in ("--help", "-h"):
print(_usage())
return 0
cmd = argv[0]
try:
config = config_mod.load_config()
except FileNotFoundError as e:
print(f"ERROR: {e}", file=sys.stderr)
return 1
if cmd == "index":
return _index(config)
elif cmd == "sync":
return _sync(config)
elif cmd == "reindex":
return _reindex(config)
elif cmd == "status":
return _status(config)
else:
print(f"Unknown command: {cmd}\n{_usage()}", file=sys.stderr)
return 1
def _index(config) -> int:
indexer = Indexer(config)
t0 = time.monotonic()
try:
gen = indexer.full_index()
result: dict = {"indexed_files": 0, "total_chunks": 0, "errors": []}
for item in gen:
result = item # progress yields are dicts; final dict from return
duration_ms = int((time.monotonic() - t0) * 1000)
print(
json.dumps(
{
"type": "complete",
"indexed_files": result["indexed_files"],
"total_chunks": result["total_chunks"],
"duration_ms": duration_ms,
"errors": result["errors"],
},
indent=2,
)
)
return 0 if not result["errors"] else 1
except Exception as e:
print(json.dumps({"type": "error", "error": str(e)}), file=sys.stderr)
return 2
def _sync(config) -> int:
indexer = Indexer(config)
try:
result = indexer.sync()
print(json.dumps({"type": "complete", **result}, indent=2))
return 0 if not result["errors"] else 1
except Exception as e:
print(json.dumps({"type": "error", "error": str(e)}), file=sys.stderr)
return 2
def _reindex(config) -> int:
indexer = Indexer(config)
t0 = time.monotonic()
try:
result = indexer.reindex()
duration_ms = int((time.monotonic() - t0) * 1000)
print(
json.dumps(
{
"type": "complete",
"indexed_files": result["indexed_files"],
"total_chunks": result["total_chunks"],
"duration_ms": duration_ms,
"errors": result["errors"],
},
indent=2,
)
)
return 0
except Exception as e:
print(json.dumps({"type": "error", "error": str(e)}), file=sys.stderr)
return 2
def _status(config) -> int:
try:
db = get_db(config)
table = db.open_table("obsidian_chunks")
stats = get_stats(table)
# Resolve sync-result.json path (same convention as indexer)
from pathlib import Path
import os as osmod
project_root = Path(__file__).parent.parent.parent
data_dir = project_root / "obsidian-rag"
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
data_dir = Path(osmod.path.expanduser("~/.obsidian-rag"))
sync_path = data_dir / "sync-result.json"
last_sync = None
if sync_path.exists():
try:
last_sync = json.loads(sync_path.read_text()).get("timestamp")
except Exception:
pass
print(
json.dumps(
{
"total_docs": stats["total_docs"],
"total_chunks": stats["total_chunks"],
"last_sync": last_sync,
},
indent=2,
)
)
return 0
except FileNotFoundError:
print(json.dumps({"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2))
return 1
except Exception as e:
print(json.dumps({"error": str(e)}), file=sys.stderr)
return 1
def _usage() -> str:
return """obsidian-rag - Obsidian vault RAG indexer
Usage:
obsidian-rag index Full index of the vault
obsidian-rag sync Incremental sync (changed files only)
obsidian-rag reindex Force full reindex (nuke + rebuild)
obsidian-rag status Show index health and statistics
"""
if __name__ == "__main__":
sys.exit(main())