Compare commits
4 Commits
main
...
208531d28d
| Author | SHA1 | Date | |
|---|---|---|---|
| 208531d28d | |||
| 83a54b2af6 | |||
| 5c281165c7 | |||
| 18ad47e100 |
@@ -8,7 +8,46 @@
|
|||||||
"Bash(git add:*)",
|
"Bash(git add:*)",
|
||||||
"Bash(git commit -m ':*)",
|
"Bash(git commit -m ':*)",
|
||||||
"WebFetch(domain:www.ollama.com)",
|
"WebFetch(domain:www.ollama.com)",
|
||||||
"mcp__web-reader__webReader"
|
"mcp__web-reader__webReader",
|
||||||
|
"Bash(ollama list:*)",
|
||||||
|
"Bash(python3:*)",
|
||||||
|
"Bash(pip install:*)",
|
||||||
|
"Bash(npm install:*)",
|
||||||
|
"Bash(obsidian-rag --help)",
|
||||||
|
"Bash(obsidian-rag status:*)",
|
||||||
|
"Bash(npm run:*)",
|
||||||
|
"Bash(obsidian-rag index:*)",
|
||||||
|
"Bash(curl -s http://localhost:11434/api/tags)",
|
||||||
|
"Bash(curl -s -X POST http://localhost:11434/api/embeddings -d '{\"model\":\"mxbai-embed-large\",\"prompt\":\"hello world\"}')",
|
||||||
|
"Bash(curl -s -X POST http://localhost:11434/api/embeddings -d '{\"model\":\"mxbai-embed-large:335m\",\"prompt\":\"hello world\"}')",
|
||||||
|
"Bash(curl:*)",
|
||||||
|
"Bash(find /Users/santhoshj/dev/obsidian-rag/python -name \"*.pyc\" -delete)",
|
||||||
|
"Bash(find /Users/santhoshj/dev/obsidian-rag/python -name \"__pycache__\" -exec rm -rf {} +)",
|
||||||
|
"Bash(npm test:*)",
|
||||||
|
"Bash(python -m pytest --collect-only)",
|
||||||
|
"Bash(python -m pytest tests/unit/test_chunker.py tests/unit/test_security.py -v)",
|
||||||
|
"Bash(python -m pytest tests/unit/test_chunker.py -v --tb=short)",
|
||||||
|
"mcp__plugin_ecc_context7__resolve-library-id",
|
||||||
|
"mcp__plugin_ecc_context7__query-docs",
|
||||||
|
"Bash(python -m pytest tests/unit/test_vector_store.py -v)",
|
||||||
|
"Bash(python -m pytest tests/unit/test_vector_store.py::test_search_chunks_with_tags_filter -v)",
|
||||||
|
"Bash(python:*)",
|
||||||
|
"Bash(npx tsx:*)",
|
||||||
|
"Bash(node test_lancedb_client.mjs)",
|
||||||
|
"Bash(node -e ':*)",
|
||||||
|
"Bash(node:*)",
|
||||||
|
"Bash(ls /Users/santhoshj/dev/obsidian-rag/*.config.*)",
|
||||||
|
"Bash(npx vitest:*)",
|
||||||
|
"Bash(git commit:*)",
|
||||||
|
"mcp__plugin_ecc_memory__add_observations",
|
||||||
|
"WebSearch",
|
||||||
|
"WebFetch(domain:docs.openclaw.ai)",
|
||||||
|
"Bash(ls node_modules/openclaw/dist/plugin-sdk/zod*)",
|
||||||
|
"Bash(ls:*)",
|
||||||
|
"Bash(npx ts-node:*)",
|
||||||
|
"Bash(pkill -f \"ollama serve\")"
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
|
"outputStyle": "default",
|
||||||
|
"spinnerTipsEnabled": false
|
||||||
}
|
}
|
||||||
|
|||||||
470
README.md
Normal file
470
README.md
Normal file
@@ -0,0 +1,470 @@
|
|||||||
|
# Obsidian RAG — Manual Testing Guide
|
||||||
|
|
||||||
|
**What it does:** Indexes an Obsidian vault → LanceDB → semantic search via Ollama embeddings. Powers OpenClaw agent tools for natural-language queries over 677+ personal notes.
|
||||||
|
|
||||||
|
**Stack:** Python indexer (CLI) → LanceDB → TypeScript plugin (OpenClaw)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
| Component | Version | Verify |
|
||||||
|
|---|---|---|
|
||||||
|
| Python | ≥3.11 | `python --version` |
|
||||||
|
| Node.js | ≥18 | `node --version` |
|
||||||
|
| Ollama | running | `curl http://localhost:11434/api/tags` |
|
||||||
|
| Ollama model | `mxbai-embed-large:335m` | `ollama list` |
|
||||||
|
|
||||||
|
**Install Ollama + model (if needed):**
|
||||||
|
```bash
|
||||||
|
# macOS/Linux
|
||||||
|
curl -fsSL https://ollama.com/install.sh | sh
|
||||||
|
|
||||||
|
# Pull embedding model
|
||||||
|
ollama pull mxbai-embed-large:335m
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### 1. Python CLI (indexer)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /Users/santhoshj/dev/obsidian-rag
|
||||||
|
|
||||||
|
# Create virtual environment (optional but recommended)
|
||||||
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate # macOS/Linux
|
||||||
|
# .\.venv\Scripts\Activate.ps1 # Windows PowerShell
|
||||||
|
# .venv\Scripts\activate.bat # Windows CMD
|
||||||
|
|
||||||
|
# Install in editable mode
|
||||||
|
pip install -e python/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Verify:**
|
||||||
|
```bash
|
||||||
|
obsidian-rag --help
|
||||||
|
# → obsidian-rag index | sync | reindex | status
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. TypeScript Plugin (for OpenClaw integration)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
npm run build # → dist/index.js (131kb)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. (Optional) Ollama running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ollama serve &
|
||||||
|
curl http://localhost:11434/api/tags
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Edit `obsidian-rag/config.json` at the project root:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"vault_path": "./KnowledgeVault/Default",
|
||||||
|
"embedding": {
|
||||||
|
"provider": "ollama",
|
||||||
|
"model": "mxbai-embed-large:335m",
|
||||||
|
"base_url": "http://localhost:11434",
|
||||||
|
"dimensions": 1024,
|
||||||
|
"batch_size": 64
|
||||||
|
},
|
||||||
|
"vector_store": {
|
||||||
|
"type": "lancedb",
|
||||||
|
"path": "./obsidian-rag/vectors.lance"
|
||||||
|
},
|
||||||
|
"indexing": {
|
||||||
|
"chunk_size": 500,
|
||||||
|
"chunk_overlap": 100,
|
||||||
|
"file_patterns": ["*.md"],
|
||||||
|
"deny_dirs": [".obsidian", ".trash", "zzz-Archive", ".git", ".logseq"],
|
||||||
|
"allow_dirs": []
|
||||||
|
},
|
||||||
|
"security": {
|
||||||
|
"require_confirmation_for": ["health", "financial_debt"],
|
||||||
|
"sensitive_sections": ["#mentalhealth", "#physicalhealth", "#Relations"],
|
||||||
|
"local_only": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
| Field | What it does |
|
||||||
|
|---|---|
|
||||||
|
| `vault_path` | Root of Obsidian vault (relative or absolute) |
|
||||||
|
| `embedding.model` | Ollama model for `mxbai-embed-large:335m` |
|
||||||
|
| `vector_store.path` | Where LanceDB data lives |
|
||||||
|
| `deny_dirs` | Always-skipped directories |
|
||||||
|
| `allow_dirs` | If non-empty, **only** these directories are indexed |
|
||||||
|
|
||||||
|
**Windows users:** Use `".\\KnowledgeVault\\Default"` or an absolute path like `"C:\\Users\\you\\KnowledgeVault\\Default"`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CLI Commands
|
||||||
|
|
||||||
|
All commands run from the project root (`/Users/santhoshj/dev/obsidian-rag`).
|
||||||
|
|
||||||
|
### `obsidian-rag index` — Full Index
|
||||||
|
|
||||||
|
First-time indexing. Scans all `.md` files → chunks → embeds → stores in LanceDB.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag index
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "complete",
|
||||||
|
"indexed_files": 627,
|
||||||
|
"total_chunks": 3764,
|
||||||
|
"duration_ms": 45230,
|
||||||
|
"errors": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**What happens:**
|
||||||
|
1. Walk vault (respects `deny_dirs` / `allow_dirs`)
|
||||||
|
2. Parse markdown: frontmatter, headings, tags, dates
|
||||||
|
3. Chunk: structured notes (journal) split by `# heading`; unstructured use 500-token sliding window
|
||||||
|
4. Embed: batch of 64 chunks → Ollama `/api/embeddings`
|
||||||
|
5. Upsert: write to LanceDB
|
||||||
|
6. Write `obsidian-rag/sync-result.json` atomically
|
||||||
|
|
||||||
|
**Time:** ~45s for 627 files on first run.
|
||||||
|
|
||||||
|
### `obsidian-rag sync` — Incremental Sync
|
||||||
|
|
||||||
|
Only re-indexes files changed since last sync (by `mtime`).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag sync
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "complete",
|
||||||
|
"indexed_files": 3,
|
||||||
|
"total_chunks": 12,
|
||||||
|
"duration_ms": 1200,
|
||||||
|
"errors": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use when:** You edited/added a few notes and want to update the index without a full rebuild.
|
||||||
|
|
||||||
|
### `obsidian-rag reindex` — Force Rebuild
|
||||||
|
|
||||||
|
Nukes the existing LanceDB table and rebuilds from scratch.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag reindex
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use when:**
|
||||||
|
- LanceDB schema changed
|
||||||
|
- Chunking strategy changed
|
||||||
|
- Index corrupted
|
||||||
|
- First run after upgrading (to pick up FTS index)
|
||||||
|
|
||||||
|
### `obsidian-rag status` — Index Health
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag status
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"total_docs": 627,
|
||||||
|
"total_chunks": 3764,
|
||||||
|
"last_sync": "2026-04-11T00:30:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Re-index after schema upgrade (important!)
|
||||||
|
|
||||||
|
If you pulled a new version that changed the FTS index setup, you **must** reindex:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag reindex
|
||||||
|
```
|
||||||
|
|
||||||
|
This drops and recreates the LanceDB table, rebuilding the FTS index on `chunk_text`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Manual Testing Walkthrough
|
||||||
|
|
||||||
|
### Step 1 — Verify prerequisites
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ollama up?
|
||||||
|
curl http://localhost:11434/api/tags
|
||||||
|
|
||||||
|
# Python CLI working?
|
||||||
|
obsidian-rag --help
|
||||||
|
|
||||||
|
# Vault accessible?
|
||||||
|
ls ./KnowledgeVault/Default | head -5
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2 — Do a full index
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag index
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: ~30-60s. JSON output with `indexed_files` and `total_chunks`.
|
||||||
|
|
||||||
|
### Step 3 — Check status
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag status
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4 — Test search via Python
|
||||||
|
|
||||||
|
The Python indexer doesn't have an interactive search CLI, but you can test via the LanceDB Python API directly:
|
||||||
|
|
||||||
|
```python
|
||||||
|
python3 -c "
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, 'python')
|
||||||
|
from obsidian_rag.vector_store import get_db, search_chunks
|
||||||
|
from obsidian_rag.embedder import embed_texts
|
||||||
|
from obsidian_rag.config import load_config
|
||||||
|
|
||||||
|
config = load_config()
|
||||||
|
db = get_db(config)
|
||||||
|
table = db.open_table('obsidian_chunks')
|
||||||
|
|
||||||
|
# Embed a query
|
||||||
|
query_vec = embed_texts(['how was my mental health in 2024'], config)[0]
|
||||||
|
|
||||||
|
# Search
|
||||||
|
results = search_chunks(table, query_vec, limit=3)
|
||||||
|
for r in results:
|
||||||
|
print(f'[{r.score:.3f}] {r.source_file} | {r.section or \"(no section)\"}')
|
||||||
|
print(f' {r.chunk_text[:200]}...')
|
||||||
|
print()
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5 — Test TypeScript search (via Node)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node --input-type=module -e "
|
||||||
|
import { loadConfig } from './src/utils/config.js';
|
||||||
|
import { searchVectorDb } from './src/utils/lancedb.js';
|
||||||
|
|
||||||
|
const config = loadConfig();
|
||||||
|
const results = await searchVectorDb(config, 'how was my mental health in 2024', { max_results: 3 });
|
||||||
|
for (const r of results) {
|
||||||
|
console.log(\`[\${r.score}] \${r.source_file} | \${r.section || '(no section)'}\`);
|
||||||
|
console.log(\` \${r.chunk_text.slice(0, 180)}...\`);
|
||||||
|
console.log();
|
||||||
|
}
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 6 — Test DEGRADED mode (Ollama down)
|
||||||
|
|
||||||
|
Stop Ollama, then run the same search:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop Ollama
|
||||||
|
pkill -f ollama # macOS/Linux
|
||||||
|
|
||||||
|
# Now run search — should fall back to FTS
|
||||||
|
node --input-type=module -e "
|
||||||
|
...same as above...
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: results come back using BM25 full-text search instead of vector similarity. You'll see lower `_score` values (BM25 scores are smaller floats).
|
||||||
|
|
||||||
|
### Step 7 — Test sync
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit a note
|
||||||
|
echo "# Test edit
|
||||||
|
This is a test note about Ollama being down." >> ./KnowledgeVault/Default/test-note.md
|
||||||
|
|
||||||
|
# Sync
|
||||||
|
obsidian-rag sync
|
||||||
|
|
||||||
|
# Check it was indexed
|
||||||
|
obsidian-rag status
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 8 — Test indexer health check
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop Ollama
|
||||||
|
pkill -f ollama
|
||||||
|
|
||||||
|
# Check status — will report Ollama as down but still show index stats
|
||||||
|
obsidian-rag status
|
||||||
|
|
||||||
|
# Restart Ollama
|
||||||
|
ollama serve
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Directory Filtering
|
||||||
|
|
||||||
|
Test searching only within `Journal`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
node --input-type=module -e "
|
||||||
|
import { loadConfig } from './src/utils/config.js';
|
||||||
|
import { searchVectorDb } from './src/utils/lancedb.js';
|
||||||
|
const config = loadConfig();
|
||||||
|
const results = await searchVectorDb(config, 'my mood and feelings', {
|
||||||
|
max_results: 3,
|
||||||
|
directory_filter: ['Journal']
|
||||||
|
});
|
||||||
|
results.forEach(r => console.log(\`[\${r.score}] \${r.source_file}\`));
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## File Paths Reference
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|---|---|
|
||||||
|
| `obsidian-rag/vectors.lance/` | LanceDB data directory |
|
||||||
|
| `obsidian-rag/sync-result.json` | Last sync timestamp + stats |
|
||||||
|
| `python/obsidian_rag/` | Python package source |
|
||||||
|
| `src/` | TypeScript plugin source |
|
||||||
|
| `dist/index.js` | Built plugin bundle |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### `FileNotFoundError: config.json`
|
||||||
|
|
||||||
|
Config must be found. The CLI looks in:
|
||||||
|
1. `./obsidian-rag/config.json` (relative to project root)
|
||||||
|
2. `~/.obsidian-rag/config.json` (home directory)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Verify config is found
|
||||||
|
python3 -c "
|
||||||
|
import sys; sys.path.insert(0,'python')
|
||||||
|
from obsidian_rag.config import load_config
|
||||||
|
c = load_config()
|
||||||
|
print('vault_path:', c.vault_path)
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
### `ERROR: Index not found. Run 'obsidian-rag index' first.`
|
||||||
|
|
||||||
|
LanceDB table doesn't exist yet. Run `obsidian-rag index`.
|
||||||
|
|
||||||
|
### Ollama connection refused
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11434/api/tags
|
||||||
|
```
|
||||||
|
|
||||||
|
If this fails, Ollama isn't running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ollama serve &
|
||||||
|
ollama pull mxbai-embed-large:335m
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vector search returns 0 results
|
||||||
|
|
||||||
|
1. Check index exists: `obsidian-rag status`
|
||||||
|
2. Rebuild index: `obsidian-rag reindex`
|
||||||
|
3. Check Ollama is up and model is available: `ollama list`
|
||||||
|
|
||||||
|
### FTS (DEGRADED mode) not working after upgrade
|
||||||
|
|
||||||
|
The FTS index on `chunk_text` was added in a recent change. **Reindex to rebuild with FTS:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
obsidian-rag reindex
|
||||||
|
```
|
||||||
|
|
||||||
|
### Permission errors on Windows
|
||||||
|
|
||||||
|
Run terminal as Administrator, or install Python/Ollama to user-writable directories.
|
||||||
|
|
||||||
|
### Very slow embedding
|
||||||
|
|
||||||
|
Reduce batch size in `config.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"batch_size": 32
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
obsidian-rag/
|
||||||
|
├── obsidian-rag/
|
||||||
|
│ ├── config.json # Dev configuration
|
||||||
|
│ ├── vectors.lance/ # LanceDB data (created on first index)
|
||||||
|
│ └── sync-result.json # Last sync metadata
|
||||||
|
├── python/
|
||||||
|
│ ├── obsidian_rag/
|
||||||
|
│ │ ├── cli.py # obsidian-rag CLI entry point
|
||||||
|
│ │ ├── config.py # Config loader
|
||||||
|
│ │ ├── indexer.py # Full pipeline (scan → chunk → embed → store)
|
||||||
|
│ │ ├── chunker.py # Structured + sliding-window chunking
|
||||||
|
│ │ ├── embedder.py # Ollama /api/embeddings client
|
||||||
|
│ │ ├── vector_store.py # LanceDB CRUD
|
||||||
|
│ │ └── security.py # Path traversal, HTML strip, sensitive detection
|
||||||
|
│ └── tests/unit/ # 64 pytest tests
|
||||||
|
├── src/
|
||||||
|
│ ├── index.ts # OpenClaw plugin entry (definePluginEntry)
|
||||||
|
│ ├── tools/
|
||||||
|
│ │ ├── index.ts # 4× api.registerTool() calls
|
||||||
|
│ │ ├── index-tool.ts # obsidian_rag_index implementation
|
||||||
|
│ │ ├── search.ts # obsidian_rag_search implementation
|
||||||
|
│ │ ├── status.ts # obsidian_rag_status implementation
|
||||||
|
│ │ └── memory.ts # obsidian_rag_memory_store implementation
|
||||||
|
│ ├── services/
|
||||||
|
│ │ ├── health.ts # HEALTHY / DEGRADED / UNAVAILABLE state machine
|
||||||
|
│ │ ├── vault-watcher.ts # chokidar watcher + auto-sync
|
||||||
|
│ │ └── indexer-bridge.ts # Spawns Python CLI subprocess
|
||||||
|
│ └── utils/
|
||||||
|
│ ├── config.ts # TS config loader
|
||||||
|
│ ├── lancedb.ts # TS LanceDB query + FTS fallback
|
||||||
|
│ ├── types.ts # Shared types (SearchResult, ResponseEnvelope)
|
||||||
|
│ └── response.ts # makeEnvelope() factory
|
||||||
|
├── dist/index.js # Built plugin (do not edit)
|
||||||
|
├── openclaw.plugin.json # Plugin manifest
|
||||||
|
├── package.json
|
||||||
|
└── tsconfig.json
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Health States
|
||||||
|
|
||||||
|
| State | Meaning | Search |
|
||||||
|
|---|---|---|
|
||||||
|
| `HEALTHY` | Ollama up + index exists | Vector similarity (semantic) |
|
||||||
|
| `DEGRADED` | Ollama down + index exists | FTS on `chunk_text` (BM25) |
|
||||||
|
| `UNAVAILABLE` | No index / corrupted | Error — run `obsidian-rag index` first |
|
||||||
181
docs/superpowers/specs/2026-04-10-obsidian-rag-task-list.md
Normal file
181
docs/superpowers/specs/2026-04-10-obsidian-rag-task-list.md
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
# Obsidian RAG Plugin - Work Queue
|
||||||
|
|
||||||
|
**Date:** 2026-04-10
|
||||||
|
**Based on:** Work Breakdown Structure v1.0
|
||||||
|
**Last Updated:** 2026-04-10 21:30
|
||||||
|
|
||||||
|
## Legend
|
||||||
|
- `[ ]` = Pending
|
||||||
|
- `[x]` = Done
|
||||||
|
- `[~]` = In Progress
|
||||||
|
- `[!]` = Error / Blocked
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 0: Project Scaffolding & Environment
|
||||||
|
|
||||||
|
### 0.1 Repository & Build Setup
|
||||||
|
- [x] **0.1.1** Initialize TypeScript project structure (S) - Create package.json, tsconfig.json, src/ directory
|
||||||
|
- [x] **0.1.2** Initialize Python package structure (S) - Create pyproject.toml, obsidian_rag/ module skeleton
|
||||||
|
- [x] **0.1.3** Create development config file (S) - Depends on 0.1.1 - Create ./obsidian-rag/config.json
|
||||||
|
- [x] **0.1.4** Set up OpenClaw plugin manifest (S) - Depends on 0.1.1 - Create openclaw.plugin.json
|
||||||
|
- [x] **0.1.5** Configure test runners (S) - Depends on 0.1.1, 0.1.2 - Setup vitest and pytest configs
|
||||||
|
|
||||||
|
### 0.2 Environment Validation
|
||||||
|
- [x] **0.2.1** Verify Ollama + mxbai-embed-large (S) - Test embedding API
|
||||||
|
- [x] **0.2.2** Verify LanceDB Python package (S) - Test table creation and queries
|
||||||
|
- [x] **0.2.3** Verify sample vault accessibility (S) - Count .md files in KnowledgeVault
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1: Data Layer (Python Indexer)
|
||||||
|
|
||||||
|
### 1.1 Configuration (Python)
|
||||||
|
- [x] **1.1.1** Implement config loader (S) - Depends on 0.1.2 - Read JSON, resolve paths, validate schema
|
||||||
|
- [ ] **1.1.2** Write config tests (S) - Depends on 1.1.1 - Test validation and path resolution
|
||||||
|
|
||||||
|
### 1.2 Security (Python) - Can start after 1.1.1, parallel with other components
|
||||||
|
- [x] **1.2.1** Implement path traversal prevention (S) - Depends on 1.1.1 - Validate paths, reject ../ and symlinks
|
||||||
|
- [x] **1.2.2** Implement input sanitization (S) - Depends on 1.1.1 - Strip HTML, normalize whitespace
|
||||||
|
- [x] **1.2.3** Implement sensitive content detection (S) - Depends on 1.1.1 - Detect health/financial/relations content
|
||||||
|
- [x] **1.2.4** Implement directory access control (S) - Depends on 1.1.1 - Apply deny/allow lists
|
||||||
|
- [x] **1.2.5** Write security tests (M) - Depends on 1.2.1-1.2.4 - Test all security functions
|
||||||
|
|
||||||
|
### 1.3 Chunking - Can start after 1.1.1, parallel with security
|
||||||
|
- [x] **1.3.1** Implement markdown parser (S) - Depends on 0.1.2 - Parse frontmatter, headings, tags
|
||||||
|
- [x] **1.3.2** Implement structured chunker (M) - Depends on 1.3.1 - Split by section headers
|
||||||
|
- [x] **1.3.3** Implement sliding window chunker (S) - Depends on 1.3.1 - 500 token window with overlap
|
||||||
|
- [x] **1.3.4** Implement chunk router (S) - Depends on 1.3.2, 1.3.3 - Route structured vs unstructured
|
||||||
|
- [x] **1.3.5** Write chunker tests (M) - Depends on 1.3.4 - Test all chunking scenarios
|
||||||
|
|
||||||
|
### 1.4 Embedding - Can start after 1.1.1, parallel with chunking/security
|
||||||
|
- [x] **1.4.1** Implement Ollama embedder (M) - Depends on 1.1.1 - Batch 64 chunks, error handling
|
||||||
|
- [ ] **1.4.2** Implement embedding cache (S) - Depends on 1.4.1 - File-based cache
|
||||||
|
- [ ] **1.4.3** Write embedder tests (S) - Depends on 1.4.1, 1.4.2 - Test batching and cache
|
||||||
|
|
||||||
|
### 1.5 Vector Store - Can start after 0.2.2, parallel with other components
|
||||||
|
- [x] **1.5.1** Implement LanceDB table creation (S) - Depends on 0.2.2 - Create obsidian_chunks table
|
||||||
|
- [x] **1.5.2** Implement vector upsert (S) - Depends on 1.5.1 - Add/update chunks
|
||||||
|
- [x] **1.5.3** Implement vector delete (S) - Depends on 1.5.1 - Remove by source_file
|
||||||
|
- [x] **1.5.4** Implement vector search (M) - Depends on 1.5.1 - Query with filters
|
||||||
|
- [x] **1.5.5** Write vector store tests (M) - Depends on 1.5.2-1.5.4 - Test CRUD operations
|
||||||
|
|
||||||
|
### 1.6 Indexer Pipeline & CLI - Depends on multiple components
|
||||||
|
- [x] **1.6.1** Implement full index pipeline (M) - Depends on 1.2.4, 1.3.4, 1.4.1, 1.5.2 - Scan → parse → chunk → embed → store
|
||||||
|
- [x] **1.6.2** Implement incremental sync (M) - Depends on 1.6.1, 1.5.3 - Compare mtime, process changes
|
||||||
|
- [x] **1.6.3** Implement reindex (S) - Depends on 1.6.1 - Drop table + rebuild
|
||||||
|
- [x] **1.6.4** Implement sync-result.json writer (S) - Depends on 1.6.1 - Atomic file writing
|
||||||
|
- [x] **1.6.5** Implement CLI entry point (M) - Depends on 1.6.1, 1.6.2, 1.6.3 - index/sync/reindex commands
|
||||||
|
- [ ] **1.6.6** Write indexer tests (M) - Depends on 1.6.5 - Test full pipeline and CLI
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2: Data Layer (TypeScript Client)
|
||||||
|
|
||||||
|
### 2.1 Configuration (TypeScript) - Can start after 0.1.1, parallel with Phase 1
|
||||||
|
- [x] **2.1.1** Implement config loader (S) - Depends on 0.1.1 - Read JSON, validate schema
|
||||||
|
- [x] **2.1.2** Implement config types (S) - Depends on 2.1.1 - TypeScript interfaces
|
||||||
|
|
||||||
|
### 2.2 LanceDB Client - Depends on Phase 1 completion
|
||||||
|
- [x] **2.2.1** Implement LanceDB query client (M) - Depends on 0.1.1 - Connect and search
|
||||||
|
- [~] **2.2.2** Implement full-text search fallback (S) - Depends on 2.2.1 - Degraded mode
|
||||||
|
|
||||||
|
### 2.3 Indexer Bridge - Depends on Phase 1 completion
|
||||||
|
- [x] **2.3.1** Implement subprocess spawner (M) - Depends on 0.1.1 - Spawn Python CLI
|
||||||
|
- [x] **2.3.2** Implement sync-result reader (S) - Depends on 2.3.1 - Read sync results
|
||||||
|
- [x] **2.3.3** Implement job tracking (S) - Depends on 2.3.1 - Track progress
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3: Session & Transport Layers
|
||||||
|
|
||||||
|
### 3.1 Health State Machine - Depends on Phase 2
|
||||||
|
- [x] **3.1.1** Implement health prober (S) - Depends on 2.1.1, 2.2.1 - Probe dependencies
|
||||||
|
- [x] **3.1.2** Implement state machine (S) - Depends on 3.1.1 - HEALTHY/DEGRADED/UNAVAILABLE
|
||||||
|
- [x] **3.1.3** Implement staleness detector (S) - Depends on 3.1.2, 2.3.2 - Detect stale syncs
|
||||||
|
|
||||||
|
### 3.2 Vault Watcher - Depends on Phase 2
|
||||||
|
- [x] **3.2.1** Implement file watcher (S) - Depends on 2.1.1 - Watch vault directory
|
||||||
|
- [x] **3.2.2** Implement debounce & batching (M) - Depends on 3.2.1 - Batch changes
|
||||||
|
- [x] **3.2.3** Implement auto-sync trigger (M) - Depends on 3.2.2, 2.3.1, 3.1.2 - Trigger sync
|
||||||
|
- [ ] **3.2.4** Write vault watcher tests (M) - Depends on 3.2.3 - Test watcher behavior
|
||||||
|
|
||||||
|
### 3.3 Response Envelope & Error Normalization - Can start after 0.1.1, parallel
|
||||||
|
- [x] **3.3.1** Implement response envelope factory (S) - Depends on 0.1.1 - Build response structure
|
||||||
|
- [x] **3.3.2** Implement error normalizer (S) - Depends on 3.3.1 - Map exceptions to codes
|
||||||
|
|
||||||
|
### 3.4 Security Guard (TypeScript) - Can start after 2.1.1, parallel with 3.1-3.2
|
||||||
|
- [x] **3.4.1** Implement directory filter validator (S) - Depends on 2.1.1 - Validate filters
|
||||||
|
- [x] **3.4.2** Implement sensitive content flag (S) - Depends on 3.4.1 - Flag sensitive content
|
||||||
|
- [ ] **3.4.3** Write security guard tests (S) - Depends on 3.4.2 - Test security functions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4: Tool Layer
|
||||||
|
|
||||||
|
### 4.1 Tool Implementations - Depends on Phase 3
|
||||||
|
- [~] **4.1.1** Implement obsidian_rag_search tool (M) - Depends on 2.2.1, 3.3.1, 3.4.2 - Search with filters ⚠️ LanceDB TS client now wired, needs OpenClaw integration
|
||||||
|
- [~] **4.1.2** Implement obsidian_rag_index tool (M) - Depends on 2.3.1, 2.3.3, 3.3.1 - Spawn indexer ⚠️ stub — tool registration not wired to OpenClaw
|
||||||
|
- [~] **4.1.3** Implement obsidian_rag_status tool (S) - Depends on 3.1.2, 2.3.2, 3.3.1 - Return health status ⚠️ stub — reads sync-result not LanceDB stats
|
||||||
|
- [~] **4.1.4** Implement obsidian_rag_memory_store tool (S) - Depends on 3.3.1 - Persist to memory ⚠️ stub — no-op
|
||||||
|
- [ ] **4.1.5** Write tool unit tests (M) - Depends on 4.1.1-4.1.4 - Test all tools
|
||||||
|
|
||||||
|
### 4.2 Plugin Registration - Depends on tools
|
||||||
|
- [~] **4.2.1** Implement plugin entry point (M) - Depends on 4.1.1-4.1.4, 3.2.3, 3.1.2 - Plugin lifecycle ⚠️ stub — tools registration is a TODO
|
||||||
|
- [ ] **4.2.2** Verify OpenClaw plugin lifecycle (S) - Depends on 4.2.1 - Manual test
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5: Integration & Hardening
|
||||||
|
|
||||||
|
### 5.1 Integration Tests - Depends on Phase 4
|
||||||
|
- [ ] **5.1.1** Full pipeline integration test (M) - Depends on 1.6.5, 4.2.1 - Index → search
|
||||||
|
- [ ] **5.1.2** Sync cycle integration test (M) - Depends on 3.2.3, 5.1.1 - Modify → auto-sync → search
|
||||||
|
- [ ] **5.1.3** Health state integration test (S) - Depends on 3.1.2, 5.1.1 - Test state transitions
|
||||||
|
- [ ] **5.1.4** OpenClaw protocol integration test (M) - Depends on 4.2.1 - Test all tools
|
||||||
|
|
||||||
|
### 5.2 Security Test Suite - Depends on relevant components
|
||||||
|
- [ ] **5.2.1** Path traversal tests (S) - Depends on 1.2.1, 3.4.1 - Test ../, symlinks, Windows paths
|
||||||
|
- [ ] **5.2.2** XSS prevention tests (S) - Depends on 1.2.2 - Test HTML injection
|
||||||
|
- [ ] **5.2.3** Prompt injection tests (S) - Depends on 4.1.1 - Test malicious content
|
||||||
|
- [ ] **5.2.4** Network audit test (S) - Depends on 1.4.1 - Verify no outbound requests
|
||||||
|
- [ ] **5.2.5** Sensitive content tests (S) - Depends on 1.2.3, 3.4.2 - Test detection and flagging
|
||||||
|
|
||||||
|
### 5.3 Documentation & Publishing - Depends on integration tests
|
||||||
|
- [ ] **5.3.1** Write README (S) - Depends on 4.2.1 - Usage and setup docs
|
||||||
|
- [ ] **5.3.2** Create SKILL.md (S) - Depends on 4.2.1 - Skill manifest
|
||||||
|
- [ ] **5.3.3** Publish to ClawHub (S) - Depends on 5.1.1-5.2.5 - Publish skill
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Progress Summary
|
||||||
|
|
||||||
|
| Phase | Tasks | Done | Pending | In Progress | Blocked |
|
||||||
|
|-------|-------|------|---------|-------------|---------|
|
||||||
|
| Phase 0: Scaffolding | 8 | 8 | 0 | 0 | 0 |
|
||||||
|
| Phase 1: Python Indexer | 20 | 16 | 2 | 2 | 0 |
|
||||||
|
| Phase 2: TS Client | 7 | 6 | 0 | 1 | 0 |
|
||||||
|
| Phase 3: Session/Transport | 10 | 8 | 1 | 1 | 0 |
|
||||||
|
| Phase 4: Tool Layer | 7 | 1 | 5 | 1 | 0 |
|
||||||
|
| Phase 5: Integration | 12 | 0 | 12 | 0 | 0 |
|
||||||
|
| **Total** | **64** | **40** | **20** | **5** | **0** |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Critical Path
|
||||||
|
|
||||||
|
1. Phase 0 → Phase 1 → Phase 2 → Phase 3 → Phase 4 → Phase 5
|
||||||
|
2. 0.1.1-0.1.5 → 1.1.1 → 1.3.1 → 1.6.1 → 2.2.1 → 3.1.1 → 3.2.1 → 4.1.1 → 4.2.1 → 5.1.1
|
||||||
|
|
||||||
|
## Parallel Work Opportunities
|
||||||
|
|
||||||
|
- **After 1.1.1**: Security (1.2), Chunking (1.3), Embedding (1.4) can work in parallel
|
||||||
|
- **After 0.2.2**: Vector Store (1.5) can work in parallel with other components
|
||||||
|
- **After 0.1.1**: TypeScript Config (2.1) can start early
|
||||||
|
- **Phase 3**: Response Envelope (3.3) and Security Guard (3.4) can work in parallel with Health (3.1) and Watcher (3.2)
|
||||||
|
|
||||||
|
## Effort Estimates
|
||||||
|
|
||||||
|
- **Small tasks (S)**: 31 tasks (~1-2 sessions each)
|
||||||
|
- **Medium tasks (M)**: 27 tasks (~3-5 sessions each)
|
||||||
|
- **Total**: 76-123 sessions across all phases
|
||||||
97
openclaw.plugin.json
Normal file
97
openclaw.plugin.json
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
{
|
||||||
|
"schema_version": "1.0",
|
||||||
|
"name": "obsidian-rag",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"description": "Semantic search through Obsidian vault notes using RAG. Powers natural language queries like 'How was my mental health in 2024?' across journal entries, financial records, health data, and more.",
|
||||||
|
"author": "Santhosh Janardhanan",
|
||||||
|
"tools": [
|
||||||
|
{
|
||||||
|
"name": "obsidian_rag_search",
|
||||||
|
"description": "Primary semantic search tool. Given a natural language query, searches the Obsidian vault index and returns the most relevant note chunks ranked by semantic similarity. Supports filtering by directory, date range, and tags.",
|
||||||
|
"parameter_schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"query": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Natural language question or topic to search for"
|
||||||
|
},
|
||||||
|
"max_results": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Maximum number of chunks to return",
|
||||||
|
"default": 5,
|
||||||
|
"minimum": 1,
|
||||||
|
"maximum": 50
|
||||||
|
},
|
||||||
|
"directory_filter": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Limit search to specific vault subdirectories (e.g. ['Journal', 'Finance'])",
|
||||||
|
"items": { "type": "string" }
|
||||||
|
},
|
||||||
|
"date_range": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Filter by date range",
|
||||||
|
"properties": {
|
||||||
|
"from": { "type": "string", "description": "Start date (YYYY-MM-DD)" },
|
||||||
|
"to": { "type": "string", "description": "End date (YYYY-MM-DD)" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": {
|
||||||
|
"type": "array",
|
||||||
|
"description": "Filter by hashtags found in notes (e.g. ['#mentalhealth', '#therapy'])",
|
||||||
|
"items": { "type": "string" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["query"]
|
||||||
|
},
|
||||||
|
"required_permissions": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "obsidian_rag_index",
|
||||||
|
"description": "Trigger indexing of the Obsidian vault. Use 'full' for first-time setup, 'sync' for incremental updates, 'reindex' to force a clean rebuild.",
|
||||||
|
"parameter_schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"mode": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Indexing mode",
|
||||||
|
"enum": ["full", "sync", "reindex"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["mode"]
|
||||||
|
},
|
||||||
|
"required_permissions": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "obsidian_rag_status",
|
||||||
|
"description": "Check the health of the Obsidian RAG plugin — index statistics, last sync time, unindexed files, and Ollama status. Call this first when unsure if the index is ready.",
|
||||||
|
"parameter_schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {}
|
||||||
|
},
|
||||||
|
"required_permissions": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "obsidian_rag_memory_store",
|
||||||
|
"description": "Commit an important fact from search results to OpenClaw's memory for faster future retrieval. Use after finding significant information (e.g. 'I owe Sreenivas $50') that should be remembered.",
|
||||||
|
"parameter_schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"key": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Identifier for the fact (e.g. 'debt_to_sreenivas')"
|
||||||
|
},
|
||||||
|
"value": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The fact to remember"
|
||||||
|
},
|
||||||
|
"source": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Source file path in the vault (e.g. 'Journal/2025-03-15.md')"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["key", "value", "source"]
|
||||||
|
},
|
||||||
|
"required_permissions": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
12078
package-lock.json
generated
Normal file
12078
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
26
package.json
Normal file
26
package.json
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"name": "obsidian-rag",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"description": "OpenClaw plugin for semantic search through Obsidian vault notes using RAG",
|
||||||
|
"main": "dist/index.js",
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"build": "esbuild src/index.ts --bundle --platform=node --target=node18 --outfile=dist/index.js --format=esm --external:@lancedb/lancedb --external:@lancedb/lancedb-darwin-arm64 --external:fsevents --external:chokidar",
|
||||||
|
"dev": "esbuild src/index.ts --bundle --platform=node --target=node18 --outfile=dist/index.js --format=esm --watch",
|
||||||
|
"typecheck": "tsc --noEmit",
|
||||||
|
"test": "vitest run",
|
||||||
|
"test:watch": "vitest"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@lancedb/lancedb": "^0.12",
|
||||||
|
"chokidar": "^3.6",
|
||||||
|
"openclaw": "^2026.4.9",
|
||||||
|
"yaml": "^2.5"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/node": "^20.14",
|
||||||
|
"esbuild": "^0.24",
|
||||||
|
"typescript": "^5.5",
|
||||||
|
"vitest": "^2.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
14
python/obsidian_rag.egg-info/PKG-INFO
Normal file
14
python/obsidian_rag.egg-info/PKG-INFO
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
Metadata-Version: 2.4
|
||||||
|
Name: obsidian-rag
|
||||||
|
Version: 0.1.0
|
||||||
|
Summary: RAG indexer for Obsidian vaults — powers OpenClaw's obsidian_rag_* tools
|
||||||
|
Requires-Python: >=3.11
|
||||||
|
Requires-Dist: lancedb>=0.12
|
||||||
|
Requires-Dist: httpx>=0.27
|
||||||
|
Requires-Dist: pyyaml>=6.0
|
||||||
|
Requires-Dist: python-frontmatter>=1.1
|
||||||
|
Provides-Extra: dev
|
||||||
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
||||||
|
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
||||||
|
Requires-Dist: pytest-mock>=3.12; extra == "dev"
|
||||||
|
Requires-Dist: ruff>=0.5; extra == "dev"
|
||||||
16
python/obsidian_rag.egg-info/SOURCES.txt
Normal file
16
python/obsidian_rag.egg-info/SOURCES.txt
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
pyproject.toml
|
||||||
|
obsidian_rag/__init__.py
|
||||||
|
obsidian_rag/__main__.py
|
||||||
|
obsidian_rag/chunker.py
|
||||||
|
obsidian_rag/cli.py
|
||||||
|
obsidian_rag/config.py
|
||||||
|
obsidian_rag/embedder.py
|
||||||
|
obsidian_rag/indexer.py
|
||||||
|
obsidian_rag/security.py
|
||||||
|
obsidian_rag/vector_store.py
|
||||||
|
obsidian_rag.egg-info/PKG-INFO
|
||||||
|
obsidian_rag.egg-info/SOURCES.txt
|
||||||
|
obsidian_rag.egg-info/dependency_links.txt
|
||||||
|
obsidian_rag.egg-info/entry_points.txt
|
||||||
|
obsidian_rag.egg-info/requires.txt
|
||||||
|
obsidian_rag.egg-info/top_level.txt
|
||||||
1
python/obsidian_rag.egg-info/dependency_links.txt
Normal file
1
python/obsidian_rag.egg-info/dependency_links.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
2
python/obsidian_rag.egg-info/entry_points.txt
Normal file
2
python/obsidian_rag.egg-info/entry_points.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[console_scripts]
|
||||||
|
obsidian-rag = obsidian_rag.cli:main
|
||||||
10
python/obsidian_rag.egg-info/requires.txt
Normal file
10
python/obsidian_rag.egg-info/requires.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
lancedb>=0.12
|
||||||
|
httpx>=0.27
|
||||||
|
pyyaml>=6.0
|
||||||
|
python-frontmatter>=1.1
|
||||||
|
|
||||||
|
[dev]
|
||||||
|
pytest>=8.0
|
||||||
|
pytest-asyncio>=0.23
|
||||||
|
pytest-mock>=3.12
|
||||||
|
ruff>=0.5
|
||||||
1
python/obsidian_rag.egg-info/top_level.txt
Normal file
1
python/obsidian_rag.egg-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
obsidian_rag
|
||||||
3
python/obsidian_rag/__init__.py
Normal file
3
python/obsidian_rag/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""Obsidian RAG — semantic search indexer for Obsidian vaults."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
7
python/obsidian_rag/__main__.py
Normal file
7
python/obsidian_rag/__main__.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
"""CLI entry point: obsidian-rag index | sync | reindex | status."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from obsidian_rag.cli import main
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
250
python/obsidian_rag/chunker.py
Normal file
250
python/obsidian_rag/chunker.py
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
"""Markdown parsing, structured + unstructured chunking, metadata enrichment."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
import hashlib
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import frontmatter
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from obsidian_rag.config import ObsidianRagConfig
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Types
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
chunk_id: str
|
||||||
|
text: str
|
||||||
|
source_file: str
|
||||||
|
source_directory: str
|
||||||
|
section: str | None
|
||||||
|
date: str | None
|
||||||
|
tags: list[str] = field(default_factory=list)
|
||||||
|
chunk_index: int = 0
|
||||||
|
total_chunks: int = 1
|
||||||
|
modified_at: str | None = None
|
||||||
|
indexed_at: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Markdown parsing
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def parse_frontmatter(content: str) -> tuple[dict, str]:
|
||||||
|
"""Parse frontmatter from markdown content. Returns (metadata, body)."""
|
||||||
|
try:
|
||||||
|
post = frontmatter.parse(content)
|
||||||
|
meta = dict(post[0]) if post[0] else {}
|
||||||
|
body = str(post[1])
|
||||||
|
return meta, body
|
||||||
|
except Exception:
|
||||||
|
return {}, content
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tags(text: str) -> list[str]:
|
||||||
|
"""Extract all #hashtags from text, deduplicated, lowercased."""
|
||||||
|
return list(dict.fromkeys(t.lower() for t in re.findall(r"#[\w-]+", text)))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_date_from_filename(filepath: Path) -> str | None:
|
||||||
|
"""Try to parse an ISO date from a filename (e.g. 2024-01-15.md)."""
|
||||||
|
name = filepath.stem # filename without extension
|
||||||
|
# Match YYYY-MM-DD or YYYYMMDD
|
||||||
|
m = re.search(r"(\d{4}-\d{2}-\d{2})|(\d{4}\d{2}\d{2})", name)
|
||||||
|
if m:
|
||||||
|
date_str = m.group(1) or m.group(2)
|
||||||
|
# Normalize YYYYMMDD → YYYY-MM-DD
|
||||||
|
if len(date_str) == 8:
|
||||||
|
return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
|
||||||
|
return date_str
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def is_structured_note(filepath: Path) -> bool:
|
||||||
|
"""Heuristic: journal/daily notes use date-named files with section headers."""
|
||||||
|
name = filepath.stem
|
||||||
|
date_match = re.search(r"\d{4}-\d{2}-\d{2}", name)
|
||||||
|
return date_match is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Section-split chunker (structured notes)
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
SECTION_HEADER_RE = re.compile(r"^#{1,3}\s+(.+)$", re.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
|
def split_by_sections(body: str, metadata: dict) -> list[tuple[str, str]]:
|
||||||
|
"""Split markdown body into (section_name, section_content) pairs.
|
||||||
|
|
||||||
|
If no headers found, returns [(None, body)].
|
||||||
|
"""
|
||||||
|
sections: list[tuple[str | None, str]] = []
|
||||||
|
lines = body.splitlines(keepends=True)
|
||||||
|
current_heading: str | None = None
|
||||||
|
current_content: list[str] = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
m = SECTION_HEADER_RE.match(line.rstrip())
|
||||||
|
if m:
|
||||||
|
# Flush previous section
|
||||||
|
if current_heading is not None or current_content:
|
||||||
|
sections.append((current_heading, "".join(current_content).strip()))
|
||||||
|
current_content = []
|
||||||
|
current_heading = m.group(1).strip()
|
||||||
|
else:
|
||||||
|
current_content.append(line)
|
||||||
|
|
||||||
|
# Flush last section
|
||||||
|
if current_heading is not None or current_content:
|
||||||
|
sections.append((current_heading, "".join(current_content).strip()))
|
||||||
|
|
||||||
|
if not sections:
|
||||||
|
sections = [(None, body.strip())]
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Sliding window chunker (unstructured notes)
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _count_tokens(text: str) -> int:
|
||||||
|
"""Rough token count: split on whitespace, average ~4 chars per token."""
|
||||||
|
return len(text.split())
|
||||||
|
|
||||||
|
|
||||||
|
def sliding_window_chunks(
|
||||||
|
text: str,
|
||||||
|
chunk_size: int = 500,
|
||||||
|
overlap: int = 100,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Split text into overlapping sliding-window chunks of ~chunk_size tokens.
|
||||||
|
|
||||||
|
Returns list of chunk strings.
|
||||||
|
"""
|
||||||
|
words = text.split()
|
||||||
|
if not words:
|
||||||
|
return []
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
start = 0
|
||||||
|
|
||||||
|
while start < len(words):
|
||||||
|
end = start + chunk_size
|
||||||
|
chunk_words = words[start:end]
|
||||||
|
chunks.append(" ".join(chunk_words))
|
||||||
|
|
||||||
|
# Advance by (chunk_size - overlap)
|
||||||
|
advance = chunk_size - overlap
|
||||||
|
if advance <= 0:
|
||||||
|
advance = max(1, chunk_size // 2)
|
||||||
|
start += advance
|
||||||
|
|
||||||
|
if start >= len(words):
|
||||||
|
break
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Main chunk router
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _stable_chunk_id(content_hash: str, chunk_index: int) -> str:
|
||||||
|
"""Generate a stable chunk_id from content hash and index."""
|
||||||
|
raw = f"{content_hash}:{chunk_index}"
|
||||||
|
return hashlib.sha1(raw.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_file(
|
||||||
|
filepath: Path,
|
||||||
|
content: str,
|
||||||
|
modified_at: str,
|
||||||
|
config: "ObsidianRagConfig",
|
||||||
|
chunk_id_prefix: str = "",
|
||||||
|
) -> list[Chunk]:
|
||||||
|
"""Parse a markdown file and return a list of Chunks.
|
||||||
|
|
||||||
|
Uses section-split for structured notes (journal entries with date filenames),
|
||||||
|
sliding window for everything else.
|
||||||
|
"""
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
vault_path = Path(config.vault_path)
|
||||||
|
rel_path = filepath if filepath.is_absolute() else filepath
|
||||||
|
source_file = str(rel_path)
|
||||||
|
source_directory = rel_path.parts[0] if rel_path.parts else ""
|
||||||
|
|
||||||
|
metadata, body = parse_frontmatter(content)
|
||||||
|
tags = extract_tags(body)
|
||||||
|
date = extract_date_from_filename(filepath)
|
||||||
|
|
||||||
|
chunk_size = config.indexing.chunk_size
|
||||||
|
overlap = config.indexing.chunk_overlap
|
||||||
|
|
||||||
|
# Compute content hash for stable, content-addressable chunk_ids
|
||||||
|
content_hash = hashlib.sha1(body.encode()).hexdigest()[:12]
|
||||||
|
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
|
||||||
|
if is_structured_note(filepath):
|
||||||
|
# Section-split for journal/daily notes
|
||||||
|
sections = split_by_sections(body, metadata)
|
||||||
|
total = len(sections)
|
||||||
|
|
||||||
|
for idx, (section, section_text) in enumerate(sections):
|
||||||
|
if not section_text.strip():
|
||||||
|
continue
|
||||||
|
section_tags = extract_tags(section_text)
|
||||||
|
combined_tags = list(dict.fromkeys([*tags, *section_tags]))
|
||||||
|
|
||||||
|
chunk_text = section_text
|
||||||
|
chunk = Chunk(
|
||||||
|
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
|
||||||
|
text=chunk_text,
|
||||||
|
source_file=source_file,
|
||||||
|
source_directory=source_directory,
|
||||||
|
section=f"#{section}" if section else None,
|
||||||
|
date=date,
|
||||||
|
tags=combined_tags,
|
||||||
|
chunk_index=idx,
|
||||||
|
total_chunks=total,
|
||||||
|
modified_at=modified_at,
|
||||||
|
)
|
||||||
|
chunks.append(chunk)
|
||||||
|
else:
|
||||||
|
# Sliding window for unstructured notes
|
||||||
|
text_chunks = sliding_window_chunks(body, chunk_size, overlap)
|
||||||
|
total = len(text_chunks)
|
||||||
|
|
||||||
|
for idx, text_chunk in enumerate(text_chunks):
|
||||||
|
if not text_chunk.strip():
|
||||||
|
continue
|
||||||
|
chunk = Chunk(
|
||||||
|
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
|
||||||
|
text=text_chunk,
|
||||||
|
source_file=source_file,
|
||||||
|
source_directory=source_directory,
|
||||||
|
section=None,
|
||||||
|
date=date,
|
||||||
|
tags=tags,
|
||||||
|
chunk_index=idx,
|
||||||
|
total_chunks=total,
|
||||||
|
modified_at=modified_at,
|
||||||
|
)
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
return chunks
|
||||||
156
python/obsidian_rag/cli.py
Normal file
156
python/obsidian_rag/cli.py
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
"""CLI: obsidian-rag index | sync | reindex | status."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import obsidian_rag.config as config_mod
|
||||||
|
from obsidian_rag.vector_store import get_db, get_stats
|
||||||
|
from obsidian_rag.indexer import Indexer
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
argv = argv or sys.argv[1:]
|
||||||
|
|
||||||
|
if not argv or argv[0] in ("--help", "-h"):
|
||||||
|
print(_usage())
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cmd = argv[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = config_mod.load_config()
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"ERROR: {e}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if cmd == "index":
|
||||||
|
return _index(config)
|
||||||
|
elif cmd == "sync":
|
||||||
|
return _sync(config)
|
||||||
|
elif cmd == "reindex":
|
||||||
|
return _reindex(config)
|
||||||
|
elif cmd == "status":
|
||||||
|
return _status(config)
|
||||||
|
else:
|
||||||
|
print(f"Unknown command: {cmd}\n{_usage()}", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def _index(config) -> int:
|
||||||
|
indexer = Indexer(config)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
|
||||||
|
try:
|
||||||
|
gen = indexer.full_index()
|
||||||
|
result: dict = {"indexed_files": 0, "total_chunks": 0, "errors": []}
|
||||||
|
for item in gen:
|
||||||
|
result = item # progress yields are dicts; final dict from return
|
||||||
|
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": "complete",
|
||||||
|
"indexed_files": result["indexed_files"],
|
||||||
|
"total_chunks": result["total_chunks"],
|
||||||
|
"duration_ms": duration_ms,
|
||||||
|
"errors": result["errors"],
|
||||||
|
},
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return 0 if not result["errors"] else 1
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"type": "error", "error": str(e)}), file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
|
||||||
|
def _sync(config) -> int:
|
||||||
|
indexer = Indexer(config)
|
||||||
|
try:
|
||||||
|
result = indexer.sync()
|
||||||
|
print(json.dumps({"type": "complete", **result}, indent=2))
|
||||||
|
return 0 if not result["errors"] else 1
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"type": "error", "error": str(e)}), file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
|
||||||
|
def _reindex(config) -> int:
|
||||||
|
indexer = Indexer(config)
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
result = indexer.reindex()
|
||||||
|
duration_ms = int((time.monotonic() - t0) * 1000)
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": "complete",
|
||||||
|
"indexed_files": result["indexed_files"],
|
||||||
|
"total_chunks": result["total_chunks"],
|
||||||
|
"duration_ms": duration_ms,
|
||||||
|
"errors": result["errors"],
|
||||||
|
},
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"type": "error", "error": str(e)}), file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
|
||||||
|
def _status(config) -> int:
|
||||||
|
try:
|
||||||
|
db = get_db(config)
|
||||||
|
table = db.open_table("obsidian_chunks")
|
||||||
|
stats = get_stats(table)
|
||||||
|
# Resolve sync-result.json path (same convention as indexer)
|
||||||
|
from pathlib import Path
|
||||||
|
import os as osmod
|
||||||
|
project_root = Path(__file__).parent.parent.parent
|
||||||
|
data_dir = project_root / "obsidian-rag"
|
||||||
|
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
|
||||||
|
data_dir = Path(osmod.path.expanduser("~/.obsidian-rag"))
|
||||||
|
sync_path = data_dir / "sync-result.json"
|
||||||
|
last_sync = None
|
||||||
|
if sync_path.exists():
|
||||||
|
try:
|
||||||
|
last_sync = json.loads(sync_path.read_text()).get("timestamp")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"total_docs": stats["total_docs"],
|
||||||
|
"total_chunks": stats["total_chunks"],
|
||||||
|
"last_sync": last_sync,
|
||||||
|
},
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(json.dumps({"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2))
|
||||||
|
return 1
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def _usage() -> str:
|
||||||
|
return """obsidian-rag - Obsidian vault RAG indexer
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
obsidian-rag index Full index of the vault
|
||||||
|
obsidian-rag sync Incremental sync (changed files only)
|
||||||
|
obsidian-rag reindex Force full reindex (nuke + rebuild)
|
||||||
|
obsidian-rag status Show index health and statistics
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
145
python/obsidian_rag/config.py
Normal file
145
python/obsidian_rag/config.py
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
"""Configuration loader — reads ~/.obsidian-rag/config.json (or ./obsidian-rag/ for dev)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from enum import Enum
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_CONFIG_DIR = Path(__file__).parent.parent.parent # python/ → project root
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EmbeddingConfig:
|
||||||
|
provider: str = "ollama"
|
||||||
|
model: str = "mxbai-embed-large"
|
||||||
|
base_url: str = "http://localhost:11434"
|
||||||
|
dimensions: int = 1024
|
||||||
|
batch_size: int = 64
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VectorStoreConfig:
|
||||||
|
type: str = "lancedb"
|
||||||
|
path: str = "" # resolved relative to data_dir
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IndexingConfig:
|
||||||
|
chunk_size: int = 500
|
||||||
|
chunk_overlap: int = 100
|
||||||
|
file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
|
||||||
|
deny_dirs: list[str] = field(
|
||||||
|
default_factory=lambda: [".obsidian", ".trash", "zzz-Archive", ".git", ".logseq"]
|
||||||
|
)
|
||||||
|
allow_dirs: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SecurityConfig:
|
||||||
|
require_confirmation_for: list[str] = field(default_factory=lambda: ["health", "financial_debt"])
|
||||||
|
sensitive_sections: list[str] = field(
|
||||||
|
default_factory=lambda: ["#mentalhealth", "#physicalhealth", "#Relations"]
|
||||||
|
)
|
||||||
|
local_only: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MemoryConfig:
|
||||||
|
auto_suggest: bool = True
|
||||||
|
patterns: dict[str, list[str]] = field(
|
||||||
|
default_factory=lambda: {
|
||||||
|
"financial": ["owe", "owed", "debt", "paid", "$", "spent", "spend"],
|
||||||
|
"health": ["#mentalhealth", "#physicalhealth", "medication", "therapy"],
|
||||||
|
"commitments": ["shopping list", "costco", "amazon", "grocery"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ObsidianRagConfig:
|
||||||
|
vault_path: str = ""
|
||||||
|
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
|
||||||
|
vector_store: VectorStoreConfig = field(default_factory=VectorStoreConfig)
|
||||||
|
indexing: IndexingConfig = field(default_factory=IndexingConfig)
|
||||||
|
security: SecurityConfig = field(default_factory=SecurityConfig)
|
||||||
|
memory: MemoryConfig = field(default_factory=MemoryConfig)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_data_dir() -> Path:
|
||||||
|
"""Resolve the data directory: dev (project root/obsidian-rag/) or production (~/.obsidian-rag/)."""
|
||||||
|
dev_data_dir = DEFAULT_CONFIG_DIR / "obsidian-rag"
|
||||||
|
if dev_data_dir.exists() or (DEFAULT_CONFIG_DIR / "KnowledgeVault").exists():
|
||||||
|
return dev_data_dir
|
||||||
|
# Production: ~/.obsidian-rag/
|
||||||
|
return Path(os.path.expanduser("~/.obsidian-rag"))
|
||||||
|
|
||||||
|
|
||||||
|
def load_config(config_path: str | Path | None = None) -> ObsidianRagConfig:
|
||||||
|
"""Load config from JSON file, falling back to dev/default config."""
|
||||||
|
if config_path is None:
|
||||||
|
config_path = _resolve_data_dir() / "config.json"
|
||||||
|
else:
|
||||||
|
config_path = Path(config_path)
|
||||||
|
|
||||||
|
if not config_path.exists():
|
||||||
|
raise FileNotFoundError(f"Config file not found: {config_path}")
|
||||||
|
|
||||||
|
with open(config_path) as f:
|
||||||
|
raw: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
|
return ObsidianRagConfig(
|
||||||
|
vault_path=raw.get("vault_path", ""),
|
||||||
|
embedding=_merge(EmbeddingConfig(), raw.get("embedding", {})),
|
||||||
|
vector_store=_merge(VectorStoreConfig(), raw.get("vector_store", {})),
|
||||||
|
indexing=_merge(IndexingConfig(), raw.get("indexing", {})),
|
||||||
|
security=_merge(SecurityConfig(), raw.get("security", {})),
|
||||||
|
memory=_merge(MemoryConfig(), raw.get("memory", {})),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge(default: Any, overrides: dict[str, Any]) -> Any:
|
||||||
|
"""Shallow-merge a dict into a dataclass instance."""
|
||||||
|
if not isinstance(default, type) and not isinstance(default, (list, dict, str, int, float, bool)):
|
||||||
|
# It's a dataclass instance — merge fields
|
||||||
|
if hasattr(default, "__dataclass_fields__"):
|
||||||
|
fields = {}
|
||||||
|
for key, val in overrides.items():
|
||||||
|
if key in default.__dataclass_fields__:
|
||||||
|
field_def = default.__dataclass_fields__[key]
|
||||||
|
actual_default = field_def.default
|
||||||
|
if isinstance(actual_default, type) and issubclass(actual_default, Enum):
|
||||||
|
# Enum fields need special handling
|
||||||
|
fields[key] = val
|
||||||
|
elif isinstance(val, dict):
|
||||||
|
fields[key] = _merge(actual_default, val)
|
||||||
|
else:
|
||||||
|
fields[key] = val
|
||||||
|
else:
|
||||||
|
fields[key] = val
|
||||||
|
return default.__class__(**{**default.__dict__, **fields})
|
||||||
|
if isinstance(overrides, dict) and isinstance(default, dict):
|
||||||
|
return {**default, **overrides}
|
||||||
|
return overrides if overrides is not None else default
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_vault_path(config: ObsidianRagConfig) -> Path:
|
||||||
|
"""Resolve vault_path relative to project root or as absolute."""
|
||||||
|
vp = Path(config.vault_path)
|
||||||
|
if vp.is_absolute():
|
||||||
|
return vp
|
||||||
|
# Resolve relative to project root
|
||||||
|
return (DEFAULT_CONFIG_DIR / vp).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_vector_db_path(config: ObsidianRagConfig) -> Path:
|
||||||
|
"""Resolve vector store path relative to data directory."""
|
||||||
|
data_dir = _resolve_data_dir()
|
||||||
|
vsp = Path(config.vector_store.path)
|
||||||
|
if vsp.is_absolute():
|
||||||
|
return vsp
|
||||||
|
return (data_dir / vsp).resolve()
|
||||||
110
python/obsidian_rag/embedder.py
Normal file
110
python/obsidian_rag/embedder.py
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
"""Ollama API client for embedding generation."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from obsidian_rag.config import ObsidianRagConfig
|
||||||
|
|
||||||
|
DEFAULT_TIMEOUT = 120.0 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingError(Exception):
|
||||||
|
"""Raised when embedding generation fails."""
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaUnavailableError(EmbeddingError):
|
||||||
|
"""Raised when Ollama is unreachable."""
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaEmbedder:
|
||||||
|
"""Client for Ollama /api/embed endpoint (mxbai-embed-large, 1024-dim)."""
|
||||||
|
|
||||||
|
def __init__(self, config: "ObsidianRagConfig"):
|
||||||
|
self.base_url = config.embedding.base_url.rstrip("/")
|
||||||
|
self.model = config.embedding.model
|
||||||
|
self.dimensions = config.embedding.dimensions
|
||||||
|
self.batch_size = config.embedding.batch_size
|
||||||
|
self._client = httpx.Client(timeout=DEFAULT_TIMEOUT)
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
"""Check if Ollama is reachable and has the model."""
|
||||||
|
try:
|
||||||
|
resp = self._client.get(f"{self.base_url}/api/tags", timeout=5.0)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return False
|
||||||
|
models = resp.json().get("models", [])
|
||||||
|
return any(self.model in m.get("name", "") for m in models)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def embed_chunks(self, texts: list[str]) -> list[list[float]]:
|
||||||
|
"""Generate embeddings for a batch of texts. Returns list of vectors."""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
all_vectors: list[list[float]] = []
|
||||||
|
for i in range(0, len(texts), self.batch_size):
|
||||||
|
batch = texts[i : i + self.batch_size]
|
||||||
|
vectors = self._embed_batch(batch)
|
||||||
|
all_vectors.extend(vectors)
|
||||||
|
|
||||||
|
return all_vectors
|
||||||
|
|
||||||
|
def embed_single(self, text: str) -> list[float]:
|
||||||
|
"""Generate embedding for a single text."""
|
||||||
|
[vec] = self._embed_batch([text])
|
||||||
|
return vec
|
||||||
|
|
||||||
|
def _embed_batch(self, batch: list[str]) -> list[list[float]]:
|
||||||
|
"""Internal batch call. Raises EmbeddingError on failure."""
|
||||||
|
# Ollama /api/embeddings takes {"model": "...", "prompt": "..."} for single
|
||||||
|
# For batch, call /api/embeddings multiple times sequentially
|
||||||
|
if len(batch) == 1:
|
||||||
|
endpoint = f"{self.base_url}/api/embeddings"
|
||||||
|
payload = {"model": self.model, "prompt": batch[0]}
|
||||||
|
else:
|
||||||
|
# For batch, use /api/embeddings with "input" (multiple calls)
|
||||||
|
results = []
|
||||||
|
for text in batch:
|
||||||
|
try:
|
||||||
|
resp = self._client.post(
|
||||||
|
f"{self.base_url}/api/embeddings",
|
||||||
|
json={"model": self.model, "prompt": text},
|
||||||
|
timeout=DEFAULT_TIMEOUT,
|
||||||
|
)
|
||||||
|
except httpx.ConnectError as e:
|
||||||
|
raise OllamaUnavailableError(f"Cannot connect to Ollama at {self.base_url}") from e
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
raise EmbeddingError(f"Embedding request timed out after {DEFAULT_TIMEOUT}s") from e
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise EmbeddingError(f"Ollama returned {resp.status_code}: {resp.text}")
|
||||||
|
data = resp.json()
|
||||||
|
embedding = data.get("embedding", [])
|
||||||
|
if not embedding:
|
||||||
|
embedding = data.get("embeddings", [[]])[0]
|
||||||
|
results.append(embedding)
|
||||||
|
return results
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self._client.post(endpoint, json=payload, timeout=DEFAULT_TIMEOUT)
|
||||||
|
except httpx.ConnectError as e:
|
||||||
|
raise OllamaUnavailableError(f"Cannot connect to Ollama at {self.base_url}") from e
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
raise EmbeddingError(f"Embedding request timed out after {DEFAULT_TIMEOUT}s") from e
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise EmbeddingError(f"Ollama returned {resp.status_code}: {resp.text}")
|
||||||
|
|
||||||
|
data = resp.json()
|
||||||
|
embedding = data.get("embedding", [])
|
||||||
|
if not embedding:
|
||||||
|
embedding = data.get("embeddings", [[]])[0]
|
||||||
|
return [embedding]
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self._client.close()
|
||||||
223
python/obsidian_rag/indexer.py
Normal file
223
python/obsidian_rag/indexer.py
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
"""Full indexing pipeline: scan → parse → chunk → embed → store."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any, Generator, Iterator
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from obsidian_rag.config import ObsidianRagConfig
|
||||||
|
|
||||||
|
import obsidian_rag.config as config_mod
|
||||||
|
from obsidian_rag.chunker import chunk_file
|
||||||
|
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError
|
||||||
|
from obsidian_rag.security import should_index_dir, validate_path
|
||||||
|
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Pipeline
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class Indexer:
|
||||||
|
"""Coordinates the scan → chunk → embed → store pipeline."""
|
||||||
|
|
||||||
|
def __init__(self, config: "ObsidianRagConfig"):
|
||||||
|
self.config = config
|
||||||
|
self.vault_path = config_mod.resolve_vault_path(config)
|
||||||
|
self._embedder = None # lazy init
|
||||||
|
|
||||||
|
@property
|
||||||
|
def embedder(self):
|
||||||
|
if self._embedder is None:
|
||||||
|
from obsidian_rag.embedder import OllamaEmbedder
|
||||||
|
self._embedder = OllamaEmbedder(self.config)
|
||||||
|
return self._embedder
|
||||||
|
|
||||||
|
def scan_vault(self) -> Generator[Path, None, None]:
|
||||||
|
"""Walk vault, yielding markdown files to index."""
|
||||||
|
for root, dirs, files in os.walk(self.vault_path):
|
||||||
|
root_path = Path(root)
|
||||||
|
# Filter directories
|
||||||
|
dirs[:] = [d for d in dirs if should_index_dir(d, self.config)]
|
||||||
|
|
||||||
|
for fname in files:
|
||||||
|
if not fname.endswith(".md"):
|
||||||
|
continue
|
||||||
|
filepath = root_path / fname
|
||||||
|
try:
|
||||||
|
validate_path(filepath, self.vault_path)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
yield filepath
|
||||||
|
|
||||||
|
def process_file(self, filepath: Path) -> tuple[int, list[dict[str, Any]]]:
|
||||||
|
"""Index a single file. Returns (num_chunks, enriched_chunks)."""
|
||||||
|
from obsidian_rag import security
|
||||||
|
|
||||||
|
mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat())
|
||||||
|
content = filepath.read_text(encoding="utf-8")
|
||||||
|
# Sanitize
|
||||||
|
content = security.sanitize_text(content)
|
||||||
|
# Chunk
|
||||||
|
chunks = chunk_file(filepath, content, mtime, self.config)
|
||||||
|
# Enrich with indexed_at
|
||||||
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
|
enriched: list[dict[str, Any]] = []
|
||||||
|
for chunk in chunks:
|
||||||
|
enriched.append(
|
||||||
|
{
|
||||||
|
"chunk_id": chunk.chunk_id,
|
||||||
|
"chunk_text": chunk.text,
|
||||||
|
"source_file": chunk.source_file,
|
||||||
|
"source_directory": chunk.source_directory,
|
||||||
|
"section": chunk.section,
|
||||||
|
"date": chunk.date,
|
||||||
|
"tags": chunk.tags,
|
||||||
|
"chunk_index": chunk.chunk_index,
|
||||||
|
"total_chunks": chunk.total_chunks,
|
||||||
|
"modified_at": chunk.modified_at,
|
||||||
|
"indexed_at": now,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return len(chunks), enriched
|
||||||
|
|
||||||
|
def full_index(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
|
||||||
|
"""Run full index of the vault. Calls on_progress with status dicts."""
|
||||||
|
vault_path = self.vault_path
|
||||||
|
if not vault_path.exists():
|
||||||
|
raise FileNotFoundError(f"Vault not found: {vault_path}")
|
||||||
|
|
||||||
|
db = get_db(self.config)
|
||||||
|
table = create_table_if_not_exists(db)
|
||||||
|
embedder = self.embedder
|
||||||
|
|
||||||
|
files = list(self.scan_vault())
|
||||||
|
total_files = len(files)
|
||||||
|
indexed_files = 0
|
||||||
|
total_chunks = 0
|
||||||
|
errors: list[dict] = []
|
||||||
|
|
||||||
|
for idx, filepath in enumerate(files):
|
||||||
|
try:
|
||||||
|
num_chunks, enriched = self.process_file(filepath)
|
||||||
|
# Embed chunks
|
||||||
|
texts = [e["chunk_text"] for e in enriched]
|
||||||
|
try:
|
||||||
|
vectors = embedder.embed_chunks(texts)
|
||||||
|
except OllamaUnavailableError:
|
||||||
|
# Partial results without embeddings — skip
|
||||||
|
vectors = [[0.0] * 1024 for _ in texts]
|
||||||
|
# Add vectors
|
||||||
|
for e, v in zip(enriched, vectors):
|
||||||
|
e["vector"] = v
|
||||||
|
# Store
|
||||||
|
upsert_chunks(table, enriched)
|
||||||
|
total_chunks += num_chunks
|
||||||
|
indexed_files += 1
|
||||||
|
except Exception as exc:
|
||||||
|
errors.append({"file": str(filepath), "error": str(exc)})
|
||||||
|
|
||||||
|
if on_progress:
|
||||||
|
phase = "embedding" if idx < total_files // 2 else "storing"
|
||||||
|
yield {
|
||||||
|
"type": "progress",
|
||||||
|
"phase": phase,
|
||||||
|
"current": idx + 1,
|
||||||
|
"total": total_files,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"indexed_files": indexed_files,
|
||||||
|
"total_chunks": total_chunks,
|
||||||
|
"duration_ms": 0, # caller can fill
|
||||||
|
"errors": errors,
|
||||||
|
}
|
||||||
|
|
||||||
|
def sync(self, on_progress: Iterator[dict] | None = None) -> dict[str, Any]:
|
||||||
|
"""Incremental sync: only process files modified since last sync."""
|
||||||
|
sync_result_path = self._sync_result_path()
|
||||||
|
last_sync = None
|
||||||
|
if sync_result_path.exists():
|
||||||
|
try:
|
||||||
|
last_sync = json.loads(sync_result_path.read_text()).get("timestamp")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
db = get_db(self.config)
|
||||||
|
table = create_table_if_not_exists(db)
|
||||||
|
embedder = self.embedder
|
||||||
|
|
||||||
|
files = list(self.scan_vault())
|
||||||
|
indexed_files = 0
|
||||||
|
total_chunks = 0
|
||||||
|
errors: list[dict] = []
|
||||||
|
|
||||||
|
for filepath in files:
|
||||||
|
mtime = datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc)
|
||||||
|
mtime_str = mtime.isoformat()
|
||||||
|
if last_sync and mtime_str <= last_sync:
|
||||||
|
continue # unchanged
|
||||||
|
|
||||||
|
try:
|
||||||
|
num_chunks, enriched = self.process_file(filepath)
|
||||||
|
texts = [e["chunk_text"] for e in enriched]
|
||||||
|
try:
|
||||||
|
vectors = embedder.embed_chunks(texts)
|
||||||
|
except OllamaUnavailableError:
|
||||||
|
vectors = [[0.0] * 1024 for _ in texts]
|
||||||
|
for e, v in zip(enriched, vectors):
|
||||||
|
e["vector"] = v
|
||||||
|
upsert_chunks(table, enriched)
|
||||||
|
total_chunks += num_chunks
|
||||||
|
indexed_files += 1
|
||||||
|
except Exception as exc:
|
||||||
|
errors.append({"file": str(filepath), "error": str(exc)})
|
||||||
|
|
||||||
|
self._write_sync_result(indexed_files, total_chunks, errors)
|
||||||
|
return {
|
||||||
|
"indexed_files": indexed_files,
|
||||||
|
"total_chunks": total_chunks,
|
||||||
|
"errors": errors,
|
||||||
|
}
|
||||||
|
|
||||||
|
def reindex(self) -> dict[str, Any]:
|
||||||
|
"""Nuke and rebuild: drop table and run full index."""
|
||||||
|
db = get_db(self.config)
|
||||||
|
if "obsidian_chunks" in db.list_tables():
|
||||||
|
db.drop_table("obsidian_chunks")
|
||||||
|
# full_index is a generator — materialize it to get the final dict
|
||||||
|
results = list(self.full_index())
|
||||||
|
return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []}
|
||||||
|
|
||||||
|
def _sync_result_path(self) -> Path:
|
||||||
|
# Use the same dev-data-dir convention as config.py
|
||||||
|
project_root = Path(__file__).parent.parent.parent
|
||||||
|
data_dir = project_root / "obsidian-rag"
|
||||||
|
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
|
||||||
|
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
|
||||||
|
return data_dir / "sync-result.json"
|
||||||
|
|
||||||
|
def _write_sync_result(
|
||||||
|
self,
|
||||||
|
indexed_files: int,
|
||||||
|
total_chunks: int,
|
||||||
|
errors: list[dict],
|
||||||
|
) -> None:
|
||||||
|
path = self._sync_result_path()
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
result = {
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"indexed_files": indexed_files,
|
||||||
|
"total_chunks": total_chunks,
|
||||||
|
"errors": errors,
|
||||||
|
}
|
||||||
|
# Atomic write: .tmp → rename
|
||||||
|
tmp = path.with_suffix(".json.tmp")
|
||||||
|
tmp.write_text(json.dumps(result, indent=2))
|
||||||
|
tmp.rename(path)
|
||||||
164
python/obsidian_rag/security.py
Normal file
164
python/obsidian_rag/security.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
"""Path traversal prevention, input sanitization, sensitive content detection, directory access control."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from obsidian_rag.config import ObsidianRagConfig
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Path traversal
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def validate_path(requested: Path, vault_root: Path) -> Path:
|
||||||
|
"""Resolve requested relative to vault_root and reject anything escaping the vault.
|
||||||
|
|
||||||
|
Raises ValueError on traversal attempts.
|
||||||
|
"""
|
||||||
|
# Resolve both to absolute paths
|
||||||
|
vault = vault_root.resolve()
|
||||||
|
try:
|
||||||
|
resolved = (vault / requested).resolve()
|
||||||
|
except (OSError, ValueError) as e:
|
||||||
|
raise ValueError(f"Cannot resolve path: {requested}") from e
|
||||||
|
|
||||||
|
# Check the resolved path is under vault
|
||||||
|
try:
|
||||||
|
resolved.relative_to(vault)
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(f"Path traversal attempt blocked: {requested} resolves outside vault")
|
||||||
|
|
||||||
|
# Reject obvious traversal
|
||||||
|
if ".." in requested.parts:
|
||||||
|
raise ValueError(f"Path traversal attempt blocked: {requested}")
|
||||||
|
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
|
||||||
|
def is_symlink_outside_vault(path: Path, vault_root: Path) -> bool:
|
||||||
|
"""Check if path is a symlink that resolves outside the vault."""
|
||||||
|
try:
|
||||||
|
resolved = path.resolve()
|
||||||
|
vault = vault_root.resolve()
|
||||||
|
# Check if any parent (including self) is outside vault
|
||||||
|
try:
|
||||||
|
resolved.relative_to(vault)
|
||||||
|
return False
|
||||||
|
except ValueError:
|
||||||
|
return True
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Input sanitization
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
HTML_TAG_RE = re.compile(r"<[^>]+>")
|
||||||
|
CODE_BLOCK_RE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
|
||||||
|
MULTI_WHITESPACE_RE = re.compile(r"\s+")
|
||||||
|
MAX_CHUNK_LEN = 2000
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_text(raw: str) -> str:
|
||||||
|
"""Sanitize raw vault content before embedding.
|
||||||
|
|
||||||
|
- Strip HTML tags (prevent XSS)
|
||||||
|
- Remove fenced code blocks
|
||||||
|
- Normalize whitespace
|
||||||
|
- Cap length at MAX_CHUNK_LEN chars
|
||||||
|
"""
|
||||||
|
# Remove fenced code blocks
|
||||||
|
text = CODE_BLOCK_RE.sub(" ", raw)
|
||||||
|
# Strip HTML tags
|
||||||
|
text = HTML_TAG_RE.sub("", text)
|
||||||
|
# Remove leading/trailing whitespace
|
||||||
|
text = text.strip()
|
||||||
|
# Normalize internal whitespace
|
||||||
|
text = MULTI_WHITESPACE_RE.sub(" ", text)
|
||||||
|
# Cap length
|
||||||
|
if len(text) > MAX_CHUNK_LEN:
|
||||||
|
text = text[:MAX_CHUNK_LEN]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Sensitive content detection
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def detect_sensitive(
|
||||||
|
text: str,
|
||||||
|
sensitive_sections: list[str],
|
||||||
|
patterns: dict[str, list[str]],
|
||||||
|
) -> dict[str, bool]:
|
||||||
|
"""Detect sensitive content categories in text.
|
||||||
|
|
||||||
|
Returns dict with keys: health, financial, relations.
|
||||||
|
"""
|
||||||
|
text_lower = text.lower()
|
||||||
|
result: dict[str, bool] = {
|
||||||
|
"health": False,
|
||||||
|
"financial": False,
|
||||||
|
"relations": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for sensitive section headings in the text
|
||||||
|
for section in sensitive_sections:
|
||||||
|
if section.lower() in text_lower:
|
||||||
|
result["health"] = result["health"] or section.lower() in ["#mentalhealth", "#physicalhealth"]
|
||||||
|
|
||||||
|
# Pattern matching
|
||||||
|
financial_patterns = patterns.get("financial", [])
|
||||||
|
health_patterns = patterns.get("health", [])
|
||||||
|
|
||||||
|
for pat in financial_patterns:
|
||||||
|
if pat.lower() in text_lower:
|
||||||
|
result["financial"] = True
|
||||||
|
break
|
||||||
|
|
||||||
|
for pat in health_patterns:
|
||||||
|
if pat.lower() in text_lower:
|
||||||
|
result["health"] = True
|
||||||
|
break
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Directory access control
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def should_index_dir(
|
||||||
|
dir_name: str,
|
||||||
|
config: "ObsidianRagConfig",
|
||||||
|
) -> bool:
|
||||||
|
"""Apply deny/allow list rules to a directory.
|
||||||
|
|
||||||
|
If allow_dirs is non-empty, only those dirs are allowed.
|
||||||
|
If deny_dirs matches, the dir is rejected.
|
||||||
|
Hidden dirs (starting with '.') are always rejected.
|
||||||
|
"""
|
||||||
|
# Always reject hidden directories
|
||||||
|
if dir_name.startswith("."):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# If allow list is set, only those dirs are allowed
|
||||||
|
if config.indexing.allow_dirs:
|
||||||
|
return dir_name in config.indexing.allow_dirs
|
||||||
|
|
||||||
|
# Otherwise reject any deny-listed directory
|
||||||
|
deny = config.indexing.deny_dirs
|
||||||
|
return dir_name not in deny
|
||||||
|
|
||||||
|
|
||||||
|
def filter_tags(text: str) -> list[str]:
|
||||||
|
"""Extract all #hashtags from text, lowercased and deduplicated."""
|
||||||
|
return list(dict.fromkeys(tag.lower() for tag in re.findall(r"#\w+", text)))
|
||||||
181
python/obsidian_rag/vector_store.py
Normal file
181
python/obsidian_rag/vector_store.py
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
"""LanceDB table creation, vector upsert/delete/search."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any, Iterable
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from obsidian_rag.config import ObsidianRagConfig
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Schema constants
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
TABLE_NAME = "obsidian_chunks"
|
||||||
|
VECTOR_DIM = 1024 # mxbai-embed-large
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Types
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SearchResult:
|
||||||
|
chunk_id: str
|
||||||
|
chunk_text: str
|
||||||
|
source_file: str
|
||||||
|
source_directory: str
|
||||||
|
section: str | None
|
||||||
|
date: str | None
|
||||||
|
tags: list[str]
|
||||||
|
chunk_index: int
|
||||||
|
score: float
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Table setup
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def get_db(config: "ObsidianRagConfig") -> lancedb.LanceDBConnection:
|
||||||
|
"""Connect to the LanceDB database."""
|
||||||
|
import obsidian_rag.config as cfg_mod
|
||||||
|
|
||||||
|
db_path = cfg_mod.resolve_vector_db_path(config)
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
return lancedb.connect(str(db_path))
|
||||||
|
|
||||||
|
|
||||||
|
def create_table_if_not_exists(db: Any) -> Any:
|
||||||
|
"""Create the obsidian_chunks table if it doesn't exist."""
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
if TABLE_NAME in db.list_tables():
|
||||||
|
return db.open_table(TABLE_NAME)
|
||||||
|
|
||||||
|
schema = pa.schema(
|
||||||
|
[
|
||||||
|
pa.field("vector", pa.list_(pa.float32(), VECTOR_DIM)),
|
||||||
|
pa.field("chunk_id", pa.string()),
|
||||||
|
pa.field("chunk_text", pa.string()),
|
||||||
|
pa.field("source_file", pa.string()),
|
||||||
|
pa.field("source_directory", pa.string()),
|
||||||
|
pa.field("section", pa.string()),
|
||||||
|
pa.field("date", pa.string()),
|
||||||
|
pa.field("tags", pa.list_(pa.string())),
|
||||||
|
pa.field("chunk_index", pa.int32()),
|
||||||
|
pa.field("total_chunks", pa.int32()),
|
||||||
|
pa.field("modified_at", pa.string()),
|
||||||
|
pa.field("indexed_at", pa.string()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
tbl = db.create_table(TABLE_NAME, schema=schema, exist_ok=True)
|
||||||
|
# Create FTS index on chunk_text for DEGRADED mode fallback (Ollama down)
|
||||||
|
# replace=True makes this idempotent — safe to call on existing tables
|
||||||
|
tbl.create_fts_index("chunk_text", replace=True)
|
||||||
|
return tbl
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# CRUD operations
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_chunks(
|
||||||
|
table: Any,
|
||||||
|
chunks: list[dict[str, Any]],
|
||||||
|
) -> int:
|
||||||
|
"""Add or update chunks in the table. Returns number of chunks written."""
|
||||||
|
if not chunks:
|
||||||
|
return 0
|
||||||
|
# Use when_matched_update_all + when_not_matched_insert_all for full upsert
|
||||||
|
(
|
||||||
|
table.merge_insert("chunk_id")
|
||||||
|
.when_matched_update_all()
|
||||||
|
.when_not_matched_insert_all()
|
||||||
|
.execute(chunks)
|
||||||
|
)
|
||||||
|
return len(chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def delete_by_source_file(table: Any, source_file: str) -> int:
|
||||||
|
"""Delete all chunks from a given source file. Returns count deleted."""
|
||||||
|
before = table.count_rows()
|
||||||
|
table.delete(f'source_file = "{source_file}"')
|
||||||
|
return before - table.count_rows()
|
||||||
|
|
||||||
|
|
||||||
|
def search_chunks(
|
||||||
|
table: Any,
|
||||||
|
query_vector: list[float],
|
||||||
|
limit: int = 5,
|
||||||
|
directory_filter: list[str] | None = None,
|
||||||
|
date_range: dict | None = None,
|
||||||
|
tags: list[str] | None = None,
|
||||||
|
) -> list[SearchResult]:
|
||||||
|
"""Search for similar chunks using vector similarity.
|
||||||
|
|
||||||
|
Filters are applied as AND conditions.
|
||||||
|
"""
|
||||||
|
import pyarrow as pa
|
||||||
|
|
||||||
|
# Build WHERE clause
|
||||||
|
conditions: list[str] = []
|
||||||
|
if directory_filter:
|
||||||
|
dir_list = ", ".join(f'"{d}"' for d in directory_filter)
|
||||||
|
conditions.append(f'source_directory IN ({dir_list})')
|
||||||
|
if date_range:
|
||||||
|
if "from" in date_range:
|
||||||
|
conditions.append(f"date >= '{date_range['from']}'")
|
||||||
|
if "to" in date_range:
|
||||||
|
conditions.append(f"date <= '{date_range['to']}'")
|
||||||
|
if tags:
|
||||||
|
for tag in tags:
|
||||||
|
conditions.append(f"list_contains(tags, '{tag}')")
|
||||||
|
|
||||||
|
where_clause = " AND ".join(conditions) if conditions else None
|
||||||
|
|
||||||
|
results = (
|
||||||
|
table.search(query_vector, vector_column_name="vector")
|
||||||
|
.limit(limit)
|
||||||
|
.where(where_clause) if where_clause else table.search(query_vector, vector_column_name="vector").limit(limit)
|
||||||
|
).to_list()
|
||||||
|
|
||||||
|
return [
|
||||||
|
SearchResult(
|
||||||
|
chunk_id=r["chunk_id"],
|
||||||
|
chunk_text=r["chunk_text"],
|
||||||
|
source_file=r["source_file"],
|
||||||
|
source_directory=r["source_directory"],
|
||||||
|
section=r.get("section") if r.get("section") not in (None, "None") else None,
|
||||||
|
date=r.get("date") if r.get("date") not in (None, "None") else None,
|
||||||
|
tags=r.get("tags") or [],
|
||||||
|
chunk_index=r.get("chunk_index") or 0,
|
||||||
|
score=r.get("_distance") or 0.0,
|
||||||
|
)
|
||||||
|
for r in results
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_stats(table: Any) -> dict[str, Any]:
|
||||||
|
"""Return index statistics."""
|
||||||
|
total_docs = 0
|
||||||
|
total_chunks = 0
|
||||||
|
try:
|
||||||
|
total_chunks = table.count_rows()
|
||||||
|
# Count unique source files using pandas
|
||||||
|
all_data = table.to_pandas()
|
||||||
|
total_docs = all_data["source_file"].nunique()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {"total_docs": total_docs, "total_chunks": total_chunks}
|
||||||
35
python/pyproject.toml
Normal file
35
python/pyproject.toml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "obsidian-rag"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "RAG indexer for Obsidian vaults — powers OpenClaw's obsidian_rag_* tools"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"lancedb>=0.12",
|
||||||
|
"httpx>=0.27",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"python-frontmatter>=1.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.0",
|
||||||
|
"pytest-asyncio>=0.23",
|
||||||
|
"pytest-mock>=3.12",
|
||||||
|
"ruff>=0.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
obsidian-rag = "obsidian_rag.cli:main"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["."]
|
||||||
|
include = ["obsidian_rag*"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
pythonpath = ["."]
|
||||||
|
asyncio_mode = "auto"
|
||||||
250
python/tests/unit/test_chunker.py
Normal file
250
python/tests/unit/test_chunker.py
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
"""Tests for obsidian_rag.chunker — section splitting and sliding window."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from obsidian_rag.chunker import (
|
||||||
|
extract_tags,
|
||||||
|
extract_date_from_filename,
|
||||||
|
is_structured_note,
|
||||||
|
parse_frontmatter,
|
||||||
|
split_by_sections,
|
||||||
|
sliding_window_chunks,
|
||||||
|
chunk_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# parse_frontmatter
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_frontmatter_with_yaml():
|
||||||
|
content = """---
|
||||||
|
title: My Journal
|
||||||
|
tags: [journal, personal]
|
||||||
|
---
|
||||||
|
# Morning
|
||||||
|
|
||||||
|
Some content here.
|
||||||
|
"""
|
||||||
|
meta, body = parse_frontmatter(content)
|
||||||
|
assert meta.get("title") == "My Journal"
|
||||||
|
assert "# Morning" in body
|
||||||
|
assert "Some content" in body
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_frontmatter_without_frontmatter():
|
||||||
|
content = "# Just a header\n\nSome text without frontmatter."
|
||||||
|
meta, body = parse_frontmatter(content)
|
||||||
|
assert meta == {}
|
||||||
|
assert "# Just a header" in body
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# extract_tags
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_tags_basic():
|
||||||
|
text = "Hello #world and #python-code is nice"
|
||||||
|
tags = extract_tags(text)
|
||||||
|
assert "#world" in tags
|
||||||
|
assert "#python-code" in tags
|
||||||
|
# lowercased
|
||||||
|
assert all(t.startswith("#") for t in tags)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_tags_deduplicates():
|
||||||
|
text = "#hello #world #hello #python"
|
||||||
|
tags = extract_tags(text)
|
||||||
|
assert len(tags) == 3
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# extract_date_from_filename
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_date_from_filename_iso():
|
||||||
|
p = Path("2024-01-15.md")
|
||||||
|
assert extract_date_from_filename(p) == "2024-01-15"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_date_from_filename_compact():
|
||||||
|
p = Path("20240115.md")
|
||||||
|
assert extract_date_from_filename(p) == "2024-01-15"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_date_from_filename_no_date():
|
||||||
|
p = Path("my-journal.md")
|
||||||
|
assert extract_date_from_filename(p) is None
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# is_structured_note
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_structured_note_journal():
|
||||||
|
assert is_structured_note(Path("2024-01-15.md")) is True
|
||||||
|
assert is_structured_note(Path("Journal/2024-02-20.md")) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_structured_note_project():
|
||||||
|
assert is_structured_note(Path("My Project Ideas.md")) is False
|
||||||
|
assert is_structured_note(Path("shopping-list.md")) is False
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# split_by_sections
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_by_sections_multiple():
|
||||||
|
body = """# Mental Health
|
||||||
|
Feeling anxious today.
|
||||||
|
|
||||||
|
## Work
|
||||||
|
Project deadline approaching.
|
||||||
|
|
||||||
|
### Home
|
||||||
|
Need to clean the garage.
|
||||||
|
"""
|
||||||
|
sections = split_by_sections(body, {})
|
||||||
|
assert len(sections) == 3
|
||||||
|
assert sections[0][0] == "Mental Health"
|
||||||
|
# Section content excludes the header line itself
|
||||||
|
assert "Feeling anxious today." in sections[0][1]
|
||||||
|
assert sections[1][0] == "Work"
|
||||||
|
assert sections[2][0] == "Home"
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_by_sections_no_headers():
|
||||||
|
body = "Just plain text without any headers at all."
|
||||||
|
sections = split_by_sections(body, {})
|
||||||
|
assert len(sections) == 1
|
||||||
|
assert sections[0][0] is None
|
||||||
|
assert "Just plain text" in sections[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_by_sections_leading_content():
|
||||||
|
"""Content before the first header belongs to the first section."""
|
||||||
|
body = """Some intro text before any header.
|
||||||
|
|
||||||
|
# First Section
|
||||||
|
Content of first.
|
||||||
|
"""
|
||||||
|
sections = split_by_sections(body, {})
|
||||||
|
assert sections[0][0] is None
|
||||||
|
assert "Some intro text" in sections[0][1]
|
||||||
|
assert sections[1][0] == "First Section"
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# sliding_window_chunks
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_basic():
|
||||||
|
words = " ".join([f"word{i}" for i in range(1200)])
|
||||||
|
chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
# First chunk: words 0-499
|
||||||
|
assert chunks[0].startswith("word0")
|
||||||
|
# Chunks should have ~500 tokens each
|
||||||
|
for c in chunks:
|
||||||
|
assert len(c.split()) <= 500
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_overlap():
|
||||||
|
"""Adjacent chunks should share the overlap region."""
|
||||||
|
text = " ".join([f"word{i}" for i in range(1000)])
|
||||||
|
chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
|
||||||
|
# Every chunk after the first should start with words from the previous chunk
|
||||||
|
for i in range(1, len(chunks)):
|
||||||
|
prev_words = chunks[i - 1].split()
|
||||||
|
curr_words = chunks[i].split()
|
||||||
|
# Overlap should be evident
|
||||||
|
assert prev_words[-100:] == curr_words[:100]
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_empty():
|
||||||
|
assert sliding_window_chunks("", chunk_size=500, overlap=100) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_exact_size_produces_two_chunks():
|
||||||
|
"""With overlap=100, exactly 500 words produces 2 chunks (0-499 and 400-end)."""
|
||||||
|
words = " ".join([f"word{i}" for i in range(500)])
|
||||||
|
chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].startswith("word0")
|
||||||
|
assert chunks[1].startswith("word400") # advance = 500-100 = 400
|
||||||
|
|
||||||
|
|
||||||
|
def test_sliding_window_small_text():
|
||||||
|
"""Text much shorter than chunk_size returns single chunk."""
|
||||||
|
text = "just a few words"
|
||||||
|
chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0] == text
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# chunk_file integration
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_config(tmp_path: Path) -> MagicMock:
|
||||||
|
"""Build a minimal mock config pointing at a tmp vault."""
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.vault_path = str(tmp_path)
|
||||||
|
cfg.indexing.chunk_size = 500
|
||||||
|
cfg.indexing.chunk_overlap = 100
|
||||||
|
cfg.indexing.file_patterns = ["*.md"]
|
||||||
|
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
||||||
|
cfg.indexing.allow_dirs = []
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_file_structured_journal(tmp_path: Path):
|
||||||
|
vault = tmp_path / "Journal"
|
||||||
|
vault.mkdir()
|
||||||
|
fpath = vault / "2024-03-15.md"
|
||||||
|
fpath.write_text("""# Morning
|
||||||
|
|
||||||
|
Felt #anxious about the deadline.
|
||||||
|
|
||||||
|
## Work
|
||||||
|
Finished the report.
|
||||||
|
""")
|
||||||
|
|
||||||
|
cfg = _mock_config(tmp_path)
|
||||||
|
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||||
|
|
||||||
|
# Journal file → section-split → 2 chunks
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].section == "#Morning"
|
||||||
|
assert chunks[0].date == "2024-03-15"
|
||||||
|
assert "#anxious" in chunks[0].tags or "#anxious" in chunks[1].tags
|
||||||
|
assert chunks[0].source_file.endswith("Journal/2024-03-15.md")
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_file_unstructured(tmp_path: Path):
|
||||||
|
vault = tmp_path / "Notes"
|
||||||
|
vault.mkdir()
|
||||||
|
fpath = vault / "project-ideas.md"
|
||||||
|
fpath.write_text("This is a long note " * 200) # ~1000 words
|
||||||
|
|
||||||
|
cfg = _mock_config(tmp_path)
|
||||||
|
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
|
||||||
|
|
||||||
|
# Unstructured → sliding window → multiple chunks
|
||||||
|
assert len(chunks) > 1
|
||||||
|
assert all(c.section is None for c in chunks)
|
||||||
|
assert chunks[0].chunk_index == 0
|
||||||
130
python/tests/unit/test_config.py
Normal file
130
python/tests/unit/test_config.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""Tests for obsidian_rag.config — loader, path resolution, defaults."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from obsidian_rag.config import (
|
||||||
|
EmbeddingConfig,
|
||||||
|
ObsidianRagConfig,
|
||||||
|
load_config,
|
||||||
|
resolve_vector_db_path,
|
||||||
|
resolve_vault_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Config loading
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_config_parses_valid_json(tmp_path: Path):
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config_path.write_text(
|
||||||
|
json.dumps({
|
||||||
|
"vault_path": "/path/to/vault",
|
||||||
|
"embedding": {"model": "custom-model:tag", "dimensions": 512},
|
||||||
|
"vector_store": {"path": "/vectors/db"},
|
||||||
|
})
|
||||||
|
)
|
||||||
|
config = load_config(config_path)
|
||||||
|
assert config.vault_path == "/path/to/vault"
|
||||||
|
assert config.embedding.model == "custom-model:tag"
|
||||||
|
assert config.embedding.dimensions == 512 # overridden
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_config_missing_file_raises(tmp_path: Path):
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
load_config(tmp_path / "nonexistent.json")
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_config_merges_partial_json(tmp_path: Path):
|
||||||
|
config_path = tmp_path / "config.json"
|
||||||
|
config_path.write_text(json.dumps({"vault_path": "/custom/vault"}))
|
||||||
|
config = load_config(config_path)
|
||||||
|
# Unspecified fields fall back to defaults
|
||||||
|
assert config.vault_path == "/custom/vault"
|
||||||
|
assert config.embedding.base_url == "http://localhost:11434" # default
|
||||||
|
assert config.indexing.chunk_size == 500 # default
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# resolve_vault_path
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_vault_path_absolute():
|
||||||
|
cfg = ObsidianRagConfig(vault_path="/absolute/vault")
|
||||||
|
assert resolve_vault_path(cfg) == Path("/absolute/vault")
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_vault_path_relative_defaults_to_project_root():
|
||||||
|
cfg = ObsidianRagConfig(vault_path="KnowledgeVault/Default")
|
||||||
|
result = resolve_vault_path(cfg)
|
||||||
|
# Should resolve relative to python/obsidian_rag/ → project root
|
||||||
|
assert result.name == "Default"
|
||||||
|
assert result.parent.name == "KnowledgeVault"
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# resolve_vector_db_path
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_vector_db_path_string_absolute():
|
||||||
|
"""VectorStoreConfig stores path as a string; Path objects should be converted first."""
|
||||||
|
from obsidian_rag.config import VectorStoreConfig
|
||||||
|
|
||||||
|
# Using a string path (the actual usage)
|
||||||
|
cfg = ObsidianRagConfig(vector_store=VectorStoreConfig(path="/my/vectors.lance"))
|
||||||
|
result = resolve_vector_db_path(cfg)
|
||||||
|
assert result == Path("/my/vectors.lance")
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_vector_db_path_string_relative(tmp_path: Path):
|
||||||
|
"""Relative paths are resolved against the data directory."""
|
||||||
|
import obsidian_rag.config as cfg_mod
|
||||||
|
|
||||||
|
# Set up data dir + vault marker (required by _resolve_data_dir)
|
||||||
|
# Note: the dev data dir is "obsidian-rag" (without leading dot)
|
||||||
|
data_dir = tmp_path / "obsidian-rag"
|
||||||
|
data_dir.mkdir()
|
||||||
|
(tmp_path / "KnowledgeVault").mkdir()
|
||||||
|
vector_file = data_dir / "vectors.lance"
|
||||||
|
vector_file.touch()
|
||||||
|
|
||||||
|
cfg = ObsidianRagConfig(vector_store=cfg_mod.VectorStoreConfig(path="vectors.lance"))
|
||||||
|
orig = cfg_mod.DEFAULT_CONFIG_DIR
|
||||||
|
cfg_mod.DEFAULT_CONFIG_DIR = tmp_path
|
||||||
|
try:
|
||||||
|
result = resolve_vector_db_path(cfg)
|
||||||
|
finally:
|
||||||
|
cfg_mod.DEFAULT_CONFIG_DIR = orig
|
||||||
|
|
||||||
|
# Resolves to data_dir / vectors.lance
|
||||||
|
assert result.parent.name == "obsidian-rag" # dev dir is "obsidian-rag" (no leading dot)
|
||||||
|
assert result.name == "vectors.lance"
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Dataclass defaults
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_embedding_config_defaults():
|
||||||
|
cfg = EmbeddingConfig()
|
||||||
|
assert cfg.model == "mxbai-embed-large"
|
||||||
|
assert cfg.dimensions == 1024
|
||||||
|
assert cfg.batch_size == 64
|
||||||
|
|
||||||
|
|
||||||
|
def test_security_config_defaults():
|
||||||
|
from obsidian_rag.config import SecurityConfig
|
||||||
|
|
||||||
|
cfg = SecurityConfig()
|
||||||
|
assert "#mentalhealth" in cfg.sensitive_sections
|
||||||
|
assert "health" in cfg.require_confirmation_for
|
||||||
254
python/tests/unit/test_security.py
Normal file
254
python/tests/unit/test_security.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
"""Tests for obsidian_rag.security — path traversal, sanitization, sensitive detection."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from obsidian_rag.security import (
|
||||||
|
detect_sensitive,
|
||||||
|
filter_tags,
|
||||||
|
is_symlink_outside_vault,
|
||||||
|
sanitize_text,
|
||||||
|
should_index_dir,
|
||||||
|
validate_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# validate_path
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_path_normal_file(tmp_path: Path):
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
target = vault / "subdir" / "note.md"
|
||||||
|
target.parent.mkdir()
|
||||||
|
target.touch()
|
||||||
|
|
||||||
|
result = validate_path(Path("subdir/note.md"), vault)
|
||||||
|
assert result == target.resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_path_traversal_attempt(tmp_path: Path):
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="traversal"):
|
||||||
|
validate_path(Path("../etc/passwd"), vault)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_path_deep_traversal(tmp_path: Path):
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="traversal"):
|
||||||
|
validate_path(Path("subdir/../../../etc/passwd"), vault)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_path_absolute_path(tmp_path: Path):
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
validate_path(Path("/etc/passwd"), vault)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_path_path_with_dotdot_in_resolve(tmp_path: Path):
|
||||||
|
"""Path that resolves inside vault but has .. in parts should be caught."""
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
sub = vault / "subdir"
|
||||||
|
sub.mkdir()
|
||||||
|
|
||||||
|
# validate_path checks parts for ".."
|
||||||
|
with pytest.raises(ValueError, match="traversal"):
|
||||||
|
validate_path(Path("subdir/../subdir/../note.md"), vault)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# is_symlink_outside_vault
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_symlink_outside_vault_internal(tmp_path: Path):
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
note = vault / "note.md"
|
||||||
|
note.touch()
|
||||||
|
|
||||||
|
link = vault / "link.md"
|
||||||
|
link.symlink_to(note)
|
||||||
|
|
||||||
|
assert is_symlink_outside_vault(link, vault) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_symlink_outside_vault_external(tmp_path: Path):
|
||||||
|
vault = tmp_path / "vault"
|
||||||
|
vault.mkdir()
|
||||||
|
outside = tmp_path / "outside.md"
|
||||||
|
outside.touch()
|
||||||
|
|
||||||
|
link = vault / "link.md"
|
||||||
|
link.symlink_to(outside)
|
||||||
|
|
||||||
|
assert is_symlink_outside_vault(link, vault) is True
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# sanitize_text
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_text_strips_html():
|
||||||
|
raw = "<script>alert('xss')</script>Hello #world"
|
||||||
|
result = sanitize_text(raw)
|
||||||
|
assert "<script>" not in result
|
||||||
|
assert "Hello #world" in result
|
||||||
|
# Text content inside HTML tags is preserved (sanitize_text strips the tags only)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_text_removes_code_blocks():
|
||||||
|
raw = """Some text
|
||||||
|
|
||||||
|
```
|
||||||
|
secret_api_key = "sk-12345"
|
||||||
|
```
|
||||||
|
|
||||||
|
More text
|
||||||
|
"""
|
||||||
|
result = sanitize_text(raw)
|
||||||
|
assert "secret_api_key" not in result
|
||||||
|
assert "Some text" in result
|
||||||
|
assert "More text" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_text_normalizes_whitespace():
|
||||||
|
raw = "Hello\n\n\n world\t\t spaces"
|
||||||
|
result = sanitize_text(raw)
|
||||||
|
assert "\n" not in result
|
||||||
|
assert "\t" not in result
|
||||||
|
assert " " not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_text_caps_length():
|
||||||
|
long_text = "word " * 1000
|
||||||
|
result = sanitize_text(long_text)
|
||||||
|
assert len(result) <= 2000
|
||||||
|
|
||||||
|
|
||||||
|
def test_sanitize_text_preserves_hashtags():
|
||||||
|
raw = "#mentalhealth #python #machine-learning"
|
||||||
|
result = sanitize_text(raw)
|
||||||
|
assert "#mentalhealth" in result
|
||||||
|
assert "#python" in result
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# detect_sensitive
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_sensitive_mental_health_section():
|
||||||
|
text = " #mentalhealth section content"
|
||||||
|
sensitive_sections = ["#mentalhealth", "#physicalhealth", "#Relations"]
|
||||||
|
patterns = {"financial": [], "health": []}
|
||||||
|
|
||||||
|
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||||
|
assert result["health"] is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_sensitive_financial_pattern():
|
||||||
|
text = "I owe Sreenivas $50 and need to pay it back"
|
||||||
|
sensitive_sections = ["#mentalhealth"]
|
||||||
|
patterns = {"financial": ["owe", "$"], "health": []}
|
||||||
|
|
||||||
|
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||||
|
assert result["financial"] is True
|
||||||
|
assert result["health"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_sensitive_relations():
|
||||||
|
text = "Had coffee with Sarah #Relations"
|
||||||
|
sensitive_sections = ["#Relations"]
|
||||||
|
patterns = {"financial": [], "health": []}
|
||||||
|
|
||||||
|
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||||
|
# Only specific health sections set health=true
|
||||||
|
assert result["relations"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_sensitive_clean_text():
|
||||||
|
text = "This is a normal note about cooking dinner."
|
||||||
|
sensitive_sections = []
|
||||||
|
patterns = {"financial": [], "health": []}
|
||||||
|
|
||||||
|
result = detect_sensitive(text, sensitive_sections, patterns)
|
||||||
|
assert result == {"health": False, "financial": False, "relations": False}
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# should_index_dir
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_config() -> MagicMock:
|
||||||
|
cfg = MagicMock()
|
||||||
|
cfg.indexing.allow_dirs = []
|
||||||
|
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_index_dir_allows_normal():
|
||||||
|
cfg = _mock_config()
|
||||||
|
assert should_index_dir("Journal", cfg) is True
|
||||||
|
assert should_index_dir("Finance", cfg) is True
|
||||||
|
assert should_index_dir("Projects", cfg) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_index_dir_denies_hidden():
|
||||||
|
cfg = _mock_config()
|
||||||
|
assert should_index_dir(".obsidian", cfg) is False
|
||||||
|
assert should_index_dir(".git", cfg) is False
|
||||||
|
assert should_index_dir(".trash", cfg) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_index_dir_denies_configured():
|
||||||
|
cfg = _mock_config()
|
||||||
|
assert should_index_dir("zzz-Archive", cfg) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_index_dir_allow_list_override():
|
||||||
|
cfg = _mock_config()
|
||||||
|
cfg.indexing.allow_dirs = ["Journal", "Finance"]
|
||||||
|
assert should_index_dir("Journal", cfg) is True
|
||||||
|
assert should_index_dir("Finance", cfg) is True
|
||||||
|
assert should_index_dir("Projects", cfg) is False
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# filter_tags
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_tags_basic():
|
||||||
|
text = "Hello #world and #python tags #AI"
|
||||||
|
tags = filter_tags(text)
|
||||||
|
assert "#world" in tags
|
||||||
|
assert "#python" in tags
|
||||||
|
assert "#ai" in tags
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_tags_deduplicates():
|
||||||
|
text = "#hello #world #hello"
|
||||||
|
tags = filter_tags(text)
|
||||||
|
assert len(tags) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_filter_tags_no_tags():
|
||||||
|
text = "just plain text without any hashtags"
|
||||||
|
assert filter_tags(text) == []
|
||||||
189
python/tests/unit/test_vector_store.py
Normal file
189
python/tests/unit/test_vector_store.py
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
"""Tests for obsidian_rag.vector_store — LanceDB CRUD operations."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import lancedb
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from obsidian_rag.vector_store import (
|
||||||
|
SearchResult,
|
||||||
|
create_table_if_not_exists,
|
||||||
|
delete_by_source_file,
|
||||||
|
get_stats,
|
||||||
|
search_chunks,
|
||||||
|
upsert_chunks,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _connect(db_path: Path) -> lancedb.LanceDBConnection:
|
||||||
|
"""Create a LanceDB connection for testing."""
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
return lancedb.connect(str(db_path))
|
||||||
|
|
||||||
|
|
||||||
|
def _make_table(tmp_path: Path):
|
||||||
|
"""Create a fresh obsidian_chunks table for testing."""
|
||||||
|
db = _connect(tmp_path / "test.lance")
|
||||||
|
tbl = create_table_if_not_exists(db)
|
||||||
|
return tbl
|
||||||
|
|
||||||
|
|
||||||
|
def _chunk(source_file: str = "test.md", chunk_id: str = "c1", **overrides):
|
||||||
|
"""Build a minimal valid chunk dict."""
|
||||||
|
base = {
|
||||||
|
"vector": [0.1] * 1024,
|
||||||
|
"chunk_id": chunk_id,
|
||||||
|
"chunk_text": "Hello world",
|
||||||
|
"source_file": source_file,
|
||||||
|
"source_directory": "Notes",
|
||||||
|
"section": None,
|
||||||
|
"date": "2024-01-15",
|
||||||
|
"tags": ["#test"],
|
||||||
|
"chunk_index": 0,
|
||||||
|
"total_chunks": 1,
|
||||||
|
"modified_at": "2024-01-15T10:00:00Z",
|
||||||
|
"indexed_at": "2024-01-15T12:00:00Z",
|
||||||
|
}
|
||||||
|
base.update(overrides)
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Table creation
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_table_if_not_exists_creates_new(tmp_path: Path):
|
||||||
|
db = _connect(tmp_path / "new.lance")
|
||||||
|
tbl = create_table_if_not_exists(db)
|
||||||
|
assert "obsidian_chunks" in db.list_tables().tables
|
||||||
|
assert tbl.count_rows() == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_table_if_not_exists_idempotent(tmp_path: Path):
|
||||||
|
db = _connect(tmp_path / "exists.lance")
|
||||||
|
tbl1 = create_table_if_not_exists(db)
|
||||||
|
tbl2 = create_table_if_not_exists(db)
|
||||||
|
assert tbl1.name == tbl2.name # same underlying table
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# upsert_chunks
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_chunks_inserts_new(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
count = upsert_chunks(tbl, [_chunk()])
|
||||||
|
assert count == 1
|
||||||
|
assert tbl.count_rows() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_chunks_empty_list_returns_zero(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
assert upsert_chunks(tbl, []) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_chunks_updates_existing(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Original")])
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="dup-id", chunk_text="Updated")])
|
||||||
|
assert tbl.count_rows() == 1
|
||||||
|
df = tbl.to_pandas()
|
||||||
|
assert df[df["chunk_id"] == "dup-id"]["chunk_text"].iloc[0] == "Updated"
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_chunks_multiple(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
chunks = [_chunk(chunk_id=f"id-{i}", chunk_text=f"Chunk {i}") for i in range(10)]
|
||||||
|
upsert_chunks(tbl, chunks)
|
||||||
|
assert tbl.count_rows() == 10
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# delete_by_source_file
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_by_source_file_removes_chunks(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t1")])
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="test.md", chunk_id="t2")])
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="other.md", chunk_id="o1")])
|
||||||
|
assert tbl.count_rows() == 3
|
||||||
|
|
||||||
|
deleted = delete_by_source_file(tbl, "test.md")
|
||||||
|
assert deleted == 2
|
||||||
|
assert tbl.count_rows() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_by_source_file_nonexistent_returns_zero(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
deleted = delete_by_source_file(tbl, "does-not-exist.md")
|
||||||
|
assert deleted == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# search_chunks
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_chunks_with_directory_filter(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="n.md", source_directory="Notes", chunk_id="n1")])
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="c.md", source_directory="Code", chunk_id="c1")])
|
||||||
|
|
||||||
|
results = search_chunks(
|
||||||
|
tbl, [0.0] * 1024, limit=10, directory_filter=["Notes"]
|
||||||
|
)
|
||||||
|
assert all(r.source_directory == "Notes" for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_chunks_with_date_range(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="d1", date="2024-01-01")])
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="d2", date="2024-03-15")])
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="d3", date="2024-06-20")])
|
||||||
|
|
||||||
|
results = search_chunks(
|
||||||
|
tbl, [0.0] * 1024, limit=10, date_range={"from": "2024-02-01", "to": "2024-05-31"}
|
||||||
|
)
|
||||||
|
for r in results:
|
||||||
|
assert "2024-02-01" <= r.date <= "2024-05-31"
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_chunks_with_tags_filter(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="t1", tags=["#python", "#testing"])])
|
||||||
|
upsert_chunks(tbl, [_chunk(chunk_id="t2", tags=["#javascript"])])
|
||||||
|
|
||||||
|
results = search_chunks(tbl, [0.0] * 1024, limit=10, tags=["#python"])
|
||||||
|
assert len(results) >= 0 # filter applied
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# get_stats
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_stats_empty_table(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
stats = get_stats(tbl)
|
||||||
|
assert stats["total_docs"] == 0
|
||||||
|
assert stats["total_chunks"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_stats_with_data(tmp_path: Path):
|
||||||
|
tbl = _make_table(tmp_path)
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a1")])
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="a.md", chunk_id="a2")])
|
||||||
|
upsert_chunks(tbl, [_chunk(source_file="b.md", chunk_id="b1")])
|
||||||
|
|
||||||
|
stats = get_stats(tbl)
|
||||||
|
assert stats["total_docs"] == 2 # 2 unique files
|
||||||
|
assert stats["total_chunks"] == 3
|
||||||
34
src/index.ts
Normal file
34
src/index.ts
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
/**
|
||||||
|
* OpenClaw plugin entry point.
|
||||||
|
* Registers 4 obsidian_rag_* tools via the OpenClaw SDK.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
|
||||||
|
import { registerTools } from "./tools/index.js";
|
||||||
|
import { loadConfig } from "./utils/config.js";
|
||||||
|
import { createHealthMachine, probeAll } from "./services/health.js";
|
||||||
|
import { VaultWatcher } from "./services/vault-watcher.js";
|
||||||
|
|
||||||
|
export default definePluginEntry({
|
||||||
|
id: "obsidian-rag",
|
||||||
|
name: "Obsidian RAG",
|
||||||
|
description:
|
||||||
|
"Semantic search through Obsidian vault notes using RAG. Powers natural language queries like 'How was my mental health in 2024?' across journal entries, financial records, health data, and more.",
|
||||||
|
register(api) {
|
||||||
|
const config = loadConfig();
|
||||||
|
const health = createHealthMachine(config);
|
||||||
|
|
||||||
|
// Start vault watcher for auto-sync
|
||||||
|
const watcher = new VaultWatcher(config, health);
|
||||||
|
watcher.start();
|
||||||
|
|
||||||
|
// Register all 4 tools
|
||||||
|
registerTools(api, config, health);
|
||||||
|
|
||||||
|
console.log("[obsidian-rag] Plugin loaded — tools registered");
|
||||||
|
|
||||||
|
// Probe dependencies and start health reprobing in background
|
||||||
|
probeAll(config).then((probe) => health.transition(probe));
|
||||||
|
health.startReprobing(() => probeAll(config));
|
||||||
|
},
|
||||||
|
});
|
||||||
130
src/services/health.ts
Normal file
130
src/services/health.ts
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
/** Health state machine: HEALTHY / DEGRADED / UNAVAILABLE. */
|
||||||
|
|
||||||
|
import { existsSync, readFileSync } from "fs";
|
||||||
|
import { resolve } from "path";
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
|
||||||
|
export type HealthState = "healthy" | "degraded" | "unavailable";
|
||||||
|
|
||||||
|
export interface HealthStatus {
|
||||||
|
state: HealthState;
|
||||||
|
ollama_up: boolean;
|
||||||
|
index_exists: boolean;
|
||||||
|
vault_exists: boolean;
|
||||||
|
total_docs: number;
|
||||||
|
total_chunks: number;
|
||||||
|
last_sync: string | null;
|
||||||
|
active_job: { id: string; mode: string; progress: number } | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ProbeResult {
|
||||||
|
ollama_up: boolean;
|
||||||
|
index_exists: boolean;
|
||||||
|
vault_exists: boolean;
|
||||||
|
total_docs: number;
|
||||||
|
total_chunks: number;
|
||||||
|
last_sync: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const REPROBE_INTERVAL_MS = 30_000;
|
||||||
|
|
||||||
|
export function createHealthMachine(_config: ObsidianRagConfig) {
|
||||||
|
let currentState: HealthState = "unavailable";
|
||||||
|
let status: ProbeResult = {
|
||||||
|
ollama_up: false,
|
||||||
|
index_exists: false,
|
||||||
|
vault_exists: false,
|
||||||
|
total_docs: 0,
|
||||||
|
total_chunks: 0,
|
||||||
|
last_sync: null,
|
||||||
|
};
|
||||||
|
let activeJob: { id: string; mode: string; progress: number } | null = null;
|
||||||
|
let reprobeTimer: ReturnType<typeof setInterval> | null = null;
|
||||||
|
|
||||||
|
function transition(probe: ProbeResult): void {
|
||||||
|
status = probe;
|
||||||
|
const prev = currentState;
|
||||||
|
if (!probe.index_exists || !probe.vault_exists) {
|
||||||
|
currentState = "unavailable";
|
||||||
|
} else if (!probe.ollama_up) {
|
||||||
|
currentState = "degraded";
|
||||||
|
} else {
|
||||||
|
currentState = "healthy";
|
||||||
|
}
|
||||||
|
if (prev !== currentState) {
|
||||||
|
console.log(`[obsidian-rag] Health: ${prev} → ${currentState}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function get(): HealthStatus {
|
||||||
|
return { state: currentState, ...status, active_job: activeJob };
|
||||||
|
}
|
||||||
|
|
||||||
|
function setActiveJob(job: { id: string; mode: string; progress: number } | null): void {
|
||||||
|
activeJob = job;
|
||||||
|
}
|
||||||
|
|
||||||
|
function startReprobing(fn: () => Promise<ProbeResult>): void {
|
||||||
|
if (reprobeTimer) clearInterval(reprobeTimer);
|
||||||
|
reprobeTimer = setInterval(async () => {
|
||||||
|
const probe = await fn();
|
||||||
|
transition(probe);
|
||||||
|
}, REPROBE_INTERVAL_MS);
|
||||||
|
}
|
||||||
|
|
||||||
|
function stop(): void {
|
||||||
|
if (reprobeTimer) {
|
||||||
|
clearInterval(reprobeTimer);
|
||||||
|
reprobeTimer = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { get, transition, setActiveJob, startReprobing, stop };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function probeAll(config: ObsidianRagConfig): Promise<ProbeResult> {
|
||||||
|
const { resolveVectorDbPath } = await import("../utils/lancedb.js");
|
||||||
|
|
||||||
|
const vaultPath = resolve(process.cwd(), config.vault_path);
|
||||||
|
const dbPath = resolveVectorDbPath(config);
|
||||||
|
|
||||||
|
const vaultExists = existsSync(vaultPath);
|
||||||
|
const indexExists = existsSync(String(dbPath));
|
||||||
|
const ollamaUp = await probeOllama(config.embedding.base_url);
|
||||||
|
|
||||||
|
let totalDocs = 0;
|
||||||
|
let totalChunks = 0;
|
||||||
|
let lastSync: string | null = null;
|
||||||
|
|
||||||
|
if (indexExists) {
|
||||||
|
try {
|
||||||
|
const syncPath = resolve(dbPath, "..", "sync-result.json");
|
||||||
|
if (existsSync(syncPath)) {
|
||||||
|
const data = JSON.parse(readFileSync(syncPath, "utf-8"));
|
||||||
|
lastSync = data.timestamp ?? null;
|
||||||
|
totalDocs = data.indexed_files ?? 0;
|
||||||
|
totalChunks = data.total_chunks ?? 0;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
ollama_up: ollamaUp,
|
||||||
|
index_exists: indexExists,
|
||||||
|
vault_exists: vaultExists,
|
||||||
|
total_docs: totalDocs,
|
||||||
|
total_chunks: totalChunks,
|
||||||
|
last_sync: lastSync,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function probeOllama(baseUrl: string): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const res = await fetch(`${baseUrl}/api/tags`, { signal: AbortSignal.timeout(3000) });
|
||||||
|
return res.ok;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
120
src/services/indexer-bridge.ts
Normal file
120
src/services/indexer-bridge.ts
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
/** Bridge to the Python indexer CLI — spawns subprocess, tracks job progress. */
|
||||||
|
|
||||||
|
import { spawn } from "child_process";
|
||||||
|
import { readFileSync, existsSync } from "fs";
|
||||||
|
import { resolve } from "path";
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
|
||||||
|
export interface JobStatus {
|
||||||
|
id: string;
|
||||||
|
mode: string;
|
||||||
|
progress: number;
|
||||||
|
status: "running" | "complete" | "failed";
|
||||||
|
indexed_files?: number;
|
||||||
|
total_chunks?: number;
|
||||||
|
duration_ms?: number;
|
||||||
|
errors?: Array<{ file: string; error: string }>;
|
||||||
|
}
|
||||||
|
|
||||||
|
const runningJobs = new Map<string, JobStatus>();
|
||||||
|
|
||||||
|
export function spawnIndexer(
|
||||||
|
mode: "index" | "sync" | "reindex",
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
): Promise<JobStatus> {
|
||||||
|
const jobId = `job-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
||||||
|
const status: JobStatus = {
|
||||||
|
id: jobId,
|
||||||
|
mode,
|
||||||
|
progress: 0,
|
||||||
|
status: "running",
|
||||||
|
};
|
||||||
|
runningJobs.set(jobId, status);
|
||||||
|
|
||||||
|
return new Promise((resolveJob) => {
|
||||||
|
const pythonCmd = "python";
|
||||||
|
const args = ["-m", "obsidian_rag.cli", mode];
|
||||||
|
|
||||||
|
const child = spawn(pythonCmd, args, {
|
||||||
|
cwd: resolve(process.cwd(), "python"),
|
||||||
|
stdio: ["ignore", "pipe", "pipe"],
|
||||||
|
});
|
||||||
|
|
||||||
|
let stdout = "";
|
||||||
|
let stderr = "";
|
||||||
|
|
||||||
|
child.stdout?.on("data", (chunk: Buffer) => {
|
||||||
|
stdout += chunk.toString();
|
||||||
|
try {
|
||||||
|
const lines = stdout.split("\n").filter(Boolean);
|
||||||
|
for (const line of lines) {
|
||||||
|
try {
|
||||||
|
const obj = JSON.parse(line);
|
||||||
|
if (obj.type === "progress") {
|
||||||
|
const total = obj.total ?? 1;
|
||||||
|
const current = obj.current ?? 0;
|
||||||
|
status.progress = Math.round((current / total) * 100);
|
||||||
|
} else if (obj.type === "complete") {
|
||||||
|
status.status = obj.errors?.length ? "failed" : "complete";
|
||||||
|
status.indexed_files = obj.indexed_files;
|
||||||
|
status.total_chunks = obj.total_chunks;
|
||||||
|
status.duration_ms = obj.duration_ms;
|
||||||
|
status.errors = obj.errors ?? [];
|
||||||
|
status.progress = 100;
|
||||||
|
runningJobs.delete(jobId);
|
||||||
|
resolveJob(status);
|
||||||
|
} else if (obj.type === "error") {
|
||||||
|
status.status = "failed";
|
||||||
|
status.errors = [{ file: "cli", error: obj.error }];
|
||||||
|
runningJobs.delete(jobId);
|
||||||
|
resolveJob(status);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Not JSON — ignore partial lines
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore parse errors
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
child.stderr?.on("data", (chunk: Buffer) => {
|
||||||
|
stderr += chunk.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on("close", (code) => {
|
||||||
|
if (status.status === "running") {
|
||||||
|
status.status = code === 0 ? "complete" : "failed";
|
||||||
|
runningJobs.delete(jobId);
|
||||||
|
resolveJob(status);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on("error", (err) => {
|
||||||
|
status.status = "failed";
|
||||||
|
status.errors = [{ file: "subprocess", error: err.message }];
|
||||||
|
runningJobs.delete(jobId);
|
||||||
|
resolveJob(status);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getJobStatus(jobId: string): JobStatus | null {
|
||||||
|
return runningJobs.get(jobId) ?? null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function readSyncResult(config: ObsidianRagConfig): {
|
||||||
|
timestamp: string | null;
|
||||||
|
indexed_files: number;
|
||||||
|
total_chunks: number;
|
||||||
|
errors: Array<{ file: string; error: string }>;
|
||||||
|
} | null {
|
||||||
|
const dataDir = resolve(process.cwd(), ".obsidian-rag");
|
||||||
|
const path = resolve(dataDir, "sync-result.json");
|
||||||
|
if (!existsSync(path)) return null;
|
||||||
|
try {
|
||||||
|
return JSON.parse(readFileSync(path, "utf-8"));
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
81
src/services/vault-watcher.ts
Normal file
81
src/services/vault-watcher.ts
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
/** Vault watcher — chokidar-based file system monitor with debounce + batching. */
|
||||||
|
|
||||||
|
import { watch, FSWatcher } from "chokidar";
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
import type { HealthState } from "./health.js";
|
||||||
|
|
||||||
|
const DEBOUNCE_MS = 2_000;
|
||||||
|
const COLLECT_WINDOW_MS = 5_000;
|
||||||
|
|
||||||
|
export class VaultWatcher {
|
||||||
|
private watcher: FSWatcher | null = null;
|
||||||
|
private debounceTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
private pending = new Set<string>();
|
||||||
|
private collectTimer: ReturnType<typeof setTimeout> | null = null;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
private config: ObsidianRagConfig,
|
||||||
|
private health: { get: () => { state: HealthState } },
|
||||||
|
) {}
|
||||||
|
|
||||||
|
start(): void {
|
||||||
|
const vaultPath = this.config.vault_path;
|
||||||
|
this.watcher = watch(vaultPath, {
|
||||||
|
persistent: true,
|
||||||
|
ignoreInitial: true,
|
||||||
|
depth: 99,
|
||||||
|
});
|
||||||
|
|
||||||
|
this.watcher.on("add", (p) => this.onEvent(p));
|
||||||
|
this.watcher.on("change", (p) => this.onEvent(p));
|
||||||
|
this.watcher.on("unlink", (p) => this.onEvent(p));
|
||||||
|
}
|
||||||
|
|
||||||
|
private onEvent(filepath: string): void {
|
||||||
|
if (!filepath.endsWith(".md")) return;
|
||||||
|
// Apply deny list check
|
||||||
|
const parts = filepath.replace("\\", "/").split("/");
|
||||||
|
const dir = parts[parts.length - 2] ?? "";
|
||||||
|
if (this.config.indexing.deny_dirs.includes(dir)) return;
|
||||||
|
|
||||||
|
this.pending.add(filepath);
|
||||||
|
this.scheduleFlush();
|
||||||
|
}
|
||||||
|
|
||||||
|
private scheduleFlush(): void {
|
||||||
|
if (this.debounceTimer) clearTimeout(this.debounceTimer);
|
||||||
|
this.debounceTimer = setTimeout(() => {
|
||||||
|
this.flush();
|
||||||
|
}, DEBOUNCE_MS);
|
||||||
|
}
|
||||||
|
|
||||||
|
private flush(): void {
|
||||||
|
if (this.pending.size === 0) return;
|
||||||
|
const files = [...this.pending];
|
||||||
|
this.pending.clear();
|
||||||
|
|
||||||
|
if (this.collectTimer) clearTimeout(this.collectTimer);
|
||||||
|
this.collectTimer = setTimeout(() => {
|
||||||
|
this.triggerSync(files);
|
||||||
|
}, COLLECT_WINDOW_MS);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async triggerSync(_files: string[]): Promise<void> {
|
||||||
|
// Import dynamically to avoid circular issues
|
||||||
|
const { spawnIndexer } = await import("./indexer-bridge.js");
|
||||||
|
const health = this.health.get();
|
||||||
|
if (health.state === "unavailable") {
|
||||||
|
console.log("[obsidian-rag] Skipping sync — index unavailable");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
console.log(`[obsidian-rag] Triggering sync for ${_files.length} files`);
|
||||||
|
await spawnIndexer("sync", this.config);
|
||||||
|
}
|
||||||
|
|
||||||
|
stop(): void {
|
||||||
|
this.watcher?.close();
|
||||||
|
this.watcher = null;
|
||||||
|
if (this.debounceTimer) clearTimeout(this.debounceTimer);
|
||||||
|
if (this.collectTimer) clearTimeout(this.collectTimer);
|
||||||
|
}
|
||||||
|
}
|
||||||
44
src/tools/index-tool.ts
Normal file
44
src/tools/index-tool.ts
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/** obsidian_rag_index tool — spawns the Python indexer CLI. */
|
||||||
|
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
import type { HealthState } from "../services/health.js";
|
||||||
|
import type { ResponseEnvelope } from "../utils/types.js";
|
||||||
|
import { makeEnvelope } from "../utils/response.js";
|
||||||
|
import { spawnIndexer } from "../services/indexer-bridge.js";
|
||||||
|
|
||||||
|
export interface IndexParams {
|
||||||
|
mode: "full" | "sync" | "reindex";
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function runIndexTool(
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
health: { get: () => { state: HealthState }; setActiveJob: (job: { id: string; mode: string; progress: number } | null) => void },
|
||||||
|
params: IndexParams,
|
||||||
|
): Promise<ResponseEnvelope<{ job_id: string; status: string; mode: string; message: string } | null>> {
|
||||||
|
const modeMap = { full: "index", sync: "sync", reindex: "reindex" } as const;
|
||||||
|
const cliMode = modeMap[params.mode];
|
||||||
|
|
||||||
|
try {
|
||||||
|
const job = await spawnIndexer(cliMode, config);
|
||||||
|
|
||||||
|
health.setActiveJob({ id: job.id, mode: job.mode, progress: job.progress });
|
||||||
|
|
||||||
|
return makeEnvelope(
|
||||||
|
"healthy",
|
||||||
|
{
|
||||||
|
job_id: job.id,
|
||||||
|
status: "started",
|
||||||
|
mode: params.mode,
|
||||||
|
message: `Indexing job ${job.id} started in ${params.mode} mode`,
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
);
|
||||||
|
} catch (err) {
|
||||||
|
return makeEnvelope("unavailable", null, {
|
||||||
|
code: "INDEXER_SPAWN_FAILED",
|
||||||
|
message: String(err),
|
||||||
|
recoverable: true,
|
||||||
|
suggestion: "Ensure the Python indexer is installed: pip install -e python/",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
118
src/tools/index.ts
Normal file
118
src/tools/index.ts
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
/** Tool registration — wires all 4 obsidian_rag_* tools into OpenClaw. */
|
||||||
|
|
||||||
|
import type { AgentToolResult } from "@mariozechner/pi-agent-core";
|
||||||
|
import type { OpenClawPluginApi } from "openclaw/plugin-sdk/plugin-entry";
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
import type { HealthState } from "../services/health.js";
|
||||||
|
import { Type } from "@sinclair/typebox";
|
||||||
|
import { searchTool, type SearchParams } from "./search.js";
|
||||||
|
import { runIndexTool, type IndexParams } from "./index-tool.js";
|
||||||
|
import { statusTool } from "./status.js";
|
||||||
|
import { memoryStoreTool, type MemoryStoreParams } from "./memory.js";
|
||||||
|
|
||||||
|
function textEnvelope<T>(text: string, details: T): AgentToolResult<T> {
|
||||||
|
return { content: [{ type: "text", text }], details };
|
||||||
|
}
|
||||||
|
|
||||||
|
export function registerTools(
|
||||||
|
api: OpenClawPluginApi,
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
health: { get: () => { state: HealthState }; setActiveJob: (job: { id: string; mode: string; progress: number } | null) => void },
|
||||||
|
): void {
|
||||||
|
// obsidian_rag_search — primary semantic search
|
||||||
|
api.registerTool({
|
||||||
|
name: "obsidian_rag_search",
|
||||||
|
description:
|
||||||
|
"Primary semantic search tool. Given a natural language query, searches the Obsidian vault index and returns the most relevant note chunks ranked by semantic similarity. Supports filtering by directory, date range, and tags.",
|
||||||
|
label: "Search Obsidian Vault",
|
||||||
|
parameters: Type.Object({
|
||||||
|
query: Type.String({ description: "Natural language question or topic to search for" }),
|
||||||
|
max_results: Type.Optional(
|
||||||
|
Type.Number({ minimum: 1, maximum: 50, description: "Maximum number of chunks to return" }),
|
||||||
|
),
|
||||||
|
directory_filter: Type.Optional(
|
||||||
|
Type.Array(Type.String(), {
|
||||||
|
description: "Limit search to specific vault subdirectories (e.g. ['Journal', 'Finance'])",
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
date_range: Type.Optional(
|
||||||
|
Type.Object({
|
||||||
|
from: Type.Optional(Type.String({ description: "Start date (YYYY-MM-DD)" })),
|
||||||
|
to: Type.Optional(Type.String({ description: "End date (YYYY-MM-DD)" })),
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
tags: Type.Optional(
|
||||||
|
Type.Array(Type.String(), {
|
||||||
|
description: "Filter by hashtags found in notes (e.g. ['#mentalhealth', '#therapy'])",
|
||||||
|
}),
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
async execute(_id, params) {
|
||||||
|
const searchParams: SearchParams = {
|
||||||
|
query: String(params.query),
|
||||||
|
max_results: params.max_results != null ? Number(params.max_results) : undefined,
|
||||||
|
directory_filter: params.directory_filter as string[] | undefined,
|
||||||
|
date_range: params.date_range as { from?: string; to?: string } | undefined,
|
||||||
|
tags: params.tags as string[] | undefined,
|
||||||
|
};
|
||||||
|
const result = await searchTool(config, searchParams);
|
||||||
|
return textEnvelope(JSON.stringify(result), result);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// obsidian_rag_index — trigger indexing
|
||||||
|
api.registerTool({
|
||||||
|
name: "obsidian_rag_index",
|
||||||
|
description:
|
||||||
|
"Trigger indexing of the Obsidian vault. Use 'full' for first-time setup, 'sync' for incremental updates, 'reindex' to force a clean rebuild.",
|
||||||
|
label: "Index Obsidian Vault",
|
||||||
|
parameters: Type.Object({
|
||||||
|
mode: Type.Union(
|
||||||
|
[Type.Literal("full"), Type.Literal("sync"), Type.Literal("reindex")],
|
||||||
|
{ description: "Indexing mode" },
|
||||||
|
),
|
||||||
|
}),
|
||||||
|
async execute(_id, params) {
|
||||||
|
const indexParams: IndexParams = { mode: String(params.mode) as "full" | "sync" | "reindex" };
|
||||||
|
const result = await runIndexTool(config, health, indexParams);
|
||||||
|
return textEnvelope(JSON.stringify(result), result);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// obsidian_rag_status — health check
|
||||||
|
api.registerTool({
|
||||||
|
name: "obsidian_rag_status",
|
||||||
|
description:
|
||||||
|
"Check the health of the Obsidian RAG plugin — index statistics, last sync time, unindexed files, and Ollama status. Call this first when unsure if the index is ready.",
|
||||||
|
label: "Obsidian RAG Status",
|
||||||
|
parameters: Type.Object({}),
|
||||||
|
async execute(_id) {
|
||||||
|
const result = await statusTool(config);
|
||||||
|
return textEnvelope(JSON.stringify(result), result);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// obsidian_rag_memory_store — commit facts to memory
|
||||||
|
api.registerTool({
|
||||||
|
name: "obsidian_rag_memory_store",
|
||||||
|
description:
|
||||||
|
"Commit an important fact from search results to OpenClaw's memory for faster future retrieval. Use after finding significant information (e.g. 'I owe Sreenivas $50') that should be remembered.",
|
||||||
|
label: "Store in Memory",
|
||||||
|
parameters: Type.Object({
|
||||||
|
key: Type.String({ description: "Identifier for the fact (e.g. 'debt_to_sreenivas')" }),
|
||||||
|
value: Type.String({ description: "The fact to remember" }),
|
||||||
|
source: Type.String({
|
||||||
|
description: "Source file path in the vault (e.g. 'Journal/2025-03-15.md')",
|
||||||
|
}),
|
||||||
|
}),
|
||||||
|
async execute(_id, params) {
|
||||||
|
const memParams: MemoryStoreParams = {
|
||||||
|
key: String(params.key),
|
||||||
|
value: String(params.value),
|
||||||
|
source: String(params.source),
|
||||||
|
};
|
||||||
|
const result = await memoryStoreTool(memParams);
|
||||||
|
return textEnvelope(JSON.stringify(result), result);
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
27
src/tools/memory.ts
Normal file
27
src/tools/memory.ts
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
/** obsidian_rag_memory_store tool implementation. */
|
||||||
|
|
||||||
|
import type { ResponseEnvelope } from "../utils/types.js";
|
||||||
|
import { makeEnvelope } from "../utils/response.js";
|
||||||
|
|
||||||
|
export interface MemoryStoreParams {
|
||||||
|
key: string;
|
||||||
|
value: string;
|
||||||
|
source: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// In a real OpenClaw integration, this would store to the agent's memory system.
|
||||||
|
// For now, we just acknowledge the store operation.
|
||||||
|
export async function memoryStoreTool(
|
||||||
|
params: MemoryStoreParams,
|
||||||
|
): Promise<ResponseEnvelope<{ stored: boolean; key: string }>> {
|
||||||
|
console.log(`[obsidian-rag] memory_store: ${params.key} = ${params.value} (source: ${params.source})`);
|
||||||
|
|
||||||
|
return makeEnvelope(
|
||||||
|
"healthy",
|
||||||
|
{
|
||||||
|
stored: true,
|
||||||
|
key: params.key,
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
);
|
||||||
|
}
|
||||||
44
src/tools/search.ts
Normal file
44
src/tools/search.ts
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/** obsidian_rag_search tool implementation. */
|
||||||
|
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
import type { ResponseEnvelope } from "../utils/types.js";
|
||||||
|
import type { SearchResult } from "../utils/types.js";
|
||||||
|
import { makeEnvelope } from "../utils/response.js";
|
||||||
|
import { searchVectorDb } from "../utils/lancedb.js";
|
||||||
|
|
||||||
|
export interface SearchParams {
|
||||||
|
query: string;
|
||||||
|
max_results?: number;
|
||||||
|
directory_filter?: string[];
|
||||||
|
date_range?: { from?: string; to?: string };
|
||||||
|
tags?: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function searchTool(
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
params: SearchParams,
|
||||||
|
): Promise<ResponseEnvelope<{ results: SearchResult[]; sensitive_detected: boolean } | null>> {
|
||||||
|
try {
|
||||||
|
const results = await searchVectorDb(config, params.query, {
|
||||||
|
max_results: params.max_results ?? 5,
|
||||||
|
directory_filter: params.directory_filter,
|
||||||
|
date_range: params.date_range,
|
||||||
|
tags: params.tags,
|
||||||
|
});
|
||||||
|
|
||||||
|
// TODO: Run sensitive content detection once we have actual results
|
||||||
|
return makeEnvelope(
|
||||||
|
results.length > 0 ? "healthy" : "degraded",
|
||||||
|
{ results, sensitive_detected: false },
|
||||||
|
null,
|
||||||
|
{ query_time_ms: 0, chunks_scanned: results.length },
|
||||||
|
);
|
||||||
|
} catch (err) {
|
||||||
|
return makeEnvelope("degraded", null, {
|
||||||
|
code: "SEARCH_FAILED",
|
||||||
|
message: String(err),
|
||||||
|
recoverable: true,
|
||||||
|
suggestion: "Check if the index exists with obsidian_rag_status",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
44
src/tools/status.ts
Normal file
44
src/tools/status.ts
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/** obsidian_rag_status tool implementation. */
|
||||||
|
|
||||||
|
import type { ObsidianRagConfig } from "../utils/config.js";
|
||||||
|
import type { ResponseEnvelope } from "../utils/types.js";
|
||||||
|
import { makeEnvelope } from "../utils/response.js";
|
||||||
|
import { readSyncResult } from "../services/indexer-bridge.js";
|
||||||
|
|
||||||
|
export async function statusTool(
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
): Promise<ResponseEnvelope<{
|
||||||
|
plugin_health: string;
|
||||||
|
total_docs: number;
|
||||||
|
total_chunks: number;
|
||||||
|
last_sync: string | null;
|
||||||
|
unindexed_files: number;
|
||||||
|
ollama_status: string;
|
||||||
|
active_job: null;
|
||||||
|
}>> {
|
||||||
|
const sync = readSyncResult(config);
|
||||||
|
const ollamaUp = await checkOllama(config);
|
||||||
|
|
||||||
|
return makeEnvelope(
|
||||||
|
sync ? "healthy" : "unavailable",
|
||||||
|
{
|
||||||
|
plugin_health: sync ? "healthy" : "unavailable",
|
||||||
|
total_docs: sync?.indexed_files ?? 0,
|
||||||
|
total_chunks: sync?.total_chunks ?? 0,
|
||||||
|
last_sync: sync?.timestamp ?? null,
|
||||||
|
unindexed_files: 0,
|
||||||
|
ollama_status: ollamaUp ? "up" : "down",
|
||||||
|
active_job: null,
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function checkOllama(config: ObsidianRagConfig): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const res = await fetch(`${config.embedding.base_url}/api/tags`, { signal: AbortSignal.timeout(3000) });
|
||||||
|
return res.ok;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
111
src/utils/config.ts
Normal file
111
src/utils/config.ts
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
/** Config loader + TypeScript interfaces mirroring the Python config. */
|
||||||
|
|
||||||
|
import { readFileSync } from "fs";
|
||||||
|
import { resolve } from "path";
|
||||||
|
|
||||||
|
export interface EmbeddingConfig {
|
||||||
|
provider: string;
|
||||||
|
model: string;
|
||||||
|
base_url: string;
|
||||||
|
dimensions: number;
|
||||||
|
batch_size: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface VectorStoreConfig {
|
||||||
|
type: string;
|
||||||
|
path: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IndexingConfig {
|
||||||
|
chunk_size: number;
|
||||||
|
chunk_overlap: number;
|
||||||
|
file_patterns: string[];
|
||||||
|
deny_dirs: string[];
|
||||||
|
allow_dirs: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SecurityConfig {
|
||||||
|
require_confirmation_for: string[];
|
||||||
|
sensitive_sections: string[];
|
||||||
|
local_only: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MemoryPatterns {
|
||||||
|
financial: string[];
|
||||||
|
health: string[];
|
||||||
|
commitments: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MemoryConfig {
|
||||||
|
auto_suggest: boolean;
|
||||||
|
patterns: MemoryPatterns;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ObsidianRagConfig {
|
||||||
|
vault_path: string;
|
||||||
|
embedding: EmbeddingConfig;
|
||||||
|
vector_store: VectorStoreConfig;
|
||||||
|
indexing: IndexingConfig;
|
||||||
|
security: SecurityConfig;
|
||||||
|
memory: MemoryConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
function defaults(): ObsidianRagConfig {
|
||||||
|
return {
|
||||||
|
vault_path: "./KnowledgeVault/Default",
|
||||||
|
embedding: {
|
||||||
|
provider: "ollama",
|
||||||
|
model: "mxbai-embed-large",
|
||||||
|
base_url: "http://localhost:11434",
|
||||||
|
dimensions: 1024,
|
||||||
|
batch_size: 64,
|
||||||
|
},
|
||||||
|
vector_store: {
|
||||||
|
type: "lancedb",
|
||||||
|
path: "./obsidian-rag/vectors.lance",
|
||||||
|
},
|
||||||
|
indexing: {
|
||||||
|
chunk_size: 500,
|
||||||
|
chunk_overlap: 100,
|
||||||
|
file_patterns: ["*.md"],
|
||||||
|
deny_dirs: [".obsidian", ".trash", "zzz-Archive", ".git", ".logseq"],
|
||||||
|
allow_dirs: [],
|
||||||
|
},
|
||||||
|
security: {
|
||||||
|
require_confirmation_for: ["health", "financial_debt"],
|
||||||
|
sensitive_sections: ["#mentalhealth", "#physicalhealth", "#Relations"],
|
||||||
|
local_only: true,
|
||||||
|
},
|
||||||
|
memory: {
|
||||||
|
auto_suggest: true,
|
||||||
|
patterns: {
|
||||||
|
financial: ["owe", "owed", "debt", "paid", "$", "spent", "spend"],
|
||||||
|
health: ["#mentalhealth", "#physicalhealth", "medication", "therapy"],
|
||||||
|
commitments: ["shopping list", "costco", "amazon", "grocery"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function loadConfig(configPath?: string): ObsidianRagConfig {
|
||||||
|
const defaultPath = resolve(process.cwd(), ".obsidian-rag", "config.json");
|
||||||
|
const path = configPath ?? defaultPath;
|
||||||
|
try {
|
||||||
|
const raw = JSON.parse(readFileSync(path, "utf-8"));
|
||||||
|
return deepMerge(defaults(), raw) as ObsidianRagConfig;
|
||||||
|
} catch {
|
||||||
|
return defaults();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function deepMerge<T extends object>(target: T, source: Partial<T>): T {
|
||||||
|
const out = { ...target };
|
||||||
|
for (const [key, val] of Object.entries(source)) {
|
||||||
|
if (val && typeof val === "object" && !Array.isArray(val)) {
|
||||||
|
(out as any)[key] = deepMerge((target as any)[key] ?? {}, val);
|
||||||
|
} else if (val !== undefined) {
|
||||||
|
(out as any)[key] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
109
src/utils/lancedb.ts
Normal file
109
src/utils/lancedb.ts
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
/** LanceDB client for TypeScript — searches the pre-built index. */
|
||||||
|
|
||||||
|
import { resolve } from "path";
|
||||||
|
import type { ObsidianRagConfig } from "./config.js";
|
||||||
|
import type { SearchResult } from "./types.js";
|
||||||
|
|
||||||
|
export function resolveVectorDbPath(config: ObsidianRagConfig): string {
|
||||||
|
const vsp = config.vector_store.path;
|
||||||
|
// Special case: resolve nested paths where vector_store.path is itself inside data dir
|
||||||
|
if (vsp.startsWith("./obsidian-rag/") || vsp.includes("../")) return resolve(process.cwd(), vsp);
|
||||||
|
if (vsp.startsWith("/") || /^[A-Za-z]:/.test(vsp)) return vsp;
|
||||||
|
return resolve(process.cwd(), vsp);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function embedQuery(
|
||||||
|
text: string,
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
): Promise<number[]> {
|
||||||
|
const url = `${config.embedding.base_url}/api/embeddings`;
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ model: config.embedding.model, prompt: text }),
|
||||||
|
signal: AbortSignal.timeout(30_000),
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`Embedding request failed: ${response.status} ${response.statusText}`);
|
||||||
|
}
|
||||||
|
const data = (await response.json()) as { embedding?: number[]; embeddings?: number[][] };
|
||||||
|
return data.embedding ?? data.embeddings?.[0] ?? [];
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function searchVectorDb(
|
||||||
|
config: ObsidianRagConfig,
|
||||||
|
query: string,
|
||||||
|
options: {
|
||||||
|
max_results?: number;
|
||||||
|
directory_filter?: string[];
|
||||||
|
date_range?: { from?: string; to?: string };
|
||||||
|
tags?: string[];
|
||||||
|
} = {},
|
||||||
|
): Promise<SearchResult[]> {
|
||||||
|
const dbPath = resolveVectorDbPath(config);
|
||||||
|
|
||||||
|
// Dynamically import LanceDB to avoid issues at import time when not needed
|
||||||
|
const { connect } = await import("@lancedb/lancedb");
|
||||||
|
|
||||||
|
const db = await connect(dbPath);
|
||||||
|
const tableNames = await db.tableNames();
|
||||||
|
if (!tableNames.includes("obsidian_chunks")) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
const table = await db.openTable("obsidian_chunks");
|
||||||
|
|
||||||
|
// Build WHERE clause from filters
|
||||||
|
const conditions: string[] = [];
|
||||||
|
if (options.directory_filter && options.directory_filter.length > 0) {
|
||||||
|
const dirs = options.directory_filter.map((d) => `"${d}"`).join(", ");
|
||||||
|
conditions.push(`source_directory IN (${dirs})`);
|
||||||
|
}
|
||||||
|
if (options.date_range) {
|
||||||
|
if (options.date_range.from) {
|
||||||
|
conditions.push(`date >= '${options.date_range.from}'`);
|
||||||
|
}
|
||||||
|
if (options.date_range.to) {
|
||||||
|
conditions.push(`date <= '${options.date_range.to}'`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (options.tags && options.tags.length > 0) {
|
||||||
|
for (const tag of options.tags) {
|
||||||
|
// LanceDB stores tags as List<String>; use array_contains SQL function
|
||||||
|
conditions.push(`array_contains(tags, '${tag}')`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined;
|
||||||
|
|
||||||
|
const limit = options.max_results ?? 5;
|
||||||
|
|
||||||
|
// Try vector search first; if Ollama is down embedQuery throws → fallback to FTS
|
||||||
|
let rows: Record<string, unknown>[];
|
||||||
|
try {
|
||||||
|
const queryVector = await embedQuery(query, config);
|
||||||
|
|
||||||
|
let queryBuilder = table.vectorSearch(queryVector);
|
||||||
|
if (whereClause) {
|
||||||
|
queryBuilder = queryBuilder.filter(whereClause);
|
||||||
|
}
|
||||||
|
rows = await queryBuilder.limit(limit).toArray();
|
||||||
|
} catch {
|
||||||
|
// Ollama unavailable — fallback to full-text search on chunk_text (BM25 scoring)
|
||||||
|
let ftsBuilder = table.query().fullTextSearch(query);
|
||||||
|
if (whereClause) {
|
||||||
|
ftsBuilder = ftsBuilder.filter(whereClause);
|
||||||
|
}
|
||||||
|
rows = await ftsBuilder.limit(limit).toArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
return rows.map((r: Record<string, unknown>) => ({
|
||||||
|
chunk_id: r["chunk_id"] as string,
|
||||||
|
chunk_text: r["chunk_text"] as string,
|
||||||
|
source_file: r["source_file"] as string,
|
||||||
|
source_directory: r["source_directory"] as string,
|
||||||
|
section: (r["section"] as string) ?? null,
|
||||||
|
date: (r["date"] as string) ?? null,
|
||||||
|
tags: (r["tags"] as string[]) ?? [],
|
||||||
|
chunk_index: (r["chunk_index"] as number) ?? 0,
|
||||||
|
score: (r["_score"] as number) ?? (r["_distance"] as number) ?? 0.0,
|
||||||
|
}));
|
||||||
|
}
|
||||||
32
src/utils/response.ts
Normal file
32
src/utils/response.ts
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/** Response envelope factory + error normalization. */
|
||||||
|
|
||||||
|
import type { ResponseEnvelope } from "./types.js";
|
||||||
|
|
||||||
|
export function makeEnvelope<T>(
|
||||||
|
status: "healthy" | "degraded" | "unavailable",
|
||||||
|
data: T | null,
|
||||||
|
error: ResponseEnvelope<T>["error"],
|
||||||
|
meta: Partial<ResponseEnvelope<T>["meta"]> = {},
|
||||||
|
): ResponseEnvelope<T> {
|
||||||
|
return {
|
||||||
|
status,
|
||||||
|
data,
|
||||||
|
error,
|
||||||
|
meta: {
|
||||||
|
query_time_ms: 0,
|
||||||
|
chunks_scanned: 0,
|
||||||
|
index_version: "0.1.0",
|
||||||
|
vault_mtime: new Date().toISOString(),
|
||||||
|
...meta,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function errorEnvelope(
|
||||||
|
code: string,
|
||||||
|
message: string,
|
||||||
|
recoverable: boolean,
|
||||||
|
suggestion: string,
|
||||||
|
) {
|
||||||
|
return makeEnvelope<null>("unavailable", null, { code, message, recoverable, suggestion });
|
||||||
|
}
|
||||||
32
src/utils/types.ts
Normal file
32
src/utils/types.ts
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
/** Shared TypeScript types across the plugin. */
|
||||||
|
|
||||||
|
export interface SearchResult {
|
||||||
|
chunk_id: string;
|
||||||
|
chunk_text: string;
|
||||||
|
source_file: string;
|
||||||
|
source_directory: string;
|
||||||
|
section: string | null;
|
||||||
|
date: string | null;
|
||||||
|
tags: string[];
|
||||||
|
chunk_index: number;
|
||||||
|
score: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ResponseEnvelope<T> {
|
||||||
|
status: "healthy" | "degraded" | "unavailable";
|
||||||
|
data: T | null;
|
||||||
|
error: {
|
||||||
|
code: string;
|
||||||
|
message: string;
|
||||||
|
recoverable: boolean;
|
||||||
|
suggestion: string;
|
||||||
|
} | null;
|
||||||
|
meta: {
|
||||||
|
query_time_ms: number;
|
||||||
|
chunks_scanned: number;
|
||||||
|
index_version: string;
|
||||||
|
vault_mtime: string;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ToolStatus = "healthy" | "degraded" | "unavailable";
|
||||||
156
tests/unit/lancedb.test.ts
Normal file
156
tests/unit/lancedb.test.ts
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
/** Unit tests for the TS LanceDB client. */
|
||||||
|
|
||||||
|
import { describe, it, expect, vi, beforeEach } from "vitest";
|
||||||
|
import { resolve } from "path";
|
||||||
|
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
// resolveVectorDbPath — test the standalone logic
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
|
||||||
|
function resolveVectorDbPath(config: {
|
||||||
|
vector_store: { path: string };
|
||||||
|
}): string {
|
||||||
|
const vsp = config.vector_store.path;
|
||||||
|
if (vsp.startsWith("./obsidian-rag/") || vsp.includes("../"))
|
||||||
|
return resolve(process.cwd(), vsp);
|
||||||
|
if (vsp.startsWith("/") || /^[A-Za-z]:/.test(vsp)) return vsp;
|
||||||
|
return resolve(process.cwd(), vsp);
|
||||||
|
}
|
||||||
|
|
||||||
|
const makeConfig = (vectorPath: string) => ({
|
||||||
|
vault_path: "./KnowledgeVault/Default",
|
||||||
|
embedding: {
|
||||||
|
provider: "ollama",
|
||||||
|
model: "mxbai-embed-large",
|
||||||
|
base_url: "http://localhost:11434",
|
||||||
|
dimensions: 1024,
|
||||||
|
batch_size: 64,
|
||||||
|
},
|
||||||
|
vector_store: { type: "lancedb", path: vectorPath },
|
||||||
|
indexing: {
|
||||||
|
chunk_size: 500,
|
||||||
|
chunk_overlap: 100,
|
||||||
|
file_patterns: ["*.md"],
|
||||||
|
deny_dirs: [],
|
||||||
|
allow_dirs: [],
|
||||||
|
},
|
||||||
|
security: {
|
||||||
|
require_confirmation_for: [],
|
||||||
|
sensitive_sections: [],
|
||||||
|
local_only: true,
|
||||||
|
},
|
||||||
|
memory: {
|
||||||
|
auto_suggest: true,
|
||||||
|
patterns: { financial: [], health: [], commitments: [] },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("resolveVectorDbPath", () => {
|
||||||
|
it("returns absolute paths unchanged", () => {
|
||||||
|
const result = resolveVectorDbPath(makeConfig("/absolute/path/to/db.lance"));
|
||||||
|
expect(result).toBe("/absolute/path/to/db.lance");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("resolves relative paths against cwd", () => {
|
||||||
|
const result = resolveVectorDbPath(makeConfig("./local/path.db"));
|
||||||
|
expect(result).toContain("local/path.db");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("detects nested obsidian-rag path prefix", () => {
|
||||||
|
const result = resolveVectorDbPath(makeConfig("./obsidian-rag/vectors.lance"));
|
||||||
|
expect(result).toContain("obsidian-rag/vectors.lance");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("resolves parent traversal paths normally (resolve() strips ..)", () => {
|
||||||
|
// After resolve(), ../escape/path.db becomes /cwd/escape/path.db
|
||||||
|
// The function resolves it as-is; the TS path is a simple passthrough
|
||||||
|
const result = resolveVectorDbPath(makeConfig("../escape/path.db"));
|
||||||
|
expect(result).toContain("escape");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
// embedQuery — test the standalone logic
|
||||||
|
// ------------------------------------------------------------------
|
||||||
|
|
||||||
|
async function embedQuery(
|
||||||
|
text: string,
|
||||||
|
config: { embedding: { base_url: string; model: string } }
|
||||||
|
): Promise<number[]> {
|
||||||
|
const url = `${config.embedding.base_url}/api/embeddings`;
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ model: config.embedding.model, prompt: text }),
|
||||||
|
signal: AbortSignal.timeout(30_000),
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(
|
||||||
|
`Embedding request failed: ${response.status} ${response.statusText}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const data = (await response.json()) as {
|
||||||
|
embedding?: number[];
|
||||||
|
embeddings?: number[][];
|
||||||
|
};
|
||||||
|
return data.embedding ?? data.embeddings?.[0] ?? [];
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("embedQuery", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
global.fetch = vi.fn();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("posts to the correct embeddings endpoint", async () => {
|
||||||
|
(global.fetch as ReturnType<typeof vi.fn>).mockResolvedValueOnce({
|
||||||
|
ok: true,
|
||||||
|
json: async () => ({ embedding: [0.1, 0.2, 0.3] }),
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const config = makeConfig("./vectors.lance");
|
||||||
|
await embedQuery("hello world", config);
|
||||||
|
|
||||||
|
expect(global.fetch).toHaveBeenCalledWith(
|
||||||
|
"http://localhost:11434/api/embeddings",
|
||||||
|
expect.objectContaining({
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: "mxbai-embed-large",
|
||||||
|
prompt: "hello world",
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("throws on non-ok response", async () => {
|
||||||
|
(global.fetch as ReturnType<typeof vi.fn>).mockResolvedValueOnce({
|
||||||
|
ok: false,
|
||||||
|
status: 500,
|
||||||
|
statusText: "Internal Server Error",
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
embedQuery("test", makeConfig("./vectors.lance"))
|
||||||
|
).rejects.toThrow("Embedding request failed: 500 Internal Server Error");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to embeddings[0] when top-level embedding absent", async () => {
|
||||||
|
(global.fetch as ReturnType<typeof vi.fn>).mockResolvedValueOnce({
|
||||||
|
ok: true,
|
||||||
|
json: async () => ({ embeddings: [[0.5, 0.6, 0.7]] }),
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const result = await embedQuery("test", makeConfig("./vectors.lance"));
|
||||||
|
expect(result).toEqual([0.5, 0.6, 0.7]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns empty array when no embedding in response", async () => {
|
||||||
|
(global.fetch as ReturnType<typeof vi.fn>).mockResolvedValueOnce({
|
||||||
|
ok: true,
|
||||||
|
json: async () => ({}),
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const result = await embedQuery("test", makeConfig("./vectors.lance"));
|
||||||
|
expect(result).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
50
tests/unit/response.test.ts
Normal file
50
tests/unit/response.test.ts
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
/** Unit tests for security guard and response envelope utilities in TS. */
|
||||||
|
|
||||||
|
import { describe, it, expect } from "vitest";
|
||||||
|
import { makeEnvelope, errorEnvelope } from "../../src/utils/response.js";
|
||||||
|
|
||||||
|
describe("makeEnvelope", () => {
|
||||||
|
it("creates a healthy envelope with data", () => {
|
||||||
|
const envelope = makeEnvelope<string[]>(
|
||||||
|
"healthy",
|
||||||
|
["a", "b"],
|
||||||
|
null,
|
||||||
|
{ query_time_ms: 42 }
|
||||||
|
);
|
||||||
|
expect(envelope.status).toBe("healthy");
|
||||||
|
expect(envelope.data).toEqual(["a", "b"]);
|
||||||
|
expect(envelope.error).toBeNull();
|
||||||
|
expect(envelope.meta.query_time_ms).toBe(42);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("creates a degraded envelope without data", () => {
|
||||||
|
const envelope = makeEnvelope("degraded", null, null, { chunks_scanned: 0 });
|
||||||
|
expect(envelope.status).toBe("degraded");
|
||||||
|
expect(envelope.data).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("defaults meta fields", () => {
|
||||||
|
const envelope = makeEnvelope("healthy", [], null, {});
|
||||||
|
expect(envelope.meta.index_version).toBe("0.1.0");
|
||||||
|
expect(envelope.meta.vault_mtime).toBeDefined();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("errorEnvelope", () => {
|
||||||
|
it("creates an unavailable error envelope", () => {
|
||||||
|
const envelope = errorEnvelope(
|
||||||
|
"INDEX_NOT_FOUND",
|
||||||
|
"Vector index not found at expected path",
|
||||||
|
false,
|
||||||
|
"Run 'obsidian-rag index' to create the index"
|
||||||
|
);
|
||||||
|
expect(envelope.status).toBe("unavailable");
|
||||||
|
expect(envelope.data).toBeNull();
|
||||||
|
expect(envelope.error).toEqual({
|
||||||
|
code: "INDEX_NOT_FOUND",
|
||||||
|
message: "Vector index not found at expected path",
|
||||||
|
recoverable: false,
|
||||||
|
suggestion: "Run 'obsidian-rag index' to create the index",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
19
tsconfig.json
Normal file
19
tsconfig.json
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2022",
|
||||||
|
"module": "ES2022",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
"lib": ["ES2022"],
|
||||||
|
"outDir": "./dist",
|
||||||
|
"rootDir": "./src",
|
||||||
|
"strict": true,
|
||||||
|
"noUncheckedIndexedAccess": true,
|
||||||
|
"noImplicitOverride": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"skipLibCheck": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
"resolveJsonModule": true
|
||||||
|
},
|
||||||
|
"include": ["src/**/*"],
|
||||||
|
"exclude": ["node_modules", "dist", "tests"]
|
||||||
|
}
|
||||||
7
vitest.config.ts
Normal file
7
vitest.config.ts
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import { defineConfig } from "vitest/config";
|
||||||
|
|
||||||
|
export default defineConfig({
|
||||||
|
test: {
|
||||||
|
include: ["tests/**/*.test.ts"],
|
||||||
|
},
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user