Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new

**Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB:
- `config.py` — JSON config loader with cross-platform path resolution
- `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists
- `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes
- `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling
- `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats
- `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields
- `cli.py` — `index | sync | reindex | status` CLI commands

**TypeScript plugin (`src/`)** — OpenClaw plugin scaffold:
- `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client
- `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner)
- `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending)
- `index.ts` — plugin entry point with health probe + vault watcher startup

**Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`):
- 627 files / 3764 chunks indexed in dev vault

**Tests: 76 passing**
- Python: 64 pytest tests (chunker, security, vector_store, config)
- TypeScript: 12 vitest tests (lancedb client, response envelope)

## Bugs fixed

- LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column)
- LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array
- LanceDB JS result score field: `_score` → `_distance`
- TypeScript regex literal with unescaped `/` in path-resolve regex
- Python: `create_table_if_not_exists` identity check → name comparison

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions

27
src/index.ts Normal file
View File

@@ -0,0 +1,27 @@
import { registerTools } from "./tools/index.js";
import { loadConfig } from "./utils/config.js";
import { createHealthMachine, probeAll } from "./services/health.js";
import { VaultWatcher } from "./services/vault-watcher.js";
/** OpenClaw plugin entry point. */
export async function onLoad(): Promise<void> {
const config = loadConfig();
const health = createHealthMachine(config);
// Probe dependencies immediately
const probe = await probeAll(config);
health.transition(probe);
// Start vault watcher for auto-sync
const watcher = new VaultWatcher(config, health);
watcher.start();
// Register all 4 tools
await registerTools(config, health);
console.log("[obsidian-rag] Plugin loaded");
}
export async function onUnload(): Promise<void> {
console.log("[obsidian-rag] Plugin unloading");
}

130
src/services/health.ts Normal file
View File

@@ -0,0 +1,130 @@
/** Health state machine: HEALTHY / DEGRADED / UNAVAILABLE. */
import { existsSync, readFileSync } from "fs";
import { resolve } from "path";
import type { ObsidianRagConfig } from "../utils/config.js";
export type HealthState = "healthy" | "degraded" | "unavailable";
export interface HealthStatus {
state: HealthState;
ollama_up: boolean;
index_exists: boolean;
vault_exists: boolean;
total_docs: number;
total_chunks: number;
last_sync: string | null;
active_job: { id: string; mode: string; progress: number } | null;
}
export interface ProbeResult {
ollama_up: boolean;
index_exists: boolean;
vault_exists: boolean;
total_docs: number;
total_chunks: number;
last_sync: string | null;
}
const REPROBE_INTERVAL_MS = 30_000;
export function createHealthMachine(_config: ObsidianRagConfig) {
let currentState: HealthState = "unavailable";
let status: ProbeResult = {
ollama_up: false,
index_exists: false,
vault_exists: false,
total_docs: 0,
total_chunks: 0,
last_sync: null,
};
let activeJob: { id: string; mode: string; progress: number } | null = null;
let reprobeTimer: ReturnType<typeof setInterval> | null = null;
function transition(probe: ProbeResult): void {
status = probe;
const prev = currentState;
if (!probe.index_exists || !probe.vault_exists) {
currentState = "unavailable";
} else if (!probe.ollama_up) {
currentState = "degraded";
} else {
currentState = "healthy";
}
if (prev !== currentState) {
console.log(`[obsidian-rag] Health: ${prev}${currentState}`);
}
}
function get(): HealthStatus {
return { state: currentState, ...status, active_job: activeJob };
}
function setActiveJob(job: { id: string; mode: string; progress: number } | null): void {
activeJob = job;
}
function startReprobing(fn: () => Promise<ProbeResult>): void {
if (reprobeTimer) clearInterval(reprobeTimer);
reprobeTimer = setInterval(async () => {
const probe = await fn();
transition(probe);
}, REPROBE_INTERVAL_MS);
}
function stop(): void {
if (reprobeTimer) {
clearInterval(reprobeTimer);
reprobeTimer = null;
}
}
return { get, transition, setActiveJob, startReprobing, stop };
}
export async function probeAll(config: ObsidianRagConfig): Promise<ProbeResult> {
const { resolveVectorDbPath } = await import("../utils/lancedb.js");
const vaultPath = resolve(process.cwd(), config.vault_path);
const dbPath = resolveVectorDbPath(config);
const vaultExists = existsSync(vaultPath);
const indexExists = existsSync(String(dbPath));
const ollamaUp = await probeOllama(config.embedding.base_url);
let totalDocs = 0;
let totalChunks = 0;
let lastSync: string | null = null;
if (indexExists) {
try {
const syncPath = resolve(dbPath, "..", "sync-result.json");
if (existsSync(syncPath)) {
const data = JSON.parse(readFileSync(syncPath, "utf-8"));
lastSync = data.timestamp ?? null;
totalDocs = data.indexed_files ?? 0;
totalChunks = data.total_chunks ?? 0;
}
} catch {
// ignore
}
}
return {
ollama_up: ollamaUp,
index_exists: indexExists,
vault_exists: vaultExists,
total_docs: totalDocs,
total_chunks: totalChunks,
last_sync: lastSync,
};
}
async function probeOllama(baseUrl: string): Promise<boolean> {
try {
const res = await fetch(`${baseUrl}/api/tags`, { signal: AbortSignal.timeout(3000) });
return res.ok;
} catch {
return false;
}
}

View File

@@ -0,0 +1,120 @@
/** Bridge to the Python indexer CLI — spawns subprocess, tracks job progress. */
import { spawn } from "child_process";
import { readFileSync, existsSync } from "fs";
import { resolve } from "path";
import type { ObsidianRagConfig } from "../utils/config.js";
export interface JobStatus {
id: string;
mode: string;
progress: number;
status: "running" | "complete" | "failed";
indexed_files?: number;
total_chunks?: number;
duration_ms?: number;
errors?: Array<{ file: string; error: string }>;
}
const runningJobs = new Map<string, JobStatus>();
export function spawnIndexer(
mode: "index" | "sync" | "reindex",
config: ObsidianRagConfig,
): Promise<JobStatus> {
const jobId = `job-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
const status: JobStatus = {
id: jobId,
mode,
progress: 0,
status: "running",
};
runningJobs.set(jobId, status);
return new Promise((resolveJob) => {
const pythonCmd = "python";
const args = ["-m", "obsidian_rag.cli", mode];
const child = spawn(pythonCmd, args, {
cwd: resolve(process.cwd(), "python"),
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
child.stdout?.on("data", (chunk: Buffer) => {
stdout += chunk.toString();
try {
const lines = stdout.split("\n").filter(Boolean);
for (const line of lines) {
try {
const obj = JSON.parse(line);
if (obj.type === "progress") {
const total = obj.total ?? 1;
const current = obj.current ?? 0;
status.progress = Math.round((current / total) * 100);
} else if (obj.type === "complete") {
status.status = obj.errors?.length ? "failed" : "complete";
status.indexed_files = obj.indexed_files;
status.total_chunks = obj.total_chunks;
status.duration_ms = obj.duration_ms;
status.errors = obj.errors ?? [];
status.progress = 100;
runningJobs.delete(jobId);
resolveJob(status);
} else if (obj.type === "error") {
status.status = "failed";
status.errors = [{ file: "cli", error: obj.error }];
runningJobs.delete(jobId);
resolveJob(status);
}
} catch {
// Not JSON — ignore partial lines
}
}
} catch {
// ignore parse errors
}
});
child.stderr?.on("data", (chunk: Buffer) => {
stderr += chunk.toString();
});
child.on("close", (code) => {
if (status.status === "running") {
status.status = code === 0 ? "complete" : "failed";
runningJobs.delete(jobId);
resolveJob(status);
}
});
child.on("error", (err) => {
status.status = "failed";
status.errors = [{ file: "subprocess", error: err.message }];
runningJobs.delete(jobId);
resolveJob(status);
});
});
}
export function getJobStatus(jobId: string): JobStatus | null {
return runningJobs.get(jobId) ?? null;
}
export function readSyncResult(config: ObsidianRagConfig): {
timestamp: string | null;
indexed_files: number;
total_chunks: number;
errors: Array<{ file: string; error: string }>;
} | null {
const dataDir = resolve(process.cwd(), ".obsidian-rag");
const path = resolve(dataDir, "sync-result.json");
if (!existsSync(path)) return null;
try {
return JSON.parse(readFileSync(path, "utf-8"));
} catch {
return null;
}
}

View File

@@ -0,0 +1,81 @@
/** Vault watcher — chokidar-based file system monitor with debounce + batching. */
import { watch, FSWatcher } from "chokidar";
import type { ObsidianRagConfig } from "../utils/config.js";
import type { HealthState } from "./health.js";
const DEBOUNCE_MS = 2_000;
const COLLECT_WINDOW_MS = 5_000;
export class VaultWatcher {
private watcher: FSWatcher | null = null;
private debounceTimer: ReturnType<typeof setTimeout> | null = null;
private pending = new Set<string>();
private collectTimer: ReturnType<typeof setTimeout> | null = null;
constructor(
private config: ObsidianRagConfig,
private health: { get: () => { state: HealthState } },
) {}
start(): void {
const vaultPath = this.config.vault_path;
this.watcher = watch(vaultPath, {
persistent: true,
ignoreInitial: true,
depth: 99,
});
this.watcher.on("add", (p) => this.onEvent(p));
this.watcher.on("change", (p) => this.onEvent(p));
this.watcher.on("unlink", (p) => this.onEvent(p));
}
private onEvent(filepath: string): void {
if (!filepath.endsWith(".md")) return;
// Apply deny list check
const parts = filepath.replace("\\", "/").split("/");
const dir = parts[parts.length - 2] ?? "";
if (this.config.indexing.deny_dirs.includes(dir)) return;
this.pending.add(filepath);
this.scheduleFlush();
}
private scheduleFlush(): void {
if (this.debounceTimer) clearTimeout(this.debounceTimer);
this.debounceTimer = setTimeout(() => {
this.flush();
}, DEBOUNCE_MS);
}
private flush(): void {
if (this.pending.size === 0) return;
const files = [...this.pending];
this.pending.clear();
if (this.collectTimer) clearTimeout(this.collectTimer);
this.collectTimer = setTimeout(() => {
this.triggerSync(files);
}, COLLECT_WINDOW_MS);
}
private async triggerSync(_files: string[]): Promise<void> {
// Import dynamically to avoid circular issues
const { spawnIndexer } = await import("./indexer-bridge.js");
const health = this.health.get();
if (health.state === "unavailable") {
console.log("[obsidian-rag] Skipping sync — index unavailable");
return;
}
console.log(`[obsidian-rag] Triggering sync for ${_files.length} files`);
await spawnIndexer("sync", this.config);
}
stop(): void {
this.watcher?.close();
this.watcher = null;
if (this.debounceTimer) clearTimeout(this.debounceTimer);
if (this.collectTimer) clearTimeout(this.collectTimer);
}
}

12
src/tools/index.ts Normal file
View File

@@ -0,0 +1,12 @@
/** Tool registration — wires all 4 obsidian_rag_* tools into OpenClaw. */
import type { ObsidianRagConfig } from "../utils/config.js";
import type { HealthState } from "../services/health.js";
export async function registerTools(
_config: ObsidianRagConfig,
_health: { get: () => { state: HealthState } },
): Promise<void> {
// TODO: Wire into OpenClaw tool registry once SDK is available
console.log("[obsidian-rag] Tools registered (stub — OpenClaw SDK TBD)");
}

27
src/tools/memory.ts Normal file
View File

@@ -0,0 +1,27 @@
/** obsidian_rag_memory_store tool implementation. */
import type { ResponseEnvelope } from "../utils/types.js";
import { makeEnvelope } from "../utils/response.js";
export interface MemoryStoreParams {
key: string;
value: string;
source: string;
}
// In a real OpenClaw integration, this would store to the agent's memory system.
// For now, we just acknowledge the store operation.
export async function memoryStoreTool(
params: MemoryStoreParams,
): Promise<ResponseEnvelope<{ stored: boolean; key: string }>> {
console.log(`[obsidian-rag] memory_store: ${params.key} = ${params.value} (source: ${params.source})`);
return makeEnvelope(
"healthy",
{
stored: true,
key: params.key,
},
null,
);
}

44
src/tools/search.ts Normal file
View File

@@ -0,0 +1,44 @@
/** obsidian_rag_search tool implementation. */
import type { ObsidianRagConfig } from "../utils/config.js";
import type { ResponseEnvelope } from "../utils/types.js";
import type { SearchResult } from "../utils/types.js";
import { makeEnvelope } from "../utils/response.js";
import { searchVectorDb } from "../utils/lancedb.js";
export interface SearchParams {
query: string;
max_results?: number;
directory_filter?: string[];
date_range?: { from?: string; to?: string };
tags?: string[];
}
export async function searchTool(
config: ObsidianRagConfig,
params: SearchParams,
): Promise<ResponseEnvelope<{ results: SearchResult[]; sensitive_detected: boolean } | null>> {
try {
const results = await searchVectorDb(config, params.query, {
max_results: params.max_results ?? 5,
directory_filter: params.directory_filter,
date_range: params.date_range,
tags: params.tags,
});
// TODO: Run sensitive content detection once we have actual results
return makeEnvelope(
results.length > 0 ? "healthy" : "degraded",
{ results, sensitive_detected: false },
null,
{ query_time_ms: 0, chunks_scanned: results.length },
);
} catch (err) {
return makeEnvelope("degraded", null, {
code: "SEARCH_FAILED",
message: String(err),
recoverable: true,
suggestion: "Check if the index exists with obsidian_rag_status",
});
}
}

44
src/tools/status.ts Normal file
View File

@@ -0,0 +1,44 @@
/** obsidian_rag_status tool implementation. */
import type { ObsidianRagConfig } from "../utils/config.js";
import type { ResponseEnvelope } from "../utils/types.js";
import { makeEnvelope } from "../utils/response.js";
import { readSyncResult } from "../services/indexer-bridge.js";
export async function statusTool(
config: ObsidianRagConfig,
): Promise<ResponseEnvelope<{
plugin_health: string;
total_docs: number;
total_chunks: number;
last_sync: string | null;
unindexed_files: number;
ollama_status: string;
active_job: null;
}>> {
const sync = readSyncResult(config);
const ollamaUp = await checkOllama(config);
return makeEnvelope(
sync ? "healthy" : "unavailable",
{
plugin_health: sync ? "healthy" : "unavailable",
total_docs: sync?.indexed_files ?? 0,
total_chunks: sync?.total_chunks ?? 0,
last_sync: sync?.timestamp ?? null,
unindexed_files: 0,
ollama_status: ollamaUp ? "up" : "down",
active_job: null,
},
null,
);
}
async function checkOllama(config: ObsidianRagConfig): Promise<boolean> {
try {
const res = await fetch(`${config.embedding.base_url}/api/tags`, { signal: AbortSignal.timeout(3000) });
return res.ok;
} catch {
return false;
}
}

111
src/utils/config.ts Normal file
View File

@@ -0,0 +1,111 @@
/** Config loader + TypeScript interfaces mirroring the Python config. */
import { readFileSync } from "fs";
import { resolve } from "path";
export interface EmbeddingConfig {
provider: string;
model: string;
base_url: string;
dimensions: number;
batch_size: number;
}
export interface VectorStoreConfig {
type: string;
path: string;
}
export interface IndexingConfig {
chunk_size: number;
chunk_overlap: number;
file_patterns: string[];
deny_dirs: string[];
allow_dirs: string[];
}
export interface SecurityConfig {
require_confirmation_for: string[];
sensitive_sections: string[];
local_only: boolean;
}
export interface MemoryPatterns {
financial: string[];
health: string[];
commitments: string[];
}
export interface MemoryConfig {
auto_suggest: boolean;
patterns: MemoryPatterns;
}
export interface ObsidianRagConfig {
vault_path: string;
embedding: EmbeddingConfig;
vector_store: VectorStoreConfig;
indexing: IndexingConfig;
security: SecurityConfig;
memory: MemoryConfig;
}
function defaults(): ObsidianRagConfig {
return {
vault_path: "./KnowledgeVault/Default",
embedding: {
provider: "ollama",
model: "mxbai-embed-large",
base_url: "http://localhost:11434",
dimensions: 1024,
batch_size: 64,
},
vector_store: {
type: "lancedb",
path: "./obsidian-rag/vectors.lance",
},
indexing: {
chunk_size: 500,
chunk_overlap: 100,
file_patterns: ["*.md"],
deny_dirs: [".obsidian", ".trash", "zzz-Archive", ".git", ".logseq"],
allow_dirs: [],
},
security: {
require_confirmation_for: ["health", "financial_debt"],
sensitive_sections: ["#mentalhealth", "#physicalhealth", "#Relations"],
local_only: true,
},
memory: {
auto_suggest: true,
patterns: {
financial: ["owe", "owed", "debt", "paid", "$", "spent", "spend"],
health: ["#mentalhealth", "#physicalhealth", "medication", "therapy"],
commitments: ["shopping list", "costco", "amazon", "grocery"],
},
},
};
}
export function loadConfig(configPath?: string): ObsidianRagConfig {
const defaultPath = resolve(process.cwd(), ".obsidian-rag", "config.json");
const path = configPath ?? defaultPath;
try {
const raw = JSON.parse(readFileSync(path, "utf-8"));
return deepMerge(defaults(), raw) as ObsidianRagConfig;
} catch {
return defaults();
}
}
function deepMerge<T extends object>(target: T, source: Partial<T>): T {
const out = { ...target };
for (const [key, val] of Object.entries(source)) {
if (val && typeof val === "object" && !Array.isArray(val)) {
(out as any)[key] = deepMerge((target as any)[key] ?? {}, val);
} else if (val !== undefined) {
(out as any)[key] = val;
}
}
return out;
}

100
src/utils/lancedb.ts Normal file
View File

@@ -0,0 +1,100 @@
/** LanceDB client for TypeScript — searches the pre-built index. */
import { resolve } from "path";
import type { ObsidianRagConfig } from "./config.js";
import type { SearchResult } from "./types.js";
export function resolveVectorDbPath(config: ObsidianRagConfig): string {
const vsp = config.vector_store.path;
// Special case: resolve nested paths where vector_store.path is itself inside data dir
if (vsp.startsWith("./obsidian-rag/") || vsp.includes("../")) return resolve(process.cwd(), vsp);
if (vsp.startsWith("/") || /^[A-Za-z]:/.test(vsp)) return vsp;
return resolve(process.cwd(), vsp);
}
export async function embedQuery(
text: string,
config: ObsidianRagConfig,
): Promise<number[]> {
const url = `${config.embedding.base_url}/api/embeddings`;
const response = await fetch(url, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ model: config.embedding.model, prompt: text }),
signal: AbortSignal.timeout(30_000),
});
if (!response.ok) {
throw new Error(`Embedding request failed: ${response.status} ${response.statusText}`);
}
const data = (await response.json()) as { embedding?: number[]; embeddings?: number[][] };
return data.embedding ?? data.embeddings?.[0] ?? [];
}
export async function searchVectorDb(
config: ObsidianRagConfig,
query: string,
options: {
max_results?: number;
directory_filter?: string[];
date_range?: { from?: string; to?: string };
tags?: string[];
} = {},
): Promise<SearchResult[]> {
const dbPath = resolveVectorDbPath(config);
// Dynamically import LanceDB to avoid issues at import time when not needed
const { connect } = await import("@lancedb/lancedb");
const db = await connect(dbPath);
const tableNames = await db.tableNames();
if (!tableNames.includes("obsidian_chunks")) {
return [];
}
const table = await db.openTable("obsidian_chunks");
// Embed the query text
const queryVector = await embedQuery(query, config);
// Build WHERE clause from filters
const conditions: string[] = [];
if (options.directory_filter && options.directory_filter.length > 0) {
const dirs = options.directory_filter.map((d) => `"${d}"`).join(", ");
conditions.push(`source_directory IN (${dirs})`);
}
if (options.date_range) {
if (options.date_range.from) {
conditions.push(`date >= '${options.date_range.from}'`);
}
if (options.date_range.to) {
conditions.push(`date <= '${options.date_range.to}'`);
}
}
if (options.tags && options.tags.length > 0) {
for (const tag of options.tags) {
// LanceDB stores tags as List<String>; use array_contains SQL function
conditions.push(`array_contains(tags, '${tag}')`);
}
}
const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined;
const limit = options.max_results ?? 5;
// LanceDB JS SDK: table.vectorSearch(vector).filter(...).limit(...).toArray()
let queryBuilder = table.vectorSearch(queryVector);
if (whereClause) {
queryBuilder = queryBuilder.filter(whereClause);
}
const rows = await queryBuilder.limit(limit).toArray();
return rows.map((r: Record<string, unknown>) => ({
chunk_id: r["chunk_id"] as string,
chunk_text: r["chunk_text"] as string,
source_file: r["source_file"] as string,
source_directory: r["source_directory"] as string,
section: (r["section"] as string) ?? null,
date: (r["date"] as string) ?? null,
tags: (r["tags"] as string[]) ?? [],
chunk_index: (r["chunk_index"] as number) ?? 0,
score: (r["_distance"] as number) ?? 0.0,
}));
}

32
src/utils/response.ts Normal file
View File

@@ -0,0 +1,32 @@
/** Response envelope factory + error normalization. */
import type { ResponseEnvelope } from "./types.js";
export function makeEnvelope<T>(
status: "healthy" | "degraded" | "unavailable",
data: T | null,
error: ResponseEnvelope<T>["error"],
meta: Partial<ResponseEnvelope<T>["meta"]> = {},
): ResponseEnvelope<T> {
return {
status,
data,
error,
meta: {
query_time_ms: 0,
chunks_scanned: 0,
index_version: "0.1.0",
vault_mtime: new Date().toISOString(),
...meta,
},
};
}
export function errorEnvelope(
code: string,
message: string,
recoverable: boolean,
suggestion: string,
) {
return makeEnvelope<null>("unavailable", null, { code, message, recoverable, suggestion });
}

32
src/utils/types.ts Normal file
View File

@@ -0,0 +1,32 @@
/** Shared TypeScript types across the plugin. */
export interface SearchResult {
chunk_id: string;
chunk_text: string;
source_file: string;
source_directory: string;
section: string | null;
date: string | null;
tags: string[];
chunk_index: number;
score: number;
}
export interface ResponseEnvelope<T> {
status: "healthy" | "degraded" | "unavailable";
data: T | null;
error: {
code: string;
message: string;
recoverable: boolean;
suggestion: string;
} | null;
meta: {
query_time_ms: number;
chunks_scanned: number;
index_version: string;
vault_mtime: string;
};
}
export type ToolStatus = "healthy" | "degraded" | "unavailable";