Sprint 0-1: Python indexer, TS plugin scaffolding, and test suite

## What's new

**Python indexer (`python/obsidian_rag/`)** — full pipeline from scan to LanceDB:
- `config.py` — JSON config loader with cross-platform path resolution
- `security.py` — path traversal prevention, HTML stripping, sensitive content detection, dir allow/deny lists
- `chunker.py` — section-split for journal entries (date-named files), sliding-window for unstructured notes
- `embedder.py` — Ollama `/api/embeddings` client with batched requests and timeout/error handling
- `vector_store.py` — LanceDB schema, upsert (merge_insert), delete, search with filters, stats
- `indexer.py` — full/sync/reindex pipeline orchestrator with progress yields
- `cli.py` — `index | sync | reindex | status` CLI commands

**TypeScript plugin (`src/`)** — OpenClaw plugin scaffold:
- `utils/` — config loader, TypeScript types, response envelope factory, LanceDB client
- `services/` — health state machine (HEALTHY/DEGRADED/UNAVAILABLE), vault watcher with debounce/batching, indexer bridge (subprocess spawner)
- `tools/` — 4 tool stubs: search, index, status, memory_store (OpenClaw wiring pending)
- `index.ts` — plugin entry point with health probe + vault watcher startup

**Config** (`obsidian-rag/config.json`, `openclaw.plugin.json`):
- 627 files / 3764 chunks indexed in dev vault

**Tests: 76 passing**
- Python: 64 pytest tests (chunker, security, vector_store, config)
- TypeScript: 12 vitest tests (lancedb client, response envelope)

## Bugs fixed

- LanceDB `tags` column filter: `LIKE '%tag%'` → `list_contains(tags, 'tag')` (List<String> column)
- LanceDB JS `db.list_tables()` returns `ListTablesResponse` object, not plain array
- LanceDB JS result score field: `_score` → `_distance`
- TypeScript regex literal with unescaped `/` in path-resolve regex
- Python: `create_table_if_not_exists` identity check → name comparison

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-10 22:56:50 -04:00
parent 18ad47e100
commit 5c281165c7
40 changed files with 5814 additions and 59 deletions

100
src/utils/lancedb.ts Normal file
View File

@@ -0,0 +1,100 @@
/** LanceDB client for TypeScript — searches the pre-built index. */
import { resolve } from "path";
import type { ObsidianRagConfig } from "./config.js";
import type { SearchResult } from "./types.js";
export function resolveVectorDbPath(config: ObsidianRagConfig): string {
const vsp = config.vector_store.path;
// Special case: resolve nested paths where vector_store.path is itself inside data dir
if (vsp.startsWith("./obsidian-rag/") || vsp.includes("../")) return resolve(process.cwd(), vsp);
if (vsp.startsWith("/") || /^[A-Za-z]:/.test(vsp)) return vsp;
return resolve(process.cwd(), vsp);
}
export async function embedQuery(
text: string,
config: ObsidianRagConfig,
): Promise<number[]> {
const url = `${config.embedding.base_url}/api/embeddings`;
const response = await fetch(url, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ model: config.embedding.model, prompt: text }),
signal: AbortSignal.timeout(30_000),
});
if (!response.ok) {
throw new Error(`Embedding request failed: ${response.status} ${response.statusText}`);
}
const data = (await response.json()) as { embedding?: number[]; embeddings?: number[][] };
return data.embedding ?? data.embeddings?.[0] ?? [];
}
export async function searchVectorDb(
config: ObsidianRagConfig,
query: string,
options: {
max_results?: number;
directory_filter?: string[];
date_range?: { from?: string; to?: string };
tags?: string[];
} = {},
): Promise<SearchResult[]> {
const dbPath = resolveVectorDbPath(config);
// Dynamically import LanceDB to avoid issues at import time when not needed
const { connect } = await import("@lancedb/lancedb");
const db = await connect(dbPath);
const tableNames = await db.tableNames();
if (!tableNames.includes("obsidian_chunks")) {
return [];
}
const table = await db.openTable("obsidian_chunks");
// Embed the query text
const queryVector = await embedQuery(query, config);
// Build WHERE clause from filters
const conditions: string[] = [];
if (options.directory_filter && options.directory_filter.length > 0) {
const dirs = options.directory_filter.map((d) => `"${d}"`).join(", ");
conditions.push(`source_directory IN (${dirs})`);
}
if (options.date_range) {
if (options.date_range.from) {
conditions.push(`date >= '${options.date_range.from}'`);
}
if (options.date_range.to) {
conditions.push(`date <= '${options.date_range.to}'`);
}
}
if (options.tags && options.tags.length > 0) {
for (const tag of options.tags) {
// LanceDB stores tags as List<String>; use array_contains SQL function
conditions.push(`array_contains(tags, '${tag}')`);
}
}
const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined;
const limit = options.max_results ?? 5;
// LanceDB JS SDK: table.vectorSearch(vector).filter(...).limit(...).toArray()
let queryBuilder = table.vectorSearch(queryVector);
if (whereClause) {
queryBuilder = queryBuilder.filter(whereClause);
}
const rows = await queryBuilder.limit(limit).toArray();
return rows.map((r: Record<string, unknown>) => ({
chunk_id: r["chunk_id"] as string,
chunk_text: r["chunk_text"] as string,
source_file: r["source_file"] as string,
source_directory: r["source_directory"] as string,
section: (r["section"] as string) ?? null,
date: (r["date"] as string) ?? null,
tags: (r["tags"] as string[]) ?? [],
chunk_index: (r["chunk_index"] as number) ?? 0,
score: (r["_distance"] as number) ?? 0.0,
}));
}