Files
obsidian-rag/src/utils/lancedb.ts
Santhosh Janardhanan e15e4ff856 fix: add configSchema to openclaw.plugin.json, add search CLI command, fix total_docs stat
- Add required configSchema to openclaw.plugin.json for OpenClaw plugin discovery
- Add search command to CLI with --limit, --dir, --from-date, --to-date, --tags filters
- Fix get_stats() to properly count unique docs (was returning 0 for non-null values)
- Remove hardcoded max_results default of 5; search now returns all results by default
- Update INSTALL.md and design docs with correct OpenClaw extension path instructions
2026-04-11 20:01:09 -04:00

116 lines
4.0 KiB
TypeScript

/** LanceDB client for TypeScript — searches the pre-built index. */
import { resolve } from "path";
import type { ObsidianRagConfig } from "./config.js";
import type { SearchResult } from "./types.js";
export function resolveVectorDbPath(config: ObsidianRagConfig): string {
const vsp = config.vector_store.path;
// Special case: resolve nested paths where vector_store.path is itself inside data dir
if (vsp.startsWith("./obsidian-rag/") || vsp.includes("../")) return resolve(process.cwd(), vsp);
if (vsp.startsWith("/") || /^[A-Za-z]:/.test(vsp)) return vsp;
return resolve(process.cwd(), vsp);
}
export async function embedQuery(
text: string,
config: ObsidianRagConfig,
): Promise<number[]> {
const url = `${config.embedding.base_url}/api/embeddings`;
const response = await fetch(url, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ model: config.embedding.model, prompt: text }),
signal: AbortSignal.timeout(30_000),
});
if (!response.ok) {
throw new Error(`Embedding request failed: ${response.status} ${response.statusText}`);
}
const data = (await response.json()) as { embedding?: number[]; embeddings?: number[][] };
return data.embedding ?? data.embeddings?.[0] ?? [];
}
export async function searchVectorDb(
config: ObsidianRagConfig,
query: string,
options: {
max_results?: number;
directory_filter?: string[];
date_range?: { from?: string; to?: string };
tags?: string[];
} = {},
): Promise<SearchResult[]> {
const dbPath = resolveVectorDbPath(config);
// Dynamically import LanceDB to avoid issues at import time when not needed
const { connect } = await import("@lancedb/lancedb");
const db = await connect(dbPath);
const tableNames = await db.tableNames();
if (!tableNames.includes("obsidian_chunks")) {
return [];
}
const table = await db.openTable("obsidian_chunks");
// Build WHERE clause from filters
const conditions: string[] = [];
if (options.directory_filter && options.directory_filter.length > 0) {
const dirs = options.directory_filter.map((d) => `"${d}"`).join(", ");
conditions.push(`source_directory IN (${dirs})`);
}
if (options.date_range) {
if (options.date_range.from) {
conditions.push(`date >= '${options.date_range.from}'`);
}
if (options.date_range.to) {
conditions.push(`date <= '${options.date_range.to}'`);
}
}
if (options.tags && options.tags.length > 0) {
for (const tag of options.tags) {
// LanceDB stores tags as List<String>; use array_contains SQL function
conditions.push(`array_contains(tags, '${tag}')`);
}
}
const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined;
const limit = options.max_results;
// Try vector search first; if Ollama is down embedQuery throws → fallback to FTS
let rows: Record<string, unknown>[];
try {
const queryVector = await embedQuery(query, config);
let queryBuilder = table.vectorSearch(queryVector);
if (whereClause) {
queryBuilder = queryBuilder.filter(whereClause);
}
if (limit !== undefined) {
queryBuilder = queryBuilder.limit(limit);
}
rows = await queryBuilder.toArray();
} catch {
// Ollama unavailable — fallback to full-text search on chunk_text (BM25 scoring)
let ftsBuilder = table.query().fullTextSearch(query);
if (whereClause) {
ftsBuilder = ftsBuilder.filter(whereClause);
}
if (limit !== undefined) {
ftsBuilder = ftsBuilder.limit(limit);
}
rows = await ftsBuilder.toArray();
}
return rows.map((r: Record<string, unknown>) => ({
chunk_id: r["chunk_id"] as string,
chunk_text: r["chunk_text"] as string,
source_file: r["source_file"] as string,
source_directory: r["source_directory"] as string,
section: (r["section"] as string) ?? null,
date: (r["date"] as string) ?? null,
tags: (r["tags"] as string[]) ?? [],
chunk_index: (r["chunk_index"] as number) ?? 0,
score: (r["_score"] as number) ?? (r["_distance"] as number) ?? 0.0,
}));
}