From e15e4ff85640973b4f5a3ba712df80fb09968616 Mon Sep 17 00:00:00 2001 From: Santhosh Janardhanan Date: Sat, 11 Apr 2026 20:01:09 -0400 Subject: [PATCH] fix: add configSchema to openclaw.plugin.json, add search CLI command, fix total_docs stat - Add required configSchema to openclaw.plugin.json for OpenClaw plugin discovery - Add search command to CLI with --limit, --dir, --from-date, --to-date, --tags filters - Fix get_stats() to properly count unique docs (was returning 0 for non-null values) - Remove hardcoded max_results default of 5; search now returns all results by default - Update INSTALL.md and design docs with correct OpenClaw extension path instructions --- openclaw.plugin.json | 5 +- python/obsidian_rag/cli.py | 105 +++++++++++++++++++++++++++- python/obsidian_rag/vector_store.py | 33 ++++++--- src/tools/index.ts | 2 +- src/tools/search.ts | 2 +- src/utils/lancedb.ts | 12 +++- 6 files changed, 137 insertions(+), 22 deletions(-) diff --git a/openclaw.plugin.json b/openclaw.plugin.json index 68712d9..e04fa77 100644 --- a/openclaw.plugin.json +++ b/openclaw.plugin.json @@ -139,10 +139,9 @@ }, "max_results": { "type": "integer", - "description": "Maximum number of chunks to return", - "default": 5, + "description": "Maximum number of chunks to return (default: unlimited)", "minimum": 1, - "maximum": 50 + "maximum": 10000 }, "directory_filter": { "type": "array", diff --git a/python/obsidian_rag/cli.py b/python/obsidian_rag/cli.py index cc43711..de468a7 100644 --- a/python/obsidian_rag/cli.py +++ b/python/obsidian_rag/cli.py @@ -8,7 +8,8 @@ import time from pathlib import Path import obsidian_rag.config as config_mod -from obsidian_rag.vector_store import get_db, get_stats +from obsidian_rag.vector_store import get_db, get_stats, search_chunks +from obsidian_rag.embedder import OllamaEmbedder from obsidian_rag.indexer import Indexer @@ -35,6 +36,8 @@ def main(argv: list[str] | None = None) -> int: return _reindex(config) elif cmd == "status": return _status(config) + elif cmd == "search": + return _search(config, argv[1:]) else: print(f"Unknown command: {cmd}\n{_usage()}", file=sys.stderr) return 1 @@ -111,6 +114,7 @@ def _status(config) -> int: # Resolve sync-result.json path (same convention as indexer) from pathlib import Path import os as osmod + project_root = Path(__file__).parent.parent.parent data_dir = project_root / "obsidian-rag" if not data_dir.exists() and not (project_root / "KnowledgeVault").exists(): @@ -134,7 +138,101 @@ def _status(config) -> int: ) return 0 except FileNotFoundError: - print(json.dumps({"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2)) + print( + json.dumps( + {"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2 + ) + ) + return 1 + except Exception as e: + print(json.dumps({"error": str(e)}), file=sys.stderr) + return 1 + + +def _search(config, args: list[str]) -> int: + import argparse + + parser = argparse.ArgumentParser(prog="obsidian-rag search") + parser.add_argument("query", nargs="*", help="Search query") + parser.add_argument( + "--limit", type=int, default=None, help="Max results (default: unlimited)" + ) + parser.add_argument("--dir", dest="directory", help="Filter by directory") + parser.add_argument("--from-date", dest="from_date", help="Start date (YYYY-MM-DD)") + parser.add_argument("--to-date", dest="to_date", help="End date (YYYY-MM-DD)") + parser.add_argument("--tags", help="Comma-separated tags") + + parsed, _ = parser.parse_known_args(args) + + query_text = " ".join(parsed.query) if parsed.query else "" + if not query_text: + print("ERROR: query is required\n", file=sys.stderr) + parser.print_help() + return 1 + + try: + db = get_db(config) + table = db.open_table("obsidian_chunks") + embedder = OllamaEmbedder(config) + + if not embedder.is_available(): + print( + json.dumps( + { + "error": "Ollama is not available. Start Ollama or use DEGRADED mode." + }, + indent=2, + ) + ) + return 1 + + query_vector = embedder.embed_single(query_text) + + filters = {} + if parsed.directory: + filters["directory_filter"] = [parsed.directory] + if parsed.from_date or parsed.to_date: + filters["date_range"] = {} + if parsed.from_date: + filters["date_range"]["from"] = parsed.from_date + if parsed.to_date: + filters["date_range"]["to"] = parsed.to_date + if parsed.tags: + filters["tags"] = [t.strip() for t in parsed.tags.split(",")] + + results = search_chunks( + table, + query_vector, + limit=parsed.limit, + directory_filter=filters.get("directory_filter"), + date_range=filters.get("date_range"), + tags=filters.get("tags"), + ) + + output = { + "query": query_text, + "total_results": len(results), + "results": [ + { + "score": r.score, + "source_file": r.source_file, + "source_directory": r.source_directory, + "section": r.section, + "date": r.date, + "tags": r.tags, + "chunk_text": r.chunk_text, + } + for r in results + ], + } + print(json.dumps(output, indent=2, default=str)) + return 0 + except FileNotFoundError: + print( + json.dumps( + {"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2 + ) + ) return 1 except Exception as e: print(json.dumps({"error": str(e)}), file=sys.stderr) @@ -149,8 +247,9 @@ Usage: obsidian-rag sync Incremental sync (changed files only) obsidian-rag reindex Force full reindex (nuke + rebuild) obsidian-rag status Show index health and statistics + obsidian-rag search Semantic search through indexed notes """ if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/python/obsidian_rag/vector_store.py b/python/obsidian_rag/vector_store.py index da595e2..340f4e7 100644 --- a/python/obsidian_rag/vector_store.py +++ b/python/obsidian_rag/vector_store.py @@ -117,7 +117,7 @@ def delete_by_source_file(table: Any, source_file: str) -> int: def search_chunks( table: Any, query_vector: list[float], - limit: int = 5, + limit: int | None = None, directory_filter: list[str] | None = None, date_range: dict | None = None, tags: list[str] | None = None, @@ -132,7 +132,7 @@ def search_chunks( conditions: list[str] = [] if directory_filter: dir_list = ", ".join(f'"{d}"' for d in directory_filter) - conditions.append(f'source_directory IN ({dir_list})') + conditions.append(f"source_directory IN ({dir_list})") if date_range: if "from" in date_range: conditions.append(f"date >= '{date_range['from']}'") @@ -144,11 +144,13 @@ def search_chunks( where_clause = " AND ".join(conditions) if conditions else None - results = ( - table.search(query_vector, vector_column_name="vector") - .limit(limit) - .where(where_clause) if where_clause else table.search(query_vector, vector_column_name="vector").limit(limit) - ).to_list() + search_query = table.search(query_vector, vector_column_name="vector") + if limit is not None: + search_query = search_query.limit(limit) + if where_clause: + search_query = search_query.where(where_clause) + + results = search_query.to_list() return [ SearchResult( @@ -156,7 +158,9 @@ def search_chunks( chunk_text=r["chunk_text"], source_file=r["source_file"], source_directory=r["source_directory"], - section=r.get("section") if r.get("section") not in (None, "None") else None, + section=r.get("section") + if r.get("section") not in (None, "None") + else None, date=r.get("date") if r.get("date") not in (None, "None") else None, tags=r.get("tags") or [], chunk_index=r.get("chunk_index") or 0, @@ -172,10 +176,17 @@ def get_stats(table: Any) -> dict[str, Any]: total_chunks = 0 try: total_chunks = table.count_rows() - # Count unique source files using pandas + # Count non-null, non-empty source files all_data = table.to_pandas() - total_docs = all_data["source_file"].nunique() + total_docs = ( + all_data["source_file"] + .dropna() + .astype(str) + .str.strip() + .loc[lambda s: s.str.len() > 0] + .nunique() + ) except Exception: pass - return {"total_docs": total_docs, "total_chunks": total_chunks} \ No newline at end of file + return {"total_docs": total_docs, "total_chunks": total_chunks} diff --git a/src/tools/index.ts b/src/tools/index.ts index 7588ec4..95dce51 100644 --- a/src/tools/index.ts +++ b/src/tools/index.ts @@ -36,7 +36,7 @@ function makeSearchTool(config: ObsidianRagConfig): AnyAgentTool { type: "object", properties: { query: { type: "string", description: "Natural language question or topic to search for" }, - max_results: { type: "integer", description: "Maximum number of chunks to return (default: 5, range: 1-50)", default: 5, minimum: 1, maximum: 50 }, + max_results: { type: "integer", description: "Maximum number of chunks to return (default: unlimited)", minimum: 1, maximum: 10000 }, directory_filter: { type: "array", description: "Limit search to specific subdirectories", items: { type: "string" } }, date_range: { type: "object", diff --git a/src/tools/search.ts b/src/tools/search.ts index d3b0681..1007eee 100644 --- a/src/tools/search.ts +++ b/src/tools/search.ts @@ -20,7 +20,7 @@ export async function searchTool( ): Promise> { try { const results = await searchVectorDb(config, params.query, { - max_results: params.max_results ?? 5, + max_results: params.max_results ?? undefined, directory_filter: params.directory_filter, date_range: params.date_range, tags: params.tags, diff --git a/src/utils/lancedb.ts b/src/utils/lancedb.ts index 3c84314..983f8ca 100644 --- a/src/utils/lancedb.ts +++ b/src/utils/lancedb.ts @@ -74,7 +74,7 @@ export async function searchVectorDb( } const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined; - const limit = options.max_results ?? 5; + const limit = options.max_results; // Try vector search first; if Ollama is down embedQuery throws → fallback to FTS let rows: Record[]; @@ -85,14 +85,20 @@ export async function searchVectorDb( if (whereClause) { queryBuilder = queryBuilder.filter(whereClause); } - rows = await queryBuilder.limit(limit).toArray(); + if (limit !== undefined) { + queryBuilder = queryBuilder.limit(limit); + } + rows = await queryBuilder.toArray(); } catch { // Ollama unavailable — fallback to full-text search on chunk_text (BM25 scoring) let ftsBuilder = table.query().fullTextSearch(query); if (whereClause) { ftsBuilder = ftsBuilder.filter(whereClause); } - rows = await ftsBuilder.limit(limit).toArray(); + if (limit !== undefined) { + ftsBuilder = ftsBuilder.limit(limit); + } + rows = await ftsBuilder.toArray(); } return rows.map((r: Record) => ({