From e15e4ff85640973b4f5a3ba712df80fb09968616 Mon Sep 17 00:00:00 2001
From: Santhosh Janardhanan <santhoshj@gmail.com>
Date: Sat, 11 Apr 2026 20:01:09 -0400
Subject: [PATCH] fix: add configSchema to openclaw.plugin.json, add search CLI
 command, fix total_docs stat

- Add required configSchema to openclaw.plugin.json for OpenClaw plugin discovery
- Add search command to CLI with --limit, --dir, --from-date, --to-date, --tags filters
- Fix get_stats() to properly count unique docs (was returning 0 for non-null values)
- Remove hardcoded max_results default of 5; search now returns all results by default
- Update INSTALL.md and design docs with correct OpenClaw extension path instructions
---
 openclaw.plugin.json                |   5 +-
 python/obsidian_rag/cli.py          | 105 +++++++++++++++++++++++++++-
 python/obsidian_rag/vector_store.py |  33 ++++++---
 src/tools/index.ts                  |   2 +-
 src/tools/search.ts                 |   2 +-
 src/utils/lancedb.ts                |  12 +++-
 6 files changed, 137 insertions(+), 22 deletions(-)

diff --git a/openclaw.plugin.json b/openclaw.plugin.json
index 68712d9..e04fa77 100644
--- a/openclaw.plugin.json
+++ b/openclaw.plugin.json
@@ -139,10 +139,9 @@
           },
           "max_results": {
             "type": "integer",
-            "description": "Maximum number of chunks to return",
-            "default": 5,
+            "description": "Maximum number of chunks to return (default: unlimited)",
             "minimum": 1,
-            "maximum": 50
+            "maximum": 10000
           },
           "directory_filter": {
             "type": "array",
diff --git a/python/obsidian_rag/cli.py b/python/obsidian_rag/cli.py
index cc43711..de468a7 100644
--- a/python/obsidian_rag/cli.py
+++ b/python/obsidian_rag/cli.py
@@ -8,7 +8,8 @@ import time
 from pathlib import Path
 
 import obsidian_rag.config as config_mod
-from obsidian_rag.vector_store import get_db, get_stats
+from obsidian_rag.vector_store import get_db, get_stats, search_chunks
+from obsidian_rag.embedder import OllamaEmbedder
 from obsidian_rag.indexer import Indexer
 
 
@@ -35,6 +36,8 @@ def main(argv: list[str] | None = None) -> int:
         return _reindex(config)
     elif cmd == "status":
         return _status(config)
+    elif cmd == "search":
+        return _search(config, argv[1:])
     else:
         print(f"Unknown command: {cmd}\n{_usage()}", file=sys.stderr)
         return 1
@@ -111,6 +114,7 @@ def _status(config) -> int:
         # Resolve sync-result.json path (same convention as indexer)
         from pathlib import Path
         import os as osmod
+
         project_root = Path(__file__).parent.parent.parent
         data_dir = project_root / "obsidian-rag"
         if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
@@ -134,7 +138,101 @@ def _status(config) -> int:
         )
         return 0
     except FileNotFoundError:
-        print(json.dumps({"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2))
+        print(
+            json.dumps(
+                {"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2
+            )
+        )
+        return 1
+    except Exception as e:
+        print(json.dumps({"error": str(e)}), file=sys.stderr)
+        return 1
+
+
+def _search(config, args: list[str]) -> int:
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="obsidian-rag search")
+    parser.add_argument("query", nargs="*", help="Search query")
+    parser.add_argument(
+        "--limit", type=int, default=None, help="Max results (default: unlimited)"
+    )
+    parser.add_argument("--dir", dest="directory", help="Filter by directory")
+    parser.add_argument("--from-date", dest="from_date", help="Start date (YYYY-MM-DD)")
+    parser.add_argument("--to-date", dest="to_date", help="End date (YYYY-MM-DD)")
+    parser.add_argument("--tags", help="Comma-separated tags")
+
+    parsed, _ = parser.parse_known_args(args)
+
+    query_text = " ".join(parsed.query) if parsed.query else ""
+    if not query_text:
+        print("ERROR: query is required\n", file=sys.stderr)
+        parser.print_help()
+        return 1
+
+    try:
+        db = get_db(config)
+        table = db.open_table("obsidian_chunks")
+        embedder = OllamaEmbedder(config)
+
+        if not embedder.is_available():
+            print(
+                json.dumps(
+                    {
+                        "error": "Ollama is not available. Start Ollama or use DEGRADED mode."
+                    },
+                    indent=2,
+                )
+            )
+            return 1
+
+        query_vector = embedder.embed_single(query_text)
+
+        filters = {}
+        if parsed.directory:
+            filters["directory_filter"] = [parsed.directory]
+        if parsed.from_date or parsed.to_date:
+            filters["date_range"] = {}
+            if parsed.from_date:
+                filters["date_range"]["from"] = parsed.from_date
+            if parsed.to_date:
+                filters["date_range"]["to"] = parsed.to_date
+        if parsed.tags:
+            filters["tags"] = [t.strip() for t in parsed.tags.split(",")]
+
+        results = search_chunks(
+            table,
+            query_vector,
+            limit=parsed.limit,
+            directory_filter=filters.get("directory_filter"),
+            date_range=filters.get("date_range"),
+            tags=filters.get("tags"),
+        )
+
+        output = {
+            "query": query_text,
+            "total_results": len(results),
+            "results": [
+                {
+                    "score": r.score,
+                    "source_file": r.source_file,
+                    "source_directory": r.source_directory,
+                    "section": r.section,
+                    "date": r.date,
+                    "tags": r.tags,
+                    "chunk_text": r.chunk_text,
+                }
+                for r in results
+            ],
+        }
+        print(json.dumps(output, indent=2, default=str))
+        return 0
+    except FileNotFoundError:
+        print(
+            json.dumps(
+                {"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2
+            )
+        )
         return 1
     except Exception as e:
         print(json.dumps({"error": str(e)}), file=sys.stderr)
@@ -149,8 +247,9 @@ Usage:
   obsidian-rag sync     Incremental sync (changed files only)
   obsidian-rag reindex  Force full reindex (nuke + rebuild)
   obsidian-rag status   Show index health and statistics
+  obsidian-rag search   Semantic search through indexed notes
 """
 
 
 if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
+    sys.exit(main())
diff --git a/python/obsidian_rag/vector_store.py b/python/obsidian_rag/vector_store.py
index da595e2..340f4e7 100644
--- a/python/obsidian_rag/vector_store.py
+++ b/python/obsidian_rag/vector_store.py
@@ -117,7 +117,7 @@ def delete_by_source_file(table: Any, source_file: str) -> int:
 def search_chunks(
     table: Any,
     query_vector: list[float],
-    limit: int = 5,
+    limit: int | None = None,
     directory_filter: list[str] | None = None,
     date_range: dict | None = None,
     tags: list[str] | None = None,
@@ -132,7 +132,7 @@ def search_chunks(
     conditions: list[str] = []
     if directory_filter:
         dir_list = ", ".join(f'"{d}"' for d in directory_filter)
-        conditions.append(f'source_directory IN ({dir_list})')
+        conditions.append(f"source_directory IN ({dir_list})")
     if date_range:
         if "from" in date_range:
             conditions.append(f"date >= '{date_range['from']}'")
@@ -144,11 +144,13 @@ def search_chunks(
 
     where_clause = " AND ".join(conditions) if conditions else None
 
-    results = (
-        table.search(query_vector, vector_column_name="vector")
-        .limit(limit)
-        .where(where_clause) if where_clause else table.search(query_vector, vector_column_name="vector").limit(limit)
-    ).to_list()
+    search_query = table.search(query_vector, vector_column_name="vector")
+    if limit is not None:
+        search_query = search_query.limit(limit)
+    if where_clause:
+        search_query = search_query.where(where_clause)
+
+    results = search_query.to_list()
 
     return [
         SearchResult(
@@ -156,7 +158,9 @@ def search_chunks(
             chunk_text=r["chunk_text"],
             source_file=r["source_file"],
             source_directory=r["source_directory"],
-            section=r.get("section") if r.get("section") not in (None, "None") else None,
+            section=r.get("section")
+            if r.get("section") not in (None, "None")
+            else None,
             date=r.get("date") if r.get("date") not in (None, "None") else None,
             tags=r.get("tags") or [],
             chunk_index=r.get("chunk_index") or 0,
@@ -172,10 +176,17 @@ def get_stats(table: Any) -> dict[str, Any]:
     total_chunks = 0
     try:
         total_chunks = table.count_rows()
-        # Count unique source files using pandas
+        # Count non-null, non-empty source files
         all_data = table.to_pandas()
-        total_docs = all_data["source_file"].nunique()
+        total_docs = (
+            all_data["source_file"]
+            .dropna()
+            .astype(str)
+            .str.strip()
+            .loc[lambda s: s.str.len() > 0]
+            .nunique()
+        )
     except Exception:
         pass
 
-    return {"total_docs": total_docs, "total_chunks": total_chunks}
\ No newline at end of file
+    return {"total_docs": total_docs, "total_chunks": total_chunks}
diff --git a/src/tools/index.ts b/src/tools/index.ts
index 7588ec4..95dce51 100644
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -36,7 +36,7 @@ function makeSearchTool(config: ObsidianRagConfig): AnyAgentTool {
       type: "object",
       properties: {
         query: { type: "string", description: "Natural language question or topic to search for" },
-        max_results: { type: "integer", description: "Maximum number of chunks to return (default: 5, range: 1-50)", default: 5, minimum: 1, maximum: 50 },
+        max_results: { type: "integer", description: "Maximum number of chunks to return (default: unlimited)", minimum: 1, maximum: 10000 },
         directory_filter: { type: "array", description: "Limit search to specific subdirectories", items: { type: "string" } },
         date_range: {
           type: "object",
diff --git a/src/tools/search.ts b/src/tools/search.ts
index d3b0681..1007eee 100644
--- a/src/tools/search.ts
+++ b/src/tools/search.ts
@@ -20,7 +20,7 @@ export async function searchTool(
 ): Promise<ResponseEnvelope<{ results: SearchResult[]; sensitive_detected: boolean } | null>> {
   try {
     const results = await searchVectorDb(config, params.query, {
-      max_results: params.max_results ?? 5,
+      max_results: params.max_results ?? undefined,
       directory_filter: params.directory_filter,
       date_range: params.date_range,
       tags: params.tags,
diff --git a/src/utils/lancedb.ts b/src/utils/lancedb.ts
index 3c84314..983f8ca 100644
--- a/src/utils/lancedb.ts
+++ b/src/utils/lancedb.ts
@@ -74,7 +74,7 @@ export async function searchVectorDb(
   }
   const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined;
 
-  const limit = options.max_results ?? 5;
+  const limit = options.max_results;
 
   // Try vector search first; if Ollama is down embedQuery throws → fallback to FTS
   let rows: Record<string, unknown>[];
@@ -85,14 +85,20 @@ export async function searchVectorDb(
     if (whereClause) {
       queryBuilder = queryBuilder.filter(whereClause);
     }
-    rows = await queryBuilder.limit(limit).toArray();
+    if (limit !== undefined) {
+      queryBuilder = queryBuilder.limit(limit);
+    }
+    rows = await queryBuilder.toArray();
   } catch {
     // Ollama unavailable — fallback to full-text search on chunk_text (BM25 scoring)
     let ftsBuilder = table.query().fullTextSearch(query);
     if (whereClause) {
       ftsBuilder = ftsBuilder.filter(whereClause);
     }
-    rows = await ftsBuilder.limit(limit).toArray();
+    if (limit !== undefined) {
+      ftsBuilder = ftsBuilder.limit(limit);
+    }
+    rows = await ftsBuilder.toArray();
   }
 
   return rows.map((r: Record<string, unknown>) => ({