fix: add configSchema to openclaw.plugin.json, add search CLI command, fix total_docs stat

- Add required configSchema to openclaw.plugin.json for OpenClaw plugin discovery - Add search command to CLI with --limit, --dir, --from-date, --to-date, --tags filters - Fix get_stats() to properly count unique docs (was returning 0 for non-null values) - Remove hardcoded max_results default of 5; search now returns all results by default - Update INSTALL.md and design docs with correct OpenClaw extension path instructions
2026-04-11 20:01:09 -04:00
parent de3b9c1c12
commit e15e4ff856
6 changed files with 137 additions and 22 deletions
--- a/openclaw.plugin.json
+++ b/openclaw.plugin.json
@@ -139,10 +139,9 @@
          },
          "max_results": {
            "type": "integer",
-            "description": "Maximum number of chunks to return",
-            "default": 5,
+            "description": "Maximum number of chunks to return (default: unlimited)",
            "minimum": 1,
-            "maximum": 50
+            "maximum": 10000
          },
          "directory_filter": {
            "type": "array",
--- a/python/obsidian_rag/cli.py
+++ b/python/obsidian_rag/cli.py
@@ -8,7 +8,8 @@ import time
 from pathlib import Path

 import obsidian_rag.config as config_mod
-from obsidian_rag.vector_store import get_db, get_stats
+from obsidian_rag.vector_store import get_db, get_stats, search_chunks
+from obsidian_rag.embedder import OllamaEmbedder
 from obsidian_rag.indexer import Indexer


@@ -35,6 +36,8 @@ def main(argv: list[str] | None = None) -> int:
        return _reindex(config)
    elif cmd == "status":
        return _status(config)
+    elif cmd == "search":
+        return _search(config, argv[1:])
    else:
        print(f"Unknown command: {cmd}\n{_usage()}", file=sys.stderr)
        return 1
@@ -111,6 +114,7 @@ def _status(config) -> int:
        # Resolve sync-result.json path (same convention as indexer)
        from pathlib import Path
        import os as osmod
+
        project_root = Path(__file__).parent.parent.parent
        data_dir = project_root / "obsidian-rag"
        if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
@@ -134,7 +138,101 @@ def _status(config) -> int:
        )
        return 0
    except FileNotFoundError:
-        print(json.dumps({"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2))
+        print(
+            json.dumps(
+                {"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2
+            )
+        )
+        return 1
+    except Exception as e:
+        print(json.dumps({"error": str(e)}), file=sys.stderr)
+        return 1
+
+
+def _search(config, args: list[str]) -> int:
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="obsidian-rag search")
+    parser.add_argument("query", nargs="*", help="Search query")
+    parser.add_argument(
+        "--limit", type=int, default=None, help="Max results (default: unlimited)"
+    )
+    parser.add_argument("--dir", dest="directory", help="Filter by directory")
+    parser.add_argument("--from-date", dest="from_date", help="Start date (YYYY-MM-DD)")
+    parser.add_argument("--to-date", dest="to_date", help="End date (YYYY-MM-DD)")
+    parser.add_argument("--tags", help="Comma-separated tags")
+
+    parsed, _ = parser.parse_known_args(args)
+
+    query_text = " ".join(parsed.query) if parsed.query else ""
+    if not query_text:
+        print("ERROR: query is required\n", file=sys.stderr)
+        parser.print_help()
+        return 1
+
+    try:
+        db = get_db(config)
+        table = db.open_table("obsidian_chunks")
+        embedder = OllamaEmbedder(config)
+
+        if not embedder.is_available():
+            print(
+                json.dumps(
+                    {
+                        "error": "Ollama is not available. Start Ollama or use DEGRADED mode."
+                    },
+                    indent=2,
+                )
+            )
+            return 1
+
+        query_vector = embedder.embed_single(query_text)
+
+        filters = {}
+        if parsed.directory:
+            filters["directory_filter"] = [parsed.directory]
+        if parsed.from_date or parsed.to_date:
+            filters["date_range"] = {}
+            if parsed.from_date:
+                filters["date_range"]["from"] = parsed.from_date
+            if parsed.to_date:
+                filters["date_range"]["to"] = parsed.to_date
+        if parsed.tags:
+            filters["tags"] = [t.strip() for t in parsed.tags.split(",")]
+
+        results = search_chunks(
+            table,
+            query_vector,
+            limit=parsed.limit,
+            directory_filter=filters.get("directory_filter"),
+            date_range=filters.get("date_range"),
+            tags=filters.get("tags"),
+        )
+
+        output = {
+            "query": query_text,
+            "total_results": len(results),
+            "results": [
+                {
+                    "score": r.score,
+                    "source_file": r.source_file,
+                    "source_directory": r.source_directory,
+                    "section": r.section,
+                    "date": r.date,
+                    "tags": r.tags,
+                    "chunk_text": r.chunk_text,
+                }
+                for r in results
+            ],
+        }
+        print(json.dumps(output, indent=2, default=str))
+        return 0
+    except FileNotFoundError:
+        print(
+            json.dumps(
+                {"error": "Index not found. Run 'obsidian-rag index' first."}, indent=2
+            )
+        )
        return 1
    except Exception as e:
        print(json.dumps({"error": str(e)}), file=sys.stderr)
@@ -149,8 +247,9 @@ Usage:
  obsidian-rag sync     Incremental sync (changed files only)
  obsidian-rag reindex  Force full reindex (nuke + rebuild)
  obsidian-rag status   Show index health and statistics
+  obsidian-rag search   Semantic search through indexed notes
 """


 if __name__ == "__main__":
-    sys.exit(main())
+    sys.exit(main())
--- a/python/obsidian_rag/vector_store.py
+++ b/python/obsidian_rag/vector_store.py
@@ -117,7 +117,7 @@ def delete_by_source_file(table: Any, source_file: str) -> int:
 def search_chunks(
    table: Any,
    query_vector: list[float],
-    limit: int = 5,
+    limit: int | None = None,
    directory_filter: list[str] | None = None,
    date_range: dict | None = None,
    tags: list[str] | None = None,
@@ -132,7 +132,7 @@ def search_chunks(
    conditions: list[str] = []
    if directory_filter:
        dir_list = ", ".join(f'"{d}"' for d in directory_filter)
-        conditions.append(f'source_directory IN ({dir_list})')
+        conditions.append(f"source_directory IN ({dir_list})")
    if date_range:
        if "from" in date_range:
            conditions.append(f"date >= '{date_range['from']}'")
@@ -144,11 +144,13 @@ def search_chunks(

    where_clause = " AND ".join(conditions) if conditions else None

-    results = (
-        table.search(query_vector, vector_column_name="vector")
-        .limit(limit)
-        .where(where_clause) if where_clause else table.search(query_vector, vector_column_name="vector").limit(limit)
-    ).to_list()
+    search_query = table.search(query_vector, vector_column_name="vector")
+    if limit is not None:
+        search_query = search_query.limit(limit)
+    if where_clause:
+        search_query = search_query.where(where_clause)
+
+    results = search_query.to_list()

    return [
        SearchResult(
@@ -156,7 +158,9 @@ def search_chunks(
            chunk_text=r["chunk_text"],
            source_file=r["source_file"],
            source_directory=r["source_directory"],
-            section=r.get("section") if r.get("section") not in (None, "None") else None,
+            section=r.get("section")
+            if r.get("section") not in (None, "None")
+            else None,
            date=r.get("date") if r.get("date") not in (None, "None") else None,
            tags=r.get("tags") or [],
            chunk_index=r.get("chunk_index") or 0,
@@ -172,10 +176,17 @@ def get_stats(table: Any) -> dict[str, Any]:
    total_chunks = 0
    try:
        total_chunks = table.count_rows()
-        # Count unique source files using pandas
+        # Count non-null, non-empty source files
        all_data = table.to_pandas()
-        total_docs = all_data["source_file"].nunique()
+        total_docs = (
+            all_data["source_file"]
+            .dropna()
+            .astype(str)
+            .str.strip()
+            .loc[lambda s: s.str.len() > 0]
+            .nunique()
+        )
    except Exception:
        pass

-    return {"total_docs": total_docs, "total_chunks": total_chunks}
+    return {"total_docs": total_docs, "total_chunks": total_chunks}
--- a/src/tools/index.ts
+++ b/src/tools/index.ts
@@ -36,7 +36,7 @@ function makeSearchTool(config: ObsidianRagConfig): AnyAgentTool {
      type: "object",
      properties: {
        query: { type: "string", description: "Natural language question or topic to search for" },
-        max_results: { type: "integer", description: "Maximum number of chunks to return (default: 5, range: 1-50)", default: 5, minimum: 1, maximum: 50 },
+        max_results: { type: "integer", description: "Maximum number of chunks to return (default: unlimited)", minimum: 1, maximum: 10000 },
        directory_filter: { type: "array", description: "Limit search to specific subdirectories", items: { type: "string" } },
        date_range: {
          type: "object",
--- a/src/tools/search.ts
+++ b/src/tools/search.ts
@@ -20,7 +20,7 @@ export async function searchTool(
 ): Promise<ResponseEnvelope<{ results: SearchResult[]; sensitive_detected: boolean } | null>> {
  try {
    const results = await searchVectorDb(config, params.query, {
-      max_results: params.max_results ?? 5,
+      max_results: params.max_results ?? undefined,
      directory_filter: params.directory_filter,
      date_range: params.date_range,
      tags: params.tags,
--- a/src/utils/lancedb.ts
+++ b/src/utils/lancedb.ts
@@ -74,7 +74,7 @@ export async function searchVectorDb(
  }
  const whereClause = conditions.length > 0 ? conditions.join(" AND ") : undefined;

-  const limit = options.max_results ?? 5;
+  const limit = options.max_results;

  // Try vector search first; if Ollama is down embedQuery throws → fallback to FTS
  let rows: Record<string, unknown>[];
@@ -85,14 +85,20 @@ export async function searchVectorDb(
    if (whereClause) {
      queryBuilder = queryBuilder.filter(whereClause);
    }
-    rows = await queryBuilder.limit(limit).toArray();
+    if (limit !== undefined) {
+      queryBuilder = queryBuilder.limit(limit);
+    }
+    rows = await queryBuilder.toArray();
  } catch {
    // Ollama unavailable — fallback to full-text search on chunk_text (BM25 scoring)
    let ftsBuilder = table.query().fullTextSearch(query);
    if (whereClause) {
      ftsBuilder = ftsBuilder.filter(whereClause);
    }
-    rows = await ftsBuilder.limit(limit).toArray();
+    if (limit !== undefined) {
+      ftsBuilder = ftsBuilder.limit(limit);
+    }
+    rows = await ftsBuilder.toArray();
  }

  return rows.map((r: Record<string, unknown>) => ({