Compare commits

8 Commits

Author SHA1 Message Date
21b9704e21 fix(indexer): use upsert_chunks return value for chunk count
Previously total_chunks counted from process_file return (num_chunks)
which could differ from actual stored count if upsert silently failed.
Now using stored count returned by upsert_chunks.

Also fixes cli._index to skip progress yields when building result.
2026-04-12 02:16:19 -04:00
4ab504e87c fix(indexer): write sync-result.json in reindex() not just sync()
reindex() was consuming the full_index() generator but never calling
_write_sync_result(), leaving sync-result.json stale while CLI output
showed correct indexed_files/total_chunks.
2026-04-12 01:49:43 -04:00
9d919dc237 fix(config): use 'obsidian-rag' not '.obsidian-rag' for dev config path 2026-04-12 01:03:00 -04:00
fe428511d1 fix(bridge): dev data dir is 'obsidian-rag' not '.obsidian-rag'
Python's _resolve_data_dir() uses 'obsidian-rag' (no dot).
TypeScript was using '.obsidian-rag' (with dot) — mismatch caused
sync-result.json to never be found by the agent plugin.
2026-04-12 00:56:10 -04:00
a12e27b83a fix(bridge): resolve data dir same as Python for sync-result.json
Both readSyncResult() and probeAll() now mirror Python's
_resolve_data_dir() logic: dev detection (cwd/.obsidian-rag
or cwd/KnowledgeVault) then home dir fallback.

Previously readSyncResult always used cwd/.obsidian-rag (wrong for
server deployments) and probeAll resolved sync-result.json relative
to db path (wrong for absolute paths like /home/san/.obsidian-rag/).
2026-04-12 00:10:38 -04:00
34f3ce97f7 feat(indexer): hierarchical chunking for large sections
- Section-split first for structured notes
- Large sections (>max_section_chars) broken via sliding-window
- Small sections stay intact with heading preserved
- Adds max_section_chars config (default 4000)
- 2 new TDD tests for hierarchical chunking
2026-04-11 23:58:05 -04:00
a744c0c566 docs: add AGENTS.md with repo-specific guidance 2026-04-11 23:16:47 -04:00
d946cf34e1 fix(indexer): truncate chunks exceeding Ollama context window 2026-04-11 23:12:13 -04:00
11 changed files with 253 additions and 65 deletions

76
AGENTS.md Normal file
View File

@@ -0,0 +1,76 @@
# AGENTS.md
## Stack
Two independent packages in one repo:
| Directory | Role | Entry | Build |
|-----------|------|-------|-------|
| `src/` | TypeScript OpenClaw plugin | `src/index.ts` | esbuild → `dist/index.js` |
| `python/` | Python CLI indexer | `obsidian_rag/cli.py` | pip install -e |
## Commands
**TypeScript (OpenClaw plugin):**
```bash
npm run build # esbuild → dist/index.js
npm run typecheck # tsc --noEmit
npm run test # vitest run
```
**Python (RAG indexer):**
```bash
pip install -e python/ # editable install
obsidian-rag index|sync|reindex|status # CLI
pytest python/ # tests
ruff check python/ # lint
```
## OpenClaw Plugin Install
Plugin `package.json` MUST have:
```json
"openclaw": {
"extensions": ["./dist/index.js"],
"hook": []
}
```
- `extensions` = array, string path
- `hook` = singular, not `hooks`
## Config
User config at `~/.obsidian-rag/config.json` or `./obsidian-rag/` dev config.
Key indexing fields:
- `indexing.chunk_size` — sliding window chunk size (default 500)
- `indexing.chunk_overlap` — overlap between chunks (default 100)
- `indexing.max_section_chars` — max chars per section before hierarchical split (default 4000)
Key security fields:
- `security.require_confirmation_for` — list of categories (e.g. `["health", "financial_debt"]`). Empty list disables guard.
- `security.auto_approve_sensitive``true` bypasses sensitive content prompts.
- `security.local_only``true` blocks non-localhost Ollama.
## Ollama Context Length
`python/obsidian_rag/embedder.py` truncates chunks at `MAX_CHUNK_CHARS = 8000` before embedding. If Ollama 500 error returns, increase `max_section_chars` (to reduce section sizes) or reduce `chunk_size` in config.
## Hierarchical Chunking
Structured notes (date-named files) use section-split first, then sliding-window within sections that exceed `max_section_chars`. Small sections stay intact; large sections are broken into sub-chunks with the parent section heading preserved.
## Sensitive Content Guard
Triggered by categories in `require_confirmation_for`. Raises `SensitiveContentError` from `obsidian_rag/indexer.py`.
To disable: set `require_confirmation_for: []` or `auto_approve_sensitive: true` in config.
## Architecture
```
User query → OpenClaw (TypeScript plugin src/index.ts)
→ obsidian_rag_* tools (python/obsidian_rag/)
→ Ollama embeddings (http://localhost:11434)
→ LanceDB vector store
```

View File

@@ -65,6 +65,11 @@
"type": "integer",
"minimum": 0
},
"max_section_chars": {
"type": "integer",
"minimum": 1,
"description": "Max chars per section before splitting into sub-chunks. Default 4000."
},
"file_patterns": {
"type": "array",
"items": {

View File

@@ -3,7 +3,6 @@
from __future__ import annotations
import re
import unicodedata
import hashlib
from dataclasses import dataclass, field
from pathlib import Path
@@ -181,9 +180,7 @@ def chunk_file(
Uses section-split for structured notes (journal entries with date filenames),
sliding window for everything else.
"""
import uuid
vault_path = Path(config.vault_path)
rel_path = filepath if filepath.is_absolute() else filepath
source_file = str(rel_path)
source_directory = rel_path.parts[0] if rel_path.parts else ""
@@ -201,7 +198,6 @@ def chunk_file(
chunks: list[Chunk] = []
if is_structured_note(filepath):
# Section-split for journal/daily notes
sections = split_by_sections(body, metadata)
total = len(sections)
@@ -211,20 +207,38 @@ def chunk_file(
section_tags = extract_tags(section_text)
combined_tags = list(dict.fromkeys([*tags, *section_tags]))
chunk_text = section_text
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=chunk_text,
source_file=source_file,
source_directory=source_directory,
section=f"#{section}" if section else None,
date=date,
tags=combined_tags,
chunk_index=idx,
total_chunks=total,
modified_at=modified_at,
)
chunks.append(chunk)
section_heading = f"#{section}" if section else None
if len(section_text) > config.indexing.max_section_chars:
sub_chunks = sliding_window_chunks(section_text, chunk_size, overlap)
sub_total = len(sub_chunks)
for sub_idx, sub_text in enumerate(sub_chunks):
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}_{sub_idx}",
text=sub_text,
source_file=source_file,
source_directory=source_directory,
section=section_heading,
date=date,
tags=combined_tags,
chunk_index=sub_idx,
total_chunks=sub_total,
modified_at=modified_at,
)
chunks.append(chunk)
else:
chunk = Chunk(
chunk_id=f"{chunk_id_prefix}{_stable_chunk_id(content_hash, idx)}",
text=section_text,
source_file=source_file,
source_directory=source_directory,
section=section_heading,
date=date,
tags=combined_tags,
chunk_index=idx,
total_chunks=total,
modified_at=modified_at,
)
chunks.append(chunk)
else:
# Sliding window for unstructured notes
text_chunks = sliding_window_chunks(body, chunk_size, overlap)

View File

@@ -51,7 +51,10 @@ def _index(config) -> int:
gen = indexer.full_index()
result: dict = {"indexed_files": 0, "total_chunks": 0, "errors": []}
for item in gen:
result = item # progress yields are dicts; final dict from return
if item.get("type") == "complete":
result = item
elif item.get("type") == "progress":
pass # skip progress logs in result
duration_ms = int((time.monotonic() - t0) * 1000)
print(
json.dumps(

View File

@@ -3,7 +3,6 @@
from __future__ import annotations
import json
import os
from enum import Enum
from dataclasses import dataclass, field
from pathlib import Path
@@ -32,6 +31,7 @@ class VectorStoreConfig:
class IndexingConfig:
chunk_size: int = 500
chunk_overlap: int = 100
max_section_chars: int = 4000
file_patterns: list[str] = field(default_factory=lambda: ["*.md"])
deny_dirs: list[str] = field(
default_factory=lambda: [

View File

@@ -12,6 +12,7 @@ if TYPE_CHECKING:
from obsidian_rag.config import ObsidianRagConfig
DEFAULT_TIMEOUT = 120.0 # seconds
MAX_CHUNK_CHARS = 8000 # safe default for most Ollama models
class EmbeddingError(Exception):
@@ -44,7 +45,7 @@ class OllamaEmbedder:
return
parsed = urllib.parse.urlparse(self.base_url)
if parsed.hostname not in ['localhost', '127.0.0.1', '::1']:
if parsed.hostname not in ["localhost", "127.0.0.1", "::1"]:
raise SecurityError(
f"Remote embedding service not allowed when local_only=True: {self.base_url}"
)
@@ -84,23 +85,31 @@ class OllamaEmbedder:
# For batch, call /api/embeddings multiple times sequentially
if len(batch) == 1:
endpoint = f"{self.base_url}/api/embeddings"
payload = {"model": self.model, "prompt": batch[0]}
prompt = batch[0][:MAX_CHUNK_CHARS]
payload = {"model": self.model, "prompt": prompt}
else:
# For batch, use /api/embeddings with "input" (multiple calls)
results = []
for text in batch:
truncated = text[:MAX_CHUNK_CHARS]
try:
resp = self._client.post(
f"{self.base_url}/api/embeddings",
json={"model": self.model, "prompt": text},
json={"model": self.model, "prompt": truncated},
timeout=DEFAULT_TIMEOUT,
)
except httpx.ConnectError as e:
raise OllamaUnavailableError(f"Cannot connect to Ollama at {self.base_url}") from e
raise OllamaUnavailableError(
f"Cannot connect to Ollama at {self.base_url}"
) from e
except httpx.TimeoutException as e:
raise EmbeddingError(f"Embedding request timed out after {DEFAULT_TIMEOUT}s") from e
raise EmbeddingError(
f"Embedding request timed out after {DEFAULT_TIMEOUT}s"
) from e
if resp.status_code != 200:
raise EmbeddingError(f"Ollama returned {resp.status_code}: {resp.text}")
raise EmbeddingError(
f"Ollama returned {resp.status_code}: {resp.text}"
)
data = resp.json()
embedding = data.get("embedding", [])
if not embedding:
@@ -111,9 +120,13 @@ class OllamaEmbedder:
try:
resp = self._client.post(endpoint, json=payload, timeout=DEFAULT_TIMEOUT)
except httpx.ConnectError as e:
raise OllamaUnavailableError(f"Cannot connect to Ollama at {self.base_url}") from e
raise OllamaUnavailableError(
f"Cannot connect to Ollama at {self.base_url}"
) from e
except httpx.TimeoutException as e:
raise EmbeddingError(f"Embedding request timed out after {DEFAULT_TIMEOUT}s") from e
raise EmbeddingError(
f"Embedding request timed out after {DEFAULT_TIMEOUT}s"
) from e
if resp.status_code != 200:
raise EmbeddingError(f"Ollama returned {resp.status_code}: {resp.text}")

View File

@@ -4,8 +4,6 @@ from __future__ import annotations
import json
import os
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generator, Iterator
@@ -16,10 +14,13 @@ if TYPE_CHECKING:
import obsidian_rag.config as config_mod
from obsidian_rag.config import _resolve_data_dir
from obsidian_rag.chunker import chunk_file
from obsidian_rag.embedder import EmbeddingError, OllamaUnavailableError, SecurityError
from obsidian_rag.embedder import OllamaUnavailableError
from obsidian_rag.security import should_index_dir, validate_path
from obsidian_rag.audit_logger import AuditLogger
from obsidian_rag.vector_store import create_table_if_not_exists, delete_by_source_file, get_db, upsert_chunks
from obsidian_rag.vector_store import (
create_table_if_not_exists,
get_db,
upsert_chunks,
)
# ----------------------------------------------------------------------
# Pipeline
@@ -43,6 +44,7 @@ class Indexer:
def embedder(self):
if self._embedder is None:
from obsidian_rag.embedder import OllamaEmbedder
self._embedder = OllamaEmbedder(self.config)
return self._embedder
@@ -50,6 +52,7 @@ class Indexer:
def audit_logger(self):
if self._audit_logger is None:
from obsidian_rag.audit_logger import AuditLogger
log_dir = _resolve_data_dir() / "audit"
self._audit_logger = AuditLogger(log_dir / "audit.log")
return self._audit_logger
@@ -64,9 +67,9 @@ class Indexer:
for chunk in chunks:
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
chunk["chunk_text"],
self.config.security.sensitive_sections,
self.config.memory.patterns
self.config.memory.patterns,
)
for category in sensitive_categories:
@@ -99,7 +102,11 @@ class Indexer:
"""Index a single file. Returns (num_chunks, enriched_chunks)."""
from obsidian_rag import security
mtime = str(datetime.fromtimestamp(filepath.stat().st_mtime, tz=timezone.utc).isoformat())
mtime = str(
datetime.fromtimestamp(
filepath.stat().st_mtime, tz=timezone.utc
).isoformat()
)
content = filepath.read_text(encoding="utf-8")
# Sanitize
content = security.sanitize_text(content)
@@ -151,18 +158,19 @@ class Indexer:
# Log sensitive content access
for chunk in enriched:
from obsidian_rag import security
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
chunk["chunk_text"],
self.config.security.sensitive_sections,
self.config.memory.patterns
self.config.memory.patterns,
)
for category in ['health', 'financial', 'relations']:
for category in ["health", "financial", "relations"]:
if sensitivity.get(category, False):
self.audit_logger.log_sensitive_access(
str(chunk['source_file']),
str(chunk["source_file"]),
category,
'index',
{'chunk_id': chunk['chunk_id']}
"index",
{"chunk_id": chunk["chunk_id"]},
)
# Embed chunks
@@ -176,8 +184,8 @@ class Indexer:
for e, v in zip(enriched, vectors):
e["vector"] = v
# Store
upsert_chunks(table, enriched)
total_chunks += num_chunks
stored = upsert_chunks(table, enriched)
total_chunks += stored
indexed_files += 1
except Exception as exc:
errors.append({"file": str(filepath), "error": str(exc)})
@@ -249,9 +257,16 @@ class Indexer:
db = get_db(self.config)
if "obsidian_chunks" in db.list_tables():
db.drop_table("obsidian_chunks")
# full_index is a generator — materialize it to get the final dict
results = list(self.full_index())
return results[-1] if results else {"indexed_files": 0, "total_chunks": 0, "errors": []}
final = (
results[-1]
if results
else {"indexed_files": 0, "total_chunks": 0, "errors": []}
)
self._write_sync_result(
final["indexed_files"], final["total_chunks"], final["errors"]
)
return final
def _sync_result_path(self) -> Path:
# Use the same dev-data-dir convention as config.py
@@ -313,18 +328,19 @@ class Indexer:
# Log sensitive content access
for chunk in enriched:
from obsidian_rag import security
sensitivity = security.detect_sensitive(
chunk['chunk_text'],
chunk["chunk_text"],
self.config.security.sensitive_sections,
self.config.memory.patterns
self.config.memory.patterns,
)
for category in ['health', 'financial', 'relations']:
for category in ["health", "financial", "relations"]:
if sensitivity.get(category, False):
self.audit_logger.log_sensitive_access(
str(chunk['source_file']),
str(chunk["source_file"]),
category,
'index',
{'chunk_id': chunk['chunk_id']}
"index",
{"chunk_id": chunk["chunk_id"]},
)
# Embed chunks
@@ -353,4 +369,3 @@ class Indexer:
if not data_dir.exists() and not (project_root / "KnowledgeVault").exists():
data_dir = Path(os.path.expanduser("~/.obsidian-rag"))
return data_dir / "sync-result.json"

View File

@@ -206,6 +206,7 @@ def _mock_config(tmp_path: Path) -> MagicMock:
cfg.vault_path = str(tmp_path)
cfg.indexing.chunk_size = 500
cfg.indexing.chunk_overlap = 100
cfg.indexing.max_section_chars = 4000
cfg.indexing.file_patterns = ["*.md"]
cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
cfg.indexing.allow_dirs = []
@@ -248,3 +249,41 @@ def test_chunk_file_unstructured(tmp_path: Path):
assert len(chunks) > 1
assert all(c.section is None for c in chunks)
assert chunks[0].chunk_index == 0
def test_large_section_split_into_sub_chunks(tmp_path: Path):
"""Large section (exceeding max_section_chars) is split via sliding window."""
vault = tmp_path / "Notes"
vault.mkdir()
fpath = vault / "2024-03-15-Podcast.md"
large_content = "word " * 3000 # ~15000 chars, exceeds MAX_SECTION_CHARS
fpath.write_text(f"# Episode Notes\n\n{large_content}")
cfg = _mock_config(tmp_path)
cfg.indexing.max_section_chars = 4000
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
# Large section should be split into multiple sub-chunks
assert len(chunks) > 1
# Each sub-chunk should preserve the section heading
for chunk in chunks:
assert chunk.section == "#Episode Notes", (
f"Expected #Episode Notes, got {chunk.section}"
)
def test_small_section_kept_intact(tmp_path: Path):
"""Small section (under max_section_chars) remains a single chunk."""
vault = tmp_path / "Notes"
vault.mkdir()
fpath = vault / "2024-03-15-Short.md"
fpath.write_text("# Notes\n\nShort content here.")
cfg = _mock_config(tmp_path)
cfg.indexing.max_section_chars = 4000
chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)
# Small section → single chunk
assert len(chunks) == 1
assert chunks[0].section == "#Notes"
assert chunks[0].text.strip().endswith("Short content here.")

View File

@@ -98,7 +98,8 @@ export async function probeAll(config: ObsidianRagConfig): Promise<ProbeResult>
if (indexExists) {
try {
const syncPath = resolve(dbPath, "..", "sync-result.json");
const dataDir = resolveDataDir();
const syncPath = resolve(dataDir, "sync-result.json");
if (existsSync(syncPath)) {
const data = JSON.parse(readFileSync(syncPath, "utf-8"));
lastSync = data.timestamp ?? null;
@@ -120,6 +121,17 @@ export async function probeAll(config: ObsidianRagConfig): Promise<ProbeResult>
};
}
function resolveDataDir(): string {
const cwd = process.cwd();
const devDataDir = resolve(cwd, "obsidian-rag");
const devVaultMarker = resolve(cwd, "KnowledgeVault");
if (existsSync(devDataDir) || existsSync(devVaultMarker)) {
return devDataDir;
}
const home = process.env.HOME ?? process.env.USERPROFILE ?? "";
return resolve(home, ".obsidian-rag");
}
async function probeOllama(baseUrl: string): Promise<boolean> {
try {
const res = await fetch(`${baseUrl}/api/tags`, { signal: AbortSignal.timeout(3000) });

View File

@@ -109,7 +109,7 @@ export function readSyncResult(config: ObsidianRagConfig): {
total_chunks: number;
errors: Array<{ file: string; error: string }>;
} | null {
const dataDir = resolve(process.cwd(), ".obsidian-rag");
const dataDir = _resolveDataDir();
const path = resolve(dataDir, "sync-result.json");
if (!existsSync(path)) return null;
try {
@@ -118,3 +118,14 @@ export function readSyncResult(config: ObsidianRagConfig): {
return null;
}
}
function _resolveDataDir(): string {
const cwd = process.cwd();
const devDataDir = resolve(cwd, "obsidian-rag");
const devVaultMarker = resolve(cwd, "KnowledgeVault");
if (existsSync(devDataDir) || existsSync(devVaultMarker)) {
return devDataDir;
}
const home = process.env.HOME ?? process.env.USERPROFILE ?? "";
return resolve(home, ".obsidian-rag");
}

View File

@@ -88,7 +88,7 @@ function defaults(): ObsidianRagConfig {
}
export function loadConfig(configPath?: string): ObsidianRagConfig {
const defaultPath = resolve(process.cwd(), ".obsidian-rag", "config.json");
const defaultPath = resolve(process.cwd(), "obsidian-rag", "config.json");
const path = configPath ?? defaultPath;
try {
const raw = JSON.parse(readFileSync(path, "utf-8"));