obsidian-rag/python/tests/unit/test_chunker.py

"""Tests for obsidian_rag.chunker — section splitting and sliding window."""

from __future__ import annotations

from pathlib import Path
import tempfile
from unittest.mock import MagicMock

import pytest

from obsidian_rag.chunker import (
    extract_tags,
    extract_date_from_filename,
    is_structured_note,
    parse_frontmatter,
    split_by_sections,
    sliding_window_chunks,
    chunk_file,
)


# ----------------------------------------------------------------------
# parse_frontmatter
# ----------------------------------------------------------------------


def test_parse_frontmatter_with_yaml():
    content = """---
title: My Journal
tags: [journal, personal]
---
# Morning

Some content here.
"""
    meta, body = parse_frontmatter(content)
    assert meta.get("title") == "My Journal"
    assert "# Morning" in body
    assert "Some content" in body


def test_parse_frontmatter_without_frontmatter():
    content = "# Just a header\n\nSome text without frontmatter."
    meta, body = parse_frontmatter(content)
    assert meta == {}
    assert "# Just a header" in body


# ----------------------------------------------------------------------
# extract_tags
# ----------------------------------------------------------------------


def test_extract_tags_basic():
    text = "Hello #world and #python-code is nice"
    tags = extract_tags(text)
    assert "#world" in tags
    assert "#python-code" in tags
    # lowercased
    assert all(t.startswith("#") for t in tags)


def test_extract_tags_deduplicates():
    text = "#hello #world #hello #python"
    tags = extract_tags(text)
    assert len(tags) == 3


# ----------------------------------------------------------------------
# extract_date_from_filename
# ----------------------------------------------------------------------


def test_extract_date_from_filename_iso():
    p = Path("2024-01-15.md")
    assert extract_date_from_filename(p) == "2024-01-15"


def test_extract_date_from_filename_compact():
    p = Path("20240115.md")
    assert extract_date_from_filename(p) == "2024-01-15"


def test_extract_date_from_filename_no_date():
    p = Path("my-journal.md")
    assert extract_date_from_filename(p) is None


# ----------------------------------------------------------------------
# is_structured_note
# ----------------------------------------------------------------------


def test_is_structured_note_journal():
    assert is_structured_note(Path("2024-01-15.md")) is True
    assert is_structured_note(Path("Journal/2024-02-20.md")) is True


def test_is_structured_note_project():
    assert is_structured_note(Path("My Project Ideas.md")) is False
    assert is_structured_note(Path("shopping-list.md")) is False


# ----------------------------------------------------------------------
# split_by_sections
# ----------------------------------------------------------------------


def test_split_by_sections_multiple():
    body = """# Mental Health
Feeling anxious today.

## Work
Project deadline approaching.

### Home
Need to clean the garage.
"""
    sections = split_by_sections(body, {})
    assert len(sections) == 3
    assert sections[0][0] == "Mental Health"
    # Section content excludes the header line itself
    assert "Feeling anxious today." in sections[0][1]
    assert sections[1][0] == "Work"
    assert sections[2][0] == "Home"


def test_split_by_sections_no_headers():
    body = "Just plain text without any headers at all."
    sections = split_by_sections(body, {})
    assert len(sections) == 1
    assert sections[0][0] is None
    assert "Just plain text" in sections[0][1]


def test_split_by_sections_leading_content():
    """Content before the first header belongs to the first section."""
    body = """Some intro text before any header.

# First Section
Content of first.
"""
    sections = split_by_sections(body, {})
    assert sections[0][0] is None
    assert "Some intro text" in sections[0][1]
    assert sections[1][0] == "First Section"


# ----------------------------------------------------------------------
# sliding_window_chunks
# ----------------------------------------------------------------------


def test_sliding_window_basic():
    words = " ".join([f"word{i}" for i in range(1200)])
    chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
    assert len(chunks) >= 2
    # First chunk: words 0-499
    assert chunks[0].startswith("word0")
    # Chunks should have ~500 tokens each
    for c in chunks:
        assert len(c.split()) <= 500


def test_sliding_window_overlap():
    """Adjacent chunks should share the overlap region."""
    text = " ".join([f"word{i}" for i in range(1000)])
    chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
    # Every chunk after the first should start with words from the previous chunk
    for i in range(1, len(chunks)):
        prev_words = chunks[i - 1].split()
        curr_words = chunks[i].split()
        # Overlap should be evident
        assert prev_words[-100:] == curr_words[:100]


def test_sliding_window_empty():
    assert sliding_window_chunks("", chunk_size=500, overlap=100) == []


def test_sliding_window_exact_size_produces_two_chunks():
    """With overlap=100, exactly 500 words produces 2 chunks (0-499 and 400-end)."""
    words = " ".join([f"word{i}" for i in range(500)])
    chunks = sliding_window_chunks(words, chunk_size=500, overlap=100)
    assert len(chunks) == 2
    assert chunks[0].startswith("word0")
    assert chunks[1].startswith("word400")  # advance = 500-100 = 400


def test_sliding_window_small_text():
    """Text much shorter than chunk_size returns single chunk."""
    text = "just a few words"
    chunks = sliding_window_chunks(text, chunk_size=500, overlap=100)
    assert len(chunks) == 1
    assert chunks[0] == text


# ----------------------------------------------------------------------
# chunk_file integration
# ----------------------------------------------------------------------


def _mock_config(tmp_path: Path) -> MagicMock:
    """Build a minimal mock config pointing at a tmp vault."""
    cfg = MagicMock()
    cfg.vault_path = str(tmp_path)
    cfg.indexing.chunk_size = 500
    cfg.indexing.chunk_overlap = 100
    cfg.indexing.max_section_chars = 4000
    cfg.indexing.file_patterns = ["*.md"]
    cfg.indexing.deny_dirs = [".obsidian", ".trash", "zzz-Archive", ".git"]
    cfg.indexing.allow_dirs = []
    return cfg


def test_chunk_file_structured_journal(tmp_path: Path):
    vault = tmp_path / "Journal"
    vault.mkdir()
    fpath = vault / "2024-03-15.md"
    fpath.write_text("""# Morning

Felt #anxious about the deadline.

## Work
Finished the report.
""")

    cfg = _mock_config(tmp_path)
    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)

    # Journal file → section-split → 2 chunks
    assert len(chunks) == 2
    assert chunks[0].section == "#Morning"
    assert chunks[0].date == "2024-03-15"
    assert "#anxious" in chunks[0].tags or "#anxious" in chunks[1].tags
    assert chunks[0].source_file.endswith("Journal/2024-03-15.md")


def test_chunk_file_unstructured(tmp_path: Path):
    vault = tmp_path / "Notes"
    vault.mkdir()
    fpath = vault / "project-ideas.md"
    fpath.write_text("This is a long note " * 200)  # ~1000 words

    cfg = _mock_config(tmp_path)
    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)

    # Unstructured → sliding window → multiple chunks
    assert len(chunks) > 1
    assert all(c.section is None for c in chunks)
    assert chunks[0].chunk_index == 0


def test_large_section_split_into_sub_chunks(tmp_path: Path):
    """Large section (exceeding max_section_chars) is split via sliding window."""
    vault = tmp_path / "Notes"
    vault.mkdir()
    fpath = vault / "2024-03-15-Podcast.md"
    large_content = "word " * 3000  # ~15000 chars, exceeds MAX_SECTION_CHARS
    fpath.write_text(f"# Episode Notes\n\n{large_content}")

    cfg = _mock_config(tmp_path)
    cfg.indexing.max_section_chars = 4000
    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)

    # Large section should be split into multiple sub-chunks
    assert len(chunks) > 1
    # Each sub-chunk should preserve the section heading
    for chunk in chunks:
        assert chunk.section == "#Episode Notes", (
            f"Expected #Episode Notes, got {chunk.section}"
        )


def test_small_section_kept_intact(tmp_path: Path):
    """Small section (under max_section_chars) remains a single chunk."""
    vault = tmp_path / "Notes"
    vault.mkdir()
    fpath = vault / "2024-03-15-Short.md"
    fpath.write_text("# Notes\n\nShort content here.")

    cfg = _mock_config(tmp_path)
    cfg.indexing.max_section_chars = 4000
    chunks = chunk_file(fpath, fpath.read_text(), "2024-03-15T10:00:00Z", cfg)

    # Small section → single chunk
    assert len(chunks) == 1
    assert chunks[0].section == "#Notes"
    assert chunks[0].text.strip().endswith("Short content here.")