kv-ai/tests/test_forge_extract.py

"""Tests for training data extractor."""

import tempfile
from pathlib import Path

import pytest

from companion.config import Config, VaultConfig, IndexingConfig
from companion.forge.extract import (
    TrainingDataExtractor,
    TrainingExample,
    _create_training_example,
    _extract_date_from_filename,
    _has_reflection_patterns,
    _has_reflection_tags,
    _is_likely_reflection,
    extract_training_data,
)


def test_has_reflection_tags():
    assert _has_reflection_tags("#reflection on today's events")
    assert _has_reflection_tags("#decision made today")
    assert not _has_reflection_tags("#worklog entry")


def test_has_reflection_patterns():
    assert _has_reflection_patterns("I think this is important")
    assert _has_reflection_patterns("I wonder if I should change")
    assert _has_reflection_patterns("Looking back, I see the pattern")
    assert not _has_reflection_patterns("The meeting was at 3pm")


def test_is_likely_reflection():
    assert _is_likely_reflection("#reflection I think this matters")
    assert _is_likely_reflection("I realize now that I was wrong")
    assert not _is_likely_reflection("Just a regular note")


def test_extract_date_from_filename():
    assert _extract_date_from_filename("2026-04-12.md") == "2026-04-12"
    assert _extract_date_from_filename("12-Apr-2026.md") == "12-Apr-2026"
    assert _extract_date_from_filename("2026-04-12-journal.md") == "2026-04-12"
    assert _extract_date_from_filename("notes.md") is None


def test_create_training_example():
    text = "#reflection I think I need to reconsider my approach. The way I've been handling this isn't working."
    example = _create_training_example(
        chunk_text=text,
        source_file="journal/2026-04-12.md",
        tags=["#reflection"],
        date="2026-04-12",
    )

    assert example is not None
    assert len(example.messages) == 3
    assert example.messages[0]["role"] == "system"
    assert example.messages[1]["role"] == "user"
    assert example.messages[2]["role"] == "assistant"
    assert example.messages[2]["content"] == text
    assert example.source_file == "journal/2026-04-12.md"


def test_create_training_example_too_short():
    text = "I think."  # Too short
    example = _create_training_example(
        chunk_text=text,
        source_file="test.md",
        tags=["#reflection"],
        date=None,
    )
    assert example is None


def test_create_training_example_no_reflection():
    text = "This is just a regular note about the meeting at 3pm. Nothing special." * 5
    example = _create_training_example(
        chunk_text=text,
        source_file="test.md",
        tags=["#work"],
        date=None,
    )
    assert example is None


def test_training_example_to_dict():
    example = TrainingExample(
        messages=[
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi"},
        ],
        source_file="test.md",
        tags=["#test"],
        date="2026-04-12",
    )
    d = example.to_dict()
    assert d["messages"][0]["role"] == "user"
    assert d["source_file"] == "test.md"
    assert d["date"] == "2026-04-12"


class TestTrainingDataExtractor:
    def _get_config_dict(self, vault_path: Path) -> dict:
        """Return minimal config dict for testing."""
        return {
            "companion": {
                "name": "SAN",
                "persona": {
                    "role": "companion",
                    "tone": "reflective",
                    "style": "questioning",
                    "boundaries": [],
                },
                "memory": {
                    "session_turns": 20,
                    "persistent_store": "",
                    "summarize_after": 10,
                },
                "chat": {
                    "streaming": True,
                    "max_response_tokens": 2048,
                    "default_temperature": 0.7,
                    "allow_temperature_override": True,
                },
            },
            "vault": {
                "path": str(vault_path),
                "indexing": {
                    "auto_sync": False,
                    "auto_sync_interval_minutes": 1440,
                    "watch_fs_events": False,
                    "file_patterns": ["*.md"],
                    "deny_dirs": [".git"],
                    "deny_patterns": [],
                },
                "chunking_rules": {},
            },
            "rag": {
                "embedding": {
                    "provider": "ollama",
                    "model": "mxbai-embed-large",
                    "base_url": "http://localhost:11434",
                    "dimensions": 1024,
                    "batch_size": 32,
                },
                "vector_store": {"type": "lancedb", "path": ".test.vectors"},
                "search": {
                    "default_top_k": 8,
                    "max_top_k": 20,
                    "similarity_threshold": 0.75,
                    "hybrid_search": {
                        "enabled": False,
                        "keyword_weight": 0.3,
                        "semantic_weight": 0.7,
                    },
                    "filters": {
                        "date_range_enabled": True,
                        "tag_filter_enabled": True,
                        "directory_filter_enabled": True,
                    },
                },
            },
            "model": {
                "inference": {
                    "backend": "llama.cpp",
                    "model_path": "",
                    "context_length": 8192,
                    "gpu_layers": 35,
                    "batch_size": 512,
                    "threads": 8,
                },
                "fine_tuning": {
                    "base_model": "",
                    "output_dir": "",
                    "lora_rank": 16,
                    "lora_alpha": 32,
                    "learning_rate": 0.0002,
                    "batch_size": 4,
                    "gradient_accumulation_steps": 4,
                    "num_epochs": 3,
                    "warmup_steps": 100,
                    "save_steps": 500,
                    "eval_steps": 250,
                    "training_data_path": "",
                    "validation_split": 0.1,
                },
                "retrain_schedule": {
                    "auto_reminder": True,
                    "default_interval_days": 90,
                    "reminder_channels": [],
                },
            },
            "api": {
                "host": "127.0.0.1",
                "port": 7373,
                "cors_origins": [],
                "auth": {"enabled": False},
            },
            "ui": {
                "web": {
                    "enabled": True,
                    "theme": "obsidian",
                    "features": {
                        "streaming": True,
                        "citations": True,
                        "source_preview": True,
                    },
                },
                "cli": {"enabled": True, "rich_output": True},
            },
            "logging": {
                "level": "INFO",
                "file": "",
                "max_size_mb": 100,
                "backup_count": 5,
            },
            "security": {
                "local_only": True,
                "vault_path_traversal_check": True,
                "sensitive_content_detection": True,
                "sensitive_patterns": [],
                "require_confirmation_for_external_apis": True,
            },
        }

    def test_extract_from_single_file(self):
        with tempfile.TemporaryDirectory() as tmp:
            vault = Path(tmp)
            journal = vault / "Journal" / "2026" / "04"
            journal.mkdir(parents=True)

            content = """#DayInShort: Busy day

#reflection I think I need to slow down. The pace has been unsustainable.

#work Normal work day with meetings.

#insight I realize that I've been prioritizing urgency over importance.
"""
            (journal / "2026-04-12.md").write_text(content, encoding="utf-8")

            # Use helper method for config
            from companion.config import load_config
            import json

            config_dict = self._get_config_dict(vault)
            config_path = Path(tmp) / "test_config.json"
            with open(config_path, "w") as f:
                json.dump(config_dict, f)

            config = load_config(config_path)
            extractor = TrainingDataExtractor(config)
            examples = extractor.extract()

            # Should extract at least 2 reflection examples
            assert len(examples) >= 2

            # Check they have the right structure
            for ex in examples:
                assert len(ex.messages) == 3
                assert ex.messages[2]["role"] == "assistant"

    def _save_to_jsonl_helper(self):
        """Helper extracted to reduce nesting."""
        pass  # placeholder
                "companion": {
                    "name": "SAN",
                    "persona": {
                        "role": "companion",
                        "tone": "reflective",
                        "style": "questioning",
                        "boundaries": [],
                    },
                    "memory": {
                        "session_turns": 20,
                        "persistent_store": "",
                        "summarize_after": 10,
                    },
                    "chat": {
                        "streaming": True,
                        "max_response_tokens": 2048,
                        "default_temperature": 0.7,
                        "allow_temperature_override": True,
                    },
                },
                "vault": {
                    "path": str(vault),
                    "indexing": {
                        "auto_sync": False,
                        "auto_sync_interval_minutes": 1440,
                        "watch_fs_events": False,
                        "file_patterns": ["*.md"],
                        "deny_dirs": [".git"],
                        "deny_patterns": [],
                    },
                    "chunking_rules": {},
                },
                "rag": {
                    "embedding": {
                        "provider": "ollama",
                        "model": "mxbai-embed-large",
                        "base_url": "http://localhost:11434",
                        "dimensions": 1024,
                        "batch_size": 32,
                    },
                    "vector_store": {"type": "lancedb", "path": ".test.vectors"},
                    "search": {
                        "default_top_k": 8,
                        "max_top_k": 20,
                        "similarity_threshold": 0.75,
                        "hybrid_search": {
                            "enabled": False,
                            "keyword_weight": 0.3,
                            "semantic_weight": 0.7,
                        },
                        "filters": {
                            "date_range_enabled": True,
                            "tag_filter_enabled": True,
                            "directory_filter_enabled": True,
                        },
                    },
                },
                "model": {
                    "inference": {
                        "backend": "llama.cpp",
                        "model_path": "",
                        "context_length": 8192,
                        "gpu_layers": 35,
                        "batch_size": 512,
                        "threads": 8,
                    },
                    "fine_tuning": {
                        "base_model": "",
                        "output_dir": "",
                        "lora_rank": 16,
                        "lora_alpha": 32,
                        "learning_rate": 0.0002,
                        "batch_size": 4,
                        "gradient_accumulation_steps": 4,
                        "num_epochs": 3,
                        "warmup_steps": 100,
                        "save_steps": 500,
                        "eval_steps": 250,
                        "training_data_path": "",
                        "validation_split": 0.1,
                    },
                    "retrain_schedule": {
                        "auto_reminder": True,
                        "default_interval_days": 90,
                        "reminder_channels": [],
                    },
                },
                "api": {
                    "host": "127.0.0.1",
                    "port": 7373,
                    "cors_origins": [],
                    "auth": {"enabled": False},
                },
                "ui": {
                    "web": {
                        "enabled": True,
                        "theme": "obsidian",
                        "features": {
                            "streaming": True,
                            "citations": True,
                            "source_preview": True,
                        },
                    },
                    "cli": {"enabled": True, "rich_output": True},
                },
                "logging": {
                    "level": "INFO",
                    "file": "",
                    "max_size_mb": 100,
                    "backup_count": 5,
                },
                "security": {
                    "local_only": True,
                    "vault_path_traversal_check": True,
                    "sensitive_content_detection": True,
                    "sensitive_patterns": [],
                    "require_confirmation_for_external_apis": True,
                },
            }

            config_path = Path(tmp) / "test_config.json"
            with open(config_path, "w") as f:
                json.dump(config_dict, f)

            config = load_config(config_path)
            extractor = TrainingDataExtractor(config)
            examples = extractor.extract()

            # Should extract at least 2 reflection examples
            assert len(examples) >= 2

            # Check they have the right structure
            for ex in examples:
                assert len(ex.messages) == 3
                assert ex.messages[2]["role"] == "assistant"

    def test_save_to_jsonl(self):
        with tempfile.TemporaryDirectory() as tmp:
            output = Path(tmp) / "training.jsonl"

            examples = [
                TrainingExample(
                    messages=[
                        {"role": "system", "content": "sys"},
                        {"role": "user", "content": "user"},
                        {"role": "assistant", "content": "assistant"},
                    ],
                    source_file="test.md",
                    tags=["#test"],
                    date="2026-04-12",
                )
            ]

            # Create minimal config for extractor
            config_dict = self._get_config_dict(Path(tmp))
            config_path = Path(tmp) / "test_config.json"
            import json

            with open(config_path, "w") as f:
                json.dump(config_dict, f)

            from companion.config import load_config

            config = load_config(config_path)
            extractor = TrainingDataExtractor(config)
            extractor.examples = examples

            count = extractor.save_to_jsonl(output)
            assert count == 1

            # Verify file content
            lines = output.read_text(encoding="utf-8").strip().split("\n")
            assert len(lines) == 1
            assert "assistant" in lines[0]

    def test_get_stats(self):
        examples = [
            TrainingExample(
                messages=[
                    {"role": "system", "content": "sys"},
                    {"role": "user", "content": "user"},
                    {"role": "assistant", "content": "a" * 100},
                ],
                source_file="test1.md",
                tags=["#reflection", "#learning"],
                date="2026-04-12",
            ),
            TrainingExample(
                messages=[
                    {"role": "system", "content": "sys"},
                    {"role": "user", "content": "user"},
                    {"role": "assistant", "content": "b" * 200},
                ],
                source_file="test2.md",
                tags=["#reflection", "#decision"],
                date="2026-04-13",
            ),
        ]

        # Create minimal config
        with tempfile.TemporaryDirectory() as tmp:
            config_dict = {
                "companion": {
                    "name": "SAN",
                    "persona": {
                        "role": "companion",
                        "tone": "reflective",
                        "style": "questioning",
                        "boundaries": [],
                    },
                    "memory": {
                        "session_turns": 20,
                        "persistent_store": "",
                        "summarize_after": 10,
                    },
                    "chat": {
                        "streaming": True,
                        "max_response_tokens": 2048,
                        "default_temperature": 0.7,
                        "allow_temperature_override": True,
                    },
                },
                "vault": {
                    "path": str(tmp),
                    "indexing": {
                        "auto_sync": False,
                        "auto_sync_interval_minutes": 1440,
                        "watch_fs_events": False,
                        "file_patterns": ["*.md"],
                        "deny_dirs": [".git"],
                        "deny_patterns": [],
                    },
                    "chunking_rules": {},
                },
                "rag": {
                    "embedding": {
                        "provider": "ollama",
                        "model": "mxbai-embed-large",
                        "base_url": "http://localhost:11434",
                        "dimensions": 1024,
                        "batch_size": 32,
                    },
                    "vector_store": {"type": "lancedb", "path": ".test.vectors"},
                    "search": {
                        "default_top_k": 8,
                        "max_top_k": 20,
                        "similarity_threshold": 0.75,
                        "hybrid_search": {
                            "enabled": False,
                            "keyword_weight": 0.3,
                            "semantic_weight": 0.7,
                        },
                        "filters": {
                            "date_range_enabled": True,
                            "tag_filter_enabled": True,
                            "directory_filter_enabled": True,
                        },
                    },
                },
                "model": {
                    "inference": {
                        "backend": "llama.cpp",
                        "model_path": "",
                        "context_length": 8192,
                        "gpu_layers": 35,
                        "batch_size": 512,
                        "threads": 8,
                    },
                    "fine_tuning": {
                        "base_model": "",
                        "output_dir": "",
                        "lora_rank": 16,
                        "lora_alpha": 32,
                        "learning_rate": 0.0002,
                        "batch_size": 4,
                        "gradient_accumulation_steps": 4,
                        "num_epochs": 3,
                        "warmup_steps": 100,
                        "save_steps": 500,
                        "eval_steps": 250,
                        "training_data_path": "",
                        "validation_split": 0.1,
                    },
                    "retrain_schedule": {
                        "auto_reminder": True,
                        "default_interval_days": 90,
                        "reminder_channels": [],
                    },
                },
                "api": {
                    "host": "127.0.0.1",
                    "port": 7373,
                    "cors_origins": [],
                    "auth": {"enabled": False},
                },
                "ui": {
                    "web": {
                        "enabled": True,
                        "theme": "obsidian",
                        "features": {
                            "streaming": True,
                            "citations": True,
                            "source_preview": True,
                        },
                    },
                    "cli": {"enabled": True, "rich_output": True},
                },
                "logging": {
                    "level": "INFO",
                    "file": "",
                    "max_size_mb": 100,
                    "backup_count": 5,
                },
                "security": {
                    "local_only": True,
                    "vault_path_traversal_check": True,
                    "sensitive_content_detection": True,
                    "sensitive_patterns": [],
                    "require_confirmation_for_external_apis": True,
                },
            }
            config_path = Path(tmp) / "test_config.json"
            import json

            with open(config_path, "w") as f:
                json.dump(config_dict, f)

            from companion.config import load_config

            config = load_config(config_path)
            extractor = TrainingDataExtractor(config)
            extractor.examples = examples

            stats = extractor.get_stats()
            assert stats["total"] == 2
            assert stats["avg_length"] == 150  # (100 + 200) // 2
            assert len(stats["top_tags"]) > 0
            assert stats["top_tags"][0][0] == "#reflection"