WIP: Phase 4 forge extract module with tests

This commit is contained in:
2026-04-13 15:14:35 -04:00
parent 922e724cfe
commit f944bdc573
11 changed files with 1780 additions and 0 deletions

31
tests/test_api.py Normal file
View File

@@ -0,0 +1,31 @@
"""Simple smoke tests for FastAPI backend."""
import pytest
def test_api_imports():
"""Test that API module imports correctly."""
# This will fail if there are any import errors
from companion.api import app, ChatRequest
assert app is not None
assert ChatRequest is not None
def test_chat_request_model():
"""Test ChatRequest model validation."""
from companion.api import ChatRequest
# Valid request
req = ChatRequest(message="hello", session_id="abc123")
assert req.message == "hello"
assert req.session_id == "abc123"
# Valid request with temperature
req2 = ChatRequest(message="hello", temperature=0.7)
assert req2.temperature == 0.7
# Valid request with minimal fields
req3 = ChatRequest(message="hello")
assert req3.session_id is None
assert req3.temperature is None

604
tests/test_forge_extract.py Normal file
View File

@@ -0,0 +1,604 @@
"""Tests for training data extractor."""
import tempfile
from pathlib import Path
import pytest
from companion.config import Config, VaultConfig, IndexingConfig
from companion.forge.extract import (
TrainingDataExtractor,
TrainingExample,
_create_training_example,
_extract_date_from_filename,
_has_reflection_patterns,
_has_reflection_tags,
_is_likely_reflection,
extract_training_data,
)
def test_has_reflection_tags():
assert _has_reflection_tags("#reflection on today's events")
assert _has_reflection_tags("#decision made today")
assert not _has_reflection_tags("#worklog entry")
def test_has_reflection_patterns():
assert _has_reflection_patterns("I think this is important")
assert _has_reflection_patterns("I wonder if I should change")
assert _has_reflection_patterns("Looking back, I see the pattern")
assert not _has_reflection_patterns("The meeting was at 3pm")
def test_is_likely_reflection():
assert _is_likely_reflection("#reflection I think this matters")
assert _is_likely_reflection("I realize now that I was wrong")
assert not _is_likely_reflection("Just a regular note")
def test_extract_date_from_filename():
assert _extract_date_from_filename("2026-04-12.md") == "2026-04-12"
assert _extract_date_from_filename("12-Apr-2026.md") == "12-Apr-2026"
assert _extract_date_from_filename("2026-04-12-journal.md") == "2026-04-12"
assert _extract_date_from_filename("notes.md") is None
def test_create_training_example():
text = "#reflection I think I need to reconsider my approach. The way I've been handling this isn't working."
example = _create_training_example(
chunk_text=text,
source_file="journal/2026-04-12.md",
tags=["#reflection"],
date="2026-04-12",
)
assert example is not None
assert len(example.messages) == 3
assert example.messages[0]["role"] == "system"
assert example.messages[1]["role"] == "user"
assert example.messages[2]["role"] == "assistant"
assert example.messages[2]["content"] == text
assert example.source_file == "journal/2026-04-12.md"
def test_create_training_example_too_short():
text = "I think." # Too short
example = _create_training_example(
chunk_text=text,
source_file="test.md",
tags=["#reflection"],
date=None,
)
assert example is None
def test_create_training_example_no_reflection():
text = "This is just a regular note about the meeting at 3pm. Nothing special." * 5
example = _create_training_example(
chunk_text=text,
source_file="test.md",
tags=["#work"],
date=None,
)
assert example is None
def test_training_example_to_dict():
example = TrainingExample(
messages=[
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi"},
],
source_file="test.md",
tags=["#test"],
date="2026-04-12",
)
d = example.to_dict()
assert d["messages"][0]["role"] == "user"
assert d["source_file"] == "test.md"
assert d["date"] == "2026-04-12"
class TestTrainingDataExtractor:
def _get_config_dict(self, vault_path: Path) -> dict:
"""Return minimal config dict for testing."""
return {
"companion": {
"name": "SAN",
"persona": {
"role": "companion",
"tone": "reflective",
"style": "questioning",
"boundaries": [],
},
"memory": {
"session_turns": 20,
"persistent_store": "",
"summarize_after": 10,
},
"chat": {
"streaming": True,
"max_response_tokens": 2048,
"default_temperature": 0.7,
"allow_temperature_override": True,
},
},
"vault": {
"path": str(vault_path),
"indexing": {
"auto_sync": False,
"auto_sync_interval_minutes": 1440,
"watch_fs_events": False,
"file_patterns": ["*.md"],
"deny_dirs": [".git"],
"deny_patterns": [],
},
"chunking_rules": {},
},
"rag": {
"embedding": {
"provider": "ollama",
"model": "mxbai-embed-large",
"base_url": "http://localhost:11434",
"dimensions": 1024,
"batch_size": 32,
},
"vector_store": {"type": "lancedb", "path": ".test.vectors"},
"search": {
"default_top_k": 8,
"max_top_k": 20,
"similarity_threshold": 0.75,
"hybrid_search": {
"enabled": False,
"keyword_weight": 0.3,
"semantic_weight": 0.7,
},
"filters": {
"date_range_enabled": True,
"tag_filter_enabled": True,
"directory_filter_enabled": True,
},
},
},
"model": {
"inference": {
"backend": "llama.cpp",
"model_path": "",
"context_length": 8192,
"gpu_layers": 35,
"batch_size": 512,
"threads": 8,
},
"fine_tuning": {
"base_model": "",
"output_dir": "",
"lora_rank": 16,
"lora_alpha": 32,
"learning_rate": 0.0002,
"batch_size": 4,
"gradient_accumulation_steps": 4,
"num_epochs": 3,
"warmup_steps": 100,
"save_steps": 500,
"eval_steps": 250,
"training_data_path": "",
"validation_split": 0.1,
},
"retrain_schedule": {
"auto_reminder": True,
"default_interval_days": 90,
"reminder_channels": [],
},
},
"api": {
"host": "127.0.0.1",
"port": 7373,
"cors_origins": [],
"auth": {"enabled": False},
},
"ui": {
"web": {
"enabled": True,
"theme": "obsidian",
"features": {
"streaming": True,
"citations": True,
"source_preview": True,
},
},
"cli": {"enabled": True, "rich_output": True},
},
"logging": {
"level": "INFO",
"file": "",
"max_size_mb": 100,
"backup_count": 5,
},
"security": {
"local_only": True,
"vault_path_traversal_check": True,
"sensitive_content_detection": True,
"sensitive_patterns": [],
"require_confirmation_for_external_apis": True,
},
}
def test_extract_from_single_file(self):
with tempfile.TemporaryDirectory() as tmp:
vault = Path(tmp)
journal = vault / "Journal" / "2026" / "04"
journal.mkdir(parents=True)
content = """#DayInShort: Busy day
#reflection I think I need to slow down. The pace has been unsustainable.
#work Normal work day with meetings.
#insight I realize that I've been prioritizing urgency over importance.
"""
(journal / "2026-04-12.md").write_text(content, encoding="utf-8")
# Use helper method for config
from companion.config import load_config
import json
config_dict = self._get_config_dict(vault)
config_path = Path(tmp) / "test_config.json"
with open(config_path, "w") as f:
json.dump(config_dict, f)
config = load_config(config_path)
extractor = TrainingDataExtractor(config)
examples = extractor.extract()
# Should extract at least 2 reflection examples
assert len(examples) >= 2
# Check they have the right structure
for ex in examples:
assert len(ex.messages) == 3
assert ex.messages[2]["role"] == "assistant"
def _save_to_jsonl_helper(self):
"""Helper extracted to reduce nesting."""
pass # placeholder
"companion": {
"name": "SAN",
"persona": {
"role": "companion",
"tone": "reflective",
"style": "questioning",
"boundaries": [],
},
"memory": {
"session_turns": 20,
"persistent_store": "",
"summarize_after": 10,
},
"chat": {
"streaming": True,
"max_response_tokens": 2048,
"default_temperature": 0.7,
"allow_temperature_override": True,
},
},
"vault": {
"path": str(vault),
"indexing": {
"auto_sync": False,
"auto_sync_interval_minutes": 1440,
"watch_fs_events": False,
"file_patterns": ["*.md"],
"deny_dirs": [".git"],
"deny_patterns": [],
},
"chunking_rules": {},
},
"rag": {
"embedding": {
"provider": "ollama",
"model": "mxbai-embed-large",
"base_url": "http://localhost:11434",
"dimensions": 1024,
"batch_size": 32,
},
"vector_store": {"type": "lancedb", "path": ".test.vectors"},
"search": {
"default_top_k": 8,
"max_top_k": 20,
"similarity_threshold": 0.75,
"hybrid_search": {
"enabled": False,
"keyword_weight": 0.3,
"semantic_weight": 0.7,
},
"filters": {
"date_range_enabled": True,
"tag_filter_enabled": True,
"directory_filter_enabled": True,
},
},
},
"model": {
"inference": {
"backend": "llama.cpp",
"model_path": "",
"context_length": 8192,
"gpu_layers": 35,
"batch_size": 512,
"threads": 8,
},
"fine_tuning": {
"base_model": "",
"output_dir": "",
"lora_rank": 16,
"lora_alpha": 32,
"learning_rate": 0.0002,
"batch_size": 4,
"gradient_accumulation_steps": 4,
"num_epochs": 3,
"warmup_steps": 100,
"save_steps": 500,
"eval_steps": 250,
"training_data_path": "",
"validation_split": 0.1,
},
"retrain_schedule": {
"auto_reminder": True,
"default_interval_days": 90,
"reminder_channels": [],
},
},
"api": {
"host": "127.0.0.1",
"port": 7373,
"cors_origins": [],
"auth": {"enabled": False},
},
"ui": {
"web": {
"enabled": True,
"theme": "obsidian",
"features": {
"streaming": True,
"citations": True,
"source_preview": True,
},
},
"cli": {"enabled": True, "rich_output": True},
},
"logging": {
"level": "INFO",
"file": "",
"max_size_mb": 100,
"backup_count": 5,
},
"security": {
"local_only": True,
"vault_path_traversal_check": True,
"sensitive_content_detection": True,
"sensitive_patterns": [],
"require_confirmation_for_external_apis": True,
},
}
config_path = Path(tmp) / "test_config.json"
with open(config_path, "w") as f:
json.dump(config_dict, f)
config = load_config(config_path)
extractor = TrainingDataExtractor(config)
examples = extractor.extract()
# Should extract at least 2 reflection examples
assert len(examples) >= 2
# Check they have the right structure
for ex in examples:
assert len(ex.messages) == 3
assert ex.messages[2]["role"] == "assistant"
def test_save_to_jsonl(self):
with tempfile.TemporaryDirectory() as tmp:
output = Path(tmp) / "training.jsonl"
examples = [
TrainingExample(
messages=[
{"role": "system", "content": "sys"},
{"role": "user", "content": "user"},
{"role": "assistant", "content": "assistant"},
],
source_file="test.md",
tags=["#test"],
date="2026-04-12",
)
]
# Create minimal config for extractor
config_dict = self._get_config_dict(Path(tmp))
config_path = Path(tmp) / "test_config.json"
import json
with open(config_path, "w") as f:
json.dump(config_dict, f)
from companion.config import load_config
config = load_config(config_path)
extractor = TrainingDataExtractor(config)
extractor.examples = examples
count = extractor.save_to_jsonl(output)
assert count == 1
# Verify file content
lines = output.read_text(encoding="utf-8").strip().split("\n")
assert len(lines) == 1
assert "assistant" in lines[0]
def test_get_stats(self):
examples = [
TrainingExample(
messages=[
{"role": "system", "content": "sys"},
{"role": "user", "content": "user"},
{"role": "assistant", "content": "a" * 100},
],
source_file="test1.md",
tags=["#reflection", "#learning"],
date="2026-04-12",
),
TrainingExample(
messages=[
{"role": "system", "content": "sys"},
{"role": "user", "content": "user"},
{"role": "assistant", "content": "b" * 200},
],
source_file="test2.md",
tags=["#reflection", "#decision"],
date="2026-04-13",
),
]
# Create minimal config
with tempfile.TemporaryDirectory() as tmp:
config_dict = {
"companion": {
"name": "SAN",
"persona": {
"role": "companion",
"tone": "reflective",
"style": "questioning",
"boundaries": [],
},
"memory": {
"session_turns": 20,
"persistent_store": "",
"summarize_after": 10,
},
"chat": {
"streaming": True,
"max_response_tokens": 2048,
"default_temperature": 0.7,
"allow_temperature_override": True,
},
},
"vault": {
"path": str(tmp),
"indexing": {
"auto_sync": False,
"auto_sync_interval_minutes": 1440,
"watch_fs_events": False,
"file_patterns": ["*.md"],
"deny_dirs": [".git"],
"deny_patterns": [],
},
"chunking_rules": {},
},
"rag": {
"embedding": {
"provider": "ollama",
"model": "mxbai-embed-large",
"base_url": "http://localhost:11434",
"dimensions": 1024,
"batch_size": 32,
},
"vector_store": {"type": "lancedb", "path": ".test.vectors"},
"search": {
"default_top_k": 8,
"max_top_k": 20,
"similarity_threshold": 0.75,
"hybrid_search": {
"enabled": False,
"keyword_weight": 0.3,
"semantic_weight": 0.7,
},
"filters": {
"date_range_enabled": True,
"tag_filter_enabled": True,
"directory_filter_enabled": True,
},
},
},
"model": {
"inference": {
"backend": "llama.cpp",
"model_path": "",
"context_length": 8192,
"gpu_layers": 35,
"batch_size": 512,
"threads": 8,
},
"fine_tuning": {
"base_model": "",
"output_dir": "",
"lora_rank": 16,
"lora_alpha": 32,
"learning_rate": 0.0002,
"batch_size": 4,
"gradient_accumulation_steps": 4,
"num_epochs": 3,
"warmup_steps": 100,
"save_steps": 500,
"eval_steps": 250,
"training_data_path": "",
"validation_split": 0.1,
},
"retrain_schedule": {
"auto_reminder": True,
"default_interval_days": 90,
"reminder_channels": [],
},
},
"api": {
"host": "127.0.0.1",
"port": 7373,
"cors_origins": [],
"auth": {"enabled": False},
},
"ui": {
"web": {
"enabled": True,
"theme": "obsidian",
"features": {
"streaming": True,
"citations": True,
"source_preview": True,
},
},
"cli": {"enabled": True, "rich_output": True},
},
"logging": {
"level": "INFO",
"file": "",
"max_size_mb": 100,
"backup_count": 5,
},
"security": {
"local_only": True,
"vault_path_traversal_check": True,
"sensitive_content_detection": True,
"sensitive_patterns": [],
"require_confirmation_for_external_apis": True,
},
}
config_path = Path(tmp) / "test_config.json"
import json
with open(config_path, "w") as f:
json.dump(config_dict, f)
from companion.config import load_config
config = load_config(config_path)
extractor = TrainingDataExtractor(config)
extractor.examples = examples
stats = extractor.get_stats()
assert stats["total"] == 2
assert stats["avg_length"] == 150 # (100 + 200) // 2
assert len(stats["top_tags"]) > 0
assert stats["top_tags"][0][0] == "#reflection"