WIP: Phase 4 forge extract module with tests
This commit is contained in:
31
tests/test_api.py
Normal file
31
tests/test_api.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""Simple smoke tests for FastAPI backend."""
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_api_imports():
|
||||
"""Test that API module imports correctly."""
|
||||
# This will fail if there are any import errors
|
||||
from companion.api import app, ChatRequest
|
||||
|
||||
assert app is not None
|
||||
assert ChatRequest is not None
|
||||
|
||||
|
||||
def test_chat_request_model():
|
||||
"""Test ChatRequest model validation."""
|
||||
from companion.api import ChatRequest
|
||||
|
||||
# Valid request
|
||||
req = ChatRequest(message="hello", session_id="abc123")
|
||||
assert req.message == "hello"
|
||||
assert req.session_id == "abc123"
|
||||
|
||||
# Valid request with temperature
|
||||
req2 = ChatRequest(message="hello", temperature=0.7)
|
||||
assert req2.temperature == 0.7
|
||||
|
||||
# Valid request with minimal fields
|
||||
req3 = ChatRequest(message="hello")
|
||||
assert req3.session_id is None
|
||||
assert req3.temperature is None
|
||||
604
tests/test_forge_extract.py
Normal file
604
tests/test_forge_extract.py
Normal file
@@ -0,0 +1,604 @@
|
||||
"""Tests for training data extractor."""
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from companion.config import Config, VaultConfig, IndexingConfig
|
||||
from companion.forge.extract import (
|
||||
TrainingDataExtractor,
|
||||
TrainingExample,
|
||||
_create_training_example,
|
||||
_extract_date_from_filename,
|
||||
_has_reflection_patterns,
|
||||
_has_reflection_tags,
|
||||
_is_likely_reflection,
|
||||
extract_training_data,
|
||||
)
|
||||
|
||||
|
||||
def test_has_reflection_tags():
|
||||
assert _has_reflection_tags("#reflection on today's events")
|
||||
assert _has_reflection_tags("#decision made today")
|
||||
assert not _has_reflection_tags("#worklog entry")
|
||||
|
||||
|
||||
def test_has_reflection_patterns():
|
||||
assert _has_reflection_patterns("I think this is important")
|
||||
assert _has_reflection_patterns("I wonder if I should change")
|
||||
assert _has_reflection_patterns("Looking back, I see the pattern")
|
||||
assert not _has_reflection_patterns("The meeting was at 3pm")
|
||||
|
||||
|
||||
def test_is_likely_reflection():
|
||||
assert _is_likely_reflection("#reflection I think this matters")
|
||||
assert _is_likely_reflection("I realize now that I was wrong")
|
||||
assert not _is_likely_reflection("Just a regular note")
|
||||
|
||||
|
||||
def test_extract_date_from_filename():
|
||||
assert _extract_date_from_filename("2026-04-12.md") == "2026-04-12"
|
||||
assert _extract_date_from_filename("12-Apr-2026.md") == "12-Apr-2026"
|
||||
assert _extract_date_from_filename("2026-04-12-journal.md") == "2026-04-12"
|
||||
assert _extract_date_from_filename("notes.md") is None
|
||||
|
||||
|
||||
def test_create_training_example():
|
||||
text = "#reflection I think I need to reconsider my approach. The way I've been handling this isn't working."
|
||||
example = _create_training_example(
|
||||
chunk_text=text,
|
||||
source_file="journal/2026-04-12.md",
|
||||
tags=["#reflection"],
|
||||
date="2026-04-12",
|
||||
)
|
||||
|
||||
assert example is not None
|
||||
assert len(example.messages) == 3
|
||||
assert example.messages[0]["role"] == "system"
|
||||
assert example.messages[1]["role"] == "user"
|
||||
assert example.messages[2]["role"] == "assistant"
|
||||
assert example.messages[2]["content"] == text
|
||||
assert example.source_file == "journal/2026-04-12.md"
|
||||
|
||||
|
||||
def test_create_training_example_too_short():
|
||||
text = "I think." # Too short
|
||||
example = _create_training_example(
|
||||
chunk_text=text,
|
||||
source_file="test.md",
|
||||
tags=["#reflection"],
|
||||
date=None,
|
||||
)
|
||||
assert example is None
|
||||
|
||||
|
||||
def test_create_training_example_no_reflection():
|
||||
text = "This is just a regular note about the meeting at 3pm. Nothing special." * 5
|
||||
example = _create_training_example(
|
||||
chunk_text=text,
|
||||
source_file="test.md",
|
||||
tags=["#work"],
|
||||
date=None,
|
||||
)
|
||||
assert example is None
|
||||
|
||||
|
||||
def test_training_example_to_dict():
|
||||
example = TrainingExample(
|
||||
messages=[
|
||||
{"role": "user", "content": "Hello"},
|
||||
{"role": "assistant", "content": "Hi"},
|
||||
],
|
||||
source_file="test.md",
|
||||
tags=["#test"],
|
||||
date="2026-04-12",
|
||||
)
|
||||
d = example.to_dict()
|
||||
assert d["messages"][0]["role"] == "user"
|
||||
assert d["source_file"] == "test.md"
|
||||
assert d["date"] == "2026-04-12"
|
||||
|
||||
|
||||
class TestTrainingDataExtractor:
|
||||
def _get_config_dict(self, vault_path: Path) -> dict:
|
||||
"""Return minimal config dict for testing."""
|
||||
return {
|
||||
"companion": {
|
||||
"name": "SAN",
|
||||
"persona": {
|
||||
"role": "companion",
|
||||
"tone": "reflective",
|
||||
"style": "questioning",
|
||||
"boundaries": [],
|
||||
},
|
||||
"memory": {
|
||||
"session_turns": 20,
|
||||
"persistent_store": "",
|
||||
"summarize_after": 10,
|
||||
},
|
||||
"chat": {
|
||||
"streaming": True,
|
||||
"max_response_tokens": 2048,
|
||||
"default_temperature": 0.7,
|
||||
"allow_temperature_override": True,
|
||||
},
|
||||
},
|
||||
"vault": {
|
||||
"path": str(vault_path),
|
||||
"indexing": {
|
||||
"auto_sync": False,
|
||||
"auto_sync_interval_minutes": 1440,
|
||||
"watch_fs_events": False,
|
||||
"file_patterns": ["*.md"],
|
||||
"deny_dirs": [".git"],
|
||||
"deny_patterns": [],
|
||||
},
|
||||
"chunking_rules": {},
|
||||
},
|
||||
"rag": {
|
||||
"embedding": {
|
||||
"provider": "ollama",
|
||||
"model": "mxbai-embed-large",
|
||||
"base_url": "http://localhost:11434",
|
||||
"dimensions": 1024,
|
||||
"batch_size": 32,
|
||||
},
|
||||
"vector_store": {"type": "lancedb", "path": ".test.vectors"},
|
||||
"search": {
|
||||
"default_top_k": 8,
|
||||
"max_top_k": 20,
|
||||
"similarity_threshold": 0.75,
|
||||
"hybrid_search": {
|
||||
"enabled": False,
|
||||
"keyword_weight": 0.3,
|
||||
"semantic_weight": 0.7,
|
||||
},
|
||||
"filters": {
|
||||
"date_range_enabled": True,
|
||||
"tag_filter_enabled": True,
|
||||
"directory_filter_enabled": True,
|
||||
},
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"inference": {
|
||||
"backend": "llama.cpp",
|
||||
"model_path": "",
|
||||
"context_length": 8192,
|
||||
"gpu_layers": 35,
|
||||
"batch_size": 512,
|
||||
"threads": 8,
|
||||
},
|
||||
"fine_tuning": {
|
||||
"base_model": "",
|
||||
"output_dir": "",
|
||||
"lora_rank": 16,
|
||||
"lora_alpha": 32,
|
||||
"learning_rate": 0.0002,
|
||||
"batch_size": 4,
|
||||
"gradient_accumulation_steps": 4,
|
||||
"num_epochs": 3,
|
||||
"warmup_steps": 100,
|
||||
"save_steps": 500,
|
||||
"eval_steps": 250,
|
||||
"training_data_path": "",
|
||||
"validation_split": 0.1,
|
||||
},
|
||||
"retrain_schedule": {
|
||||
"auto_reminder": True,
|
||||
"default_interval_days": 90,
|
||||
"reminder_channels": [],
|
||||
},
|
||||
},
|
||||
"api": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 7373,
|
||||
"cors_origins": [],
|
||||
"auth": {"enabled": False},
|
||||
},
|
||||
"ui": {
|
||||
"web": {
|
||||
"enabled": True,
|
||||
"theme": "obsidian",
|
||||
"features": {
|
||||
"streaming": True,
|
||||
"citations": True,
|
||||
"source_preview": True,
|
||||
},
|
||||
},
|
||||
"cli": {"enabled": True, "rich_output": True},
|
||||
},
|
||||
"logging": {
|
||||
"level": "INFO",
|
||||
"file": "",
|
||||
"max_size_mb": 100,
|
||||
"backup_count": 5,
|
||||
},
|
||||
"security": {
|
||||
"local_only": True,
|
||||
"vault_path_traversal_check": True,
|
||||
"sensitive_content_detection": True,
|
||||
"sensitive_patterns": [],
|
||||
"require_confirmation_for_external_apis": True,
|
||||
},
|
||||
}
|
||||
|
||||
def test_extract_from_single_file(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
vault = Path(tmp)
|
||||
journal = vault / "Journal" / "2026" / "04"
|
||||
journal.mkdir(parents=True)
|
||||
|
||||
content = """#DayInShort: Busy day
|
||||
|
||||
#reflection I think I need to slow down. The pace has been unsustainable.
|
||||
|
||||
#work Normal work day with meetings.
|
||||
|
||||
#insight I realize that I've been prioritizing urgency over importance.
|
||||
"""
|
||||
(journal / "2026-04-12.md").write_text(content, encoding="utf-8")
|
||||
|
||||
# Use helper method for config
|
||||
from companion.config import load_config
|
||||
import json
|
||||
|
||||
config_dict = self._get_config_dict(vault)
|
||||
config_path = Path(tmp) / "test_config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config_dict, f)
|
||||
|
||||
config = load_config(config_path)
|
||||
extractor = TrainingDataExtractor(config)
|
||||
examples = extractor.extract()
|
||||
|
||||
# Should extract at least 2 reflection examples
|
||||
assert len(examples) >= 2
|
||||
|
||||
# Check they have the right structure
|
||||
for ex in examples:
|
||||
assert len(ex.messages) == 3
|
||||
assert ex.messages[2]["role"] == "assistant"
|
||||
|
||||
def _save_to_jsonl_helper(self):
|
||||
"""Helper extracted to reduce nesting."""
|
||||
pass # placeholder
|
||||
"companion": {
|
||||
"name": "SAN",
|
||||
"persona": {
|
||||
"role": "companion",
|
||||
"tone": "reflective",
|
||||
"style": "questioning",
|
||||
"boundaries": [],
|
||||
},
|
||||
"memory": {
|
||||
"session_turns": 20,
|
||||
"persistent_store": "",
|
||||
"summarize_after": 10,
|
||||
},
|
||||
"chat": {
|
||||
"streaming": True,
|
||||
"max_response_tokens": 2048,
|
||||
"default_temperature": 0.7,
|
||||
"allow_temperature_override": True,
|
||||
},
|
||||
},
|
||||
"vault": {
|
||||
"path": str(vault),
|
||||
"indexing": {
|
||||
"auto_sync": False,
|
||||
"auto_sync_interval_minutes": 1440,
|
||||
"watch_fs_events": False,
|
||||
"file_patterns": ["*.md"],
|
||||
"deny_dirs": [".git"],
|
||||
"deny_patterns": [],
|
||||
},
|
||||
"chunking_rules": {},
|
||||
},
|
||||
"rag": {
|
||||
"embedding": {
|
||||
"provider": "ollama",
|
||||
"model": "mxbai-embed-large",
|
||||
"base_url": "http://localhost:11434",
|
||||
"dimensions": 1024,
|
||||
"batch_size": 32,
|
||||
},
|
||||
"vector_store": {"type": "lancedb", "path": ".test.vectors"},
|
||||
"search": {
|
||||
"default_top_k": 8,
|
||||
"max_top_k": 20,
|
||||
"similarity_threshold": 0.75,
|
||||
"hybrid_search": {
|
||||
"enabled": False,
|
||||
"keyword_weight": 0.3,
|
||||
"semantic_weight": 0.7,
|
||||
},
|
||||
"filters": {
|
||||
"date_range_enabled": True,
|
||||
"tag_filter_enabled": True,
|
||||
"directory_filter_enabled": True,
|
||||
},
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"inference": {
|
||||
"backend": "llama.cpp",
|
||||
"model_path": "",
|
||||
"context_length": 8192,
|
||||
"gpu_layers": 35,
|
||||
"batch_size": 512,
|
||||
"threads": 8,
|
||||
},
|
||||
"fine_tuning": {
|
||||
"base_model": "",
|
||||
"output_dir": "",
|
||||
"lora_rank": 16,
|
||||
"lora_alpha": 32,
|
||||
"learning_rate": 0.0002,
|
||||
"batch_size": 4,
|
||||
"gradient_accumulation_steps": 4,
|
||||
"num_epochs": 3,
|
||||
"warmup_steps": 100,
|
||||
"save_steps": 500,
|
||||
"eval_steps": 250,
|
||||
"training_data_path": "",
|
||||
"validation_split": 0.1,
|
||||
},
|
||||
"retrain_schedule": {
|
||||
"auto_reminder": True,
|
||||
"default_interval_days": 90,
|
||||
"reminder_channels": [],
|
||||
},
|
||||
},
|
||||
"api": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 7373,
|
||||
"cors_origins": [],
|
||||
"auth": {"enabled": False},
|
||||
},
|
||||
"ui": {
|
||||
"web": {
|
||||
"enabled": True,
|
||||
"theme": "obsidian",
|
||||
"features": {
|
||||
"streaming": True,
|
||||
"citations": True,
|
||||
"source_preview": True,
|
||||
},
|
||||
},
|
||||
"cli": {"enabled": True, "rich_output": True},
|
||||
},
|
||||
"logging": {
|
||||
"level": "INFO",
|
||||
"file": "",
|
||||
"max_size_mb": 100,
|
||||
"backup_count": 5,
|
||||
},
|
||||
"security": {
|
||||
"local_only": True,
|
||||
"vault_path_traversal_check": True,
|
||||
"sensitive_content_detection": True,
|
||||
"sensitive_patterns": [],
|
||||
"require_confirmation_for_external_apis": True,
|
||||
},
|
||||
}
|
||||
|
||||
config_path = Path(tmp) / "test_config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config_dict, f)
|
||||
|
||||
config = load_config(config_path)
|
||||
extractor = TrainingDataExtractor(config)
|
||||
examples = extractor.extract()
|
||||
|
||||
# Should extract at least 2 reflection examples
|
||||
assert len(examples) >= 2
|
||||
|
||||
# Check they have the right structure
|
||||
for ex in examples:
|
||||
assert len(ex.messages) == 3
|
||||
assert ex.messages[2]["role"] == "assistant"
|
||||
|
||||
def test_save_to_jsonl(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
output = Path(tmp) / "training.jsonl"
|
||||
|
||||
examples = [
|
||||
TrainingExample(
|
||||
messages=[
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "user"},
|
||||
{"role": "assistant", "content": "assistant"},
|
||||
],
|
||||
source_file="test.md",
|
||||
tags=["#test"],
|
||||
date="2026-04-12",
|
||||
)
|
||||
]
|
||||
|
||||
# Create minimal config for extractor
|
||||
config_dict = self._get_config_dict(Path(tmp))
|
||||
config_path = Path(tmp) / "test_config.json"
|
||||
import json
|
||||
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config_dict, f)
|
||||
|
||||
from companion.config import load_config
|
||||
|
||||
config = load_config(config_path)
|
||||
extractor = TrainingDataExtractor(config)
|
||||
extractor.examples = examples
|
||||
|
||||
count = extractor.save_to_jsonl(output)
|
||||
assert count == 1
|
||||
|
||||
# Verify file content
|
||||
lines = output.read_text(encoding="utf-8").strip().split("\n")
|
||||
assert len(lines) == 1
|
||||
assert "assistant" in lines[0]
|
||||
|
||||
def test_get_stats(self):
|
||||
examples = [
|
||||
TrainingExample(
|
||||
messages=[
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "user"},
|
||||
{"role": "assistant", "content": "a" * 100},
|
||||
],
|
||||
source_file="test1.md",
|
||||
tags=["#reflection", "#learning"],
|
||||
date="2026-04-12",
|
||||
),
|
||||
TrainingExample(
|
||||
messages=[
|
||||
{"role": "system", "content": "sys"},
|
||||
{"role": "user", "content": "user"},
|
||||
{"role": "assistant", "content": "b" * 200},
|
||||
],
|
||||
source_file="test2.md",
|
||||
tags=["#reflection", "#decision"],
|
||||
date="2026-04-13",
|
||||
),
|
||||
]
|
||||
|
||||
# Create minimal config
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
config_dict = {
|
||||
"companion": {
|
||||
"name": "SAN",
|
||||
"persona": {
|
||||
"role": "companion",
|
||||
"tone": "reflective",
|
||||
"style": "questioning",
|
||||
"boundaries": [],
|
||||
},
|
||||
"memory": {
|
||||
"session_turns": 20,
|
||||
"persistent_store": "",
|
||||
"summarize_after": 10,
|
||||
},
|
||||
"chat": {
|
||||
"streaming": True,
|
||||
"max_response_tokens": 2048,
|
||||
"default_temperature": 0.7,
|
||||
"allow_temperature_override": True,
|
||||
},
|
||||
},
|
||||
"vault": {
|
||||
"path": str(tmp),
|
||||
"indexing": {
|
||||
"auto_sync": False,
|
||||
"auto_sync_interval_minutes": 1440,
|
||||
"watch_fs_events": False,
|
||||
"file_patterns": ["*.md"],
|
||||
"deny_dirs": [".git"],
|
||||
"deny_patterns": [],
|
||||
},
|
||||
"chunking_rules": {},
|
||||
},
|
||||
"rag": {
|
||||
"embedding": {
|
||||
"provider": "ollama",
|
||||
"model": "mxbai-embed-large",
|
||||
"base_url": "http://localhost:11434",
|
||||
"dimensions": 1024,
|
||||
"batch_size": 32,
|
||||
},
|
||||
"vector_store": {"type": "lancedb", "path": ".test.vectors"},
|
||||
"search": {
|
||||
"default_top_k": 8,
|
||||
"max_top_k": 20,
|
||||
"similarity_threshold": 0.75,
|
||||
"hybrid_search": {
|
||||
"enabled": False,
|
||||
"keyword_weight": 0.3,
|
||||
"semantic_weight": 0.7,
|
||||
},
|
||||
"filters": {
|
||||
"date_range_enabled": True,
|
||||
"tag_filter_enabled": True,
|
||||
"directory_filter_enabled": True,
|
||||
},
|
||||
},
|
||||
},
|
||||
"model": {
|
||||
"inference": {
|
||||
"backend": "llama.cpp",
|
||||
"model_path": "",
|
||||
"context_length": 8192,
|
||||
"gpu_layers": 35,
|
||||
"batch_size": 512,
|
||||
"threads": 8,
|
||||
},
|
||||
"fine_tuning": {
|
||||
"base_model": "",
|
||||
"output_dir": "",
|
||||
"lora_rank": 16,
|
||||
"lora_alpha": 32,
|
||||
"learning_rate": 0.0002,
|
||||
"batch_size": 4,
|
||||
"gradient_accumulation_steps": 4,
|
||||
"num_epochs": 3,
|
||||
"warmup_steps": 100,
|
||||
"save_steps": 500,
|
||||
"eval_steps": 250,
|
||||
"training_data_path": "",
|
||||
"validation_split": 0.1,
|
||||
},
|
||||
"retrain_schedule": {
|
||||
"auto_reminder": True,
|
||||
"default_interval_days": 90,
|
||||
"reminder_channels": [],
|
||||
},
|
||||
},
|
||||
"api": {
|
||||
"host": "127.0.0.1",
|
||||
"port": 7373,
|
||||
"cors_origins": [],
|
||||
"auth": {"enabled": False},
|
||||
},
|
||||
"ui": {
|
||||
"web": {
|
||||
"enabled": True,
|
||||
"theme": "obsidian",
|
||||
"features": {
|
||||
"streaming": True,
|
||||
"citations": True,
|
||||
"source_preview": True,
|
||||
},
|
||||
},
|
||||
"cli": {"enabled": True, "rich_output": True},
|
||||
},
|
||||
"logging": {
|
||||
"level": "INFO",
|
||||
"file": "",
|
||||
"max_size_mb": 100,
|
||||
"backup_count": 5,
|
||||
},
|
||||
"security": {
|
||||
"local_only": True,
|
||||
"vault_path_traversal_check": True,
|
||||
"sensitive_content_detection": True,
|
||||
"sensitive_patterns": [],
|
||||
"require_confirmation_for_external_apis": True,
|
||||
},
|
||||
}
|
||||
config_path = Path(tmp) / "test_config.json"
|
||||
import json
|
||||
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config_dict, f)
|
||||
|
||||
from companion.config import load_config
|
||||
|
||||
config = load_config(config_path)
|
||||
extractor = TrainingDataExtractor(config)
|
||||
extractor.examples = examples
|
||||
|
||||
stats = extractor.get_stats()
|
||||
assert stats["total"] == 2
|
||||
assert stats["avg_length"] == 150 # (100 + 200) // 2
|
||||
assert len(stats["top_tags"]) > 0
|
||||
assert stats["top_tags"][0][0] == "#reflection"
|
||||
Reference in New Issue
Block a user