WIP: Phase 4 forge extract module with tests

2026-04-13 15:14:35 -04:00
parent 922e724cfe
commit f944bdc573
11 changed files with 1780 additions and 0 deletions
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -0,0 +1,31 @@
+"""Simple smoke tests for FastAPI backend."""
+
+import pytest
+
+
+def test_api_imports():
+    """Test that API module imports correctly."""
+    # This will fail if there are any import errors
+    from companion.api import app, ChatRequest
+
+    assert app is not None
+    assert ChatRequest is not None
+
+
+def test_chat_request_model():
+    """Test ChatRequest model validation."""
+    from companion.api import ChatRequest
+
+    # Valid request
+    req = ChatRequest(message="hello", session_id="abc123")
+    assert req.message == "hello"
+    assert req.session_id == "abc123"
+
+    # Valid request with temperature
+    req2 = ChatRequest(message="hello", temperature=0.7)
+    assert req2.temperature == 0.7
+
+    # Valid request with minimal fields
+    req3 = ChatRequest(message="hello")
+    assert req3.session_id is None
+    assert req3.temperature is None
--- a/tests/test_forge_extract.py
+++ b/tests/test_forge_extract.py
@@ -0,0 +1,604 @@
+"""Tests for training data extractor."""
+
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from companion.config import Config, VaultConfig, IndexingConfig
+from companion.forge.extract import (
+    TrainingDataExtractor,
+    TrainingExample,
+    _create_training_example,
+    _extract_date_from_filename,
+    _has_reflection_patterns,
+    _has_reflection_tags,
+    _is_likely_reflection,
+    extract_training_data,
+)
+
+
+def test_has_reflection_tags():
+    assert _has_reflection_tags("#reflection on today's events")
+    assert _has_reflection_tags("#decision made today")
+    assert not _has_reflection_tags("#worklog entry")
+
+
+def test_has_reflection_patterns():
+    assert _has_reflection_patterns("I think this is important")
+    assert _has_reflection_patterns("I wonder if I should change")
+    assert _has_reflection_patterns("Looking back, I see the pattern")
+    assert not _has_reflection_patterns("The meeting was at 3pm")
+
+
+def test_is_likely_reflection():
+    assert _is_likely_reflection("#reflection I think this matters")
+    assert _is_likely_reflection("I realize now that I was wrong")
+    assert not _is_likely_reflection("Just a regular note")
+
+
+def test_extract_date_from_filename():
+    assert _extract_date_from_filename("2026-04-12.md") == "2026-04-12"
+    assert _extract_date_from_filename("12-Apr-2026.md") == "12-Apr-2026"
+    assert _extract_date_from_filename("2026-04-12-journal.md") == "2026-04-12"
+    assert _extract_date_from_filename("notes.md") is None
+
+
+def test_create_training_example():
+    text = "#reflection I think I need to reconsider my approach. The way I've been handling this isn't working."
+    example = _create_training_example(
+        chunk_text=text,
+        source_file="journal/2026-04-12.md",
+        tags=["#reflection"],
+        date="2026-04-12",
+    )
+
+    assert example is not None
+    assert len(example.messages) == 3
+    assert example.messages[0]["role"] == "system"
+    assert example.messages[1]["role"] == "user"
+    assert example.messages[2]["role"] == "assistant"
+    assert example.messages[2]["content"] == text
+    assert example.source_file == "journal/2026-04-12.md"
+
+
+def test_create_training_example_too_short():
+    text = "I think."  # Too short
+    example = _create_training_example(
+        chunk_text=text,
+        source_file="test.md",
+        tags=["#reflection"],
+        date=None,
+    )
+    assert example is None
+
+
+def test_create_training_example_no_reflection():
+    text = "This is just a regular note about the meeting at 3pm. Nothing special." * 5
+    example = _create_training_example(
+        chunk_text=text,
+        source_file="test.md",
+        tags=["#work"],
+        date=None,
+    )
+    assert example is None
+
+
+def test_training_example_to_dict():
+    example = TrainingExample(
+        messages=[
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi"},
+        ],
+        source_file="test.md",
+        tags=["#test"],
+        date="2026-04-12",
+    )
+    d = example.to_dict()
+    assert d["messages"][0]["role"] == "user"
+    assert d["source_file"] == "test.md"
+    assert d["date"] == "2026-04-12"
+
+
+class TestTrainingDataExtractor:
+    def _get_config_dict(self, vault_path: Path) -> dict:
+        """Return minimal config dict for testing."""
+        return {
+            "companion": {
+                "name": "SAN",
+                "persona": {
+                    "role": "companion",
+                    "tone": "reflective",
+                    "style": "questioning",
+                    "boundaries": [],
+                },
+                "memory": {
+                    "session_turns": 20,
+                    "persistent_store": "",
+                    "summarize_after": 10,
+                },
+                "chat": {
+                    "streaming": True,
+                    "max_response_tokens": 2048,
+                    "default_temperature": 0.7,
+                    "allow_temperature_override": True,
+                },
+            },
+            "vault": {
+                "path": str(vault_path),
+                "indexing": {
+                    "auto_sync": False,
+                    "auto_sync_interval_minutes": 1440,
+                    "watch_fs_events": False,
+                    "file_patterns": ["*.md"],
+                    "deny_dirs": [".git"],
+                    "deny_patterns": [],
+                },
+                "chunking_rules": {},
+            },
+            "rag": {
+                "embedding": {
+                    "provider": "ollama",
+                    "model": "mxbai-embed-large",
+                    "base_url": "http://localhost:11434",
+                    "dimensions": 1024,
+                    "batch_size": 32,
+                },
+                "vector_store": {"type": "lancedb", "path": ".test.vectors"},
+                "search": {
+                    "default_top_k": 8,
+                    "max_top_k": 20,
+                    "similarity_threshold": 0.75,
+                    "hybrid_search": {
+                        "enabled": False,
+                        "keyword_weight": 0.3,
+                        "semantic_weight": 0.7,
+                    },
+                    "filters": {
+                        "date_range_enabled": True,
+                        "tag_filter_enabled": True,
+                        "directory_filter_enabled": True,
+                    },
+                },
+            },
+            "model": {
+                "inference": {
+                    "backend": "llama.cpp",
+                    "model_path": "",
+                    "context_length": 8192,
+                    "gpu_layers": 35,
+                    "batch_size": 512,
+                    "threads": 8,
+                },
+                "fine_tuning": {
+                    "base_model": "",
+                    "output_dir": "",
+                    "lora_rank": 16,
+                    "lora_alpha": 32,
+                    "learning_rate": 0.0002,
+                    "batch_size": 4,
+                    "gradient_accumulation_steps": 4,
+                    "num_epochs": 3,
+                    "warmup_steps": 100,
+                    "save_steps": 500,
+                    "eval_steps": 250,
+                    "training_data_path": "",
+                    "validation_split": 0.1,
+                },
+                "retrain_schedule": {
+                    "auto_reminder": True,
+                    "default_interval_days": 90,
+                    "reminder_channels": [],
+                },
+            },
+            "api": {
+                "host": "127.0.0.1",
+                "port": 7373,
+                "cors_origins": [],
+                "auth": {"enabled": False},
+            },
+            "ui": {
+                "web": {
+                    "enabled": True,
+                    "theme": "obsidian",
+                    "features": {
+                        "streaming": True,
+                        "citations": True,
+                        "source_preview": True,
+                    },
+                },
+                "cli": {"enabled": True, "rich_output": True},
+            },
+            "logging": {
+                "level": "INFO",
+                "file": "",
+                "max_size_mb": 100,
+                "backup_count": 5,
+            },
+            "security": {
+                "local_only": True,
+                "vault_path_traversal_check": True,
+                "sensitive_content_detection": True,
+                "sensitive_patterns": [],
+                "require_confirmation_for_external_apis": True,
+            },
+        }
+
+    def test_extract_from_single_file(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            vault = Path(tmp)
+            journal = vault / "Journal" / "2026" / "04"
+            journal.mkdir(parents=True)
+
+            content = """#DayInShort: Busy day
+
+#reflection I think I need to slow down. The pace has been unsustainable.
+
+#work Normal work day with meetings.
+
+#insight I realize that I've been prioritizing urgency over importance.
+"""
+            (journal / "2026-04-12.md").write_text(content, encoding="utf-8")
+
+            # Use helper method for config
+            from companion.config import load_config
+            import json
+
+            config_dict = self._get_config_dict(vault)
+            config_path = Path(tmp) / "test_config.json"
+            with open(config_path, "w") as f:
+                json.dump(config_dict, f)
+
+            config = load_config(config_path)
+            extractor = TrainingDataExtractor(config)
+            examples = extractor.extract()
+
+            # Should extract at least 2 reflection examples
+            assert len(examples) >= 2
+
+            # Check they have the right structure
+            for ex in examples:
+                assert len(ex.messages) == 3
+                assert ex.messages[2]["role"] == "assistant"
+
+    def _save_to_jsonl_helper(self):
+        """Helper extracted to reduce nesting."""
+        pass  # placeholder
+                "companion": {
+                    "name": "SAN",
+                    "persona": {
+                        "role": "companion",
+                        "tone": "reflective",
+                        "style": "questioning",
+                        "boundaries": [],
+                    },
+                    "memory": {
+                        "session_turns": 20,
+                        "persistent_store": "",
+                        "summarize_after": 10,
+                    },
+                    "chat": {
+                        "streaming": True,
+                        "max_response_tokens": 2048,
+                        "default_temperature": 0.7,
+                        "allow_temperature_override": True,
+                    },
+                },
+                "vault": {
+                    "path": str(vault),
+                    "indexing": {
+                        "auto_sync": False,
+                        "auto_sync_interval_minutes": 1440,
+                        "watch_fs_events": False,
+                        "file_patterns": ["*.md"],
+                        "deny_dirs": [".git"],
+                        "deny_patterns": [],
+                    },
+                    "chunking_rules": {},
+                },
+                "rag": {
+                    "embedding": {
+                        "provider": "ollama",
+                        "model": "mxbai-embed-large",
+                        "base_url": "http://localhost:11434",
+                        "dimensions": 1024,
+                        "batch_size": 32,
+                    },
+                    "vector_store": {"type": "lancedb", "path": ".test.vectors"},
+                    "search": {
+                        "default_top_k": 8,
+                        "max_top_k": 20,
+                        "similarity_threshold": 0.75,
+                        "hybrid_search": {
+                            "enabled": False,
+                            "keyword_weight": 0.3,
+                            "semantic_weight": 0.7,
+                        },
+                        "filters": {
+                            "date_range_enabled": True,
+                            "tag_filter_enabled": True,
+                            "directory_filter_enabled": True,
+                        },
+                    },
+                },
+                "model": {
+                    "inference": {
+                        "backend": "llama.cpp",
+                        "model_path": "",
+                        "context_length": 8192,
+                        "gpu_layers": 35,
+                        "batch_size": 512,
+                        "threads": 8,
+                    },
+                    "fine_tuning": {
+                        "base_model": "",
+                        "output_dir": "",
+                        "lora_rank": 16,
+                        "lora_alpha": 32,
+                        "learning_rate": 0.0002,
+                        "batch_size": 4,
+                        "gradient_accumulation_steps": 4,
+                        "num_epochs": 3,
+                        "warmup_steps": 100,
+                        "save_steps": 500,
+                        "eval_steps": 250,
+                        "training_data_path": "",
+                        "validation_split": 0.1,
+                    },
+                    "retrain_schedule": {
+                        "auto_reminder": True,
+                        "default_interval_days": 90,
+                        "reminder_channels": [],
+                    },
+                },
+                "api": {
+                    "host": "127.0.0.1",
+                    "port": 7373,
+                    "cors_origins": [],
+                    "auth": {"enabled": False},
+                },
+                "ui": {
+                    "web": {
+                        "enabled": True,
+                        "theme": "obsidian",
+                        "features": {
+                            "streaming": True,
+                            "citations": True,
+                            "source_preview": True,
+                        },
+                    },
+                    "cli": {"enabled": True, "rich_output": True},
+                },
+                "logging": {
+                    "level": "INFO",
+                    "file": "",
+                    "max_size_mb": 100,
+                    "backup_count": 5,
+                },
+                "security": {
+                    "local_only": True,
+                    "vault_path_traversal_check": True,
+                    "sensitive_content_detection": True,
+                    "sensitive_patterns": [],
+                    "require_confirmation_for_external_apis": True,
+                },
+            }
+
+            config_path = Path(tmp) / "test_config.json"
+            with open(config_path, "w") as f:
+                json.dump(config_dict, f)
+
+            config = load_config(config_path)
+            extractor = TrainingDataExtractor(config)
+            examples = extractor.extract()
+
+            # Should extract at least 2 reflection examples
+            assert len(examples) >= 2
+
+            # Check they have the right structure
+            for ex in examples:
+                assert len(ex.messages) == 3
+                assert ex.messages[2]["role"] == "assistant"
+
+    def test_save_to_jsonl(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            output = Path(tmp) / "training.jsonl"
+
+            examples = [
+                TrainingExample(
+                    messages=[
+                        {"role": "system", "content": "sys"},
+                        {"role": "user", "content": "user"},
+                        {"role": "assistant", "content": "assistant"},
+                    ],
+                    source_file="test.md",
+                    tags=["#test"],
+                    date="2026-04-12",
+                )
+            ]
+
+            # Create minimal config for extractor
+            config_dict = self._get_config_dict(Path(tmp))
+            config_path = Path(tmp) / "test_config.json"
+            import json
+
+            with open(config_path, "w") as f:
+                json.dump(config_dict, f)
+
+            from companion.config import load_config
+
+            config = load_config(config_path)
+            extractor = TrainingDataExtractor(config)
+            extractor.examples = examples
+
+            count = extractor.save_to_jsonl(output)
+            assert count == 1
+
+            # Verify file content
+            lines = output.read_text(encoding="utf-8").strip().split("\n")
+            assert len(lines) == 1
+            assert "assistant" in lines[0]
+
+    def test_get_stats(self):
+        examples = [
+            TrainingExample(
+                messages=[
+                    {"role": "system", "content": "sys"},
+                    {"role": "user", "content": "user"},
+                    {"role": "assistant", "content": "a" * 100},
+                ],
+                source_file="test1.md",
+                tags=["#reflection", "#learning"],
+                date="2026-04-12",
+            ),
+            TrainingExample(
+                messages=[
+                    {"role": "system", "content": "sys"},
+                    {"role": "user", "content": "user"},
+                    {"role": "assistant", "content": "b" * 200},
+                ],
+                source_file="test2.md",
+                tags=["#reflection", "#decision"],
+                date="2026-04-13",
+            ),
+        ]
+
+        # Create minimal config
+        with tempfile.TemporaryDirectory() as tmp:
+            config_dict = {
+                "companion": {
+                    "name": "SAN",
+                    "persona": {
+                        "role": "companion",
+                        "tone": "reflective",
+                        "style": "questioning",
+                        "boundaries": [],
+                    },
+                    "memory": {
+                        "session_turns": 20,
+                        "persistent_store": "",
+                        "summarize_after": 10,
+                    },
+                    "chat": {
+                        "streaming": True,
+                        "max_response_tokens": 2048,
+                        "default_temperature": 0.7,
+                        "allow_temperature_override": True,
+                    },
+                },
+                "vault": {
+                    "path": str(tmp),
+                    "indexing": {
+                        "auto_sync": False,
+                        "auto_sync_interval_minutes": 1440,
+                        "watch_fs_events": False,
+                        "file_patterns": ["*.md"],
+                        "deny_dirs": [".git"],
+                        "deny_patterns": [],
+                    },
+                    "chunking_rules": {},
+                },
+                "rag": {
+                    "embedding": {
+                        "provider": "ollama",
+                        "model": "mxbai-embed-large",
+                        "base_url": "http://localhost:11434",
+                        "dimensions": 1024,
+                        "batch_size": 32,
+                    },
+                    "vector_store": {"type": "lancedb", "path": ".test.vectors"},
+                    "search": {
+                        "default_top_k": 8,
+                        "max_top_k": 20,
+                        "similarity_threshold": 0.75,
+                        "hybrid_search": {
+                            "enabled": False,
+                            "keyword_weight": 0.3,
+                            "semantic_weight": 0.7,
+                        },
+                        "filters": {
+                            "date_range_enabled": True,
+                            "tag_filter_enabled": True,
+                            "directory_filter_enabled": True,
+                        },
+                    },
+                },
+                "model": {
+                    "inference": {
+                        "backend": "llama.cpp",
+                        "model_path": "",
+                        "context_length": 8192,
+                        "gpu_layers": 35,
+                        "batch_size": 512,
+                        "threads": 8,
+                    },
+                    "fine_tuning": {
+                        "base_model": "",
+                        "output_dir": "",
+                        "lora_rank": 16,
+                        "lora_alpha": 32,
+                        "learning_rate": 0.0002,
+                        "batch_size": 4,
+                        "gradient_accumulation_steps": 4,
+                        "num_epochs": 3,
+                        "warmup_steps": 100,
+                        "save_steps": 500,
+                        "eval_steps": 250,
+                        "training_data_path": "",
+                        "validation_split": 0.1,
+                    },
+                    "retrain_schedule": {
+                        "auto_reminder": True,
+                        "default_interval_days": 90,
+                        "reminder_channels": [],
+                    },
+                },
+                "api": {
+                    "host": "127.0.0.1",
+                    "port": 7373,
+                    "cors_origins": [],
+                    "auth": {"enabled": False},
+                },
+                "ui": {
+                    "web": {
+                        "enabled": True,
+                        "theme": "obsidian",
+                        "features": {
+                            "streaming": True,
+                            "citations": True,
+                            "source_preview": True,
+                        },
+                    },
+                    "cli": {"enabled": True, "rich_output": True},
+                },
+                "logging": {
+                    "level": "INFO",
+                    "file": "",
+                    "max_size_mb": 100,
+                    "backup_count": 5,
+                },
+                "security": {
+                    "local_only": True,
+                    "vault_path_traversal_check": True,
+                    "sensitive_content_detection": True,
+                    "sensitive_patterns": [],
+                    "require_confirmation_for_external_apis": True,
+                },
+            }
+            config_path = Path(tmp) / "test_config.json"
+            import json
+
+            with open(config_path, "w") as f:
+                json.dump(config_dict, f)
+
+            from companion.config import load_config
+
+            config = load_config(config_path)
+            extractor = TrainingDataExtractor(config)
+            extractor.examples = examples
+
+            stats = extractor.get_stats()
+            assert stats["total"] == 2
+            assert stats["avg_length"] == 150  # (100 + 200) // 2
+            assert len(stats["top_tags"]) > 0
+            assert stats["top_tags"][0][0] == "#reflection"