The Trial - Initial commit

This commit is contained in:
2026-01-17 14:59:35 -05:00
commit c401cf655d
27 changed files with 132452 additions and 0 deletions

278
scripts/cpu_training.py Normal file
View File

@@ -0,0 +1,278 @@
#!/usr/bin/env python3
"""
CPU-Compatible Training Script for The Trial SLM
Simplified approach that works without GPU
"""
import json
import logging
import os
from pathlib import Path
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class SimpleMonteCristoTrainer:
"""Simplified trainer that creates knowledge base and prompts"""
def __init__(self, data_dir: str = "data", output_dir: str = "models"):
self.data_dir = Path(data_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def load_training_data(self):
"""Load training datasets"""
datasets = {}
# Load factual Q&A
qa_file = self.data_dir / "training" / "factual_qa.json"
if qa_file.exists():
with open(qa_file, "r", encoding="utf-8") as f:
datasets["factual"] = json.load(f)
# Load analysis data
analysis_file = self.data_dir / "training" / "literary_analysis.json"
if analysis_file.exists():
with open(analysis_file, "r", encoding="utf-8") as f:
datasets["analysis"] = json.load(f)
# Load creative writing
creative_file = self.data_dir / "training" / "creative_writing.json"
if creative_file.exists():
with open(creative_file, "r", encoding="utf-8") as f:
datasets["creative"] = json.load(f)
return datasets
def create_knowledge_base(self, datasets):
"""Create structured knowledge base"""
knowledge_base = {
"characters": {},
"themes": {},
"plot_points": {},
"symbols": {},
"style_elements": {},
}
# Process factual data for characters and plot
for item in datasets.get("factual", []):
if "character" in item:
char = item["character"]
if char not in knowledge_base["characters"]:
knowledge_base["characters"][char] = {
"questions": [],
"answers": [],
}
knowledge_base["characters"][char]["questions"].append(
item["instruction"]
)
knowledge_base["characters"][char]["answers"].append(item["output"])
if "topic" in item and item["topic"] == "plot":
knowledge_base["plot_points"][item["instruction"]] = item["output"]
# Process analysis data for themes and symbols
for item in datasets.get("analysis", []):
if "theme" in item:
theme = item["theme"]
if theme not in knowledge_base["themes"]:
knowledge_base["themes"][theme] = []
knowledge_base["themes"][theme].append(item["output"])
if "symbol" in item:
symbol = item["symbol"]
if symbol not in knowledge_base["symbols"]:
knowledge_base["symbols"][symbol] = []
knowledge_base["symbols"][symbol].append(item["output"])
# Process creative data for style
for item in datasets.get("creative", []):
if "style" in item:
style = item["style"]
if style not in knowledge_base["style_elements"]:
knowledge_base["style_elements"][style] = []
knowledge_base["style_elements"][style].append(item["output"])
return knowledge_base
def create_system_prompts(self):
"""Create system prompts for different contexts"""
system_prompts = {
"default": 'You are a specialized AI assistant expert on "The Trial" by Alexandre Dumas. You have deep knowledge of the novel\'s plot, characters, themes, historical context, and literary significance. Provide accurate, insightful, and engaging responses about all aspects of this classic work of literature.',
"factual": 'You provide factual information about "The Trial". Focus on accurate details about plot events, character descriptions, historical context, and verifiable information from the novel. Be precise and cite specific chapters or events when possible.',
"analysis": 'You provide literary analysis of "The Trial". Focus on themes, symbolism, narrative techniques, character development, and the work\'s place in literary history. Offer insightful interpretations supported by textual evidence.',
"creative": 'You write in the style of Alexandre Dumas and "The Trial". Use dramatic language, romantic adventure elements, rich descriptions, and the narrative voice characteristic of 19th-century French literature.',
}
return system_prompts
def create_ollama_modelfile(self, knowledge_base, system_prompts):
"""Create Ollama Modelfile"""
# Start building the Modelfile content
lines = [
"# The Trial Literary Analysis SLM",
"# Based on llama3.2:3b with specialized knowledge",
"",
"FROM llama3.2:3b",
"",
f"# System prompt",
f"SYSTEM {system_prompts['default']}",
"",
"# Parameters for better literary analysis",
"PARAMETER temperature 0.7",
"PARAMETER top_p 0.9",
"PARAMETER top_k 40",
"PARAMETER repeat_penalty 1.1",
"",
"# Context window for longer passages",
"PARAMETER num_ctx 4096",
"",
"# The Trial Knowledge Base",
]
# Add knowledge sections as comments
lines.extend(
[
"# Character Information:",
f"# {json.dumps(knowledge_base['characters'], indent=2)}",
"",
"# Theme Analysis:",
f"# {json.dumps(knowledge_base['themes'], indent=2)}",
"",
"# Plot Points:",
f"# {json.dumps(knowledge_base['plot_points'], indent=2)}",
"",
"# Symbolism:",
f"# {json.dumps(knowledge_base['symbols'], indent=2)}",
"",
"# Style Elements:",
f"# {json.dumps(knowledge_base['style_elements'], indent=2)}",
]
)
modelfile_content = "\n".join(lines)
# Save Modelfile
modelfile_path = self.output_dir / "Modelfile"
with open(modelfile_path, "w", encoding="utf-8") as f:
f.write(modelfile_content)
logger.info(f"Created Modelfile: {modelfile_path}")
return modelfile_path
def create_test_prompts(self):
"""Create test prompts for validation"""
test_prompts = [
{
"category": "factual",
"prompt": "Who is Edmond Dantès and what happens to him at the beginning of the novel?",
"expected_elements": [
"sailor",
"Pharaon",
"Marseilles",
"betrayal",
"Château d'If",
],
},
{
"category": "analysis",
"prompt": "Analyze the theme of revenge in The Trial.",
"expected_elements": [
"justice",
"vengeance",
"morality",
"consequences",
],
},
{
"category": "creative",
"prompt": "Write a short passage in Dumas' style describing a dramatic confrontation.",
"expected_elements": ["dramatic", "romantic", "adventure", "emotional"],
},
]
test_file = self.output_dir / "test_prompts.json"
with open(test_file, "w", encoding="utf-8") as f:
json.dump(test_prompts, f, indent=2, ensure_ascii=False)
logger.info(f"Created test prompts: {test_file}")
return test_file
def train_model(self):
"""Execute simplified training process"""
logger.info("Starting simplified The Trial SLM training...")
# Load data
datasets = self.load_training_data()
logger.info(f"Loaded datasets: {list(datasets.keys())}")
# Create knowledge base
knowledge_base = self.create_knowledge_base(datasets)
logger.info("Created structured knowledge base")
# Create system prompts
system_prompts = self.create_system_prompts()
logger.info("Created system prompts")
# Create Ollama Modelfile
modelfile_path = self.create_ollama_modelfile(knowledge_base, system_prompts)
# Create test prompts
test_file = self.create_test_prompts()
# Create training summary
summary = {
"training_method": "cpu_knowledge_injection",
"datasets_used": list(datasets.keys()),
"total_examples": sum(len(items) for items in datasets.values()),
"knowledge_base_size": {
"characters": len(knowledge_base["characters"]),
"themes": len(knowledge_base["themes"]),
"plot_points": len(knowledge_base["plot_points"]),
"symbols": len(knowledge_base["symbols"]),
"style_elements": len(knowledge_base["style_elements"]),
},
"output_files": {
"modelfile": str(modelfile_path),
"test_prompts": str(test_file),
},
}
summary_file = self.output_dir / "training_summary.json"
with open(summary_file, "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
logger.info("CPU-based training completed successfully!")
logger.info(f"Training summary: {summary_file}")
return summary
def main():
"""Main training function"""
logger.info("The Trial SLM - CPU-Compatible Training")
logger.info("=" * 60)
trainer = SimpleMonteCristoTrainer()
try:
summary = trainer.train_model()
logger.info("=" * 60)
logger.info("TRAINING COMPLETED SUCCESSFULLY!")
logger.info("=" * 60)
logger.info("Next steps:")
logger.info("1. Test the model: ollama create the-trial -f models/Modelfile")
logger.info("2. Run the model: ollama run the-trial")
logger.info("3. Test with provided prompts")
logger.info("=" * 60)
except Exception as e:
logger.error(f"Training failed: {e}")
raise
if __name__ == "__main__":
main()

366
scripts/data_preparation.py Normal file
View File

@@ -0,0 +1,366 @@
#!/usr/bin/env python3
"""
Data Preparation Script for The Trial Literary Analysis SLM
Downloads and processes source texts for training
"""
import json
import os
import re
from pathlib import Path
# from bs4 import BeautifulSoup # Not needed for basic functionality
# import pandas as pd # Not needed for basic functionality
from typing import Dict, List, Tuple
import requests
class TrialDataPrep:
def __init__(self, base_dir: str = "data"):
self.base_dir = Path(base_dir)
self.raw_dir = self.base_dir / "raw"
self.processed_dir = self.base_dir / "processed"
self.training_dir = self.base_dir / "training"
# Create directories
for dir_path in [self.raw_dir, self.processed_dir, self.training_dir]:
dir_path.mkdir(parents=True, exist_ok=True)
def download_gutenberg_text(self) -> str | None:
"""Download The Trial from Project Gutenberg"""
print("Downloading The Trial from Project Gutenberg...")
# Project Gutenberg URL for The Trial by Franz Kafka
url = "https://www.gutenberg.org/files/7849/7849-0.txt"
try:
response = requests.get(url)
response.raise_for_status()
text = response.text
file_path = self.raw_dir / "the_trial_full.txt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Downloaded and saved to {file_path}")
print(f"Text length: {len(text):,} characters")
return str(file_path)
except Exception as e:
print(f"Error downloading text: {e}")
return None
def parse_chapters(self, text_file: str) -> List[Dict]:
"""Parse the full text into chapters"""
print("Parsing chapters...")
with open(text_file, "r", encoding="utf-8") as f:
text = f.read()
# Find chapter boundaries
chapters = []
chapter_pattern = r"Chapter (One|Two|Three|Four|Five|Six|Seven|Eight|Nine|Ten|\d+|\d+\-\d+)"
# Split by chapter pattern
parts = re.split(chapter_pattern, text)
# Extract chapter titles and content
chapter_matches = list(re.finditer(chapter_pattern, text))
for i, (part, match) in enumerate(zip(parts[1:], chapter_matches)):
chapter_num = i + 1
chapter_title = match.group().strip()
# Clean up the content
content = part.strip()
# Remove Gutenberg header/footer if present
if "*** START OF" in content:
content = content.split("*** START OF")[1]
if "*** END OF" in content:
content = content.split("*** END OF")[0]
chapters.append(
{
"chapter_number": chapter_num,
"title": chapter_title,
"content": content.strip(),
"word_count": len(content.split()),
}
)
print(f"Parsed {len(chapters)} chapters")
# Save parsed chapters
chapters_file = self.processed_dir / "chapters.json"
with open(chapters_file, "w", encoding="utf-8") as f:
json.dump(chapters, f, indent=2, ensure_ascii=False)
return chapters
def create_factual_qa_dataset(self, chapters: List[Dict]) -> List[Dict]:
"""Create factual Q&A pairs from the text"""
print("Creating factual Q&A dataset...")
qa_pairs = []
# Character-based questions
characters = {
"Josef K.": ["protagonist", "bank clerk", "arrested", "defendant"],
"Frau Grubach": ["landlady", "lodging", "concerned", "witness"],
"Fräulein Bürstner": ["neighbor", "tenant", "romantic interest", "confidante"],
"Inspector": ["authority", "police", "interrogator", "bureaucrat"],
"Uncle Albert": ["uncle", "lawyer", "family", "advisor"],
"Leni": ["nurse", "court attendant", "seductress", "helper"],
"Huld": ["lawyer", "advocate", "legal system", "professional"],
"Titorelli": ["painter", "court informant", "opportunist", "corrupt"],
}
for character, keywords in characters.items():
# Create questions about each character
questions = [
f"Who is {character}?",
f"What is {character}'s role in the story?",
f"Describe {character}'s character.",
f"What happens to {character}?",
f"Who is {character} related to?",
]
for question in questions:
# Find relevant chapters
relevant_content = []
for chapter in chapters:
if character.lower() in chapter["content"].lower():
relevant_content.append(chapter["content"])
if relevant_content:
# Create a simple answer based on the content
answer = f"{character} is a character in The Trial by Franz Kafka. "
answer += f"Key information about {character} can be found throughout the novel."
qa_pairs.append(
{
"instruction": question,
"input": "",
"output": answer,
"category": "factual",
"character": character,
}
)
# Plot-based questions
plot_questions = [
("Where does Josef K. get arrested?", "In his boarding house room"),
("What is Josef K.'s profession?", "Bank clerk"),
("Who is Josef K.'s landlady?", "Frau Grubach"),
("What crime is Josef K. accused of?", "It's never revealed"),
("Who is Josef K.'s first defender?", "Uncle Albert brings him Huld"),
("Where does the final execution take place?", "In a quarry"),
("What happens to Josef K. at the end?", "He is executed"),
]
for question, answer in plot_questions:
qa_pairs.append(
{
"instruction": question,
"input": "",
"output": answer,
"category": "factual",
"topic": "plot",
}
)
# Save factual Q&A dataset
qa_file = self.training_dir / "factual_qa.json"
with open(qa_file, "w", encoding="utf-8") as f:
json.dump(qa_pairs, f, indent=2, ensure_ascii=False)
print(f"Created {len(qa_pairs)} factual Q&A pairs")
return qa_pairs
def create_literary_analysis_dataset(self, chapters: List[Dict]) -> List[Dict]:
"""Create literary analysis examples"""
print("Creating literary analysis dataset...")
analysis_examples = []
# Theme analysis
themes = {
"Bureaucratic Absurdity": [
"The central theme driving the novel's surreal atmosphere",
"Explored through the incomprehensible legal proceedings",
"Questions the nature of law and authority",
],
"Guilt and Innocence": [
"Contrast between actual guilt and presumed guilt",
"Josef K.'s struggle to prove his innocence",
"The impossibility of defending against unknown charges",
],
"Alienation": [
"Josef K.'s isolation from society and support",
"The indifference of others to his plight",
"Existential loneliness in the face of absurdity",
],
"Authority and Oppression": [
"The oppressive nature of anonymous authority",
"Powerlessness of the individual against the system",
"The psychological impact of institutional power",
],
}
for theme, descriptions in themes.items():
for desc in descriptions:
analysis_examples.append(
{
"instruction": f"Analyze the theme of {theme} in The Trial.",
"input": "",
"output": f"{theme} is a major theme in The Trial. {desc}",
"category": "analysis",
"theme": theme,
}
)
# Symbolism analysis
symbols = {
"The Court": ["Obscure authority, labyrinthine bureaucracy, inaccessible justice"],
"The Law": ["Abstract power, incomprehensible system, moral judgment"],
"The Door": ["Barriers, access, blocked opportunities, judgment"],
"The Cathedral": ["Spiritual emptiness, failed guidance, institutional corruption"],
}
for symbol, meanings in symbols.items():
analysis_examples.append(
{
"instruction": f"What does {symbol} symbolize in the novel?",
"input": "",
"output": f"{symbol} symbolizes {', '.join(meanings)} in The Trial.",
"category": "analysis",
"symbol": symbol,
}
)
# Save analysis dataset
analysis_file = self.training_dir / "literary_analysis.json"
with open(analysis_file, "w", encoding="utf-8") as f:
json.dump(analysis_examples, f, indent=2, ensure_ascii=False)
print(f"Created {len(analysis_examples)} literary analysis examples")
return analysis_examples
def create_creative_writing_dataset(self, chapters: List[Dict]) -> List[Dict]:
"""Create creative writing examples in Kafka's style"""
print("Creating creative writing dataset...")
creative_examples = []
# Style imitation prompts
style_prompts = [
"Write a passage describing a character's anxiety about bureaucratic proceedings in Kafka's style.",
"Create a dialogue between two characters discussing incomprehensible legal matters.",
"Describe a surreal scene in the absurdist, nightmarish style.",
"Write a passage about the psychological impact of institutional power.",
"Create a scene showing the contrast between individual and anonymous authority.",
]
for prompt in style_prompts:
creative_examples.append(
{
"instruction": prompt,
"input": "",
"output": "This would demonstrate the absurdist, nightmarish style characteristic of Kafka, with precise language and psychological depth.",
"category": "creative",
"style": "kafka",
}
)
# Save creative writing dataset
creative_file = self.training_dir / "creative_writing.json"
with open(creative_file, "w", encoding="utf-8") as f:
json.dump(creative_examples, f, indent=2, ensure_ascii=False)
print(f"Created {len(creative_examples)} creative writing examples")
return creative_examples
def create_combined_dataset(self) -> str:
"""Combine all datasets into training format"""
print("Creating combined training dataset...")
# Load all datasets
datasets = {}
dataset_files = {
"factual": "factual_qa.json",
"analysis": "literary_analysis.json",
"creative": "creative_writing.json",
}
for name, filename in dataset_files.items():
file_path = self.training_dir / filename
if file_path.exists():
with open(file_path, "r", encoding="utf-8") as f:
datasets[name] = json.load(f)
# Combine all examples
combined_examples = []
for category, examples in datasets.items():
combined_examples.extend(examples)
# Save combined dataset
combined_file = self.training_dir / "the_trial_combined.json"
with open(combined_file, "w", encoding="utf-8") as f:
json.dump(combined_examples, f, indent=2, ensure_ascii=False)
print(f"Created combined dataset with {len(combined_examples)} examples")
# Create dataset statistics
stats = {
"total_examples": len(combined_examples),
"categories": {name: len(examples) for name, examples in datasets.items()},
"estimated_tokens": len(combined_examples) * 150, # Rough estimate
}
stats_file = self.training_dir / "dataset_stats.json"
with open(stats_file, "w", encoding="utf-8") as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"Dataset statistics: {stats}")
return str(combined_file)
def main():
"""Main execution function"""
prep = TrialDataPrep()
# Step 1: Download source text
text_file = prep.download_gutenberg_text()
if not text_file:
print("Failed to download source text")
return
# Step 2: Parse chapters
chapters = prep.parse_chapters(text_file)
# Step 3: Create training datasets
factual_qa = prep.create_factual_qa_dataset(chapters)
literary_analysis = prep.create_literary_analysis_dataset(chapters)
creative_writing = prep.create_creative_writing_dataset(chapters)
# Step 4: Create combined dataset
combined_file = prep.create_combined_dataset()
print("\n" + "=" * 50)
print("Data Preparation Complete!")
print("=" * 50)
print(f"Chapters parsed: {len(chapters)}")
print(f"Factual Q&A pairs: {len(factual_qa)}")
print(f"Literary analysis examples: {len(literary_analysis)}")
print(f"Creative writing examples: {len(creative_writing)}")
print(f"Combined dataset: {combined_file}")
print("=" * 50)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,13 @@
# The Trial Literary Analysis SLM
FROM llama3.2:3b
SYSTEM You are a specialized AI assistant expert on "The Trial" by Alexandre Dumas. You have deep knowledge of the novel's plot, characters, themes, historical context, and literary significance. Provide accurate, insightful, and engaging responses about all aspects of this classic work of literature.
# Parameters for better literary analysis
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER repeat_penalty 1.1
PARAMETER num_ctx 4096
MESSAGE system You are an expert on The Trial. Key knowledge: Edmond Dantès is the protagonist, betrayed and imprisoned, escapes to become Count of The Trial, seeks revenge. Main themes: revenge, justice, transformation. Key characters: Mercédès, Fernand, Danglars, Villefort, Abbé Faria. Setting: Marseilles, Paris, Château d'If, The Trial island.

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Ollama Integration Script
Creates and tests the The Trial model in Ollama
"""
import json
import subprocess
import sys
from pathlib import Path
def check_ollama_available():
"""Check if Ollama is running"""
try:
result = subprocess.run(
["ollama", "--version"], capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
print(f"+ Ollama available: {result.stdout.strip()}")
return True
else:
print("! Ollama not responding")
return False
except Exception as e:
print(f"! Error checking Ollama: {e}")
return False
def create_modelfile():
"""Create a simplified Modelfile"""
modelfile_content = """# The Trial Literary Analysis SLM
FROM llama3.2:3b
SYSTEM You are a specialized AI assistant expert on "The Trial" by Alexandre Dumas. You have deep knowledge of the novel's plot, characters, themes, historical context, and literary significance. Provide accurate, insightful, and engaging responses about all aspects of this classic work of literature.
# Parameters for better literary analysis
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER repeat_penalty 1.1
PARAMETER num_ctx 4096
MESSAGE system You are an expert on The Trial. Key knowledge: Edmond Dantès is the protagonist, betrayed and imprisoned, escapes to become Count of The Trial, seeks revenge. Main themes: revenge, justice, transformation. Key characters: Mercédès, Fernand, Danglars, Villefort, Abbé Faria. Setting: Marseilles, Paris, Château d'If, The Trial island.
"""
modelfile_path = Path("models/Modelfile_simple")
with open(modelfile_path, "w", encoding="utf-8") as f:
f.write(modelfile_content)
print(f"+ Created simplified Modelfile: {modelfile_path}")
return modelfile_path
def create_model_ollama(modelfile_path):
"""Create model in Ollama"""
print("Creating The Trial model in Ollama...")
try:
# Remove existing model if it exists
subprocess.run(["ollama", "rm", "the-trial"], capture_output=True, timeout=30)
# Create new model
result = subprocess.run(
["ollama", "create", "the-trial", "-f", str(modelfile_path)],
capture_output=True,
text=True,
timeout=300,
)
if result.returncode == 0:
print("+ Model created successfully!")
return True
else:
print(f"! Error creating model: {result.stderr}")
return False
except subprocess.TimeoutExpired:
print("! Model creation timed out")
return False
except Exception as e:
print(f"! Error creating model: {e}")
return False
def test_model():
"""Test the model with sample questions"""
test_questions = [
"Who is Edmond Dantès?",
"What is the main theme of The Trial?",
"Describe the symbolism of the island in the novel.",
]
print("Testing The Trial model...")
for i, question in enumerate(test_questions, 1):
print(f"\n--- Test {i}: {question} ---")
try:
result = subprocess.run(
["ollama", "run", "the-trial", question],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
answer = result.stdout.strip()
print(f"Answer: {answer[:200]}...")
else:
print(f"Error: {result.stderr}")
except subprocess.TimeoutExpired:
print("Question timed out")
except Exception as e:
print(f"Error: {e}")
def main():
"""Main integration function"""
print("The Trial SLM - Ollama Integration")
print("=" * 50)
# Check Ollama
if not check_ollama_available():
print("Please ensure Ollama is installed and running")
sys.exit(1)
# Create simplified Modelfile
modelfile_path = create_modelfile()
# Create model in Ollama
if create_model_ollama(modelfile_path):
print("\n" + "=" * 50)
print("MODEL CREATED SUCCESSFULLY!")
print("=" * 50)
# Test the model
test_model()
print("\n" + "=" * 50)
print("INTEGRATION COMPLETE!")
print("Usage: ollama run the-trial")
print("=" * 50)
else:
print("\n" + "=" * 50)
print("MODEL CREATION FAILED")
print("=" * 50)
sys.exit(1)
if __name__ == "__main__":
main()

427
scripts/optimized_train.py Normal file
View File

@@ -0,0 +1,427 @@
#!/usr/bin/env python3
"""
Optimized GPU Training Script for The Trial SLM
Uses QLoRA with CUDA acceleration
"""
import json
import logging
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional
import torch
import wandb
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class OptimizedTrainingConfig:
"""Optimized configuration for QLoRA training"""
# Model configuration - use local Ollama model
base_model: str = "C:/Users/simpl/.ollama/models/llama3.2:3b"
adapter_name: str = "the-trial-adapter-v2"
# Data configuration
dataset_path: str = "data/training/monte_cristo_combined.json"
max_seq_length: int = 2048
# GPU/CPU configuration
device_map: str = "auto"
torch_dtype: str = "bfloat16"
use_cpu: bool = False
# QLoRA configuration
use_4bit: bool = True
use_nested_quant: bool = False
bnb_4bit_compute_dtype: str = "bfloat16"
bnb_4bit_quant_type: str = "nf4"
# LoRA configuration
lora_r: int = 32 # Increased for better learning
lora_alpha: int = 64 # Increased for better learning
lora_dropout: float = 0.05 # Reduced dropout
target_modules: List[str] = field(
default_factory=lambda: [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"lm_head", # Include language model head
]
)
# Training arguments
output_dir: str = "models/monte_cristo_qlora_v2"
num_train_epochs: int = 5 # More epochs
per_device_train_batch_size: int = 4 if torch.cuda.is_available() else 1
gradient_accumulation_steps: int = 4 if torch.cuda.is_available() else 16
learning_rate: float = 1e-4 # Slightly higher learning rate
weight_decay: float = 0.01
warmup_ratio: float = 0.05
max_grad_norm: float = 0.5
# Optimization
optim: str = "adamw_torch" if torch.cuda.is_available() else "adafactor"
lr_scheduler_type: str = "cosine_with_restarts"
logging_steps: int = 5
save_steps: int = 50
eval_steps: int = 50
save_total_limit: int = 5
# Performance optimizations
fp16: bool = (
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] < 8
)
bf16: bool = (
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
)
gradient_checkpointing: bool = True
dataloader_pin_memory: bool = torch.cuda.is_available()
dataloader_num_workers: int = 0 # Simplified for Windows
# Mixed precision and memory
dataloader_prefetch: bool = torch.cuda.is_available()
remove_unused_columns: bool = False
# Miscellaneous
report_to: str = "none" # Disable wandb for simpler execution
run_name: str = "the-trial-qlora-v2"
seed: int = 42
# CPU-specific optimizations
use_cpu_offload: bool = not torch.cuda.is_available()
offload_state_dict: bool = not torch.cuda.is_available()
class OptimizedMonteCristoTrainer:
"""Optimized trainer for The Trial SLM"""
def __init__(self, config: OptimizedTrainingConfig):
self.config = config
self.setup_directories()
self.detect_hardware()
def setup_directories(self):
"""Create necessary directories"""
Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
def detect_hardware(self):
"""Detect and configure for hardware"""
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
gpu_capability = torch.cuda.get_device_capability(0)
logger.info(f"GPU detected: {gpu_name}")
logger.info(f"GPU memory: {gpu_memory:.1f} GB")
logger.info(f"GPU capability: {gpu_capability}")
logger.info(f"GPU count: {gpu_count}")
self.config.use_cpu = False
self.config.per_device_train_batch_size = max(
1, min(4, int(gpu_memory / 8))
)
else:
logger.info("No GPU detected - using CPU optimizations")
self.config.use_cpu = True
self.config.use_4bit = False # Disable 4-bit on CPU
self.config.torch_dtype = "float32"
self.config.gradient_accumulation_steps = 32
def load_tokenizer(self) -> AutoTokenizer:
"""Load and configure tokenizer"""
logger.info(f"Loading tokenizer for {self.config.base_model}")
tokenizer = AutoTokenizer.from_pretrained(
self.config.base_model,
trust_remote_code=True,
padding_side="right",
use_fast=True,
)
# Set pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}")
return tokenizer
def load_model(self) -> AutoModelForCausalLM:
"""Load model with appropriate configuration"""
logger.info(f"Loading model {self.config.base_model}")
if self.config.use_cpu:
logger.info("Loading model for CPU training")
model = AutoModelForCausalLM.from_pretrained(
self.config.base_model,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
low_cpu_mem_usage=True,
)
else:
# Configure 4-bit quantization for GPU
bnb_config = BitsAndBytesConfig(
load_in_4bit=self.config.use_4bit,
bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=getattr(
torch, self.config.bnb_4bit_compute_dtype
),
bnb_4bit_use_double_quant=self.config.use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
self.config.base_model,
quantization_config=bnb_config,
device_map=self.config.device_map,
trust_remote_code=True,
torch_dtype=getattr(torch, self.config.torch_dtype),
)
model = prepare_model_for_kbit_training(model)
logger.info(f"Model loaded on device: {next(model.parameters()).device}")
return model
def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM:
"""Setup LoRA adapter with optimized settings"""
logger.info("Setting up optimized LoRA adapter")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=self.config.lora_r,
lora_alpha=self.config.lora_alpha,
lora_dropout=self.config.lora_dropout,
target_modules=self.config.target_modules,
bias="none",
init_lora_weights=True,
)
model = get_peft_model(model, lora_config)
# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
logger.info(f"Trainable parameters: {trainable_params:,}")
logger.info(f"All parameters: {all_params:,}")
logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%")
return model
def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset:
"""Load and preprocess training data"""
logger.info(f"Loading dataset from {self.config.dataset_path}")
# Load dataset
with open(self.config.dataset_path, "r", encoding="utf-8") as f:
data = json.load(f)
logger.info(f"Loaded {len(data)} training examples")
# Convert to HuggingFace Dataset
dataset = Dataset.from_list(data)
# Enhanced tokenization function
def tokenize_function(examples):
# Format prompts for better training
prompts = []
for i in range(len(examples["instruction"])):
instruction = examples["instruction"][i]
input_text = examples["input"][i] if examples["input"][i] else ""
output = examples["output"][i]
category = (
examples["category"][i] if "category" in examples else "general"
)
# Enhanced prompt format with category context
if category == "factual":
context = "Answer this factual question about The Trial."
elif category == "analysis":
context = (
"Provide literary analysis for this question about The Trial."
)
elif category == "creative":
context = "Respond creatively in the style of Alexandre Dumas to this prompt."
else:
context = "Respond to this question about The Trial."
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert on Alexandre Dumas' "The Trial". {context}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{instruction}
{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{output}<|eot_id|>"""
prompts.append(prompt)
# Tokenize with optimizations
tokenized = tokenizer(
prompts,
truncation=True,
padding=False,
max_length=self.config.max_seq_length,
return_tensors=None,
return_attention_mask=False,
)
# Set labels for causal LM
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Apply tokenization
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing dataset",
)
logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples")
return tokenized_dataset
def create_trainer(
self,
model: AutoModelForCausalLM,
tokenizer: AutoTokenizer,
train_dataset: Dataset,
) -> Trainer:
"""Create optimized Trainer instance"""
# Enhanced training arguments
training_args = TrainingArguments(
output_dir=self.config.output_dir,
num_train_epochs=self.config.num_train_epochs,
per_device_train_batch_size=self.config.per_device_train_batch_size,
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
learning_rate=self.config.learning_rate,
weight_decay=self.config.weight_decay,
warmup_ratio=self.config.warmup_ratio,
max_grad_norm=self.config.max_grad_norm,
optim=self.config.optim,
lr_scheduler_type=self.config.lr_scheduler_type,
logging_steps=self.config.logging_steps,
save_steps=self.config.save_steps,
eval_steps=self.config.eval_steps,
save_total_limit=self.config.save_total_limit,
fp16=self.config.fp16,
bf16=self.config.bf16,
gradient_checkpointing=self.config.gradient_checkpointing,
dataloader_pin_memory=self.config.dataloader_pin_memory,
dataloader_num_workers=self.config.dataloader_num_workers,
dataloader_prefetch=self.config.dataloader_prefetch,
dataloader_persistent_workers=False, # Simplified
report_to=self.config.report_to,
run_name=self.config.run_name,
seed=self.config.seed,
# Performance optimizations
remove_unused_columns=self.config.remove_unused_columns,
include_tokens_per_second=True,
# Memory optimizations
dataloader_drop_last=True,
)
# Optimized data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8,
return_tensors="pt",
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
data_collator=data_collator,
tokenizer=tokenizer,
)
return trainer
def train(self):
"""Execute optimized training"""
logger.info("Starting optimized The Trial SLM training...")
logger.info(f"Using {'GPU' if not self.config.use_cpu else 'CPU'} training")
# Load components
tokenizer = self.load_tokenizer()
model = self.load_model()
model = self.setup_lora(model)
train_dataset = self.load_and_preprocess_data(tokenizer)
trainer = self.create_trainer(model, tokenizer, train_dataset)
# Train model
logger.info("Beginning training...")
trainer.train()
# Save final model
logger.info("Saving final model...")
trainer.save_model()
tokenizer.save_pretrained(self.config.output_dir)
# Save adapter separately for Ollama
adapter_path = Path(self.config.output_dir) / "adapter_model"
if adapter_path.exists():
logger.info(f"Adapter saved to {adapter_path}")
logger.info("Training completed successfully!")
return trainer, model
def main():
"""Main training function"""
logger.info("The Trial SLM - Optimized GPU/CPU Training")
logger.info("=" * 60)
# Configuration with auto-detection
config = OptimizedTrainingConfig()
# Create trainer
trainer_instance = OptimizedMonteCristoTrainer(config)
# Execute training
try:
trainer, model = trainer_instance.train()
logger.info("=" * 60)
logger.info("OPTIMIZED TRAINING COMPLETED SUCCESSFULLY!")
logger.info(f"Model saved to: {config.output_dir}")
logger.info("=" * 60)
except Exception as e:
logger.error(f"Training failed: {e}")
raise
if __name__ == "__main__":
main()

135
scripts/start_training.py Normal file
View File

@@ -0,0 +1,135 @@
#!/usr/bin/env python3
"""
Model Setup and Training Launcher
Prepares the environment and starts training
"""
import os
import subprocess
import sys
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
def check_gpu():
"""Check if GPU is available"""
try:
import torch
if torch.cuda.is_available():
print(f"✓ GPU detected: {torch.cuda.get_device_name()}")
print(
f"✓ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
)
return True
else:
print("⚠ No GPU detected. Training will be very slow.")
return False
except ImportError:
print("⚠ PyTorch not installed")
return False
def check_model_access():
"""Check if we can access the base model"""
try:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
print("✓ Base model accessible")
return True
except Exception as e:
print(f"⚠ Cannot access base model: {e}")
print("Note: You may need to request access to Llama models on HuggingFace")
return False
def setup_environment():
"""Setup training environment"""
print("Setting up training environment...")
# Create necessary directories
dirs = ["models", "models/monte_cristo_qlora", "logs"]
for dir_path in dirs:
Path(dir_path).mkdir(parents=True, exist_ok=True)
# Set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_PROJECT"] = "the-trial-slm"
# Set Hugging Face token if provided
hf_token = os.getenv("HF_TOKEN")
if hf_token:
os.environ["HF_TOKEN"] = hf_token
print("✓ Hugging Face token set")
print("✓ Environment setup complete")
def start_training():
"""Start the training process"""
print("\n" + "=" * 60)
print("The Trial SLM TRAINING")
print("=" * 60)
# Pre-flight checks
print("Running pre-flight checks...")
gpu_available = check_gpu()
model_accessible = check_model_access()
if not model_accessible:
print("\n⚠ Cannot proceed without model access")
print("Please ensure you have:")
print("1. A HuggingFace account")
print("2. Requested access to meta-llama/Llama-3.2-3B-Instruct")
print("3. Set your HuggingFace token: hf_token=<your_token>")
return False
# Setup environment
setup_environment()
# Start training
print("\nStarting training...")
try:
result = subprocess.run([sys.executable, "scripts/train_qlora.py"], check=True)
if result.returncode == 0:
print("\n" + "=" * 60)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("=" * 60)
print("Next steps:")
print("1. Check the model in models/monte_cristo_qlora/")
print("2. Run the integration script to prepare for Ollama")
print("3. Test the model with sample queries")
return True
else:
print(f"Training failed with return code: {result.returncode}")
return False
except subprocess.CalledProcessError as e:
print(f"Training failed: {e}")
return False
except KeyboardInterrupt:
print("\nTraining interrupted by user")
return False
def main():
"""Main function"""
print("The Trial Literary Analysis SLM - Training Launcher")
print("=" * 60)
success = start_training()
if success:
print("\n✓ Ready for next phase: Ollama Integration")
else:
print("\n⚠ Training failed. Check logs for details.")
sys.exit(1)
if __name__ == "__main__":
main()

116
scripts/test_environment.py Normal file
View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
Test script to verify our training setup
"""
import os
import sys
def test_imports():
"""Test if required packages are available"""
print("Testing package imports...")
try:
import torch
print(f"+ PyTorch {torch.__version__}")
except ImportError:
print("X PyTorch not installed")
return False
try:
import transformers
print(f"+ Transformers {transformers.__version__}")
except ImportError:
print("X Transformers not installed")
return False
try:
import peft
print(f"+ PEFT {peft.__version__}")
except ImportError:
print("X PEFT not installed")
return False
try:
import datasets
print(f"+ Datasets {datasets.__version__}")
except ImportError:
print("X Datasets not installed")
return False
return True
def test_gpu():
"""Test GPU availability"""
print("\nTesting GPU...")
try:
import torch
if torch.cuda.is_available():
print(f"+ GPU detected: {torch.cuda.get_device_name()}")
print(
f"+ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
)
return True
else:
print("! No GPU detected - training will be very slow")
return False
except:
print("X Cannot check GPU")
return False
def test_data():
"""Test if training data exists"""
print("\nTesting training data...")
data_file = "data/training/monte_cristo_combined.json"
if os.path.exists(data_file):
import json
with open(data_file, "r", encoding="utf-8") as f:
data = json.load(f)
print(f"+ Training data found: {len(data)} examples")
# Show categories
categories = {}
for item in data:
cat = item.get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
print(f"+ Categories: {categories}")
return True
else:
print("X Training data not found")
return False
def main():
"""Main test function"""
print("The Trial SLM - Environment Test")
print("=" * 50)
tests = [test_imports(), test_gpu(), test_data()]
if all(tests):
print("\n" + "=" * 50)
print("+ ALL TESTS PASSED - Ready for training!")
print("=" * 50)
return True
else:
print("\n" + "=" * 50)
print("! Some tests failed - fix issues before training")
print("=" * 50)
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)

355
scripts/train_qlora.py Normal file
View File

@@ -0,0 +1,355 @@
#!/usr/bin/env python3
"""
QLoRA Training Script for The Trial Literary Analysis SLM
Uses parameter-efficient fine-tuning to adapt a base model
"""
import json
import logging
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional
import torch
import wandb
from datasets import Dataset, load_dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class TrainingConfig:
"""Configuration for QLoRA training"""
# Model configuration
base_model: str = "meta-llama/Llama-3.2-3B-Instruct"
adapter_name: str = "the-trial-adapter"
# Data configuration
dataset_path: str = "data/training/monte_cristo_combined.json"
max_seq_length: int = 2048
# QLoRA configuration
use_4bit: bool = True
use_nested_quant: bool = False
bnb_4bit_compute_dtype: str = "bfloat16"
bnb_4bit_quant_type: str = "nf4"
# LoRA configuration
lora_r: int = 16
lora_alpha: int = 32
lora_dropout: float = 0.1
target_modules: List[str] = field(
default_factory=lambda: [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
]
)
# Training arguments
output_dir: str = "models/monte_cristo_qlora"
num_train_epochs: int = 3
per_device_train_batch_size: int = 1
gradient_accumulation_steps: int = 8
learning_rate: float = 2e-5
weight_decay: float = 0.0
warmup_ratio: float = 0.03
max_grad_norm: float = 1.0
# Optimization
optim: str = "paged_adamw_32bit"
lr_scheduler_type: str = "cosine"
logging_steps: int = 10
save_steps: int = 100
eval_steps: int = 100
save_total_limit: int = 3
# Hardware
fp16: bool = False
bf16: bool = True
gradient_checkpointing: bool = True
dataloader_pin_memory: bool = False
# Miscellaneous
report_to: str = "wandb"
run_name: str = "the-trial-qlora"
seed: int = 42
class MonteCristoTrainer:
"""Trainer class for The Trial SLM"""
def __init__(self, config: TrainingConfig):
self.config = config
self.setup_directories()
self.setup_logging()
def setup_directories(self):
"""Create necessary directories"""
Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
def setup_logging(self):
"""Setup logging and wandb"""
if self.config.report_to == "wandb":
wandb.init(
project="the-trial-slm",
name=self.config.run_name,
config=self.config.__dict__,
)
def load_tokenizer(self) -> AutoTokenizer:
"""Load and configure tokenizer"""
logger.info(f"Loading tokenizer for {self.config.base_model}")
tokenizer = AutoTokenizer.from_pretrained(
self.config.base_model, trust_remote_code=True, padding_side="right"
)
# Set pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}")
return tokenizer
def load_model(self) -> AutoModelForCausalLM:
"""Load model with QLoRA configuration"""
logger.info(f"Loading model {self.config.base_model}")
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=self.config.use_4bit,
bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=getattr(torch, self.config.bnb_4bit_compute_dtype),
bnb_4bit_use_double_quant=self.config.use_nested_quant,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
self.config.base_model,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
logger.info(f"Model loaded on device: {next(model.parameters()).device}")
return model
def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM:
"""Setup LoRA adapter"""
logger.info("Setting up LoRA adapter")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=self.config.lora_r,
lora_alpha=self.config.lora_alpha,
lora_dropout=self.config.lora_dropout,
target_modules=self.config.target_modules,
bias="none",
)
# Get PEFT model
model = get_peft_model(model, lora_config)
# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
logger.info(f"Trainable parameters: {trainable_params:,}")
logger.info(f"All parameters: {all_params:,}")
logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%")
return model
def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset:
"""Load and preprocess training data"""
logger.info(f"Loading dataset from {self.config.dataset_path}")
# Load dataset
with open(self.config.dataset_path, "r", encoding="utf-8") as f:
data = json.load(f)
logger.info(f"Loaded {len(data)} training examples")
# Convert to HuggingFace Dataset
dataset = Dataset.from_list(data)
# Tokenization function
def tokenize_function(examples):
# Format the prompt
prompts = []
for i in range(len(examples["instruction"])):
instruction = examples["instruction"][i]
input_text = examples["input"][i] if examples["input"][i] else ""
output = examples["output"][i]
# Create prompt in instruction format
prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
{instruction}
{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{output}<|eot_id|>"""
prompts.append(prompt)
# Tokenize
tokenized = tokenizer(
prompts,
truncation=True,
padding=False,
max_length=self.config.max_seq_length,
return_tensors=None,
)
# Set labels for causal LM
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Apply tokenization
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing dataset",
)
logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples")
return tokenized_dataset
def create_trainer(
self,
model: AutoModelForCausalLM,
tokenizer: AutoTokenizer,
train_dataset: Dataset,
) -> Trainer:
"""Create Trainer instance"""
# Training arguments
training_args = TrainingArguments(
output_dir=self.config.output_dir,
num_train_epochs=self.config.num_train_epochs,
per_device_train_batch_size=self.config.per_device_train_batch_size,
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
learning_rate=self.config.learning_rate,
weight_decay=self.config.weight_decay,
warmup_ratio=self.config.warmup_ratio,
max_grad_norm=self.config.max_grad_norm,
optim=self.config.optim,
lr_scheduler_type=self.config.lr_scheduler_type,
logging_steps=self.config.logging_steps,
save_steps=self.config.save_steps,
eval_steps=self.config.eval_steps,
save_total_limit=self.config.save_total_limit,
fp16=self.config.fp16,
bf16=self.config.bf16,
gradient_checkpointing=self.config.gradient_checkpointing,
dataloader_pin_memory=self.config.dataloader_pin_memory,
report_to=self.config.report_to,
run_name=self.config.run_name,
seed=self.config.seed,
# Performance optimizations
dataloader_num_workers=0,
remove_unused_columns=False,
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8,
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None, # No evaluation dataset for now
data_collator=data_collator,
tokenizer=tokenizer,
)
return trainer
def train(self):
"""Execute training"""
logger.info("Starting The Trial SLM training")
# Load components
tokenizer = self.load_tokenizer()
model = self.load_model()
model = self.setup_lora(model)
train_dataset = self.load_and_preprocess_data(tokenizer)
trainer = self.create_trainer(model, tokenizer, train_dataset)
# Train model
logger.info("Beginning training...")
trainer.train()
# Save final model
logger.info("Saving final model...")
trainer.save_model()
tokenizer.save_pretrained(self.config.output_dir)
# Save adapter separately for Ollama
adapter_path = Path(self.config.output_dir) / "adapter_model"
if adapter_path.exists():
logger.info(f"Adapter saved to {adapter_path}")
logger.info("Training completed successfully!")
return trainer, model
def main():
"""Main training function"""
# Configuration
config = TrainingConfig()
# Create trainer
trainer_instance = MonteCristoTrainer(config)
# Execute training
try:
trainer, model = trainer_instance.train()
logger.info("=" * 50)
logger.info("TRAINING COMPLETED SUCCESSFULLY!")
logger.info(f"Model saved to: {config.output_dir}")
logger.info("=" * 50)
except Exception as e:
logger.error(f"Training failed: {e}")
raise
finally:
# Cleanup wandb
if config.report_to == "wandb":
wandb.finish()
if __name__ == "__main__":
main()