The Trial - Initial commit

This commit is contained in:
2026-01-17 14:59:35 -05:00
commit c401cf655d
27 changed files with 132452 additions and 0 deletions

427
scripts/optimized_train.py Normal file
View File

@@ -0,0 +1,427 @@
#!/usr/bin/env python3
"""
Optimized GPU Training Script for The Trial SLM
Uses QLoRA with CUDA acceleration
"""
import json
import logging
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional
import torch
import wandb
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class OptimizedTrainingConfig:
"""Optimized configuration for QLoRA training"""
# Model configuration - use local Ollama model
base_model: str = "C:/Users/simpl/.ollama/models/llama3.2:3b"
adapter_name: str = "the-trial-adapter-v2"
# Data configuration
dataset_path: str = "data/training/monte_cristo_combined.json"
max_seq_length: int = 2048
# GPU/CPU configuration
device_map: str = "auto"
torch_dtype: str = "bfloat16"
use_cpu: bool = False
# QLoRA configuration
use_4bit: bool = True
use_nested_quant: bool = False
bnb_4bit_compute_dtype: str = "bfloat16"
bnb_4bit_quant_type: str = "nf4"
# LoRA configuration
lora_r: int = 32 # Increased for better learning
lora_alpha: int = 64 # Increased for better learning
lora_dropout: float = 0.05 # Reduced dropout
target_modules: List[str] = field(
default_factory=lambda: [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"lm_head", # Include language model head
]
)
# Training arguments
output_dir: str = "models/monte_cristo_qlora_v2"
num_train_epochs: int = 5 # More epochs
per_device_train_batch_size: int = 4 if torch.cuda.is_available() else 1
gradient_accumulation_steps: int = 4 if torch.cuda.is_available() else 16
learning_rate: float = 1e-4 # Slightly higher learning rate
weight_decay: float = 0.01
warmup_ratio: float = 0.05
max_grad_norm: float = 0.5
# Optimization
optim: str = "adamw_torch" if torch.cuda.is_available() else "adafactor"
lr_scheduler_type: str = "cosine_with_restarts"
logging_steps: int = 5
save_steps: int = 50
eval_steps: int = 50
save_total_limit: int = 5
# Performance optimizations
fp16: bool = (
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] < 8
)
bf16: bool = (
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
)
gradient_checkpointing: bool = True
dataloader_pin_memory: bool = torch.cuda.is_available()
dataloader_num_workers: int = 0 # Simplified for Windows
# Mixed precision and memory
dataloader_prefetch: bool = torch.cuda.is_available()
remove_unused_columns: bool = False
# Miscellaneous
report_to: str = "none" # Disable wandb for simpler execution
run_name: str = "the-trial-qlora-v2"
seed: int = 42
# CPU-specific optimizations
use_cpu_offload: bool = not torch.cuda.is_available()
offload_state_dict: bool = not torch.cuda.is_available()
class OptimizedMonteCristoTrainer:
"""Optimized trainer for The Trial SLM"""
def __init__(self, config: OptimizedTrainingConfig):
self.config = config
self.setup_directories()
self.detect_hardware()
def setup_directories(self):
"""Create necessary directories"""
Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
def detect_hardware(self):
"""Detect and configure for hardware"""
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
gpu_capability = torch.cuda.get_device_capability(0)
logger.info(f"GPU detected: {gpu_name}")
logger.info(f"GPU memory: {gpu_memory:.1f} GB")
logger.info(f"GPU capability: {gpu_capability}")
logger.info(f"GPU count: {gpu_count}")
self.config.use_cpu = False
self.config.per_device_train_batch_size = max(
1, min(4, int(gpu_memory / 8))
)
else:
logger.info("No GPU detected - using CPU optimizations")
self.config.use_cpu = True
self.config.use_4bit = False # Disable 4-bit on CPU
self.config.torch_dtype = "float32"
self.config.gradient_accumulation_steps = 32
def load_tokenizer(self) -> AutoTokenizer:
"""Load and configure tokenizer"""
logger.info(f"Loading tokenizer for {self.config.base_model}")
tokenizer = AutoTokenizer.from_pretrained(
self.config.base_model,
trust_remote_code=True,
padding_side="right",
use_fast=True,
)
# Set pad token if not present
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}")
return tokenizer
def load_model(self) -> AutoModelForCausalLM:
"""Load model with appropriate configuration"""
logger.info(f"Loading model {self.config.base_model}")
if self.config.use_cpu:
logger.info("Loading model for CPU training")
model = AutoModelForCausalLM.from_pretrained(
self.config.base_model,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
low_cpu_mem_usage=True,
)
else:
# Configure 4-bit quantization for GPU
bnb_config = BitsAndBytesConfig(
load_in_4bit=self.config.use_4bit,
bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=getattr(
torch, self.config.bnb_4bit_compute_dtype
),
bnb_4bit_use_double_quant=self.config.use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
self.config.base_model,
quantization_config=bnb_config,
device_map=self.config.device_map,
trust_remote_code=True,
torch_dtype=getattr(torch, self.config.torch_dtype),
)
model = prepare_model_for_kbit_training(model)
logger.info(f"Model loaded on device: {next(model.parameters()).device}")
return model
def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM:
"""Setup LoRA adapter with optimized settings"""
logger.info("Setting up optimized LoRA adapter")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=self.config.lora_r,
lora_alpha=self.config.lora_alpha,
lora_dropout=self.config.lora_dropout,
target_modules=self.config.target_modules,
bias="none",
init_lora_weights=True,
)
model = get_peft_model(model, lora_config)
# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
all_params = sum(p.numel() for p in model.parameters())
logger.info(f"Trainable parameters: {trainable_params:,}")
logger.info(f"All parameters: {all_params:,}")
logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%")
return model
def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset:
"""Load and preprocess training data"""
logger.info(f"Loading dataset from {self.config.dataset_path}")
# Load dataset
with open(self.config.dataset_path, "r", encoding="utf-8") as f:
data = json.load(f)
logger.info(f"Loaded {len(data)} training examples")
# Convert to HuggingFace Dataset
dataset = Dataset.from_list(data)
# Enhanced tokenization function
def tokenize_function(examples):
# Format prompts for better training
prompts = []
for i in range(len(examples["instruction"])):
instruction = examples["instruction"][i]
input_text = examples["input"][i] if examples["input"][i] else ""
output = examples["output"][i]
category = (
examples["category"][i] if "category" in examples else "general"
)
# Enhanced prompt format with category context
if category == "factual":
context = "Answer this factual question about The Trial."
elif category == "analysis":
context = (
"Provide literary analysis for this question about The Trial."
)
elif category == "creative":
context = "Respond creatively in the style of Alexandre Dumas to this prompt."
else:
context = "Respond to this question about The Trial."
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert on Alexandre Dumas' "The Trial". {context}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{instruction}
{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{output}<|eot_id|>"""
prompts.append(prompt)
# Tokenize with optimizations
tokenized = tokenizer(
prompts,
truncation=True,
padding=False,
max_length=self.config.max_seq_length,
return_tensors=None,
return_attention_mask=False,
)
# Set labels for causal LM
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Apply tokenization
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing dataset",
)
logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples")
return tokenized_dataset
def create_trainer(
self,
model: AutoModelForCausalLM,
tokenizer: AutoTokenizer,
train_dataset: Dataset,
) -> Trainer:
"""Create optimized Trainer instance"""
# Enhanced training arguments
training_args = TrainingArguments(
output_dir=self.config.output_dir,
num_train_epochs=self.config.num_train_epochs,
per_device_train_batch_size=self.config.per_device_train_batch_size,
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
learning_rate=self.config.learning_rate,
weight_decay=self.config.weight_decay,
warmup_ratio=self.config.warmup_ratio,
max_grad_norm=self.config.max_grad_norm,
optim=self.config.optim,
lr_scheduler_type=self.config.lr_scheduler_type,
logging_steps=self.config.logging_steps,
save_steps=self.config.save_steps,
eval_steps=self.config.eval_steps,
save_total_limit=self.config.save_total_limit,
fp16=self.config.fp16,
bf16=self.config.bf16,
gradient_checkpointing=self.config.gradient_checkpointing,
dataloader_pin_memory=self.config.dataloader_pin_memory,
dataloader_num_workers=self.config.dataloader_num_workers,
dataloader_prefetch=self.config.dataloader_prefetch,
dataloader_persistent_workers=False, # Simplified
report_to=self.config.report_to,
run_name=self.config.run_name,
seed=self.config.seed,
# Performance optimizations
remove_unused_columns=self.config.remove_unused_columns,
include_tokens_per_second=True,
# Memory optimizations
dataloader_drop_last=True,
)
# Optimized data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8,
return_tensors="pt",
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
data_collator=data_collator,
tokenizer=tokenizer,
)
return trainer
def train(self):
"""Execute optimized training"""
logger.info("Starting optimized The Trial SLM training...")
logger.info(f"Using {'GPU' if not self.config.use_cpu else 'CPU'} training")
# Load components
tokenizer = self.load_tokenizer()
model = self.load_model()
model = self.setup_lora(model)
train_dataset = self.load_and_preprocess_data(tokenizer)
trainer = self.create_trainer(model, tokenizer, train_dataset)
# Train model
logger.info("Beginning training...")
trainer.train()
# Save final model
logger.info("Saving final model...")
trainer.save_model()
tokenizer.save_pretrained(self.config.output_dir)
# Save adapter separately for Ollama
adapter_path = Path(self.config.output_dir) / "adapter_model"
if adapter_path.exists():
logger.info(f"Adapter saved to {adapter_path}")
logger.info("Training completed successfully!")
return trainer, model
def main():
"""Main training function"""
logger.info("The Trial SLM - Optimized GPU/CPU Training")
logger.info("=" * 60)
# Configuration with auto-detection
config = OptimizedTrainingConfig()
# Create trainer
trainer_instance = OptimizedMonteCristoTrainer(config)
# Execute training
try:
trainer, model = trainer_instance.train()
logger.info("=" * 60)
logger.info("OPTIMIZED TRAINING COMPLETED SUCCESSFULLY!")
logger.info(f"Model saved to: {config.output_dir}")
logger.info("=" * 60)
except Exception as e:
logger.error(f"Training failed: {e}")
raise
if __name__ == "__main__":
main()