428 lines
14 KiB
Python
428 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Optimized GPU Training Script for The Trial SLM
|
|
Uses QLoRA with CUDA acceleration
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
import torch
|
|
import wandb
|
|
from datasets import Dataset
|
|
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
|
|
from transformers import (
|
|
AutoModelForCausalLM,
|
|
AutoTokenizer,
|
|
BitsAndBytesConfig,
|
|
DataCollatorForLanguageModeling,
|
|
Trainer,
|
|
TrainingArguments,
|
|
)
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class OptimizedTrainingConfig:
|
|
"""Optimized configuration for QLoRA training"""
|
|
|
|
# Model configuration - use local Ollama model
|
|
base_model: str = "C:/Users/simpl/.ollama/models/llama3.2:3b"
|
|
adapter_name: str = "the-trial-adapter-v2"
|
|
|
|
# Data configuration
|
|
dataset_path: str = "data/training/monte_cristo_combined.json"
|
|
max_seq_length: int = 2048
|
|
|
|
# GPU/CPU configuration
|
|
device_map: str = "auto"
|
|
torch_dtype: str = "bfloat16"
|
|
use_cpu: bool = False
|
|
|
|
# QLoRA configuration
|
|
use_4bit: bool = True
|
|
use_nested_quant: bool = False
|
|
bnb_4bit_compute_dtype: str = "bfloat16"
|
|
bnb_4bit_quant_type: str = "nf4"
|
|
|
|
# LoRA configuration
|
|
lora_r: int = 32 # Increased for better learning
|
|
lora_alpha: int = 64 # Increased for better learning
|
|
lora_dropout: float = 0.05 # Reduced dropout
|
|
target_modules: List[str] = field(
|
|
default_factory=lambda: [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
"o_proj",
|
|
"gate_proj",
|
|
"up_proj",
|
|
"down_proj",
|
|
"lm_head", # Include language model head
|
|
]
|
|
)
|
|
|
|
# Training arguments
|
|
output_dir: str = "models/monte_cristo_qlora_v2"
|
|
num_train_epochs: int = 5 # More epochs
|
|
per_device_train_batch_size: int = 4 if torch.cuda.is_available() else 1
|
|
gradient_accumulation_steps: int = 4 if torch.cuda.is_available() else 16
|
|
learning_rate: float = 1e-4 # Slightly higher learning rate
|
|
weight_decay: float = 0.01
|
|
warmup_ratio: float = 0.05
|
|
max_grad_norm: float = 0.5
|
|
|
|
# Optimization
|
|
optim: str = "adamw_torch" if torch.cuda.is_available() else "adafactor"
|
|
lr_scheduler_type: str = "cosine_with_restarts"
|
|
logging_steps: int = 5
|
|
save_steps: int = 50
|
|
eval_steps: int = 50
|
|
save_total_limit: int = 5
|
|
|
|
# Performance optimizations
|
|
fp16: bool = (
|
|
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] < 8
|
|
)
|
|
bf16: bool = (
|
|
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
|
|
)
|
|
gradient_checkpointing: bool = True
|
|
dataloader_pin_memory: bool = torch.cuda.is_available()
|
|
dataloader_num_workers: int = 0 # Simplified for Windows
|
|
|
|
# Mixed precision and memory
|
|
dataloader_prefetch: bool = torch.cuda.is_available()
|
|
remove_unused_columns: bool = False
|
|
|
|
# Miscellaneous
|
|
report_to: str = "none" # Disable wandb for simpler execution
|
|
run_name: str = "the-trial-qlora-v2"
|
|
seed: int = 42
|
|
|
|
# CPU-specific optimizations
|
|
use_cpu_offload: bool = not torch.cuda.is_available()
|
|
offload_state_dict: bool = not torch.cuda.is_available()
|
|
|
|
|
|
class OptimizedMonteCristoTrainer:
|
|
"""Optimized trainer for The Trial SLM"""
|
|
|
|
def __init__(self, config: OptimizedTrainingConfig):
|
|
self.config = config
|
|
self.setup_directories()
|
|
self.detect_hardware()
|
|
|
|
def setup_directories(self):
|
|
"""Create necessary directories"""
|
|
Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
def detect_hardware(self):
|
|
"""Detect and configure for hardware"""
|
|
if torch.cuda.is_available():
|
|
gpu_count = torch.cuda.device_count()
|
|
gpu_name = torch.cuda.get_device_name(0)
|
|
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
|
|
gpu_capability = torch.cuda.get_device_capability(0)
|
|
|
|
logger.info(f"GPU detected: {gpu_name}")
|
|
logger.info(f"GPU memory: {gpu_memory:.1f} GB")
|
|
logger.info(f"GPU capability: {gpu_capability}")
|
|
logger.info(f"GPU count: {gpu_count}")
|
|
|
|
self.config.use_cpu = False
|
|
self.config.per_device_train_batch_size = max(
|
|
1, min(4, int(gpu_memory / 8))
|
|
)
|
|
|
|
else:
|
|
logger.info("No GPU detected - using CPU optimizations")
|
|
self.config.use_cpu = True
|
|
self.config.use_4bit = False # Disable 4-bit on CPU
|
|
self.config.torch_dtype = "float32"
|
|
self.config.gradient_accumulation_steps = 32
|
|
|
|
def load_tokenizer(self) -> AutoTokenizer:
|
|
"""Load and configure tokenizer"""
|
|
logger.info(f"Loading tokenizer for {self.config.base_model}")
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
self.config.base_model,
|
|
trust_remote_code=True,
|
|
padding_side="right",
|
|
use_fast=True,
|
|
)
|
|
|
|
# Set pad token if not present
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}")
|
|
return tokenizer
|
|
|
|
def load_model(self) -> AutoModelForCausalLM:
|
|
"""Load model with appropriate configuration"""
|
|
logger.info(f"Loading model {self.config.base_model}")
|
|
|
|
if self.config.use_cpu:
|
|
logger.info("Loading model for CPU training")
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
self.config.base_model,
|
|
torch_dtype=torch.float32,
|
|
device_map="cpu",
|
|
trust_remote_code=True,
|
|
low_cpu_mem_usage=True,
|
|
)
|
|
else:
|
|
# Configure 4-bit quantization for GPU
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit=self.config.use_4bit,
|
|
bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
|
|
bnb_4bit_compute_dtype=getattr(
|
|
torch, self.config.bnb_4bit_compute_dtype
|
|
),
|
|
bnb_4bit_use_double_quant=self.config.use_nested_quant,
|
|
)
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
self.config.base_model,
|
|
quantization_config=bnb_config,
|
|
device_map=self.config.device_map,
|
|
trust_remote_code=True,
|
|
torch_dtype=getattr(torch, self.config.torch_dtype),
|
|
)
|
|
|
|
model = prepare_model_for_kbit_training(model)
|
|
|
|
logger.info(f"Model loaded on device: {next(model.parameters()).device}")
|
|
return model
|
|
|
|
def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM:
|
|
"""Setup LoRA adapter with optimized settings"""
|
|
logger.info("Setting up optimized LoRA adapter")
|
|
|
|
lora_config = LoraConfig(
|
|
task_type=TaskType.CAUSAL_LM,
|
|
r=self.config.lora_r,
|
|
lora_alpha=self.config.lora_alpha,
|
|
lora_dropout=self.config.lora_dropout,
|
|
target_modules=self.config.target_modules,
|
|
bias="none",
|
|
init_lora_weights=True,
|
|
)
|
|
|
|
model = get_peft_model(model, lora_config)
|
|
|
|
# Print trainable parameters
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
|
all_params = sum(p.numel() for p in model.parameters())
|
|
|
|
logger.info(f"Trainable parameters: {trainable_params:,}")
|
|
logger.info(f"All parameters: {all_params:,}")
|
|
logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%")
|
|
|
|
return model
|
|
|
|
def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset:
|
|
"""Load and preprocess training data"""
|
|
logger.info(f"Loading dataset from {self.config.dataset_path}")
|
|
|
|
# Load dataset
|
|
with open(self.config.dataset_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
logger.info(f"Loaded {len(data)} training examples")
|
|
|
|
# Convert to HuggingFace Dataset
|
|
dataset = Dataset.from_list(data)
|
|
|
|
# Enhanced tokenization function
|
|
def tokenize_function(examples):
|
|
# Format prompts for better training
|
|
prompts = []
|
|
for i in range(len(examples["instruction"])):
|
|
instruction = examples["instruction"][i]
|
|
input_text = examples["input"][i] if examples["input"][i] else ""
|
|
output = examples["output"][i]
|
|
category = (
|
|
examples["category"][i] if "category" in examples else "general"
|
|
)
|
|
|
|
# Enhanced prompt format with category context
|
|
if category == "factual":
|
|
context = "Answer this factual question about The Trial."
|
|
elif category == "analysis":
|
|
context = (
|
|
"Provide literary analysis for this question about The Trial."
|
|
)
|
|
elif category == "creative":
|
|
context = "Respond creatively in the style of Alexandre Dumas to this prompt."
|
|
else:
|
|
context = "Respond to this question about The Trial."
|
|
|
|
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
|
|
|
You are an expert on Alexandre Dumas' "The Trial". {context}
|
|
|
|
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
|
|
|
{instruction}
|
|
|
|
{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
|
|
|
{output}<|eot_id|>"""
|
|
|
|
prompts.append(prompt)
|
|
|
|
# Tokenize with optimizations
|
|
tokenized = tokenizer(
|
|
prompts,
|
|
truncation=True,
|
|
padding=False,
|
|
max_length=self.config.max_seq_length,
|
|
return_tensors=None,
|
|
return_attention_mask=False,
|
|
)
|
|
|
|
# Set labels for causal LM
|
|
tokenized["labels"] = tokenized["input_ids"].copy()
|
|
|
|
return tokenized
|
|
|
|
# Apply tokenization
|
|
tokenized_dataset = dataset.map(
|
|
tokenize_function,
|
|
batched=True,
|
|
remove_columns=dataset.column_names,
|
|
desc="Tokenizing dataset",
|
|
)
|
|
|
|
logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples")
|
|
return tokenized_dataset
|
|
|
|
def create_trainer(
|
|
self,
|
|
model: AutoModelForCausalLM,
|
|
tokenizer: AutoTokenizer,
|
|
train_dataset: Dataset,
|
|
) -> Trainer:
|
|
"""Create optimized Trainer instance"""
|
|
|
|
# Enhanced training arguments
|
|
training_args = TrainingArguments(
|
|
output_dir=self.config.output_dir,
|
|
num_train_epochs=self.config.num_train_epochs,
|
|
per_device_train_batch_size=self.config.per_device_train_batch_size,
|
|
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
|
learning_rate=self.config.learning_rate,
|
|
weight_decay=self.config.weight_decay,
|
|
warmup_ratio=self.config.warmup_ratio,
|
|
max_grad_norm=self.config.max_grad_norm,
|
|
optim=self.config.optim,
|
|
lr_scheduler_type=self.config.lr_scheduler_type,
|
|
logging_steps=self.config.logging_steps,
|
|
save_steps=self.config.save_steps,
|
|
eval_steps=self.config.eval_steps,
|
|
save_total_limit=self.config.save_total_limit,
|
|
fp16=self.config.fp16,
|
|
bf16=self.config.bf16,
|
|
gradient_checkpointing=self.config.gradient_checkpointing,
|
|
dataloader_pin_memory=self.config.dataloader_pin_memory,
|
|
dataloader_num_workers=self.config.dataloader_num_workers,
|
|
dataloader_prefetch=self.config.dataloader_prefetch,
|
|
dataloader_persistent_workers=False, # Simplified
|
|
report_to=self.config.report_to,
|
|
run_name=self.config.run_name,
|
|
seed=self.config.seed,
|
|
# Performance optimizations
|
|
remove_unused_columns=self.config.remove_unused_columns,
|
|
include_tokens_per_second=True,
|
|
# Memory optimizations
|
|
dataloader_drop_last=True,
|
|
)
|
|
|
|
# Optimized data collator
|
|
data_collator = DataCollatorForLanguageModeling(
|
|
tokenizer=tokenizer,
|
|
mlm=False,
|
|
pad_to_multiple_of=8,
|
|
return_tensors="pt",
|
|
)
|
|
|
|
# Create trainer
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=train_dataset,
|
|
eval_dataset=None,
|
|
data_collator=data_collator,
|
|
tokenizer=tokenizer,
|
|
)
|
|
|
|
return trainer
|
|
|
|
def train(self):
|
|
"""Execute optimized training"""
|
|
logger.info("Starting optimized The Trial SLM training...")
|
|
logger.info(f"Using {'GPU' if not self.config.use_cpu else 'CPU'} training")
|
|
|
|
# Load components
|
|
tokenizer = self.load_tokenizer()
|
|
model = self.load_model()
|
|
model = self.setup_lora(model)
|
|
train_dataset = self.load_and_preprocess_data(tokenizer)
|
|
trainer = self.create_trainer(model, tokenizer, train_dataset)
|
|
|
|
# Train model
|
|
logger.info("Beginning training...")
|
|
trainer.train()
|
|
|
|
# Save final model
|
|
logger.info("Saving final model...")
|
|
trainer.save_model()
|
|
tokenizer.save_pretrained(self.config.output_dir)
|
|
|
|
# Save adapter separately for Ollama
|
|
adapter_path = Path(self.config.output_dir) / "adapter_model"
|
|
if adapter_path.exists():
|
|
logger.info(f"Adapter saved to {adapter_path}")
|
|
|
|
logger.info("Training completed successfully!")
|
|
|
|
return trainer, model
|
|
|
|
|
|
def main():
|
|
"""Main training function"""
|
|
logger.info("The Trial SLM - Optimized GPU/CPU Training")
|
|
logger.info("=" * 60)
|
|
|
|
# Configuration with auto-detection
|
|
config = OptimizedTrainingConfig()
|
|
|
|
# Create trainer
|
|
trainer_instance = OptimizedMonteCristoTrainer(config)
|
|
|
|
# Execute training
|
|
try:
|
|
trainer, model = trainer_instance.train()
|
|
logger.info("=" * 60)
|
|
logger.info("OPTIMIZED TRAINING COMPLETED SUCCESSFULLY!")
|
|
logger.info(f"Model saved to: {config.output_dir}")
|
|
logger.info("=" * 60)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Training failed: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|