The Trial - Initial commit
This commit is contained in:
427
scripts/optimized_train.py
Normal file
427
scripts/optimized_train.py
Normal file
@@ -0,0 +1,427 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Optimized GPU Training Script for The Trial SLM
|
||||
Uses QLoRA with CUDA acceleration
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import torch
|
||||
import wandb
|
||||
from datasets import Dataset
|
||||
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
BitsAndBytesConfig,
|
||||
DataCollatorForLanguageModeling,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OptimizedTrainingConfig:
|
||||
"""Optimized configuration for QLoRA training"""
|
||||
|
||||
# Model configuration - use local Ollama model
|
||||
base_model: str = "C:/Users/simpl/.ollama/models/llama3.2:3b"
|
||||
adapter_name: str = "the-trial-adapter-v2"
|
||||
|
||||
# Data configuration
|
||||
dataset_path: str = "data/training/monte_cristo_combined.json"
|
||||
max_seq_length: int = 2048
|
||||
|
||||
# GPU/CPU configuration
|
||||
device_map: str = "auto"
|
||||
torch_dtype: str = "bfloat16"
|
||||
use_cpu: bool = False
|
||||
|
||||
# QLoRA configuration
|
||||
use_4bit: bool = True
|
||||
use_nested_quant: bool = False
|
||||
bnb_4bit_compute_dtype: str = "bfloat16"
|
||||
bnb_4bit_quant_type: str = "nf4"
|
||||
|
||||
# LoRA configuration
|
||||
lora_r: int = 32 # Increased for better learning
|
||||
lora_alpha: int = 64 # Increased for better learning
|
||||
lora_dropout: float = 0.05 # Reduced dropout
|
||||
target_modules: List[str] = field(
|
||||
default_factory=lambda: [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"o_proj",
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
"down_proj",
|
||||
"lm_head", # Include language model head
|
||||
]
|
||||
)
|
||||
|
||||
# Training arguments
|
||||
output_dir: str = "models/monte_cristo_qlora_v2"
|
||||
num_train_epochs: int = 5 # More epochs
|
||||
per_device_train_batch_size: int = 4 if torch.cuda.is_available() else 1
|
||||
gradient_accumulation_steps: int = 4 if torch.cuda.is_available() else 16
|
||||
learning_rate: float = 1e-4 # Slightly higher learning rate
|
||||
weight_decay: float = 0.01
|
||||
warmup_ratio: float = 0.05
|
||||
max_grad_norm: float = 0.5
|
||||
|
||||
# Optimization
|
||||
optim: str = "adamw_torch" if torch.cuda.is_available() else "adafactor"
|
||||
lr_scheduler_type: str = "cosine_with_restarts"
|
||||
logging_steps: int = 5
|
||||
save_steps: int = 50
|
||||
eval_steps: int = 50
|
||||
save_total_limit: int = 5
|
||||
|
||||
# Performance optimizations
|
||||
fp16: bool = (
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] < 8
|
||||
)
|
||||
bf16: bool = (
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
|
||||
)
|
||||
gradient_checkpointing: bool = True
|
||||
dataloader_pin_memory: bool = torch.cuda.is_available()
|
||||
dataloader_num_workers: int = 0 # Simplified for Windows
|
||||
|
||||
# Mixed precision and memory
|
||||
dataloader_prefetch: bool = torch.cuda.is_available()
|
||||
remove_unused_columns: bool = False
|
||||
|
||||
# Miscellaneous
|
||||
report_to: str = "none" # Disable wandb for simpler execution
|
||||
run_name: str = "the-trial-qlora-v2"
|
||||
seed: int = 42
|
||||
|
||||
# CPU-specific optimizations
|
||||
use_cpu_offload: bool = not torch.cuda.is_available()
|
||||
offload_state_dict: bool = not torch.cuda.is_available()
|
||||
|
||||
|
||||
class OptimizedMonteCristoTrainer:
|
||||
"""Optimized trainer for The Trial SLM"""
|
||||
|
||||
def __init__(self, config: OptimizedTrainingConfig):
|
||||
self.config = config
|
||||
self.setup_directories()
|
||||
self.detect_hardware()
|
||||
|
||||
def setup_directories(self):
|
||||
"""Create necessary directories"""
|
||||
Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def detect_hardware(self):
|
||||
"""Detect and configure for hardware"""
|
||||
if torch.cuda.is_available():
|
||||
gpu_count = torch.cuda.device_count()
|
||||
gpu_name = torch.cuda.get_device_name(0)
|
||||
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
|
||||
gpu_capability = torch.cuda.get_device_capability(0)
|
||||
|
||||
logger.info(f"GPU detected: {gpu_name}")
|
||||
logger.info(f"GPU memory: {gpu_memory:.1f} GB")
|
||||
logger.info(f"GPU capability: {gpu_capability}")
|
||||
logger.info(f"GPU count: {gpu_count}")
|
||||
|
||||
self.config.use_cpu = False
|
||||
self.config.per_device_train_batch_size = max(
|
||||
1, min(4, int(gpu_memory / 8))
|
||||
)
|
||||
|
||||
else:
|
||||
logger.info("No GPU detected - using CPU optimizations")
|
||||
self.config.use_cpu = True
|
||||
self.config.use_4bit = False # Disable 4-bit on CPU
|
||||
self.config.torch_dtype = "float32"
|
||||
self.config.gradient_accumulation_steps = 32
|
||||
|
||||
def load_tokenizer(self) -> AutoTokenizer:
|
||||
"""Load and configure tokenizer"""
|
||||
logger.info(f"Loading tokenizer for {self.config.base_model}")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.config.base_model,
|
||||
trust_remote_code=True,
|
||||
padding_side="right",
|
||||
use_fast=True,
|
||||
)
|
||||
|
||||
# Set pad token if not present
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}")
|
||||
return tokenizer
|
||||
|
||||
def load_model(self) -> AutoModelForCausalLM:
|
||||
"""Load model with appropriate configuration"""
|
||||
logger.info(f"Loading model {self.config.base_model}")
|
||||
|
||||
if self.config.use_cpu:
|
||||
logger.info("Loading model for CPU training")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.config.base_model,
|
||||
torch_dtype=torch.float32,
|
||||
device_map="cpu",
|
||||
trust_remote_code=True,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
else:
|
||||
# Configure 4-bit quantization for GPU
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=self.config.use_4bit,
|
||||
bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
|
||||
bnb_4bit_compute_dtype=getattr(
|
||||
torch, self.config.bnb_4bit_compute_dtype
|
||||
),
|
||||
bnb_4bit_use_double_quant=self.config.use_nested_quant,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.config.base_model,
|
||||
quantization_config=bnb_config,
|
||||
device_map=self.config.device_map,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=getattr(torch, self.config.torch_dtype),
|
||||
)
|
||||
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
logger.info(f"Model loaded on device: {next(model.parameters()).device}")
|
||||
return model
|
||||
|
||||
def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM:
|
||||
"""Setup LoRA adapter with optimized settings"""
|
||||
logger.info("Setting up optimized LoRA adapter")
|
||||
|
||||
lora_config = LoraConfig(
|
||||
task_type=TaskType.CAUSAL_LM,
|
||||
r=self.config.lora_r,
|
||||
lora_alpha=self.config.lora_alpha,
|
||||
lora_dropout=self.config.lora_dropout,
|
||||
target_modules=self.config.target_modules,
|
||||
bias="none",
|
||||
init_lora_weights=True,
|
||||
)
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
|
||||
# Print trainable parameters
|
||||
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
all_params = sum(p.numel() for p in model.parameters())
|
||||
|
||||
logger.info(f"Trainable parameters: {trainable_params:,}")
|
||||
logger.info(f"All parameters: {all_params:,}")
|
||||
logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%")
|
||||
|
||||
return model
|
||||
|
||||
def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset:
|
||||
"""Load and preprocess training data"""
|
||||
logger.info(f"Loading dataset from {self.config.dataset_path}")
|
||||
|
||||
# Load dataset
|
||||
with open(self.config.dataset_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
logger.info(f"Loaded {len(data)} training examples")
|
||||
|
||||
# Convert to HuggingFace Dataset
|
||||
dataset = Dataset.from_list(data)
|
||||
|
||||
# Enhanced tokenization function
|
||||
def tokenize_function(examples):
|
||||
# Format prompts for better training
|
||||
prompts = []
|
||||
for i in range(len(examples["instruction"])):
|
||||
instruction = examples["instruction"][i]
|
||||
input_text = examples["input"][i] if examples["input"][i] else ""
|
||||
output = examples["output"][i]
|
||||
category = (
|
||||
examples["category"][i] if "category" in examples else "general"
|
||||
)
|
||||
|
||||
# Enhanced prompt format with category context
|
||||
if category == "factual":
|
||||
context = "Answer this factual question about The Trial."
|
||||
elif category == "analysis":
|
||||
context = (
|
||||
"Provide literary analysis for this question about The Trial."
|
||||
)
|
||||
elif category == "creative":
|
||||
context = "Respond creatively in the style of Alexandre Dumas to this prompt."
|
||||
else:
|
||||
context = "Respond to this question about The Trial."
|
||||
|
||||
prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||
|
||||
You are an expert on Alexandre Dumas' "The Trial". {context}
|
||||
|
||||
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
{instruction}
|
||||
|
||||
{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
{output}<|eot_id|>"""
|
||||
|
||||
prompts.append(prompt)
|
||||
|
||||
# Tokenize with optimizations
|
||||
tokenized = tokenizer(
|
||||
prompts,
|
||||
truncation=True,
|
||||
padding=False,
|
||||
max_length=self.config.max_seq_length,
|
||||
return_tensors=None,
|
||||
return_attention_mask=False,
|
||||
)
|
||||
|
||||
# Set labels for causal LM
|
||||
tokenized["labels"] = tokenized["input_ids"].copy()
|
||||
|
||||
return tokenized
|
||||
|
||||
# Apply tokenization
|
||||
tokenized_dataset = dataset.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
remove_columns=dataset.column_names,
|
||||
desc="Tokenizing dataset",
|
||||
)
|
||||
|
||||
logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples")
|
||||
return tokenized_dataset
|
||||
|
||||
def create_trainer(
|
||||
self,
|
||||
model: AutoModelForCausalLM,
|
||||
tokenizer: AutoTokenizer,
|
||||
train_dataset: Dataset,
|
||||
) -> Trainer:
|
||||
"""Create optimized Trainer instance"""
|
||||
|
||||
# Enhanced training arguments
|
||||
training_args = TrainingArguments(
|
||||
output_dir=self.config.output_dir,
|
||||
num_train_epochs=self.config.num_train_epochs,
|
||||
per_device_train_batch_size=self.config.per_device_train_batch_size,
|
||||
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
||||
learning_rate=self.config.learning_rate,
|
||||
weight_decay=self.config.weight_decay,
|
||||
warmup_ratio=self.config.warmup_ratio,
|
||||
max_grad_norm=self.config.max_grad_norm,
|
||||
optim=self.config.optim,
|
||||
lr_scheduler_type=self.config.lr_scheduler_type,
|
||||
logging_steps=self.config.logging_steps,
|
||||
save_steps=self.config.save_steps,
|
||||
eval_steps=self.config.eval_steps,
|
||||
save_total_limit=self.config.save_total_limit,
|
||||
fp16=self.config.fp16,
|
||||
bf16=self.config.bf16,
|
||||
gradient_checkpointing=self.config.gradient_checkpointing,
|
||||
dataloader_pin_memory=self.config.dataloader_pin_memory,
|
||||
dataloader_num_workers=self.config.dataloader_num_workers,
|
||||
dataloader_prefetch=self.config.dataloader_prefetch,
|
||||
dataloader_persistent_workers=False, # Simplified
|
||||
report_to=self.config.report_to,
|
||||
run_name=self.config.run_name,
|
||||
seed=self.config.seed,
|
||||
# Performance optimizations
|
||||
remove_unused_columns=self.config.remove_unused_columns,
|
||||
include_tokens_per_second=True,
|
||||
# Memory optimizations
|
||||
dataloader_drop_last=True,
|
||||
)
|
||||
|
||||
# Optimized data collator
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer,
|
||||
mlm=False,
|
||||
pad_to_multiple_of=8,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Create trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=None,
|
||||
data_collator=data_collator,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
return trainer
|
||||
|
||||
def train(self):
|
||||
"""Execute optimized training"""
|
||||
logger.info("Starting optimized The Trial SLM training...")
|
||||
logger.info(f"Using {'GPU' if not self.config.use_cpu else 'CPU'} training")
|
||||
|
||||
# Load components
|
||||
tokenizer = self.load_tokenizer()
|
||||
model = self.load_model()
|
||||
model = self.setup_lora(model)
|
||||
train_dataset = self.load_and_preprocess_data(tokenizer)
|
||||
trainer = self.create_trainer(model, tokenizer, train_dataset)
|
||||
|
||||
# Train model
|
||||
logger.info("Beginning training...")
|
||||
trainer.train()
|
||||
|
||||
# Save final model
|
||||
logger.info("Saving final model...")
|
||||
trainer.save_model()
|
||||
tokenizer.save_pretrained(self.config.output_dir)
|
||||
|
||||
# Save adapter separately for Ollama
|
||||
adapter_path = Path(self.config.output_dir) / "adapter_model"
|
||||
if adapter_path.exists():
|
||||
logger.info(f"Adapter saved to {adapter_path}")
|
||||
|
||||
logger.info("Training completed successfully!")
|
||||
|
||||
return trainer, model
|
||||
|
||||
|
||||
def main():
|
||||
"""Main training function"""
|
||||
logger.info("The Trial SLM - Optimized GPU/CPU Training")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Configuration with auto-detection
|
||||
config = OptimizedTrainingConfig()
|
||||
|
||||
# Create trainer
|
||||
trainer_instance = OptimizedMonteCristoTrainer(config)
|
||||
|
||||
# Execute training
|
||||
try:
|
||||
trainer, model = trainer_instance.train()
|
||||
logger.info("=" * 60)
|
||||
logger.info("OPTIMIZED TRAINING COMPLETED SUCCESSFULLY!")
|
||||
logger.info(f"Model saved to: {config.output_dir}")
|
||||
logger.info("=" * 60)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Training failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user