#!/usr/bin/env python3 """ Optimized GPU Training Script for The Trial SLM Uses QLoRA with CUDA acceleration """ import json import logging import os from dataclasses import dataclass, field from pathlib import Path from typing import Dict, List, Optional import torch import wandb from datasets import Dataset from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments, ) # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class OptimizedTrainingConfig: """Optimized configuration for QLoRA training""" # Model configuration - use local Ollama model base_model: str = "C:/Users/simpl/.ollama/models/llama3.2:3b" adapter_name: str = "the-trial-adapter-v2" # Data configuration dataset_path: str = "data/training/monte_cristo_combined.json" max_seq_length: int = 2048 # GPU/CPU configuration device_map: str = "auto" torch_dtype: str = "bfloat16" use_cpu: bool = False # QLoRA configuration use_4bit: bool = True use_nested_quant: bool = False bnb_4bit_compute_dtype: str = "bfloat16" bnb_4bit_quant_type: str = "nf4" # LoRA configuration lora_r: int = 32 # Increased for better learning lora_alpha: int = 64 # Increased for better learning lora_dropout: float = 0.05 # Reduced dropout target_modules: List[str] = field( default_factory=lambda: [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head", # Include language model head ] ) # Training arguments output_dir: str = "models/monte_cristo_qlora_v2" num_train_epochs: int = 5 # More epochs per_device_train_batch_size: int = 4 if torch.cuda.is_available() else 1 gradient_accumulation_steps: int = 4 if torch.cuda.is_available() else 16 learning_rate: float = 1e-4 # Slightly higher learning rate weight_decay: float = 0.01 warmup_ratio: float = 0.05 max_grad_norm: float = 0.5 # Optimization optim: str = "adamw_torch" if torch.cuda.is_available() else "adafactor" lr_scheduler_type: str = "cosine_with_restarts" logging_steps: int = 5 save_steps: int = 50 eval_steps: int = 50 save_total_limit: int = 5 # Performance optimizations fp16: bool = ( torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] < 8 ) bf16: bool = ( torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 ) gradient_checkpointing: bool = True dataloader_pin_memory: bool = torch.cuda.is_available() dataloader_num_workers: int = 0 # Simplified for Windows # Mixed precision and memory dataloader_prefetch: bool = torch.cuda.is_available() remove_unused_columns: bool = False # Miscellaneous report_to: str = "none" # Disable wandb for simpler execution run_name: str = "the-trial-qlora-v2" seed: int = 42 # CPU-specific optimizations use_cpu_offload: bool = not torch.cuda.is_available() offload_state_dict: bool = not torch.cuda.is_available() class OptimizedMonteCristoTrainer: """Optimized trainer for The Trial SLM""" def __init__(self, config: OptimizedTrainingConfig): self.config = config self.setup_directories() self.detect_hardware() def setup_directories(self): """Create necessary directories""" Path(self.config.output_dir).mkdir(parents=True, exist_ok=True) def detect_hardware(self): """Detect and configure for hardware""" if torch.cuda.is_available(): gpu_count = torch.cuda.device_count() gpu_name = torch.cuda.get_device_name(0) gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 gpu_capability = torch.cuda.get_device_capability(0) logger.info(f"GPU detected: {gpu_name}") logger.info(f"GPU memory: {gpu_memory:.1f} GB") logger.info(f"GPU capability: {gpu_capability}") logger.info(f"GPU count: {gpu_count}") self.config.use_cpu = False self.config.per_device_train_batch_size = max( 1, min(4, int(gpu_memory / 8)) ) else: logger.info("No GPU detected - using CPU optimizations") self.config.use_cpu = True self.config.use_4bit = False # Disable 4-bit on CPU self.config.torch_dtype = "float32" self.config.gradient_accumulation_steps = 32 def load_tokenizer(self) -> AutoTokenizer: """Load and configure tokenizer""" logger.info(f"Loading tokenizer for {self.config.base_model}") tokenizer = AutoTokenizer.from_pretrained( self.config.base_model, trust_remote_code=True, padding_side="right", use_fast=True, ) # Set pad token if not present if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}") return tokenizer def load_model(self) -> AutoModelForCausalLM: """Load model with appropriate configuration""" logger.info(f"Loading model {self.config.base_model}") if self.config.use_cpu: logger.info("Loading model for CPU training") model = AutoModelForCausalLM.from_pretrained( self.config.base_model, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, low_cpu_mem_usage=True, ) else: # Configure 4-bit quantization for GPU bnb_config = BitsAndBytesConfig( load_in_4bit=self.config.use_4bit, bnb_4bit_quant_type=self.config.bnb_4bit_quant_type, bnb_4bit_compute_dtype=getattr( torch, self.config.bnb_4bit_compute_dtype ), bnb_4bit_use_double_quant=self.config.use_nested_quant, ) model = AutoModelForCausalLM.from_pretrained( self.config.base_model, quantization_config=bnb_config, device_map=self.config.device_map, trust_remote_code=True, torch_dtype=getattr(torch, self.config.torch_dtype), ) model = prepare_model_for_kbit_training(model) logger.info(f"Model loaded on device: {next(model.parameters()).device}") return model def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM: """Setup LoRA adapter with optimized settings""" logger.info("Setting up optimized LoRA adapter") lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, r=self.config.lora_r, lora_alpha=self.config.lora_alpha, lora_dropout=self.config.lora_dropout, target_modules=self.config.target_modules, bias="none", init_lora_weights=True, ) model = get_peft_model(model, lora_config) # Print trainable parameters trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) all_params = sum(p.numel() for p in model.parameters()) logger.info(f"Trainable parameters: {trainable_params:,}") logger.info(f"All parameters: {all_params:,}") logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%") return model def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset: """Load and preprocess training data""" logger.info(f"Loading dataset from {self.config.dataset_path}") # Load dataset with open(self.config.dataset_path, "r", encoding="utf-8") as f: data = json.load(f) logger.info(f"Loaded {len(data)} training examples") # Convert to HuggingFace Dataset dataset = Dataset.from_list(data) # Enhanced tokenization function def tokenize_function(examples): # Format prompts for better training prompts = [] for i in range(len(examples["instruction"])): instruction = examples["instruction"][i] input_text = examples["input"][i] if examples["input"][i] else "" output = examples["output"][i] category = ( examples["category"][i] if "category" in examples else "general" ) # Enhanced prompt format with category context if category == "factual": context = "Answer this factual question about The Trial." elif category == "analysis": context = ( "Provide literary analysis for this question about The Trial." ) elif category == "creative": context = "Respond creatively in the style of Alexandre Dumas to this prompt." else: context = "Respond to this question about The Trial." prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an expert on Alexandre Dumas' "The Trial". {context} <|eot_id|><|start_header_id|>user<|end_header_id|> {instruction} {input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|> {output}<|eot_id|>""" prompts.append(prompt) # Tokenize with optimizations tokenized = tokenizer( prompts, truncation=True, padding=False, max_length=self.config.max_seq_length, return_tensors=None, return_attention_mask=False, ) # Set labels for causal LM tokenized["labels"] = tokenized["input_ids"].copy() return tokenized # Apply tokenization tokenized_dataset = dataset.map( tokenize_function, batched=True, remove_columns=dataset.column_names, desc="Tokenizing dataset", ) logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples") return tokenized_dataset def create_trainer( self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, train_dataset: Dataset, ) -> Trainer: """Create optimized Trainer instance""" # Enhanced training arguments training_args = TrainingArguments( output_dir=self.config.output_dir, num_train_epochs=self.config.num_train_epochs, per_device_train_batch_size=self.config.per_device_train_batch_size, gradient_accumulation_steps=self.config.gradient_accumulation_steps, learning_rate=self.config.learning_rate, weight_decay=self.config.weight_decay, warmup_ratio=self.config.warmup_ratio, max_grad_norm=self.config.max_grad_norm, optim=self.config.optim, lr_scheduler_type=self.config.lr_scheduler_type, logging_steps=self.config.logging_steps, save_steps=self.config.save_steps, eval_steps=self.config.eval_steps, save_total_limit=self.config.save_total_limit, fp16=self.config.fp16, bf16=self.config.bf16, gradient_checkpointing=self.config.gradient_checkpointing, dataloader_pin_memory=self.config.dataloader_pin_memory, dataloader_num_workers=self.config.dataloader_num_workers, dataloader_prefetch=self.config.dataloader_prefetch, dataloader_persistent_workers=False, # Simplified report_to=self.config.report_to, run_name=self.config.run_name, seed=self.config.seed, # Performance optimizations remove_unused_columns=self.config.remove_unused_columns, include_tokens_per_second=True, # Memory optimizations dataloader_drop_last=True, ) # Optimized data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="pt", ) # Create trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator, tokenizer=tokenizer, ) return trainer def train(self): """Execute optimized training""" logger.info("Starting optimized The Trial SLM training...") logger.info(f"Using {'GPU' if not self.config.use_cpu else 'CPU'} training") # Load components tokenizer = self.load_tokenizer() model = self.load_model() model = self.setup_lora(model) train_dataset = self.load_and_preprocess_data(tokenizer) trainer = self.create_trainer(model, tokenizer, train_dataset) # Train model logger.info("Beginning training...") trainer.train() # Save final model logger.info("Saving final model...") trainer.save_model() tokenizer.save_pretrained(self.config.output_dir) # Save adapter separately for Ollama adapter_path = Path(self.config.output_dir) / "adapter_model" if adapter_path.exists(): logger.info(f"Adapter saved to {adapter_path}") logger.info("Training completed successfully!") return trainer, model def main(): """Main training function""" logger.info("The Trial SLM - Optimized GPU/CPU Training") logger.info("=" * 60) # Configuration with auto-detection config = OptimizedTrainingConfig() # Create trainer trainer_instance = OptimizedMonteCristoTrainer(config) # Execute training try: trainer, model = trainer_instance.train() logger.info("=" * 60) logger.info("OPTIMIZED TRAINING COMPLETED SUCCESSFULLY!") logger.info(f"Model saved to: {config.output_dir}") logger.info("=" * 60) except Exception as e: logger.error(f"Training failed: {e}") raise if __name__ == "__main__": main()