The Trial - Initial commit

2026-01-17 14:59:35 -05:00
commit c401cf655d
27 changed files with 132452 additions and 0 deletions
--- a/scripts/optimized_train.py
+++ b/scripts/optimized_train.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+"""
+Optimized GPU Training Script for The Trial SLM
+Uses QLoRA with CUDA acceleration
+"""
+
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import torch
+import wandb
+from datasets import Dataset
+from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling,
+    Trainer,
+    TrainingArguments,
+)
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OptimizedTrainingConfig:
+    """Optimized configuration for QLoRA training"""
+
+    # Model configuration - use local Ollama model
+    base_model: str = "C:/Users/simpl/.ollama/models/llama3.2:3b"
+    adapter_name: str = "the-trial-adapter-v2"
+
+    # Data configuration
+    dataset_path: str = "data/training/monte_cristo_combined.json"
+    max_seq_length: int = 2048
+
+    # GPU/CPU configuration
+    device_map: str = "auto"
+    torch_dtype: str = "bfloat16"
+    use_cpu: bool = False
+
+    # QLoRA configuration
+    use_4bit: bool = True
+    use_nested_quant: bool = False
+    bnb_4bit_compute_dtype: str = "bfloat16"
+    bnb_4bit_quant_type: str = "nf4"
+
+    # LoRA configuration
+    lora_r: int = 32  # Increased for better learning
+    lora_alpha: int = 64  # Increased for better learning
+    lora_dropout: float = 0.05  # Reduced dropout
+    target_modules: List[str] = field(
+        default_factory=lambda: [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+            "lm_head",  # Include language model head
+        ]
+    )
+
+    # Training arguments
+    output_dir: str = "models/monte_cristo_qlora_v2"
+    num_train_epochs: int = 5  # More epochs
+    per_device_train_batch_size: int = 4 if torch.cuda.is_available() else 1
+    gradient_accumulation_steps: int = 4 if torch.cuda.is_available() else 16
+    learning_rate: float = 1e-4  # Slightly higher learning rate
+    weight_decay: float = 0.01
+    warmup_ratio: float = 0.05
+    max_grad_norm: float = 0.5
+
+    # Optimization
+    optim: str = "adamw_torch" if torch.cuda.is_available() else "adafactor"
+    lr_scheduler_type: str = "cosine_with_restarts"
+    logging_steps: int = 5
+    save_steps: int = 50
+    eval_steps: int = 50
+    save_total_limit: int = 5
+
+    # Performance optimizations
+    fp16: bool = (
+        torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] < 8
+    )
+    bf16: bool = (
+        torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
+    )
+    gradient_checkpointing: bool = True
+    dataloader_pin_memory: bool = torch.cuda.is_available()
+    dataloader_num_workers: int = 0  # Simplified for Windows
+
+    # Mixed precision and memory
+    dataloader_prefetch: bool = torch.cuda.is_available()
+    remove_unused_columns: bool = False
+
+    # Miscellaneous
+    report_to: str = "none"  # Disable wandb for simpler execution
+    run_name: str = "the-trial-qlora-v2"
+    seed: int = 42
+
+    # CPU-specific optimizations
+    use_cpu_offload: bool = not torch.cuda.is_available()
+    offload_state_dict: bool = not torch.cuda.is_available()
+
+
+class OptimizedMonteCristoTrainer:
+    """Optimized trainer for The Trial SLM"""
+
+    def __init__(self, config: OptimizedTrainingConfig):
+        self.config = config
+        self.setup_directories()
+        self.detect_hardware()
+
+    def setup_directories(self):
+        """Create necessary directories"""
+        Path(self.config.output_dir).mkdir(parents=True, exist_ok=True)
+
+    def detect_hardware(self):
+        """Detect and configure for hardware"""
+        if torch.cuda.is_available():
+            gpu_count = torch.cuda.device_count()
+            gpu_name = torch.cuda.get_device_name(0)
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
+            gpu_capability = torch.cuda.get_device_capability(0)
+
+            logger.info(f"GPU detected: {gpu_name}")
+            logger.info(f"GPU memory: {gpu_memory:.1f} GB")
+            logger.info(f"GPU capability: {gpu_capability}")
+            logger.info(f"GPU count: {gpu_count}")
+
+            self.config.use_cpu = False
+            self.config.per_device_train_batch_size = max(
+                1, min(4, int(gpu_memory / 8))
+            )
+
+        else:
+            logger.info("No GPU detected - using CPU optimizations")
+            self.config.use_cpu = True
+            self.config.use_4bit = False  # Disable 4-bit on CPU
+            self.config.torch_dtype = "float32"
+            self.config.gradient_accumulation_steps = 32
+
+    def load_tokenizer(self) -> AutoTokenizer:
+        """Load and configure tokenizer"""
+        logger.info(f"Loading tokenizer for {self.config.base_model}")
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.config.base_model,
+            trust_remote_code=True,
+            padding_side="right",
+            use_fast=True,
+        )
+
+        # Set pad token if not present
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        logger.info(f"Tokenizer vocab size: {tokenizer.vocab_size}")
+        return tokenizer
+
+    def load_model(self) -> AutoModelForCausalLM:
+        """Load model with appropriate configuration"""
+        logger.info(f"Loading model {self.config.base_model}")
+
+        if self.config.use_cpu:
+            logger.info("Loading model for CPU training")
+            model = AutoModelForCausalLM.from_pretrained(
+                self.config.base_model,
+                torch_dtype=torch.float32,
+                device_map="cpu",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+            )
+        else:
+            # Configure 4-bit quantization for GPU
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=self.config.use_4bit,
+                bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
+                bnb_4bit_compute_dtype=getattr(
+                    torch, self.config.bnb_4bit_compute_dtype
+                ),
+                bnb_4bit_use_double_quant=self.config.use_nested_quant,
+            )
+
+            model = AutoModelForCausalLM.from_pretrained(
+                self.config.base_model,
+                quantization_config=bnb_config,
+                device_map=self.config.device_map,
+                trust_remote_code=True,
+                torch_dtype=getattr(torch, self.config.torch_dtype),
+            )
+
+            model = prepare_model_for_kbit_training(model)
+
+        logger.info(f"Model loaded on device: {next(model.parameters()).device}")
+        return model
+
+    def setup_lora(self, model: AutoModelForCausalLM) -> AutoModelForCausalLM:
+        """Setup LoRA adapter with optimized settings"""
+        logger.info("Setting up optimized LoRA adapter")
+
+        lora_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            r=self.config.lora_r,
+            lora_alpha=self.config.lora_alpha,
+            lora_dropout=self.config.lora_dropout,
+            target_modules=self.config.target_modules,
+            bias="none",
+            init_lora_weights=True,
+        )
+
+        model = get_peft_model(model, lora_config)
+
+        # Print trainable parameters
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        all_params = sum(p.numel() for p in model.parameters())
+
+        logger.info(f"Trainable parameters: {trainable_params:,}")
+        logger.info(f"All parameters: {all_params:,}")
+        logger.info(f"Trainable%: {100 * trainable_params / all_params:.2f}%")
+
+        return model
+
+    def load_and_preprocess_data(self, tokenizer: AutoTokenizer) -> Dataset:
+        """Load and preprocess training data"""
+        logger.info(f"Loading dataset from {self.config.dataset_path}")
+
+        # Load dataset
+        with open(self.config.dataset_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+
+        logger.info(f"Loaded {len(data)} training examples")
+
+        # Convert to HuggingFace Dataset
+        dataset = Dataset.from_list(data)
+
+        # Enhanced tokenization function
+        def tokenize_function(examples):
+            # Format prompts for better training
+            prompts = []
+            for i in range(len(examples["instruction"])):
+                instruction = examples["instruction"][i]
+                input_text = examples["input"][i] if examples["input"][i] else ""
+                output = examples["output"][i]
+                category = (
+                    examples["category"][i] if "category" in examples else "general"
+                )
+
+                # Enhanced prompt format with category context
+                if category == "factual":
+                    context = "Answer this factual question about The Trial."
+                elif category == "analysis":
+                    context = (
+                        "Provide literary analysis for this question about The Trial."
+                    )
+                elif category == "creative":
+                    context = "Respond creatively in the style of Alexandre Dumas to this prompt."
+                else:
+                    context = "Respond to this question about The Trial."
+
+                prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+You are an expert on Alexandre Dumas' "The Trial". {context}
+
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{instruction}
+
+{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{output}<|eot_id|>"""
+
+                prompts.append(prompt)
+
+            # Tokenize with optimizations
+            tokenized = tokenizer(
+                prompts,
+                truncation=True,
+                padding=False,
+                max_length=self.config.max_seq_length,
+                return_tensors=None,
+                return_attention_mask=False,
+            )
+
+            # Set labels for causal LM
+            tokenized["labels"] = tokenized["input_ids"].copy()
+
+            return tokenized
+
+        # Apply tokenization
+        tokenized_dataset = dataset.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=dataset.column_names,
+            desc="Tokenizing dataset",
+        )
+
+        logger.info(f"Tokenized dataset: {len(tokenized_dataset)} examples")
+        return tokenized_dataset
+
+    def create_trainer(
+        self,
+        model: AutoModelForCausalLM,
+        tokenizer: AutoTokenizer,
+        train_dataset: Dataset,
+    ) -> Trainer:
+        """Create optimized Trainer instance"""
+
+        # Enhanced training arguments
+        training_args = TrainingArguments(
+            output_dir=self.config.output_dir,
+            num_train_epochs=self.config.num_train_epochs,
+            per_device_train_batch_size=self.config.per_device_train_batch_size,
+            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+            learning_rate=self.config.learning_rate,
+            weight_decay=self.config.weight_decay,
+            warmup_ratio=self.config.warmup_ratio,
+            max_grad_norm=self.config.max_grad_norm,
+            optim=self.config.optim,
+            lr_scheduler_type=self.config.lr_scheduler_type,
+            logging_steps=self.config.logging_steps,
+            save_steps=self.config.save_steps,
+            eval_steps=self.config.eval_steps,
+            save_total_limit=self.config.save_total_limit,
+            fp16=self.config.fp16,
+            bf16=self.config.bf16,
+            gradient_checkpointing=self.config.gradient_checkpointing,
+            dataloader_pin_memory=self.config.dataloader_pin_memory,
+            dataloader_num_workers=self.config.dataloader_num_workers,
+            dataloader_prefetch=self.config.dataloader_prefetch,
+            dataloader_persistent_workers=False,  # Simplified
+            report_to=self.config.report_to,
+            run_name=self.config.run_name,
+            seed=self.config.seed,
+            # Performance optimizations
+            remove_unused_columns=self.config.remove_unused_columns,
+            include_tokens_per_second=True,
+            # Memory optimizations
+            dataloader_drop_last=True,
+        )
+
+        # Optimized data collator
+        data_collator = DataCollatorForLanguageModeling(
+            tokenizer=tokenizer,
+            mlm=False,
+            pad_to_multiple_of=8,
+            return_tensors="pt",
+        )
+
+        # Create trainer
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=None,
+            data_collator=data_collator,
+            tokenizer=tokenizer,
+        )
+
+        return trainer
+
+    def train(self):
+        """Execute optimized training"""
+        logger.info("Starting optimized The Trial SLM training...")
+        logger.info(f"Using {'GPU' if not self.config.use_cpu else 'CPU'} training")
+
+        # Load components
+        tokenizer = self.load_tokenizer()
+        model = self.load_model()
+        model = self.setup_lora(model)
+        train_dataset = self.load_and_preprocess_data(tokenizer)
+        trainer = self.create_trainer(model, tokenizer, train_dataset)
+
+        # Train model
+        logger.info("Beginning training...")
+        trainer.train()
+
+        # Save final model
+        logger.info("Saving final model...")
+        trainer.save_model()
+        tokenizer.save_pretrained(self.config.output_dir)
+
+        # Save adapter separately for Ollama
+        adapter_path = Path(self.config.output_dir) / "adapter_model"
+        if adapter_path.exists():
+            logger.info(f"Adapter saved to {adapter_path}")
+
+        logger.info("Training completed successfully!")
+
+        return trainer, model
+
+
+def main():
+    """Main training function"""
+    logger.info("The Trial SLM - Optimized GPU/CPU Training")
+    logger.info("=" * 60)
+
+    # Configuration with auto-detection
+    config = OptimizedTrainingConfig()
+
+    # Create trainer
+    trainer_instance = OptimizedMonteCristoTrainer(config)
+
+    # Execute training
+    try:
+        trainer, model = trainer_instance.train()
+        logger.info("=" * 60)
+        logger.info("OPTIMIZED TRAINING COMPLETED SUCCESSFULLY!")
+        logger.info(f"Model saved to: {config.output_dir}")
+        logger.info("=" * 60)
+
+    except Exception as e:
+        logger.error(f"Training failed: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()