Skip to content

Model Training: Best Practices and Patterns

Published: November 2025 | 25 min read

The Model Training Lifecycle

Model training is the core of any machine learning project, where algorithms learn patterns from data to make predictions or decisions. A well-structured training pipeline is crucial for developing robust, maintainable, and high-performing models.

Key Components of Model Training

  1. Data Preparation
  2. Feature engineering
  3. Train/validation/test splits
  4. Data augmentation
  5. Class imbalance handling

  6. Model Architecture

  7. Model selection
  8. Architecture design
  9. Custom layers/blocks
  10. Transfer learning

  11. Training Process

  12. Loss functions
  13. Optimizers
  14. Learning rate scheduling
  15. Regularization techniques

  16. Evaluation

  17. Metrics selection
  18. Cross-validation
  19. Error analysis
  20. Model interpretation

Implementation with PyTorch Lightning

import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torchmetrics import Accuracy
from torchvision import datasets, transforms

class LitModel(pl.LightningModule):
    def __init__(self, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters()

        # Model architecture
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

        # Metrics
        self.train_accuracy = Accuracy(task="multiclass", num_classes=10)
        self.val_accuracy = Accuracy(task="multiclass", num_classes=10)
        self.test_accuracy = Accuracy(task="multiclass", num_classes=10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = self.train_accuracy(preds, y)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = self.val_accuracy(preds, y)

        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = self.test_accuracy(preds, y)

        self.log('test_loss', loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.1, patience=3, verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
                'interval': 'epoch',
                'frequency': 1
            }
        }

class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, data_dir='./data', batch_size=64):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])

    def prepare_data(self):
        # Download data
        datasets.MNIST(self.data_dir, train=True, download=True)
        datasets.MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        # Assign train/val datasets
        if stage == 'fit' or stage is None:
            mnist_full = datasets.MNIST(
                self.data_dir, train=True, transform=self.transform
            )
            self.mnist_train, self.mnist_val = random_split(
                mnist_full, [55000, 5000]
            )
        # Assign test dataset
        if stage == 'test' or stage is None:
            self.mnist_test = datasets.MNIST(
                self.data_dir, train=False, transform=self.transform
            )

    def train_dataloader(self):
        return DataLoader(
            self.mnist_train, 
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.mnist_val, 
            batch_size=self.batch_size,
            num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.mnist_test, 
            batch_size=self.batch_size,
            num_workers=4
        )

def train_model():
    # Initialize data module and model
    dm = MNISTDataModule(batch_size=64)
    model = LitModel(learning_rate=1e-3)

    # Initialize callbacks
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        monitor='val_loss',
        dirpath='checkpoints/',
        filename='mnist-{epoch:02d}-{val_loss:.2f}',
        save_top_k=3,
        mode='min',
    )

    early_stop_callback = pl.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        verbose=True,
        mode='min'
    )

    # Initialize trainer
    trainer = pl.Trainer(
        max_epochs=20,
        callbacks=[checkpoint_callback, early_stop_callback],
        accelerator='auto',
        devices=1 if torch.cuda.is_available() else None,
        log_every_n_steps=50,
        deterministic=True
    )

    # Train the model
    trainer.fit(model, dm)

    # Test the model
    trainer.test(datamodule=dm)

    return model

if __name__ == "__main__":
    model = train_model()

Advanced Model Training Techniques

1. Mixed Precision Training

trainer = pl.Trainer(
    precision=16,  # Enable mixed precision
    amp_backend='native',
    # ... other trainer args
)

2. Distributed Training

trainer = pl.Trainer(
    strategy='ddp',  # Data Distributed Parallel
    accelerator='gpu',
    devices=4,  # Number of GPUs
    # ... other trainer args
)

3. Gradient Clipping

def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
    return {
        'optimizer': optimizer,
        'gradient_clip_val': 0.5,
        'gradient_clip_algorithm': 'norm'
    }

Model Training Best Practices

  1. Reproducibility
  2. Set random seeds
  3. Version control everything
  4. Log hyperparameters

  5. Monitoring

  6. Track training metrics
  7. Visualize learning curves
  8. Monitor hardware utilization

  9. Regularization

  10. Dropout
  11. Weight decay
  12. Early stopping
  13. Data augmentation

  14. Hyperparameter Tuning

  15. Grid/Random search
  16. Bayesian optimization
  17. Population based training

Common Challenges and Solutions

Challenge Solution
Overfitting Increase dropout, add L2 regularization, use more data
Underfitting Increase model capacity, train longer, reduce regularization
Vanishing Gradients Use proper weight initialization, batch normalization
Exploding Gradients Gradient clipping, weight normalization
Slow Training Mixed precision, larger batch size, better optimizer
High Memory Usage Gradient checkpointing, smaller batch size, model parallelism

Next Steps

  1. Implement model versioning
  2. Set up model evaluation pipelines
  3. Create model cards for documentation
  4. Plan for model monitoring in production