PyTorch Basics
- Tensors: Multi-dimensional arrays for numerical computations and GPU acceleration.
- Autograd: Automatic differentiation for efficient gradient calculations in backpropagation.
- nn Module: High-level API for building complex neural network architectures.
- Device management: Easily move computations between CPU and GPU.
import torch
x = torch.tensor([1, 2, 3], device='cuda')
y = torch.nn.Linear(3, 1)
y = y.to('cuda') # Move to GPU
Training
- Loss functions: Measure model performance (e.g., MSELoss, CrossEntropyLoss).
- Backpropagation: Compute gradients for efficient parameter updates.
- Optimizers: Update model parameters to minimize loss (e.g., SGD, Adam).
- Learning rate schedulers: Adjust learning rates for better convergence.
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
for epoch in range(num_epochs):
for inputs, targets in dataloader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
scheduler.step()
Inference
- Model evaluation: Set model to eval mode to disable dropout and batch normalization updates.
- No gradient computation: Use torch.no_grad() for memory efficiency during inference.
- Model deployment: Export models using TorchScript or ONNX for production environments.
model.eval()
with torch.no_grad():
predictions = model(test_data)
# TorchScript export
scripted_model = torch.jit.script(model)
scripted_model.save("model.pt")
torch.nn Module Class Hierarchy
classDiagram
Module <|-- Container
Module <|-- Linear
Module <|-- Conv2d
Module <|-- RNN
Container <|-- Sequential
Container <|-- ModuleList
Container <|-- ModuleDict
RNN <|-- LSTM
RNN <|-- GRU
class Module{
+forward()
+parameters()
+to(device)
}
class Container{
+add_module()
}
class Linear{
+in_features
+out_features
}
class Conv2d{
+in_channels
+out_channels
+kernel_size
}
class RNN{
+input_size
+hidden_size
}
Early Stopping
- Monitor validation loss: Track performance on unseen data to prevent overfitting.
- Patience: Set a threshold for epochs without improvement before stopping.
- Best model checkpoint: Save the model with the lowest validation loss.
- Restore best model: Load the best checkpoint after training.
best_loss, patience, counter = float('inf'), 10, 0
for epoch in range(num_epochs):
train_loss = train(model, train_loader, optimizer)
val_loss = validate(model, val_loader)
if val_loss < best_loss:
best_loss = val_loss
torch.save(model.state_dict(), 'best_model.pth')
counter = 0
else:
counter += 1
if counter >= patience:
print(f"Early stopping at epoch {epoch}")
break
model.load_state_dict(torch.load('best_model.pth'))
Hyperparameter Tuning
- Grid search: Exhaustive search over specified parameter values.
- Random search: Sample from parameter distributions for efficient exploration.
- Bayesian optimization: Efficient search using probabilistic model (e.g., GPyOpt).
- Population-based training: Evolve hyperparameters during training.
from ray import tune
from ray.tune.schedulers import ASHAScheduler
def train_func(config):
model = Net(config["hidden_size"])
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
for epoch in range(10):
loss = train_epoch(model, optimizer)
tune.report(loss=loss)
analysis = tune.run(
train_func,
config={
"hidden_size": tune.choice([32, 64, 128]),
"lr": tune.loguniform(1e-4, 1e-1)
},
scheduler=ASHAScheduler(),
num_samples=50
)
best_config = analysis.get_best_config(metric="loss")
DataLoaders
- Dataset: Define custom datasets for efficient data handling.
- DataLoader: Batch loading with multi-processing and prefetching.
- Transforms: Apply data augmentation and preprocessing on-the-fly.
- Samplers: Control the order of iteration over dataset.
class CustomDataset(Dataset):
def __init__(self, data, labels, transform=None):
self.data, self.labels = data, labels
self.transform = transform
def __len__(self): return len(self.data)
def __getitem__(self, idx):
sample = self.data[idx]
if self.transform: sample = self.transform(sample)
return sample, self.labels[idx]
transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
dataset = CustomDataset(data, labels, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
Optimizers
- SGD: Stochastic Gradient Descent with momentum and weight decay options.
- Adam: Adaptive Moment Estimation for per-parameter learning rates.
- RMSprop: Root Mean Square Propagation, adapts learning rates based on moving average of squared gradients.
- AdamW: Adam variant with improved weight decay regularization.
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
Layer Types
- Linear (Dense): Fully connected layer for linear transformations.
- Conv2d: 2D convolution for image processing tasks.
- LSTM/GRU: Recurrent layers for sequential data processing.
- Transformer: Self-attention based layers for NLP and beyond.
- BatchNorm: Normalize activations for stable training.
- Dropout: Regularization technique to prevent overfitting.
class ComplexModel(nn.Module):
def __init__(self):
super().__init__()
self.conv = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.bn = nn.BatchNorm2d(64)
self.pool = nn.MaxPool2d(2)
self.lstm = nn.LSTM(64, 128, batch_first=True)
self.fc = nn.Linear(128, 10)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.pool(F.relu(self.bn(self.conv(x))))
x = x.view(x.size(0), -1, 64) # Flatten for LSTM
x, _ = self.lstm(x)
x = self.dropout(x[:, -1, :]) # Take last LSTM output
return self.fc(x)
model = ComplexModel()