Unverified Commit f83ea813 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[example] add train resnet/vit with booster example (#3694)

* [example] add train vit with booster example

* [example] update readme

* [example] add train resnet with booster example

* [example] enable ci

* [example] enable ci

* [example] add requirements

* [hotfix] fix analyzer init

* [example] update requirements
parent 2629f971
# Distributed Data Parallel
# Train ResNet on CIFAR-10 from scratch
## 🚀 Quick Start
This example provides a training script and an evaluation script. The training script provides an example of training ResNet on CIFAR10 dataset from scratch.
- Training Arguments
- `-r`, `--resume`: resume from checkpoint file path
- `-c`, `--checkpoint`: the folder to save checkpoints
- `-i`, `--interval`: epoch interval to save checkpoints
- `-f`, `--fp16`: use fp16
- `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`.
- `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming.
- `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`.
- `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved.
- `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`.
- Eval Arguments
- `-e`, `--epoch`: select the epoch to evaluate
- `-c`, `--checkpoint`: the folder where checkpoints are found
### Install requirements
```bash
pip install -r requirements.txt
```
### Train
......@@ -22,7 +28,10 @@ This example provides a training script and an evaluation script. The training s
colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp32
# train with torch DDP with mixed precision training
colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 --fp16
colossalai run --nproc_per_node 2 train.py -c ./ckpt-fp16 -p torch_ddp_fp16
# train with low level zero
colossalai run --nproc_per_node 2 train.py -c ./ckpt-low_level_zero -p low_level_zero
```
### Eval
......@@ -33,12 +42,15 @@ python eval.py -c ./ckpt-fp32 -e 80
# evaluate fp16 mixed precision training
python eval.py -c ./ckpt-fp16 -e 80
# evaluate low level zero training
python eval.py -c ./ckpt-low_level_zero -e 80
```
Expected accuracy performance will be:
| Model | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 |
| --------- | ------------------------ | --------------------- | --------------------- |
| ResNet-18 | 85.85% | 85.03% | 85.12% |
| Model | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero |
| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- |
| ResNet-18 | 85.85% | 84.91% | 85.46% | 84.50% |
**Note: the baseline is adapted from the [script](https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter03_intermediate/3_2_2_cnn_resnet_cifar10/) to use `torchvision.models.resnet18`**
#!/bin/bash
set -xe
export DATA=/data/scratch/cifar-10
pip install -r requirements.txt
for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do
colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.84 --plugin $plugin
done
import argparse
import os
from pathlib import Path
import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.optim import Optimizer
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import DataLoader
from tqdm import tqdm
import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 80
LEARNING_RATE = 1e-3
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
# trainsform
transform_train = transforms.Compose(
[transforms.Pad(4),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32),
transforms.ToTensor()])
transform_test = transforms.ToTensor()
# CIFAR-10 dataset
data_path = os.environ.get('DATA', './data')
with coordinator.priority_execution():
train_dataset = torchvision.datasets.CIFAR10(root=data_path,
train=True,
transform=transform_train,
download=True)
test_dataset = torchvision.datasets.CIFAR10(root=data_path,
train=False,
transform=transform_test,
download=True)
# Data loader
train_dataloader = plugin.prepare_train_dataloader(train_dataset,
batch_size=batch_size,
shuffle=True,
drop_last=True)
test_dataloader = plugin.prepare_train_dataloader(test_dataset,
batch_size=batch_size,
shuffle=False,
drop_last=False)
return train_dataloader, test_dataloader
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
for images, labels in test_dataloader:
images = images.cuda()
labels = labels.cuda()
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
dist.all_reduce(correct)
dist.all_reduce(total)
accuracy = correct.item() / total.item()
if coordinator.is_master():
print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
return accuracy
def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
booster: Booster, coordinator: DistCoordinator):
model.train()
with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
for images, labels in pbar:
images = images.cuda()
labels = labels.cuda()
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
booster.backward(loss, optimizer)
optimizer.step()
optimizer.zero_grad()
# Print log info
pbar.set_postfix({'loss': loss.item()})
def main():
# ==============================
# Parse Arguments
# ==============================
parser = argparse.ArgumentParser()
# FIXME(ver217): gemini is not supported resnet now
parser.add_argument('-p',
'--plugin',
type=str,
default='torch_ddp',
choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
help="plugin to use")
parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
parser.add_argument('--target_acc',
type=float,
default=None,
help="target accuracy. Raise exception if not reached")
args = parser.parse_args()
# ==============================
# Prepare Checkpoint Directory
# ==============================
if args.interval > 0:
Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
# ==============================
# Launch Distributed Environment
# ==============================
colossalai.launch_from_torch(config={})
coordinator = DistCoordinator()
# update the learning rate with linear scaling
# old_gpu_num / old_lr = new_gpu_num / new_lr
global LEARNING_RATE
LEARNING_RATE *= coordinator.world_size
# ==============================
# Instantiate Plugin and Booster
# ==============================
booster_kwargs = {}
if args.plugin == 'torch_ddp_fp16':
booster_kwargs['mixed_precision'] = 'fp16'
if args.plugin.startswith('torch_ddp'):
plugin = TorchDDPPlugin()
elif args.plugin == 'gemini':
plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
elif args.plugin == 'low_level_zero':
plugin = LowLevelZeroPlugin(initial_scale=2**5)
booster = Booster(plugin=plugin, **booster_kwargs)
# ==============================
# Prepare Dataloader
# ==============================
train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)
# ====================================
# Prepare model, optimizer, criterion
# ====================================
# resent50
model = torchvision.models.resnet18(num_classes=10)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
# lr scheduler
lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)
# ==============================
# Boost with ColossalAI
# ==============================
model, optimizer, criterion, _, lr_scheduler = booster.boost(model,
optimizer,
criterion=criterion,
lr_scheduler=lr_scheduler)
# ==============================
# Resume from checkpoint
# ==============================
if args.resume >= 0:
booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
# ==============================
# Train model
# ==============================
start_epoch = args.resume if args.resume >= 0 else 0
for epoch in range(start_epoch, NUM_EPOCHS):
train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
lr_scheduler.step()
# save checkpoint
if args.interval > 0 and (epoch + 1) % args.interval == 0:
booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')
accuracy = evaluate(model, test_dataloader, coordinator)
if args.target_acc is not None:
assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'
if __name__ == '__main__':
main()
# Train ViT on CIFAR-10 from scratch
## 🚀 Quick Start
This example provides a training script, which provides an example of training ViT on CIFAR10 dataset from scratch.
- Training Arguments
- `-p`, `--plugin`: Plugin to use. Choices: `torch_ddp`, `torch_ddp_fp16`, `low_level_zero`. Defaults to `torch_ddp`.
- `-r`, `--resume`: Resume from checkpoint file path. Defaults to `-1`, which means not resuming.
- `-c`, `--checkpoint`: The folder to save checkpoints. Defaults to `./checkpoint`.
- `-i`, `--interval`: Epoch interval to save checkpoints. Defaults to `5`. If set to `0`, no checkpoint will be saved.
- `--target_acc`: Target accuracy. Raise exception if not reached. Defaults to `None`.
### Install requirements
```bash
pip install -r requirements.txt
```
### Train
```bash
# train with torch DDP with fp32
colossalai run --nproc_per_node 4 train.py -c ./ckpt-fp32
# train with torch DDP with mixed precision training
colossalai run --nproc_per_node 4 train.py -c ./ckpt-fp16 -p torch_ddp_fp16
# train with low level zero
colossalai run --nproc_per_node 4 train.py -c ./ckpt-low_level_zero -p low_level_zero
```
Expected accuracy performance will be:
| Model | Single-GPU Baseline FP32 | Booster DDP with FP32 | Booster DDP with FP16 | Booster Low Level Zero |
| --------- | ------------------------ | --------------------- | --------------------- | ---------------------- |
| ViT | 83.00% | 84.03% | 84.00% | 84.43% |
colossalai
timm
torch
torchvision
tqdm
#!/bin/bash
set -xe
export DATA=/data/scratch/cifar-10
pip install -r requirements.txt
for plugin in "torch_ddp" "torch_ddp_fp16" "low_level_zero"; do
colossalai run --nproc_per_node 4 train.py --interval 0 --target_acc 0.83 --plugin $plugin
done
import argparse
import os
from pathlib import Path
import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from timm.models.vision_transformer import _cfg, _create_vision_transformer
from torch.optim import Optimizer
from torch.utils.data import DataLoader
from tqdm import tqdm
import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.lr_scheduler import LinearWarmupLR
from colossalai.nn.optimizer import HybridAdam
from colossalai.utils import get_current_device
# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 60
WARMUP_EPOCSH = 5
LEARNING_RATE = 1e-3
def vit_cifar(**kwargs):
pretrained_cfg = _cfg(num_classes=10, input_size=(3, 32, 32), crop_pct=1.0)
model_kwargs = dict(patch_size=4, embed_dim=512, depth=6, num_heads=8, drop_rate=0.1, mlp_ratio=1.0, **kwargs)
model = _create_vision_transformer('vit_cifar', pretrained_cfg=pretrained_cfg, **model_kwargs)
return model
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
# trainsform
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768)),
])
transform_test = transforms.Compose([
transforms.Resize(32),
transforms.ToTensor(),
transforms.Normalize((0.49139968, 0.48215827, 0.44653124), (0.24703233, 0.24348505, 0.26158768)),
])
# CIFAR-10 dataset
data_path = os.environ.get('DATA', './data')
with coordinator.priority_execution():
train_dataset = torchvision.datasets.CIFAR10(root=data_path,
train=True,
transform=transform_train,
download=True)
test_dataset = torchvision.datasets.CIFAR10(root=data_path,
train=False,
transform=transform_test,
download=True)
# Data loader
train_dataloader = plugin.prepare_train_dataloader(train_dataset,
batch_size=batch_size,
shuffle=True,
drop_last=True)
test_dataloader = plugin.prepare_train_dataloader(test_dataset,
batch_size=batch_size,
shuffle=False,
drop_last=False)
return train_dataloader, test_dataloader
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
model.eval()
correct = torch.zeros(1, dtype=torch.int64, device=get_current_device())
total = torch.zeros(1, dtype=torch.int64, device=get_current_device())
for images, labels in test_dataloader:
images = images.cuda()
labels = labels.cuda()
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
dist.all_reduce(correct)
dist.all_reduce(total)
accuracy = correct.item() / total.item()
if coordinator.is_master():
print(f'Accuracy of the model on the test images: {accuracy * 100:.2f} %')
return accuracy
def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, criterion: nn.Module, train_dataloader: DataLoader,
booster: Booster, coordinator: DistCoordinator):
model.train()
with tqdm(train_dataloader, desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not coordinator.is_master()) as pbar:
for images, labels in pbar:
images = images.cuda()
labels = labels.cuda()
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
booster.backward(loss, optimizer)
optimizer.step()
optimizer.zero_grad()
# Print log info
pbar.set_postfix({'loss': loss.item()})
def main():
# ==============================
# Parse Arguments
# ==============================
parser = argparse.ArgumentParser()
# FIXME(ver217): gemini is not supported resnet now
parser.add_argument('-p',
'--plugin',
type=str,
default='torch_ddp',
choices=['torch_ddp', 'torch_ddp_fp16', 'low_level_zero'],
help="plugin to use")
parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
parser.add_argument('--target_acc',
type=float,
default=None,
help="target accuracy. Raise exception if not reached")
args = parser.parse_args()
# ==============================
# Prepare Checkpoint Directory
# ==============================
if args.interval > 0:
Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
# ==============================
# Launch Distributed Environment
# ==============================
colossalai.launch_from_torch(config={})
coordinator = DistCoordinator()
# update the learning rate with linear scaling
# old_gpu_num / old_lr = new_gpu_num / new_lr
global LEARNING_RATE
LEARNING_RATE *= coordinator.world_size
# ==============================
# Instantiate Plugin and Booster
# ==============================
booster_kwargs = {}
if args.plugin == 'torch_ddp_fp16':
booster_kwargs['mixed_precision'] = 'fp16'
if args.plugin.startswith('torch_ddp'):
plugin = TorchDDPPlugin()
elif args.plugin == 'gemini':
plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, initial_scale=2**5)
elif args.plugin == 'low_level_zero':
plugin = LowLevelZeroPlugin(initial_scale=2**5)
booster = Booster(plugin=plugin, **booster_kwargs)
# ==============================
# Prepare Dataloader
# ==============================
train_dataloader, test_dataloader = build_dataloader(512, coordinator, plugin)
# ====================================
# Prepare model, optimizer, criterion
# ====================================
# resent50
model = torchvision.models.resnet18(num_classes=10)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
# lr scheduler
lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCSH)
# ==============================
# Boost with ColossalAI
# ==============================
model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model,
optimizer,
criterion=criterion,
dataloader=train_dataloader,
lr_scheduler=lr_scheduler)
# ==============================
# Resume from checkpoint
# ==============================
if args.resume >= 0:
booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
# ==============================
# Train model
# ==============================
start_epoch = args.resume if args.resume >= 0 else 0
for epoch in range(start_epoch, NUM_EPOCHS):
train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
lr_scheduler.step()
# save checkpoint
if args.interval > 0 and (epoch + 1) % args.interval == 0:
booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')
accuracy = evaluate(model, test_dataloader, coordinator)
if args.target_acc is not None:
assert accuracy >= args.target_acc, f'Accuracy {accuracy} is lower than target accuracy {args.target_acc}'
if __name__ == '__main__':
main()
......@@ -10,6 +10,12 @@ This example provides a training script, which provides an example of finetuning
- `--target_f1`: Target f1 score. Raise exception if not reached. Defaults to `None`.
### Install requirements
```bash
pip install -r requirements.txt
```
### Train
```bash
......
colossalai
datasets
torch
tqdm
transformers
scipy
scikit-learn
#!/bin/bash
set -xe
pip install -r requirements.txt
for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero"; do
torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin
done
#!/usr/bin/env
echo "The CI integration will be completed when the API is stable"
#!/bin/bash
set -xe
# FIXME(ver217): only run bert finetune to save time
cd glue_bert && bash ./test_ci.sh && cd ..
import argparse
from pathlib import Path
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import MultiStepLR
import colossalai
from colossalai.booster import Booster
from colossalai.booster.plugin import TorchDDPPlugin
from colossalai.cluster import DistCoordinator
# ==============================
# Parse Arguments
# ==============================
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--resume', type=int, default=-1, help="resume from the epoch's checkpoint")
parser.add_argument('-c', '--checkpoint', type=str, default='./checkpoint', help="checkpoint directory")
parser.add_argument('-i', '--interval', type=int, default=5, help="interval of saving checkpoint")
parser.add_argument('-f', '--fp16', action='store_true', help="use fp16")
args = parser.parse_args()
# ==============================
# Prepare Checkpoint Directory
# ==============================
Path(args.checkpoint).mkdir(parents=True, exist_ok=True)
# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 80
LEARNING_RATE = 1e-3
START_EPOCH = args.resume if args.resume >= 0 else 0
# ==============================
# Launch Distributed Environment
# ==============================
colossalai.launch_from_torch(config={})
coordinator = DistCoordinator()
# update the learning rate with linear scaling
# old_gpu_num / old_lr = new_gpu_num / new_lr
LEARNING_RATE *= coordinator.world_size
# ==============================
# Prepare Booster
# ==============================
plugin = TorchDDPPlugin()
if args.fp16:
booster = Booster(mixed_precision='fp16', plugin=plugin)
else:
booster = Booster(plugin=plugin)
# ==============================
# Prepare Train Dataset
# ==============================
transform = transforms.Compose(
[transforms.Pad(4),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32),
transforms.ToTensor()])
# CIFAR-10 dataset
with coordinator.priority_execution():
train_dataset = torchvision.datasets.CIFAR10(root='./data/', train=True, transform=transform, download=True)
# ====================================
# Prepare model, optimizer, criterion
# ====================================
# resent50
model = torchvision.models.resnet18(num_classes=10).cuda()
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# lr scheduler
lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)
# prepare dataloader with torch ddp plugin
train_dataloader = plugin.prepare_train_dataloader(train_dataset, batch_size=100, shuffle=True)
# ==============================
# Resume from checkpoint
# ==============================
if args.resume >= 0:
booster.load_model(model, f'{args.checkpoint}/model_{args.resume}.pth')
booster.load_optimizer(optimizer, f'{args.checkpoint}/optimizer_{args.resume}.pth')
booster.load_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{args.resume}.pth')
# ==============================
# Boost with ColossalAI
# ==============================
model, optimizer, criterion, train_dataloader, lr_scheduler = booster.boost(model, optimizer, criterion,
train_dataloader, lr_scheduler)
# ==============================
# Train model
# ==============================
total_step = len(train_dataloader)
for epoch in range(START_EPOCH, NUM_EPOCHS):
for i, (images, labels) in enumerate(train_dataloader):
images = images.cuda()
labels = labels.cuda()
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
booster.backward(loss, optimizer)
optimizer.step()
if (i + 1) % 100 == 0:
print("Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}".format(epoch + 1, NUM_EPOCHS, i + 1, total_step,
loss.item()))
lr_scheduler.step()
# save checkpoint every 5 epoch
if (epoch + 1) % args.interval == 0:
booster.save_model(model, f'{args.checkpoint}/model_{epoch + 1}.pth')
booster.save_optimizer(optimizer, f'{args.checkpoint}/optimizer_{epoch + 1}.pth')
booster.save_lr_scheduler(lr_scheduler, f'{args.checkpoint}/lr_scheduler_{epoch + 1}.pth')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment