Commit 78e8e038 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new model

parents
#!/usr/bin/env bash
export HIP_VISIBLE_DEVICES=0,1,2,3
python -m torch.distributed.run --nproc_per_node 4 train.py --batch-size=128 --mode=small --print-freq=1 --dataset=CIFAR10 --ema-decay=0 --label-smoothing=0 --lr=0.2 --save-epoch-freq=10 --lr-decay=cos --lr-min=0 --warmup-epochs=5 --weight-decay=6e-5 --num-epochs=400 --num-workers=2 --width-multiplier=1 --data-dir /data/ --save-path /data/mobilenetv3out/ $1 $2
# -*- coding: UTF-8 -*-
'''
statistical information and display
Ref: https://github.com/pytorch/examples/blob/master/imagenet/main.py
'''
import torch
def accuracy(output, target, topk=(1,)):
'''
Computes the accuracy over the k top predictions for the specified values of k
'''
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].contiguous().view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
class AverageMeter(object):
'''
Computes and stores the average and current value
'''
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
# fmtstr = '{name}:{val' + self.fmt + '}(avg:{avg' + self.fmt + '})'
fmtstr = 'Avg {name}:{avg' + self.fmt + '}'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print(' '.join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
\ No newline at end of file
# -*- coding: UTF-8 -*-
'''
Train the model
Ref: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
'''
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import time
import datetime
import os
from mobileNetV3 import MobileNetV3
import argparse
import copy
from math import cos, pi
from statistics import *
from EMA import EMA
from LabelSmoothing import LabelSmoothingLoss
# from DataLoader import dataloaders
from ResultWriter import ResultWriter
from CosineLR import *
from Mixup import mixup_data, mixup_criterion
from collections import OrderedDict
import torch.distributed as dist
def train(args, model, dataloader, loader_len, criterion, optimizer, scheduler, use_gpu, epoch, ema=None, save_file_name='train.csv'):
'''
train the model
'''
# save result every epoch
resultWriter = ResultWriter(args.save_path, save_file_name)
if epoch == 0:
resultWriter.create_csv(['epoch', 'loss', 'top-1', 'top-5', 'lr'])
# use gpu or not
device = torch.device('cuda' if use_gpu else 'cpu')
# statistical information
batch_time = AverageMeter('Time', ':6.3f')
sample_time=AverageMeter('samples/sec', ':6.2f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
loader_len,
[batch_time, data_time,sample_time, losses, top1, top5],
prefix="{} Epoch: [{}] rank {} ".format(datetime.datetime.fromtimestamp(time.time()),epoch,local_rank))
# update lr here if using stepLR
if args.lr_decay == 'step':
scheduler.step(epoch)
# Set model to training mode
model.train()
end = time.time()
# Iterate over data
for i, (inputs, labels) in enumerate(dataloader):
# measure data loading time
data_time.update(time.time() - end)
inputs = inputs.to(device)
labels = labels.to(device)
if args.mixup:
# using mixup
inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, args.mixup_alpha)
outputs = model(inputs)
loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
acc1_a, acc5_a = accuracy(outputs, labels_a, topk=(1, 5))
acc1_b, acc5_b = accuracy(outputs, labels_b, topk=(1, 5))
# measure accuracy and record loss
acc1 = lam * acc1_a + (1 - lam) * acc1_b
acc5 = lam * acc5_a + (1 - lam) * acc5_b
else:
# normal forward
outputs = model(inputs)
loss = criterion(outputs, labels)
# measure accuracy and record loss
acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
# zero the parameter gradients
optimizer.zero_grad()
losses.update(loss.item(), inputs.size(0))
top1.update(acc1[0], inputs.size(0))
top5.update(acc5[0], inputs.size(0))
# backward + optimize
loss.backward()
if args.lr_decay == 'cos':
# update lr here if using cosine lr decay
scheduler.step(epoch * loader_len + i)
elif args.lr_decay == 'sgdr':
# update lr here if using sgdr
scheduler.step(epoch + i / loader_len)
optimizer.step()
if args.ema_decay > 0:
# EMA update after training(every iteration)
ema.update()
batch_time.update(time.time() - end)
sample_time.update(args.batch_size/batch_time.avg)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
# write training result to file
resultWriter.write_csv([epoch, losses.avg, top1.avg.item(), top5.avg.item(), scheduler.optimizer.param_groups[0]['lr']])
print()
# there is a bug in get_lr() if using pytorch 1.1.0, see https://github.com/pytorch/pytorch/issues/22107
# so here we don't use get_lr()
# print('lr:%.6f' % scheduler.get_lr()[0])
print('lr:%.6f' % scheduler.optimizer.param_groups[0]['lr'])
print('{} Train *** rank:{} Loss:{losses.avg:.2e} Acc@1:{top1.avg:.2f} Acc@5:{top5.avg:.2f}'.format(datetime.datetime.fromtimestamp(time.time()),local_rank,losses=losses, top1=top1, top5=top5))
if epoch % args.save_epoch_freq == 0 and epoch != 0 and local_rank==0:
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
torch.save(model.state_dict(), os.path.join(args.save_path, "epoch_" + str(epoch) + ".pth"))
return sample_time.avg
def validate(args, model, dataloader, loader_len, criterion, use_gpu, epoch, ema=None, save_file_name='val.csv'):
'''
validate the model
'''
# save result every epoch
resultWriter = ResultWriter(args.save_path, save_file_name)
if epoch == 0:
resultWriter.create_csv(['epoch', 'loss', 'top-1', 'top-5'])
device = torch.device('cuda' if use_gpu else 'cpu')
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
loader_len,
[batch_time, data_time, losses, top1, top5],
prefix="{} Epoch: [{}] rank ".format(datetime.datetime.fromtimestamp(time.time()),epoch,local_rank))
if args.ema_decay > 0:
# apply EMA at validation stage
ema.apply_shadow()
# Set model to evaluate mode
model.eval()
end = time.time()
# Iterate over data
for i, (inputs, labels) in enumerate(dataloader):
# measure data loading time
data_time.update(time.time() - end)
inputs = inputs.to(device)
labels = labels.to(device)
with torch.set_grad_enabled(False):
outputs = model(inputs)
loss = criterion(outputs, labels)
# measure accuracy and record loss
acc1, acc5 = accuracy(outputs, labels, topk=(1, 5))
losses.update(loss.item(), inputs.size(0))
top1.update(acc1[0], inputs.size(0))
top5.update(acc5[0], inputs.size(0))
batch_time.update(time.time() - end)
end = time.time()
if args.ema_decay > 0:
# restore the origin parameters after val
ema.restore()
# write val result to file
resultWriter.write_csv([epoch, losses.avg, top1.avg.item(), top5.avg.item()])
print('{} Val *** rank:{} Loss:{losses.avg:.2e} Acc@1:{top1.avg:.2f} Acc@5:{top5.avg:.2f}'.format(datetime.datetime.fromtimestamp(time.time()),local_rank,losses=losses, top1=top1, top5=top5))
if epoch % args.save_epoch_freq == 0 and epoch != 0 and local_rank==0:
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
torch.save(model.state_dict(), os.path.join(args.save_path, "epoch_" + str(epoch) + ".pth"))
top1_acc = top1.avg.item()
top5_acc = top5.avg.item()
return top1_acc, top5_acc
def train_model(args, model, dataloader, loaders_len, criterion, optimizer, scheduler, use_gpu):
'''
train the model
'''
since = time.time()
ema = None
# exponential moving average
if args.ema_decay > 0:
ema = EMA(model, decay=args.ema_decay)
ema.register()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
correspond_top5 = 0.0
sample_time=0.0
for epoch in range(args.start_epoch, args.num_epochs):
epoch_time = time.time()
sample_time=train(args, model, dataloader['train'], loaders_len['train'], criterion, optimizer, scheduler, use_gpu, epoch, ema)
top1_acc, top5_acc = validate(args, model, dataloader['val'], loaders_len['val'], criterion, use_gpu, epoch, ema)
epoch_time = time.time() - epoch_time
print('Time of epoch-[{:d}/{:d}] : {:.0f}h {:.0f}m {:.0f}s\n'.format(epoch, args.num_epochs, epoch_time // 3600, (epoch_time % 3600) // 60, epoch_time % 60))
# deep copy the model if it has higher top-1 accuracy
if top1_acc > best_acc:
best_acc = top1_acc
correspond_top5 = top5_acc
if args.ema_decay > 0:
ema.apply_shadow()
best_model_wts = copy.deepcopy(model.state_dict())
if args.ema_decay > 0:
ema.restore()
print(os.path.split(args.save_path)[-1])
print('{} Best val top-1 Accuracy: {:4f}'.format(datetime.datetime.fromtimestamp(time.time()),best_acc))
print('{} Corresponding top-5 Accuracy: {:4f}'.format(datetime.datetime.fromtimestamp(time.time()),correspond_top5))
time_elapsed = time.time() - since
print('{} Training complete in {:.0f}h {:.0f}m {:.0f}s, samples/sec {:.2f}'.format(datetime.datetime.fromtimestamp(time.time()),time_elapsed // 3600, (time_elapsed % 3600) // 60, time_elapsed % 60,sample_time))
# load best model weights
model.load_state_dict(best_model_wts)
# save best model weights
if args.save and local_rank==0:
torch.save(model.state_dict(), os.path.join(args.save_path, 'best_model_wts-' + '{:.2f}'.format(best_acc) + '.pth'))
return model
def write_pid_file(pid_file_path):
'''Write pid file for watching the process later.
In each round of case, we will write the current pid in the same path.
'''
if os.path.exists(pid_file_path):
os.remove(pid_file_path)
file_d=open(pid_file_path,"w")
file_d.write("%s\n" % os.getpid())
file_d.close()
if __name__ == '__main__':
import warnings
warnings.filterwarnings('ignore')
parser = argparse.ArgumentParser(description='PyTorch implementation of MobileNetV3')
# Root catalog of images
parser.add_argument('--data-dir', type=str, default='/media/data2/chenjiarong/ImageData')
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--num-epochs', type=int, default=150)
parser.add_argument('--lr', type=float, default=0.1)
parser.add_argument('--num-workers', type=int, default=4)
#parser.add_argument('--gpus', type=str, default='0')
parser.add_argument('--print-freq', type=int, default=1000)
parser.add_argument('--save-epoch-freq', type=int, default=1)
parser.add_argument('--save-path', type=str, default='/media/data2/chenjiarong/saved-model/MobileNetV3')
parser.add_argument('-save', default=False, action='store_true', help='save model or not')
parser.add_argument('--resume', type=str, default='', help='For training from one checkpoint')
parser.add_argument('--start-epoch', type=int, default=0, help='Corresponding to the epoch of resume')
parser.add_argument('--ema-decay', type=float, default=0.9999, help='The decay of exponential moving average ')
parser.add_argument('--dataset', type=str, default='ImageNet', help='The dataset to be trained')
parser.add_argument('-dali', default=False, action='store_true', help='Using DALI or not')
parser.add_argument('--mode', type=str, default='large', help='large or small MobileNetV3')
# parser.add_argument('--num-class', type=int, default=1000)
parser.add_argument('--width-multiplier', type=float, default=1.0, help='width multiplier')
parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate')
parser.add_argument('--label-smoothing', type=float, default=0.1, help='label smoothing')
parser.add_argument('--lr-decay', type=str, default='step', help='learning rate decay method, step, cos or sgdr')
parser.add_argument('--step-size', type=int, default=3, help='step size in stepLR()')
parser.add_argument('--gamma', type=float, default=0.99, help='gamma in stepLR()')
parser.add_argument('--lr-min', type=float, default=0, help='minium lr using in CosineWarmupLR')
parser.add_argument('--warmup-epochs', type=int, default=0, help='warmup epochs using in CosineWarmupLR')
parser.add_argument('--T-0', type=int, default=10, help='T_0 in CosineAnnealingWarmRestarts')
parser.add_argument('--T-mult', type=int, default=2, help='T_mult in CosineAnnealingWarmRestarts')
parser.add_argument('--decay-rate', type=float, default=1, help='decay rate in CosineAnnealingWarmRestarts')
parser.add_argument('--optimizer', type=str, default='sgd', help='optimizer')
parser.add_argument('--weight-decay', type=float, default=1e-5, help='weight decay')
parser.add_argument('--bn-momentum', type=float, default=0.1, help='momentum in BatchNorm2d')
parser.add_argument('-use-seed', default=False, action='store_true', help='using fixed random seed or not')
parser.add_argument('--seed', type=int, default=1, help='random seed')
parser.add_argument('-deterministic', default=False, action='store_true', help='torch.backends.cudnn.deterministic')
parser.add_argument('-nbd', default=False, action='store_true', help='no bias decay')
parser.add_argument('-zero-gamma', default=False, action='store_true', help='zero gamma in BatchNorm2d when init')
parser.add_argument('-mixup', default=False, action='store_true', help='mixup or not')
parser.add_argument('--mixup-alpha', type=float, default=0.2, help='alpha used in mixup')
parser.add_argument("--log_dir",
type=str,
default="/data/flagperf/training/result/",
help="Log directory in container.")
args = parser.parse_args()
write_pid_file(args.log_dir)
args.lr_decay = args.lr_decay.lower()
args.dataset = args.dataset.lower()
args.optimizer = args.optimizer.lower()
# folder to save what we need in this type: MobileNetV3-mode-dataset-width_multiplier-dropout-lr-batch_size-ema_decay-label_smoothing
folder_name = ['MobileNetV3', args.mode, args.dataset, 'wm'+str(args.width_multiplier), 'dp'+str(args.dropout), 'lr'+str(args.lr), 'bs'+str(args.batch_size), 'ed'+str(args.ema_decay), 'ls'+str(args.label_smoothing), args.optimizer+str(args.weight_decay), 'bn'+str(args.bn_momentum), 'epochs'+str(args.num_epochs), 'seed'+(str(args.seed) if args.use_seed else 'None'), 'determin'+str(args.deterministic), 'NoBiasDecay'+str(args.nbd), 'zeroGamma'+str(args.zero_gamma), 'mixup'+(str(args.mixup_alpha) if args.mixup else 'False')]
if args.lr_decay == 'step':
folder_name.append(args.lr_decay+str(args.step_size)+'&'+str(args.gamma))
elif args.lr_decay == 'cos':
folder_name.append(args.lr_decay+str(args.warmup_epochs) + '&' + str(args.lr_min))
elif args.lr_decay == 'sgdr':
folder_name.append(args.lr_decay+str(args.T_0)+'&'+str(args.T_mult)+'&'+str(args.warmup_epochs)+'&'+str(args.decay_rate))
folder_name = '-'.join(folder_name)
args.save_path = os.path.join(args.save_path, folder_name)
if not os.path.exists(args.save_path):
os.makedirs(args.save_path)
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ['LOCAL_RANK'])
dist.init_process_group(backend="nccl")
torch.cuda.set_device(local_rank)
# use gpu or not
use_gpu = torch.cuda.is_available()
print("use_gpu:{}".format(use_gpu))
# set random seed
if args.use_seed:
print('Using fixed random seed')
torch.manual_seed(args.seed)
else:
print('do not use fixed random seed')
if use_gpu:
if args.use_seed:
torch.cuda.manual_seed(args.seed)
if torch.cuda.device_count() > 1:
torch.cuda.manual_seed_all(args.seed)
if args.deterministic:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
else:
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
print('torch.backends.cudnn.deterministic:' + str(args.deterministic))
# read data
# dataloaders = dataloaders(args)
if args.dali and (args.dataset == 'tinyimagenet' or args.dataset == 'imagenet'):
if args.dataset == 'imagenet':
from DALIDataLoader import get_dali_imageNet_train_loader, get_dali_imageNet_val_loader
train_loader, train_loader_len = get_dali_imageNet_train_loader(data_path=args.data_dir, batch_size=args.batch_size, seed=args.seed, num_threads=args.num_workers)
val_loader, val_loader_len = get_dali_imageNet_val_loader(data_path=args.data_dir, batch_size=args.batch_size, seed=args.seed, num_threads=args.num_workers)
dataloaders = {'train' : train_loader, 'val' : val_loader}
loaders_len = {'train': train_loader_len, 'val' : val_loader_len}
elif args.dataset == 'tinyimagenet':
from DALIDataLoader import get_dali_tinyImageNet_train_loader, get_dali_tinyImageNet_val_loader
train_loader, train_loader_len = get_dali_tinyImageNet_train_loader(data_path=args.data_dir, batch_size=args.batch_size, seed=args.seed, num_threads=args.num_workers)
val_loader, val_loader_len = get_dali_tinyImageNet_val_loader(data_path=args.data_dir, batch_size=args.batch_size, seed=args.seed, num_threads=args.num_workers)
dataloaders = {'train' : train_loader, 'val' : val_loader}
loaders_len = {'train': train_loader_len, 'val' : val_loader_len}
else:
from DataLoader import dataloaders
loaders = dataloaders(args)
train_loader = loaders['train']
train_loader_len = len(train_loader)
val_loader = loaders['val']
val_loader_len = len(val_loader)
dataloaders = {'train' : train_loader, 'val' : val_loader}
loaders_len = {'train': train_loader_len, 'val' : val_loader_len}
# different input size and number of classes for different datasets
if args.dataset == 'imagenet':
input_size = 224
num_class = 1000
elif args.dataset == 'tinyimagenet':
input_size = 56
num_class = 200
if args.dataset == 'cifar100':
input_size = 32
num_class = 100
elif args.dataset == 'cifar10' or args.dataset == 'svhn':
input_size = 32
num_class = 10
# get model
model = MobileNetV3(mode=args.mode, classes_num=num_class, input_size=input_size,
width_multiplier=args.width_multiplier, dropout=args.dropout,
BN_momentum=args.bn_momentum, zero_gamma=args.zero_gamma)
if use_gpu:
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(model)
else:
model.to(torch.device('cpu'))
if args.resume:
if os.path.isfile(args.resume):
print(("=> loading checkpoint '{}'".format(args.resume)))
state_dict=torch.load(args.resume)
new_state_dict= OrderedDict()
for k,v in state_dict.items():
if 'classifier' in k:
continue
name="module."+k
if 'featureList.1.conv2.1' in k:
name=name.replace('featureList.1.conv2.1','featureList.1.conv2.1.lastBN')
new_state_dict[name]=v
model.load_state_dict(new_state_dict,strict=False)
else:
print(("=> no checkpoint found at '{}'".format(args.resume)))
exit()
if args.label_smoothing > 0:
# using Label Smoothing
criterion = LabelSmoothingLoss(num_class, label_smoothing=args.label_smoothing)
else:
criterion = nn.CrossEntropyLoss()
if args.optimizer == 'sgd':
if args.nbd:
from NoBiasDecay import noBiasDecay
optimizer_ft = optim.SGD(
# no bias decay
noBiasDecay(model, args.lr, args.weight_decay),
momentum=0.9)
else:
optimizer_ft = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
elif args.optimizer == 'rmsprop':
optimizer_ft = optim.RMSprop(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
elif args.optimizer == 'adam':
optimizer_ft = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
if args.lr_decay == 'step':
# Decay LR by a factor of 0.99 every 3 epoch
lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=args.step_size, gamma=args.gamma)
elif args.lr_decay == 'cos':
lr_scheduler = CosineWarmupLR(optimizer=optimizer_ft, epochs=args.num_epochs, iter_in_one_epoch=loaders_len['train'], lr_min=args.lr_min, warmup_epochs=args.warmup_epochs)
elif args.lr_decay == 'sgdr':
lr_scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer_ft, T_0=args.T_0, T_mult=args.T_mult, warmup_epochs=args.warmup_epochs, decay_rate=args.decay_rate)
model = train_model(args=args,
model=model,
dataloader=dataloaders,
loaders_len=loaders_len,
criterion=criterion,
optimizer=optimizer_ft,
scheduler=lr_scheduler,
use_gpu=use_gpu)
python -m torch.distributed.run --nproc_per_node 4 train.py --batch-size=128 --mode=small --print-freq=1 --dataset=CIFAR10 --ema-decay=0 --label-smoothing=0 --lr=0.2 --save-epoch-freq=10 --lr-decay=cos --lr-min=0 --warmup-epochs=5 --weight-decay=6e-5 --num-epochs=400 --num-workers=2 --width-multiplier=1 --data-dir /data/ --save-path /data/mobilenetv3out/ --log_dir /data/pid.txt 2>&1 | tee mobilenetv3_dcu_`date +%Y%m%d%H%M%S`.log
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment