Unverified Commit aa316742 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #233 from microsoft/master

merge master
parents 3fe117f0 24fa4619
...@@ -16,13 +16,8 @@ NNI 中也内置了一些流程的模型压缩算法。 ...@@ -16,13 +16,8 @@ NNI 中也内置了一些流程的模型压缩算法。
:maxdepth: 2 :maxdepth: 2
概述 <Compressor/Overview> 概述 <Compressor/Overview>
Level Pruner <Compressor/Pruner> 快速入门 <Compressor/QuickStart>
AGP Pruner <Compressor/Pruner> Pruner <pruners>
L1Filter Pruner <Compressor/l1filterpruner> Quantizer <quantizers>
Slim Pruner <Compressor/SlimPruner> 模型加速 <Compressor/ModelSpeedup>
Lottery Ticket Pruner <Compressor/LotteryTicketHypothesis>
FPGM Pruner <Compressor/Pruner>
Naive Quantizer <Compressor/Quantizer>
QAT Quantizer <Compressor/Quantizer>
DoReFa Quantizer <Compressor/Quantizer>
自动模型压缩 <Compressor/AutoCompression> 自动模型压缩 <Compressor/AutoCompression>
############## ##########################
NAS 算法 神经网络架构搜索
############## ##########################
自动化的神经网络架构(NAS)搜索在寻找更好的模型方面发挥着越来越重要的作用。 自动化的神经网络架构(NAS)搜索在寻找更好的模型方面发挥着越来越重要的作用。
最近的研究工作证明了自动化 NAS 的可行性,并发现了一些超越手动设计和调整的模型。 最近的研究工作证明了自动化 NAS 的可行性,并发现了一些超越手动调整的模型。
代表工作有 NASNet, ENAS, DARTS, Network Morphism, 以及 Evolution 等。 新的算法还在不断涌现。 代表工作有 NASNet, ENAS, DARTS, Network Morphism, 以及 Evolution 等。 此外,新的创新不断涌现。
但是,要实现NAS算法需要花费大量的精力,并且很难在新算法中重用现有算法的代码。 但是,要实现 NAS 算法需要花费大量的精力,并且很难在新算法中重用现有算法的代码。
为了促进 NAS 创新 (如, 设计实现新的 NAS 模型,比较不同的 NAS 模型), 为了促进 NAS 创新 (如, 设计实现新的 NAS 模型,比较不同的 NAS 模型),
易于使用且灵活的编程接口非常重要。 易于使用且灵活的编程接口非常重要。
以此为出发点,我们的目标是在 NNI 中提供统一的架构 因此,我们为 NAS 提供统一的接口
来加速 NAS 创新,并更快的将最先进的算法用于现实世界的问题上。 来加速 NAS 创新,并更快的将最先进的算法用于现实世界的问题上。
详细信息,参考以下教程: 详细信息,参考以下教程:
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
概述 <NAS/Overview> 概述 <NAS/Overview>
NAS 接口 <NAS/NasInterface> 快速入门 <NAS/QuickStart>
教程 <NAS/NasGuide>
ENAS <NAS/ENAS> ENAS <NAS/ENAS>
DARTS <NAS/DARTS> DARTS <NAS/DARTS>
P-DARTS <NAS/PDARTS> P-DARTS <NAS/PDARTS>
SPOS <NAS/SPOS> SPOS <NAS/SPOS>
CDARTS <NAS/CDARTS> CDARTS <NAS/CDARTS>
ProxylessNAS <NAS/Proxylessnas>
自定义 NAS 算法 <NAS/Advanced>
API 参考 <NAS/NasReference>
...@@ -2,12 +2,11 @@ ...@@ -2,12 +2,11 @@
================== ==================
.. toctree:: .. toctree::
:maxdepth: 3 :maxdepth: 2
命令行<Tutorial/Nnictl> nnictl 命令 <Tutorial/Nnictl>
Python API<sdk_reference> Experiment 配置 <Tutorial/ExperimentConfig>
Annotation<Tutorial/AnnotationSpec>
配置<Tutorial/ExperimentConfig>
搜索空间<Tutorial/SearchSpaceSpec> 搜索空间<Tutorial/SearchSpaceSpec>
实现训练平台<TrainingService/HowToImplementTrainingService> NNI Annotation<Tutorial/AnnotationSpec>
Framework Library <SupportedFramework_Library> SDK API 参考 <sdk_reference>
支持的框架和库 <SupportedFramework_Library>
import math import math
import argparse
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torchvision import datasets, transforms from torchvision import datasets, transforms
from nni.compression.torch import L1FilterPruner from nni.compression.torch import ActivationMeanRankFilterPruner
from models.cifar10.vgg import VGG from models.cifar10.vgg import VGG
...@@ -40,6 +41,12 @@ def test(model, device, test_loader): ...@@ -40,6 +41,12 @@ def test(model, device, test_loader):
def main(): def main():
parser = argparse.ArgumentParser("multiple gpu with pruning")
parser.add_argument("--epochs", type=int, default=160)
parser.add_argument("--retrain", default=False, action="store_true")
parser.add_argument("--parallel", default=False, action="store_true")
args = parser.parse_args()
torch.manual_seed(0) torch.manual_seed(0)
device = torch.device('cuda') device = torch.device('cuda')
train_loader = torch.utils.data.DataLoader( train_loader = torch.utils.data.DataLoader(
...@@ -63,10 +70,11 @@ def main(): ...@@ -63,10 +70,11 @@ def main():
model.to(device) model.to(device)
# Train the base VGG-16 model # Train the base VGG-16 model
if args.retrain:
print('=' * 10 + 'Train the unpruned base model' + '=' * 10) print('=' * 10 + 'Train the unpruned base model' + '=' * 10)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 160, 0) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 160, 0)
for epoch in range(160): for epoch in range(args.epochs):
train(model, device, train_loader, optimizer) train(model, device, train_loader, optimizer)
test(model, device, test_loader) test(model, device, test_loader)
lr_scheduler.step(epoch) lr_scheduler.step(epoch)
...@@ -88,8 +96,16 @@ def main(): ...@@ -88,8 +96,16 @@ def main():
# Prune model and test accuracy without fine tuning. # Prune model and test accuracy without fine tuning.
print('=' * 10 + 'Test on the pruned model before fine tune' + '=' * 10) print('=' * 10 + 'Test on the pruned model before fine tune' + '=' * 10)
pruner = L1FilterPruner(model, configure_list) pruner = ActivationMeanRankFilterPruner(model, configure_list)
model = pruner.compress() model = pruner.compress()
if args.parallel:
if torch.cuda.device_count() > 1:
print("use {} gpus for pruning".format(torch.cuda.device_count()))
model = nn.DataParallel(model)
else:
print("only detect 1 gpu, fall back")
model.to(device)
test(model, device, test_loader) test(model, device, test_loader)
# top1 = 88.19% # top1 = 88.19%
......
import torch import torch
import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torchvision import datasets, transforms from torchvision import datasets, transforms
from nni.compression.torch import FPGMPruner from nni.compression.torch import FPGMPruner
...@@ -6,17 +7,17 @@ from nni.compression.torch import FPGMPruner ...@@ -6,17 +7,17 @@ from nni.compression.torch import FPGMPruner
class Mnist(torch.nn.Module): class Mnist(torch.nn.Module):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.conv1 = torch.nn.Conv2d(1, 20, 5, 1) self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = torch.nn.Conv2d(20, 50, 5, 1) self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = torch.nn.Linear(4 * 4 * 50, 500) self.fc1 = nn.Linear(4 * 4 * 50, 500)
self.fc2 = torch.nn.Linear(500, 10) self.fc2 = nn.Linear(500, 10)
def forward(self, x): def forward(self, x):
x = F.relu(self.conv1(x)) x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2) x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x)) x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2) x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4 * 4 * 50) x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x)) x = F.relu(self.fc1(x))
x = self.fc2(x) x = self.fc2(x)
return F.log_softmax(x, dim=1) return F.log_softmax(x, dim=1)
...@@ -27,8 +28,14 @@ class Mnist(torch.nn.Module): ...@@ -27,8 +28,14 @@ class Mnist(torch.nn.Module):
return num_zero_filters, num_filters, float(num_zero_filters)/num_filters return num_zero_filters, num_filters, float(num_zero_filters)/num_filters
def print_conv_filter_sparsity(self): def print_conv_filter_sparsity(self):
if isinstance(self.conv1, nn.Conv2d):
conv1_data = self._get_conv_weight_sparsity(self.conv1) conv1_data = self._get_conv_weight_sparsity(self.conv1)
conv2_data = self._get_conv_weight_sparsity(self.conv2) conv2_data = self._get_conv_weight_sparsity(self.conv2)
else:
# self.conv1 is wrapped as PrunerModuleWrapper
conv1_data = self._get_conv_weight_sparsity(self.conv1.module)
conv2_data = self._get_conv_weight_sparsity(self.conv2.module)
print('conv1: num zero filters: {}, num filters: {}, sparsity: {:.4f}'.format(conv1_data[0], conv1_data[1], conv1_data[2])) print('conv1: num zero filters: {}, num filters: {}, sparsity: {:.4f}'.format(conv1_data[0], conv1_data[1], conv1_data[2]))
print('conv2: num zero filters: {}, num filters: {}, sparsity: {:.4f}'.format(conv2_data[0], conv2_data[1], conv2_data[2])) print('conv2: num zero filters: {}, num filters: {}, sparsity: {:.4f}'.format(conv2_data[0], conv2_data[1], conv2_data[2]))
......
...@@ -71,6 +71,8 @@ if __name__ == '__main__': ...@@ -71,6 +71,8 @@ if __name__ == '__main__':
pruner = LotteryTicketPruner(model, configure_list, optimizer) pruner = LotteryTicketPruner(model, configure_list, optimizer)
pruner.compress() pruner.compress()
#model = nn.DataParallel(model)
for i in pruner.get_prune_iterations(): for i in pruner.get_prune_iterations():
pruner.prune_iteration_start() pruner.prune_iteration_start()
loss = 0 loss = 0
......
...@@ -55,7 +55,7 @@ def test(model, device, test_loader): ...@@ -55,7 +55,7 @@ def test(model, device, test_loader):
def main(): def main():
torch.manual_seed(0) torch.manual_seed(0)
device = torch.device('cpu') device = torch.device('cuda')
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_loader = torch.utils.data.DataLoader( train_loader = torch.utils.data.DataLoader(
...@@ -66,7 +66,7 @@ def main(): ...@@ -66,7 +66,7 @@ def main():
batch_size=1000, shuffle=True) batch_size=1000, shuffle=True)
model = Mnist() model = Mnist()
model.to(device) model = model.to(device)
'''you can change this to LevelPruner to implement it '''you can change this to LevelPruner to implement it
pruner = LevelPruner(configure_list) pruner = LevelPruner(configure_list)
...@@ -82,14 +82,14 @@ def main(): ...@@ -82,14 +82,14 @@ def main():
pruner = AGP_Pruner(model, configure_list) pruner = AGP_Pruner(model, configure_list)
model = pruner.compress() model = pruner.compress()
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
for epoch in range(10): for epoch in range(10):
pruner.update_epoch(epoch) pruner.update_epoch(epoch)
print('# Epoch {} #'.format(epoch)) print('# Epoch {} #'.format(epoch))
train(model, device, train_loader, optimizer) train(model, device, train_loader, optimizer)
test(model, device, test_loader) test(model, device, test_loader)
pruner.export_model('model.pth', 'mask.pth', 'model.onnx', [1, 1, 28, 28]) pruner.export_model('model.pth', 'mask.pth', 'model.onnx', [1, 1, 28, 28], device)
if __name__ == '__main__': if __name__ == '__main__':
......
import argparse
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from models.cifar10.vgg import VGG
from nni.compression.speedup.torch import ModelSpeedup
from nni.compression.torch import apply_compression_results
torch.manual_seed(0)
use_mask = False
def apoz_speedup(masks_file, model_checkpoint):
device = torch.device('cuda')
model = VGG(depth=16)
model.to(device)
model.eval()
dummy_input = torch.randn(64, 3, 32, 32)
if use_mask:
apply_compression_results(model, masks_file)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(32):
out = model(dummy_input)
#print(out.size(), out)
print('mask elapsed time: ', time.time() - start)
return
else:
#print("model before: ", model)
m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
m_speedup.speedup_model()
#print("model after: ", model)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(32):
out = model(dummy_input)
#print(out.size(), out)
print('speedup elapsed time: ', time.time() - start)
return
def l1filter_speedup(masks_file, model_checkpoint):
device = torch.device('cuda')
model = VGG(depth=16)
model.to(device)
model.eval()
dummy_input = torch.randn(64, 3, 32, 32)
if use_mask:
apply_compression_results(model, masks_file)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(32):
out = model(dummy_input)
#print(out.size(), out)
print('mask elapsed time: ', time.time() - start)
return
else:
#print("model before: ", model)
m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
m_speedup.speedup_model()
#print("model after: ", model)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(32):
out = model(dummy_input)
#print(out.size(), out)
print('speedup elapsed time: ', time.time() - start)
return
def fpgm_speedup(masks_file, model_checkpoint):
from fpgm_torch_mnist import Mnist
device = torch.device('cpu')
model = Mnist()
model.to(device)
model.print_conv_filter_sparsity()
dummy_input = torch.randn(64, 1, 28, 28)
if use_mask:
apply_compression_results(model, masks_file)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(40):
out = model(dummy_input)
print('mask elapsed time: ', time.time() - start)
#print(out.size(), out)
return
else:
m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
m_speedup.speedup_model()
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(40):
out = model(dummy_input)
print('speedup elapsed time: ', time.time() - start)
#print(out.size(), out)
return
def slim_speedup(masks_file, model_checkpoint):
device = torch.device('cuda')
model = VGG(depth=19)
model.to(device)
model.eval()
dummy_input = torch.randn(64, 3, 32, 32)
if use_mask:
apply_compression_results(model, masks_file)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(32):
out = model(dummy_input)
#print(out.size(), out)
print('mask elapsed time: ', time.time() - start)
return
else:
#print("model before: ", model)
m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
m_speedup.speedup_model()
#print("model after: ", model)
dummy_input = dummy_input.to(device)
start = time.time()
for _ in range(32):
out = model(dummy_input)
#print(out.size(), out)
print('speedup elapsed time: ', time.time() - start)
return
if __name__ == '__main__':
parser = argparse.ArgumentParser("speedup")
parser.add_argument("--example_name", type=str, default="slim", help="the name of pruning example")
parser.add_argument("--masks_file", type=str, default=None, help="the path of the masks file")
parser.add_argument("--model_checkpoint", type=str, default=None, help="the path of checkpointed model")
args = parser.parse_args()
if args.example_name == 'slim':
if args.masks_file is None:
args.masks_file = 'mask_vgg19_cifar10.pth'
slim_speedup(args.masks_file, args.model_checkpoint)
elif args.example_name == 'fpgm':
if args.masks_file is None:
args.masks_file = 'mask.pth'
fpgm_speedup(args.masks_file, args.model_checkpoint)
elif args.example_name == 'l1filter':
if args.masks_file is None:
args.masks_file = 'mask_vgg16_cifar10.pth'
l1filter_speedup(args.masks_file, args.model_checkpoint)
elif args.example_name == 'apoz':
if args.masks_file is None:
args.masks_file = 'mask_vgg16_cifar10.pth'
apoz_speedup(args.masks_file, args.model_checkpoint)
else:
raise ValueError('unsupported example_name: {}'.format(args.example_name))
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from nni.compression.torch import SlimPruner
class fc1(nn.Module):
def __init__(self, num_classes=10):
super(fc1, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(32)
self.relu1 = nn.ReLU(inplace=True)
self.linear1 = nn.Linear(32*28*28, 300)
self.relu2 = nn.ReLU(inplace=True)
self.linear2 = nn.Linear(300, 100)
self.relu3 = nn.ReLU(inplace=True)
self.linear3 = nn.Linear(100, num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = torch.flatten(x,1)
x = self.linear1(x)
x = self.relu2(x)
x = self.linear2(x)
x = self.relu3(x)
x = self.linear3(x)
return x
def train(model, train_loader, optimizer, criterion, device):
model.train()
for imgs, targets in train_loader:
optimizer.zero_grad()
imgs, targets = imgs.to(device), targets.to(device)
output = model(imgs)
train_loss = criterion(output, targets)
train_loss.backward()
optimizer.step()
return train_loss.item()
def test(model, test_loader, criterion, device):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
accuracy = 100. * correct / len(test_loader.dataset)
return accuracy
if __name__ == '__main__':
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
traindataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
testdataset = datasets.MNIST('./data', train=False, transform=transform)
train_loader = torch.utils.data.DataLoader(traindataset, batch_size=60, shuffle=True, num_workers=10, drop_last=False)
test_loader = torch.utils.data.DataLoader(testdataset, batch_size=60, shuffle=False, num_workers=10, drop_last=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = fc1()
criterion = nn.CrossEntropyLoss()
configure_list = [{
'prune_iterations': 5,
'sparsity': 0.86,
'op_types': ['BatchNorm2d']
}]
pruner = SlimPruner(model, configure_list)
pruner.compress()
if torch.cuda.device_count()>1:
model = nn.DataParallel(model)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1.2e-3)
for name, par in model.named_parameters():
print(name)
# for i in pruner.get_prune_iterations():
# pruner.prune_iteration_start()
loss = 0
accuracy = 0
for epoch in range(10):
loss = train(model, train_loader, optimizer, criterion, device)
accuracy = test(model, test_loader, criterion, device)
print('current epoch: {0}, loss: {1}, accuracy: {2}'.format(epoch, loss, accuracy))
# print('prune iteration: {0}, loss: {1}, accuracy: {2}'.format(i, loss, accuracy))
pruner.export_model('model.pth', 'mask.pth')
\ No newline at end of file
import math import math
import argparse
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
...@@ -6,7 +7,6 @@ from torchvision import datasets, transforms ...@@ -6,7 +7,6 @@ from torchvision import datasets, transforms
from nni.compression.torch import SlimPruner from nni.compression.torch import SlimPruner
from models.cifar10.vgg import VGG from models.cifar10.vgg import VGG
def updateBN(model): def updateBN(model):
for m in model.modules(): for m in model.modules():
if isinstance(m, nn.BatchNorm2d): if isinstance(m, nn.BatchNorm2d):
...@@ -49,6 +49,13 @@ def test(model, device, test_loader): ...@@ -49,6 +49,13 @@ def test(model, device, test_loader):
def main(): def main():
parser = argparse.ArgumentParser("multiple gpu with pruning")
parser.add_argument("--epochs", type=int, default=160)
parser.add_argument("--retrain", default=False, action="store_true")
parser.add_argument("--parallel", default=False, action="store_true")
args = parser.parse_args()
torch.manual_seed(0) torch.manual_seed(0)
device = torch.device('cuda') device = torch.device('cuda')
train_loader = torch.utils.data.DataLoader( train_loader = torch.utils.data.DataLoader(
...@@ -70,15 +77,16 @@ def main(): ...@@ -70,15 +77,16 @@ def main():
model = VGG(depth=19) model = VGG(depth=19)
model.to(device) model.to(device)
# Train the base VGG-19 model # Train the base VGG-19 model
if args.retrain:
print('=' * 10 + 'Train the unpruned base model' + '=' * 10) print('=' * 10 + 'Train the unpruned base model' + '=' * 10)
epochs = 160 epochs = args.epochs
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
for epoch in range(epochs): for epoch in range(epochs):
if epoch in [epochs * 0.5, epochs * 0.75]: if epoch in [epochs * 0.5, epochs * 0.75]:
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] *= 0.1 param_group['lr'] *= 0.1
print("epoch {}".format(epoch))
train(model, device, train_loader, optimizer, True) train(model, device, train_loader, optimizer, True)
test(model, device, test_loader) test(model, device, test_loader)
torch.save(model.state_dict(), 'vgg19_cifar10.pth') torch.save(model.state_dict(), 'vgg19_cifar10.pth')
...@@ -99,9 +107,14 @@ def main(): ...@@ -99,9 +107,14 @@ def main():
print('=' * 10 + 'Test the pruned model before fine tune' + '=' * 10) print('=' * 10 + 'Test the pruned model before fine tune' + '=' * 10)
pruner = SlimPruner(model, configure_list) pruner = SlimPruner(model, configure_list)
model = pruner.compress() model = pruner.compress()
test(model, device, test_loader) if args.parallel:
# top1 = 93.55% if torch.cuda.device_count() > 1:
print("use {} gpus for pruning".format(torch.cuda.device_count()))
model = nn.DataParallel(model)
# model = nn.DataParallel(model, device_ids=[0, 1])
else:
print("only detect 1 gpu, fall back")
model.to(device)
# Fine tune the pruned model for 40 epochs and test accuracy # Fine tune the pruned model for 40 epochs and test accuracy
print('=' * 10 + 'Fine tuning' + '=' * 10) print('=' * 10 + 'Fine tuning' + '=' * 10)
optimizer_finetune = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4) optimizer_finetune = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)
......
# 加速掩码的模型
*此功能还处于预览版。*
## 介绍
剪枝算法通常都用权重掩码来模拟实际的剪枝。 掩码可以用来检查某个剪枝(或稀疏)算法的模型性能,但还没有真正加速。 模型加速才是模型剪枝的最终目标。因此提供了此工具,来帮助基于用户提供的掩码(掩码来自于剪枝算法),将已有模型转换成小模型。
有两种剪枝算法。 一种是细粒度的剪枝,不改变权重形状,和输入输出的张量。 稀疏内核会被用来加速细粒度剪枝的层。 另一类是粗粒度的剪枝(例如,通道),通常,权重形状,输入输出张量会有所改变。 要加速这类剪枝算法,不需要使用系数内核,只需要用更小的层来替换。 由于开源社区中对稀疏内核的支持还比较有限,当前仅支持粗粒度剪枝,会在将来再支持细粒度的剪枝算法。
## 设计和实现
为了加速模型,被剪枝的层应该被替换掉,要么为粗粒度掩码使用较小的层,要么用稀疏内核来替换细粒度的掩码。 粗粒度掩码通常会改变权重的形状,或输入输出张量,因此,应该通过形状推断,来检查是否其它未被剪枝的层由于形状变化而需要改变形状。 因此,在设计中,主要有两个步骤:第一,做形状推理,找出所有应该替换的模块;第二,替换模块。 第一步需要模型的拓扑(即连接),我们使用了 `jit.trace` 来获取 PyTorch 的模型图。
对于每个模块,要准备四个函数,三个用于形状推理,一个用于模块替换。 三个形状推理函数是:给定权重形状推断输入/输出形状,给定输入形状推断权重/输出形状,给定输出形状推断权重/输入形状。 模块替换功能返回一个较小的新创建的模块。
## 用法
```python
from nni.compression.speedup.torch import ModelSpeedup
# model: 要加速的模型
# dummy_input: 模型的示输入,传给 `jit.trace`
# masks_file: 剪枝算法创建的掩码文件
m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file)
m_speedup.speedup_model()
dummy_input = dummy_input.to(device)
start = time.time()
out = model(dummy_input)
print('elapsed time: ', time.time() - start)
```
完整示例参考[这里](https://github.com/microsoft/nni/tree/master/examples/model_compress/model_speedup.py)
注意:当前实现仅用于 torch 1.3.1 和 torchvision 0.4.2
## 局限性
由于每个模块需要 4 个函数用于形状推理和模块替换,因此工作量较大,当前仅实现了示例所需的函数。 如果要加速自己的模型,但当前不支持,欢迎贡献。
对于 PyTorch,仅提供了替换模块,如果是在 `forward` 中的函数,当前不支持。 一种解决方案是将函数变为 PyTorch 模块。
## 示例的加速结果
实验代码可在[这里](https://github.com/microsoft/nni/tree/master/examples/model_compress/model_speedup.py)找到。
### slim Pruner 示例
在一块 V100 GPU 上, 输入张量:`torch.randn(64, 3, 32, 32)`
| 次数 | 掩码时延 | 加速后的时延 |
| -- | ------- | -------- |
| 1 | 0.01197 | 0.005107 |
| 2 | 0.02019 | 0.008769 |
| 4 | 0.02733 | 0.014809 |
| 8 | 0.04310 | 0.027441 |
| 16 | 0.07731 | 0.05008 |
| 32 | 0.14464 | 0.10027 |
### fpgm Pruner 示例
在 CPU 上, 输入张量:`torch.randn(64, 1, 28, 28)`, 方差较大
| 次数 | 掩码时延 | 加速后的时延 |
| --- | ------- | -------- |
| 1 | 0.01383 | 0.01839 |
| 2 | 0.01167 | 0.003558 |
| 4 | 0.01636 | 0.01088 |
| 40 | 0.14412 | 0.08268 |
| 40 | 1.29385 | 0.14408 |
| 40 | 0.41035 | 0.46162 |
| 400 | 6.29020 | 5.82143 |
### l1filter Pruner 示例
在一块 V100 GPU 上, 输入张量:`torch.randn(64, 3, 32, 32)`
| 次数 | 掩码时延 | 加速后的时延 |
| -- | ------- | -------- |
| 1 | 0.01026 | 0.003677 |
| 2 | 0.01657 | 0.008161 |
| 4 | 0.02458 | 0.020018 |
| 8 | 0.03498 | 0.025504 |
| 16 | 0.06757 | 0.047523 |
| 32 | 0.10487 | 0.086442 |
### APoZ Pruner 示例
在一块 V100 GPU 上, 输入张量:`torch.randn(64, 3, 32, 32)`
| 次数 | 掩码时延 | 加速后的时延 |
| -- | ------- | -------- |
| 1 | 0.01389 | 0.004208 |
| 2 | 0.01628 | 0.008310 |
| 4 | 0.02521 | 0.014008 |
| 8 | 0.03386 | 0.023923 |
| 16 | 0.06042 | 0.046183 |
| 32 | 0.12421 | 0.087113 |
\ No newline at end of file
import os
import numpy as np
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
def get_split_list(in_dim, child_num):
in_dim_list = [in_dim // child_num] * child_num
for _i in range(in_dim % child_num):
in_dim_list[_i] += 1
return in_dim_list
class DataProvider:
VALID_SEED = 0 # random seed for the validation set
@staticmethod
def name():
""" Return name of the dataset """
raise NotImplementedError
@property
def data_shape(self):
""" Return shape as python list of one data entry """
raise NotImplementedError
@property
def n_classes(self):
""" Return `int` of num classes """
raise NotImplementedError
@property
def save_path(self):
""" local path to save the data """
raise NotImplementedError
@property
def data_url(self):
""" link to download the data """
raise NotImplementedError
@staticmethod
def random_sample_valid_set(train_labels, valid_size, n_classes):
train_size = len(train_labels)
assert train_size > valid_size
g = torch.Generator()
g.manual_seed(DataProvider.VALID_SEED) # set random seed before sampling validation set
rand_indexes = torch.randperm(train_size, generator=g).tolist()
train_indexes, valid_indexes = [], []
per_class_remain = get_split_list(valid_size, n_classes)
for idx in rand_indexes:
label = train_labels[idx]
if isinstance(label, float):
label = int(label)
elif isinstance(label, np.ndarray):
label = np.argmax(label)
else:
assert isinstance(label, int)
if per_class_remain[label] > 0:
valid_indexes.append(idx)
per_class_remain[label] -= 1
else:
train_indexes.append(idx)
return train_indexes, valid_indexes
class ImagenetDataProvider(DataProvider):
def __init__(self, save_path=None, train_batch_size=256, test_batch_size=512, valid_size=None,
n_worker=32, resize_scale=0.08, distort_color=None):
self._save_path = save_path
train_transforms = self.build_train_transform(distort_color, resize_scale)
train_dataset = datasets.ImageFolder(self.train_path, train_transforms)
if valid_size is not None:
if isinstance(valid_size, float):
valid_size = int(valid_size * len(train_dataset))
else:
assert isinstance(valid_size, int), 'invalid valid_size: %s' % valid_size
train_indexes, valid_indexes = self.random_sample_valid_set(
[cls for _, cls in train_dataset.samples], valid_size, self.n_classes,
)
train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_indexes)
valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(valid_indexes)
valid_dataset = datasets.ImageFolder(self.train_path, transforms.Compose([
transforms.Resize(self.resize_value),
transforms.CenterCrop(self.image_size),
transforms.ToTensor(),
self.normalize,
]))
self.train = torch.utils.data.DataLoader(
train_dataset, batch_size=train_batch_size, sampler=train_sampler,
num_workers=n_worker, pin_memory=True,
)
self.valid = torch.utils.data.DataLoader(
valid_dataset, batch_size=test_batch_size, sampler=valid_sampler,
num_workers=n_worker, pin_memory=True,
)
else:
self.train = torch.utils.data.DataLoader(
train_dataset, batch_size=train_batch_size, shuffle=True,
num_workers=n_worker, pin_memory=True,
)
self.valid = None
self.test = torch.utils.data.DataLoader(
datasets.ImageFolder(self.valid_path, transforms.Compose([
transforms.Resize(self.resize_value),
transforms.CenterCrop(self.image_size),
transforms.ToTensor(),
self.normalize,
])), batch_size=test_batch_size, shuffle=False, num_workers=n_worker, pin_memory=True,
)
if self.valid is None:
self.valid = self.test
@staticmethod
def name():
return 'imagenet'
@property
def data_shape(self):
return 3, self.image_size, self.image_size # C, H, W
@property
def n_classes(self):
return 1000
@property
def save_path(self):
if self._save_path is None:
self._save_path = '/dataset/imagenet'
return self._save_path
@property
def data_url(self):
raise ValueError('unable to download ImageNet')
@property
def train_path(self):
return os.path.join(self.save_path, 'train')
@property
def valid_path(self):
return os.path.join(self._save_path, 'val')
@property
def normalize(self):
return transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
def build_train_transform(self, distort_color, resize_scale):
print('Color jitter: %s' % distort_color)
if distort_color == 'strong':
color_transform = transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)
elif distort_color == 'normal':
color_transform = transforms.ColorJitter(brightness=32. / 255., saturation=0.5)
else:
color_transform = None
if color_transform is None:
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(self.image_size, scale=(resize_scale, 1.0)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
self.normalize,
])
else:
train_transforms = transforms.Compose([
transforms.RandomResizedCrop(self.image_size, scale=(resize_scale, 1.0)),
transforms.RandomHorizontalFlip(),
color_transform,
transforms.ToTensor(),
self.normalize,
])
return train_transforms
@property
def resize_value(self):
return 256
@property
def image_size(self):
return 224
\ No newline at end of file
import os
import sys
import logging
from argparse import ArgumentParser
import torch
import datasets
from putils import get_parameters
from model import SearchMobileNet
from nni.nas.pytorch.proxylessnas import ProxylessNasTrainer
from retrain import Retrain
logger = logging.getLogger('nni_proxylessnas')
if __name__ == "__main__":
parser = ArgumentParser("proxylessnas")
# configurations of the model
parser.add_argument("--n_cell_stages", default='4,4,4,4,4,1', type=str)
parser.add_argument("--stride_stages", default='2,2,2,1,2,1', type=str)
parser.add_argument("--width_stages", default='24,40,80,96,192,320', type=str)
parser.add_argument("--bn_momentum", default=0.1, type=float)
parser.add_argument("--bn_eps", default=1e-3, type=float)
parser.add_argument("--dropout_rate", default=0, type=float)
parser.add_argument("--no_decay_keys", default='bn', type=str, choices=[None, 'bn', 'bn#bias'])
# configurations of imagenet dataset
parser.add_argument("--data_path", default='/data/imagenet/', type=str)
parser.add_argument("--train_batch_size", default=256, type=int)
parser.add_argument("--test_batch_size", default=500, type=int)
parser.add_argument("--n_worker", default=32, type=int)
parser.add_argument("--resize_scale", default=0.08, type=float)
parser.add_argument("--distort_color", default='normal', type=str, choices=['normal', 'strong', 'None'])
# configurations for training mode
parser.add_argument("--train_mode", default='search', type=str, choices=['search', 'retrain'])
# configurations for search
parser.add_argument("--checkpoint_path", default='./search_mobile_net.pt', type=str)
parser.add_argument("--arch_path", default='./arch_path.pt', type=str)
parser.add_argument("--no-warmup", dest='warmup', action='store_false')
# configurations for retrain
parser.add_argument("--exported_arch_path", default=None, type=str)
args = parser.parse_args()
if args.train_mode == 'retrain' and args.exported_arch_path is None:
logger.error('When --train_mode is retrain, --exported_arch_path must be specified.')
sys.exit(-1)
model = SearchMobileNet(width_stages=[int(i) for i in args.width_stages.split(',')],
n_cell_stages=[int(i) for i in args.n_cell_stages.split(',')],
stride_stages=[int(i) for i in args.stride_stages.split(',')],
n_classes=1000,
dropout_rate=args.dropout_rate,
bn_param=(args.bn_momentum, args.bn_eps))
logger.info('SearchMobileNet model create done')
model.init_model()
logger.info('SearchMobileNet model init done')
# move network to GPU if available
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
logger.info('Creating data provider...')
data_provider = datasets.ImagenetDataProvider(save_path=args.data_path,
train_batch_size=args.train_batch_size,
test_batch_size=args.test_batch_size,
valid_size=None,
n_worker=args.n_worker,
resize_scale=args.resize_scale,
distort_color=args.distort_color)
logger.info('Creating data provider done')
if args.no_decay_keys:
keys = args.no_decay_keys
momentum, nesterov = 0.9, True
optimizer = torch.optim.SGD([
{'params': get_parameters(model, keys, mode='exclude'), 'weight_decay': 4e-5},
{'params': get_parameters(model, keys, mode='include'), 'weight_decay': 0},
], lr=0.05, momentum=momentum, nesterov=nesterov)
else:
optimizer = torch.optim.SGD(get_parameters(model), lr=0.05, momentum=momentum, nesterov=nesterov, weight_decay=4e-5)
if args.train_mode == 'search':
# this is architecture search
logger.info('Creating ProxylessNasTrainer...')
trainer = ProxylessNasTrainer(model,
model_optim=optimizer,
train_loader=data_provider.train,
valid_loader=data_provider.valid,
device=device,
warmup=args.warmup,
ckpt_path=args.checkpoint_path,
arch_path=args.arch_path)
logger.info('Start to train with ProxylessNasTrainer...')
trainer.train()
logger.info('Training done')
trainer.export(args.arch_path)
logger.info('Best architecture exported in %s', args.arch_path)
elif args.train_mode == 'retrain':
# this is retrain
from nni.nas.pytorch.fixed import apply_fixed_architecture
assert os.path.isfile(args.exported_arch_path), \
"exported_arch_path {} should be a file.".format(args.exported_arch_path)
apply_fixed_architecture(model, args.exported_arch_path, device=device)
trainer = Retrain(model, optimizer, device, data_provider, n_epochs=300)
trainer.run()
import torch
import torch.nn as nn
import math
import ops
import putils
from nni.nas import pytorch as nas
class SearchMobileNet(nn.Module):
def __init__(self,
width_stages=[24,40,80,96,192,320],
n_cell_stages=[4,4,4,4,4,1],
stride_stages=[2,2,2,1,2,1],
width_mult=1, n_classes=1000,
dropout_rate=0, bn_param=(0.1, 1e-3)):
"""
Parameters
----------
width_stages: str
width (output channels) of each cell stage in the block
n_cell_stages: str
number of cells in each cell stage
stride_strages: str
stride of each cell stage in the block
width_mult : int
the scale factor of width
"""
super(SearchMobileNet, self).__init__()
input_channel = putils.make_divisible(32 * width_mult, 8)
first_cell_width = putils.make_divisible(16 * width_mult, 8)
for i in range(len(width_stages)):
width_stages[i] = putils.make_divisible(width_stages[i] * width_mult, 8)
# first conv
first_conv = ops.ConvLayer(3, input_channel, kernel_size=3, stride=2, use_bn=True, act_func='relu6', ops_order='weight_bn_act')
# first block
first_block_conv = ops.OPS['3x3_MBConv1'](input_channel, first_cell_width, 1)
first_block = first_block_conv
input_channel = first_cell_width
blocks = [first_block]
stage_cnt = 0
for width, n_cell, s in zip(width_stages, n_cell_stages, stride_stages):
for i in range(n_cell):
if i == 0:
stride = s
else:
stride = 1
op_candidates = [ops.OPS['3x3_MBConv3'](input_channel, width, stride),
ops.OPS['3x3_MBConv6'](input_channel, width, stride),
ops.OPS['5x5_MBConv3'](input_channel, width, stride),
ops.OPS['5x5_MBConv6'](input_channel, width, stride),
ops.OPS['7x7_MBConv3'](input_channel, width, stride),
ops.OPS['7x7_MBConv6'](input_channel, width, stride)]
if stride == 1 and input_channel == width:
# if it is not the first one
op_candidates += [ops.OPS['Zero'](input_channel, width, stride)]
conv_op = nas.mutables.LayerChoice(op_candidates,
return_mask=True,
key="s{}_c{}".format(stage_cnt, i))
else:
conv_op = nas.mutables.LayerChoice(op_candidates,
return_mask=True,
key="s{}_c{}".format(stage_cnt, i))
# shortcut
if stride == 1 and input_channel == width:
# if not first cell
shortcut = ops.IdentityLayer(input_channel, input_channel)
else:
shortcut = None
inverted_residual_block = ops.MobileInvertedResidualBlock(conv_op, shortcut, op_candidates)
blocks.append(inverted_residual_block)
input_channel = width
stage_cnt += 1
# feature mix layer
last_channel = putils.make_devisible(1280 * width_mult, 8) if width_mult > 1.0 else 1280
feature_mix_layer = ops.ConvLayer(input_channel, last_channel, kernel_size=1, use_bn=True, act_func='relu6', ops_order='weight_bn_act', )
classifier = ops.LinearLayer(last_channel, n_classes, dropout_rate=dropout_rate)
self.first_conv = first_conv
self.blocks = nn.ModuleList(blocks)
self.feature_mix_layer = feature_mix_layer
self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)
self.classifier = classifier
# set bn param
self.set_bn_param(momentum=bn_param[0], eps=bn_param[1])
def forward(self, x):
x = self.first_conv(x)
for block in self.blocks:
x = block(x)
x = self.feature_mix_layer(x)
x = self.global_avg_pooling(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def set_bn_param(self, momentum, eps):
for m in self.modules():
if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
m.momentum = momentum
m.eps = eps
return
def init_model(self, model_init='he_fout', init_div_groups=False):
for m in self.modules():
if isinstance(m, nn.Conv2d):
if model_init == 'he_fout':
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
if init_div_groups:
n /= m.groups
m.weight.data.normal_(0, math.sqrt(2. / n))
elif model_init == 'he_fin':
n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
if init_div_groups:
n /= m.groups
m.weight.data.normal_(0, math.sqrt(2. / n))
else:
raise NotImplementedError
elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
stdv = 1. / math.sqrt(m.weight.size(1))
m.weight.data.uniform_(-stdv, stdv)
if m.bias is not None:
m.bias.data.zero_()
from collections import OrderedDict
import torch
import torch.nn as nn
from putils import get_same_padding, build_activation
OPS = {
'Identity': lambda in_C, out_C, stride: IdentityLayer(in_C, out_C, ops_order='weight_bn_act'),
'Zero': lambda in_C, out_C, stride: ZeroLayer(stride=stride),
'3x3_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 1),
'3x3_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 2),
'3x3_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 3),
'3x3_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 4),
'3x3_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 5),
'3x3_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 3, stride, 6),
'5x5_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 1),
'5x5_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 2),
'5x5_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 3),
'5x5_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 4),
'5x5_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 5),
'5x5_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 5, stride, 6),
'7x7_MBConv1': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 1),
'7x7_MBConv2': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 2),
'7x7_MBConv3': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 3),
'7x7_MBConv4': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 4),
'7x7_MBConv5': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 5),
'7x7_MBConv6': lambda in_C, out_C, stride: MBInvertedConvLayer(in_C, out_C, 7, stride, 6)
}
class MobileInvertedResidualBlock(nn.Module):
def __init__(self, mobile_inverted_conv, shortcut, op_candidates_list):
super(MobileInvertedResidualBlock, self).__init__()
self.mobile_inverted_conv = mobile_inverted_conv
self.shortcut = shortcut
self.op_candidates_list = op_candidates_list
def forward(self, x):
out, idx = self.mobile_inverted_conv(x)
# TODO: unify idx format
if not isinstance(idx, int):
idx = (idx == 1).nonzero()
if self.op_candidates_list[idx].is_zero_layer():
res = x
elif self.shortcut is None:
res = out
else:
conv_x = out
skip_x = self.shortcut(x)
res = skip_x + conv_x
return res
class ShuffleLayer(nn.Module):
def __init__(self, groups):
super(ShuffleLayer, self).__init__()
self.groups = groups
def forward(self, x):
batchsize, num_channels, height, width = x.size()
channels_per_group = num_channels // self.groups
# reshape
x = x.view(batchsize, self.groups, channels_per_group, height, width)
# noinspection PyUnresolvedReferences
x = torch.transpose(x, 1, 2).contiguous()
# flatten
x = x.view(batchsize, -1, height, width)
return x
class Base2DLayer(nn.Module):
def __init__(self, in_channels, out_channels,
use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
super(Base2DLayer, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.use_bn = use_bn
self.act_func = act_func
self.dropout_rate = dropout_rate
self.ops_order = ops_order
""" modules """
modules = {}
# batch norm
if self.use_bn:
if self.bn_before_weight:
modules['bn'] = nn.BatchNorm2d(in_channels)
else:
modules['bn'] = nn.BatchNorm2d(out_channels)
else:
modules['bn'] = None
# activation
modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act')
# dropout
if self.dropout_rate > 0:
modules['dropout'] = nn.Dropout2d(self.dropout_rate, inplace=True)
else:
modules['dropout'] = None
# weight
modules['weight'] = self.weight_op()
# add modules
for op in self.ops_list:
if modules[op] is None:
continue
elif op == 'weight':
if modules['dropout'] is not None:
self.add_module('dropout', modules['dropout'])
for key in modules['weight']:
self.add_module(key, modules['weight'][key])
else:
self.add_module(op, modules[op])
@property
def ops_list(self):
return self.ops_order.split('_')
@property
def bn_before_weight(self):
for op in self.ops_list:
if op == 'bn':
return True
elif op == 'weight':
return False
raise ValueError('Invalid ops_order: %s' % self.ops_order)
def weight_op(self):
raise NotImplementedError
def forward(self, x):
for module in self._modules.values():
x = module(x)
return x
@staticmethod
def is_zero_layer():
return False
class ConvLayer(Base2DLayer):
def __init__(self, in_channels, out_channels,
kernel_size=3, stride=1, dilation=1, groups=1, bias=False, has_shuffle=False,
use_bn=True, act_func='relu', dropout_rate=0, ops_order='weight_bn_act'):
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
self.groups = groups
self.bias = bias
self.has_shuffle = has_shuffle
super(ConvLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order)
def weight_op(self):
padding = get_same_padding(self.kernel_size)
if isinstance(padding, int):
padding *= self.dilation
else:
padding[0] *= self.dilation
padding[1] *= self.dilation
weight_dict = OrderedDict()
weight_dict['conv'] = nn.Conv2d(
self.in_channels, self.out_channels, kernel_size=self.kernel_size, stride=self.stride, padding=padding,
dilation=self.dilation, groups=self.groups, bias=self.bias
)
if self.has_shuffle and self.groups > 1:
weight_dict['shuffle'] = ShuffleLayer(self.groups)
return weight_dict
class IdentityLayer(Base2DLayer):
def __init__(self, in_channels, out_channels,
use_bn=False, act_func=None, dropout_rate=0, ops_order='weight_bn_act'):
super(IdentityLayer, self).__init__(in_channels, out_channels, use_bn, act_func, dropout_rate, ops_order)
def weight_op(self):
return None
class LinearLayer(nn.Module):
def __init__(self, in_features, out_features, bias=True,
use_bn=False, act_func=None, dropout_rate=0, ops_order='weight_bn_act'):
super(LinearLayer, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.bias = bias
self.use_bn = use_bn
self.act_func = act_func
self.dropout_rate = dropout_rate
self.ops_order = ops_order
""" modules """
modules = {}
# batch norm
if self.use_bn:
if self.bn_before_weight:
modules['bn'] = nn.BatchNorm1d(in_features)
else:
modules['bn'] = nn.BatchNorm1d(out_features)
else:
modules['bn'] = None
# activation
modules['act'] = build_activation(self.act_func, self.ops_list[0] != 'act')
# dropout
if self.dropout_rate > 0:
modules['dropout'] = nn.Dropout(self.dropout_rate, inplace=True)
else:
modules['dropout'] = None
# linear
modules['weight'] = {'linear': nn.Linear(self.in_features, self.out_features, self.bias)}
# add modules
for op in self.ops_list:
if modules[op] is None:
continue
elif op == 'weight':
if modules['dropout'] is not None:
self.add_module('dropout', modules['dropout'])
for key in modules['weight']:
self.add_module(key, modules['weight'][key])
else:
self.add_module(op, modules[op])
@property
def ops_list(self):
return self.ops_order.split('_')
@property
def bn_before_weight(self):
for op in self.ops_list:
if op == 'bn':
return True
elif op == 'weight':
return False
raise ValueError('Invalid ops_order: %s' % self.ops_order)
def forward(self, x):
for module in self._modules.values():
x = module(x)
return x
@staticmethod
def is_zero_layer():
return False
class MBInvertedConvLayer(nn.Module):
"""
This layer is introduced in section 4.2 in the paper https://arxiv.org/pdf/1812.00332.pdf
"""
def __init__(self, in_channels, out_channels,
kernel_size=3, stride=1, expand_ratio=6, mid_channels=None):
super(MBInvertedConvLayer, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.expand_ratio = expand_ratio
self.mid_channels = mid_channels
if self.mid_channels is None:
feature_dim = round(self.in_channels * self.expand_ratio)
else:
feature_dim = self.mid_channels
if self.expand_ratio == 1:
self.inverted_bottleneck = None
else:
self.inverted_bottleneck = nn.Sequential(OrderedDict([
('conv', nn.Conv2d(self.in_channels, feature_dim, 1, 1, 0, bias=False)),
('bn', nn.BatchNorm2d(feature_dim)),
('act', nn.ReLU6(inplace=True)),
]))
pad = get_same_padding(self.kernel_size)
self.depth_conv = nn.Sequential(OrderedDict([
('conv', nn.Conv2d(feature_dim, feature_dim, kernel_size, stride, pad, groups=feature_dim, bias=False)),
('bn', nn.BatchNorm2d(feature_dim)),
('act', nn.ReLU6(inplace=True)),
]))
self.point_linear = nn.Sequential(OrderedDict([
('conv', nn.Conv2d(feature_dim, out_channels, 1, 1, 0, bias=False)),
('bn', nn.BatchNorm2d(out_channels)),
]))
def forward(self, x):
if self.inverted_bottleneck:
x = self.inverted_bottleneck(x)
x = self.depth_conv(x)
x = self.point_linear(x)
return x
@staticmethod
def is_zero_layer():
return False
class ZeroLayer(nn.Module):
def __init__(self, stride):
super(ZeroLayer, self).__init__()
self.stride = stride
def forward(self, x):
'''n, c, h, w = x.size()
h //= self.stride
w //= self.stride
device = x.get_device() if x.is_cuda else torch.device('cpu')
# noinspection PyUnresolvedReferences
padding = torch.zeros(n, c, h, w, device=device, requires_grad=False)
return padding'''
return x * 0
@staticmethod
def is_zero_layer():
return True
import torch.nn as nn
def get_parameters(model, keys=None, mode='include'):
if keys is None:
for name, param in model.named_parameters():
yield param
elif mode == 'include':
for name, param in model.named_parameters():
flag = False
for key in keys:
if key in name:
flag = True
break
if flag:
yield param
elif mode == 'exclude':
for name, param in model.named_parameters():
flag = True
for key in keys:
if key in name:
flag = False
break
if flag:
yield param
else:
raise ValueError('do not support: %s' % mode)
def get_same_padding(kernel_size):
if isinstance(kernel_size, tuple):
assert len(kernel_size) == 2, 'invalid kernel size: %s' % kernel_size
p1 = get_same_padding(kernel_size[0])
p2 = get_same_padding(kernel_size[1])
return p1, p2
assert isinstance(kernel_size, int), 'kernel size should be either `int` or `tuple`'
assert kernel_size % 2 > 0, 'kernel size should be odd number'
return kernel_size // 2
def build_activation(act_func, inplace=True):
if act_func == 'relu':
return nn.ReLU(inplace=inplace)
elif act_func == 'relu6':
return nn.ReLU6(inplace=inplace)
elif act_func == 'tanh':
return nn.Tanh()
elif act_func == 'sigmoid':
return nn.Sigmoid()
elif act_func is None:
return None
else:
raise ValueError('do not support: %s' % act_func)
def make_divisible(v, divisor, min_val=None):
"""
This function is taken from the original tf repo.
It ensures that all layers have a channel number that is divisible by 8
It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
"""
if min_val is None:
min_val = divisor
new_v = max(min_val, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
import time
import math
from datetime import timedelta
import torch
from torch import nn as nn
from nni.nas.pytorch.utils import AverageMeter
def cross_entropy_with_label_smoothing(pred, target, label_smoothing=0.1):
logsoftmax = nn.LogSoftmax()
n_classes = pred.size(1)
# convert to one-hot
target = torch.unsqueeze(target, 1)
soft_target = torch.zeros_like(pred)
soft_target.scatter_(1, target, 1)
# label smoothing
soft_target = soft_target * (1 - label_smoothing) + label_smoothing / n_classes
return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1))
def accuracy(output, target, topk=(1,)):
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
class Retrain:
def __init__(self, model, optimizer, device, data_provider, n_epochs):
self.model = model
self.optimizer = optimizer
self.device = device
self.train_loader = data_provider.train
self.valid_loader = data_provider.valid
self.test_loader = data_provider.test
self.n_epochs = n_epochs
self.criterion = nn.CrossEntropyLoss()
def run(self):
self.model = torch.nn.DataParallel(self.model)
self.model.to(self.device)
# train
self.train()
# validate
self.validate(is_test=False)
# test
self.validate(is_test=True)
def train_one_epoch(self, adjust_lr_func, train_log_func, label_smoothing=0.1):
batch_time = AverageMeter('batch_time')
data_time = AverageMeter('data_time')
losses = AverageMeter('losses')
top1 = AverageMeter('top1')
top5 = AverageMeter('top5')
self.model.train()
end = time.time()
for i, (images, labels) in enumerate(self.train_loader):
data_time.update(time.time() - end)
new_lr = adjust_lr_func(i)
images, labels = images.to(self.device), labels.to(self.device)
output = self.model(images)
if label_smoothing > 0:
loss = cross_entropy_with_label_smoothing(output, labels, label_smoothing)
else:
loss = self.criterion(output, labels)
acc1, acc5 = accuracy(output, labels, topk=(1, 5))
losses.update(loss, images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
self.model.zero_grad() # or self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % 10 == 0 or i + 1 == len(self.train_loader):
batch_log = train_log_func(i, batch_time, data_time, losses, top1, top5, new_lr)
print(batch_log)
return top1, top5
def train(self, validation_frequency=1):
best_acc = 0
nBatch = len(self.train_loader)
def train_log_func(epoch_, i, batch_time, data_time, losses, top1, top5, lr):
batch_log = 'Train [{0}][{1}/{2}]\t' \
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' \
'Loss {losses.val:.4f} ({losses.avg:.4f})\t' \
'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})'. \
format(epoch_ + 1, i, nBatch - 1,
batch_time=batch_time, data_time=data_time, losses=losses, top1=top1)
batch_log += '\tTop-5 acc {top5.val:.3f} ({top5.avg:.3f})'.format(top5=top5)
batch_log += '\tlr {lr:.5f}'.format(lr=lr)
return batch_log
def adjust_learning_rate(n_epochs, optimizer, epoch, batch=0, nBatch=None):
""" adjust learning of a given optimizer and return the new learning rate """
# cosine
T_total = n_epochs * nBatch
T_cur = epoch * nBatch + batch
# init_lr = 0.05
new_lr = 0.5 * 0.05 * (1 + math.cos(math.pi * T_cur / T_total))
for param_group in optimizer.param_groups:
param_group['lr'] = new_lr
return new_lr
for epoch in range(self.n_epochs):
print('\n', '-' * 30, 'Train epoch: %d' % (epoch + 1), '-' * 30, '\n')
end = time.time()
train_top1, train_top5 = self.train_one_epoch(
lambda i: adjust_learning_rate(self.n_epochs, self.optimizer, epoch, i, nBatch),
lambda i, batch_time, data_time, losses, top1, top5, new_lr:
train_log_func(epoch, i, batch_time, data_time, losses, top1, top5, new_lr),
)
time_per_epoch = time.time() - end
seconds_left = int((self.n_epochs - epoch - 1) * time_per_epoch)
print('Time per epoch: %s, Est. complete in: %s' % (
str(timedelta(seconds=time_per_epoch)),
str(timedelta(seconds=seconds_left))))
if (epoch + 1) % validation_frequency == 0:
val_loss, val_acc, val_acc5 = self.validate(is_test=False)
is_best = val_acc > best_acc
best_acc = max(best_acc, val_acc)
val_log = 'Valid [{0}/{1}]\tloss {2:.3f}\ttop-1 acc {3:.3f} ({4:.3f})'.\
format(epoch + 1, self.n_epochs, val_loss, val_acc, best_acc)
val_log += '\ttop-5 acc {0:.3f}\tTrain top-1 {top1.avg:.3f}\ttop-5 {top5.avg:.3f}'.\
format(val_acc5, top1=train_top1, top5=train_top5)
print(val_log)
else:
is_best = False
def validate(self, is_test=True):
if is_test:
data_loader = self.test_loader
else:
data_loader = self.valid_loader
self.model.eval()
batch_time = AverageMeter('batch_time')
losses = AverageMeter('losses')
top1 = AverageMeter('top1')
top5 = AverageMeter('top5')
end = time.time()
with torch.no_grad():
for i, (images, labels) in enumerate(data_loader):
images, labels = images.to(self.device), labels.to(self.device)
# compute output
output = self.model(images)
loss = self.criterion(output, labels)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, labels, topk=(1, 5))
losses.update(loss, images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % 10 == 0 or i + 1 == len(data_loader):
if is_test:
prefix = 'Test'
else:
prefix = 'Valid'
test_log = prefix + ': [{0}/{1}]\t'\
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'\
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'\
'Top-1 acc {top1.val:.3f} ({top1.avg:.3f})'.\
format(i, len(data_loader) - 1, batch_time=batch_time, loss=losses, top1=top1)
test_log += '\tTop-5 acc {top5.val:.3f} ({top5.avg:.3f})'.format(top5=top5)
print(test_log)
return losses.avg, top1.avg, top5.avg
\ No newline at end of file
...@@ -23,10 +23,13 @@ trial: ...@@ -23,10 +23,13 @@ trial:
memoryMB: 8196 memoryMB: 8196
#The docker image to run nni job on pai #The docker image to run nni job on pai
image: msranni/nni:latest image: msranni/nni:latest
nniManagerNFSMountPath: /home/user/mnt
containerNFSMountPath: /mnt/data/user
paiStoragePlugin: team_wise
paiConfig: paiConfig:
#The username to login pai #The username to login pai
userName: username userName: username
#The password to login pai #The token to login pai
passWord: password token: token
#The host of restful server of pai #The host of restful server of pai
host: 10.10.10.10 host: 10.10.10.10
\ No newline at end of file
authorName: default
experimentName: example_auto-gbdt
trialConcurrency: 1
maxExecDuration: 10h
maxTrialNum: 10
#choice: local, remote, pai
trainingServicePlatform: paiYarn
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: minimize
trial:
command: python3 main.py
codeDir: .
gpuNum: 0
cpuNum: 1
memoryMB: 8196
#The docker image to run nni job on pai
image: msranni/nni:latest
paiYarnConfig:
#The username to login pai
userName: username
#The password to login pai
passWord: password
#The host of restful server of pai
host: 10.10.10.10
\ No newline at end of file
...@@ -23,10 +23,13 @@ trial: ...@@ -23,10 +23,13 @@ trial:
memoryMB: 8196 memoryMB: 8196
#The docker image to run nni job on pai #The docker image to run nni job on pai
image: msranni/nni:latest image: msranni/nni:latest
nniManagerNFSMountPath: /home/user/mnt
containerNFSMountPath: /mnt/data/user
paiStoragePlugin: team_wise
paiConfig: paiConfig:
#The username to login pai #The username to login pai
userName: username userName: username
#The password to login pai #The token to login pai
passWord: password token: token
#The host of restful server of pai #The host of restful server of pai
host: 10.10.10.10 host: 10.10.10.10
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment