Commit 1011377c authored by qianyj's avatar qianyj
Browse files

the source code of NNI for DCU

parent abc22158
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import json
import logging
import time
from argparse import ArgumentParser
import torch
import torch.nn as nn
import datasets
from model import CNN
from nni.nas.pytorch.callbacks import ArchitectureCheckpoint, LRSchedulerCallback
from utils import accuracy
logger = logging.getLogger('nni')
if __name__ == "__main__":
parser = ArgumentParser("darts")
parser.add_argument("--layers", default=8, type=int)
parser.add_argument("--batch-size", default=64, type=int)
parser.add_argument("--log-frequency", default=10, type=int)
parser.add_argument("--epochs", default=50, type=int)
parser.add_argument("--channels", default=16, type=int)
parser.add_argument("--unrolled", default=False, action="store_true")
parser.add_argument("--visualization", default=False, action="store_true")
parser.add_argument("--v1", default=False, action="store_true")
args = parser.parse_args()
dataset_train, dataset_valid = datasets.get_dataset("cifar10")
model = CNN(32, 3, args.channels, 10, args.layers)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)
if args.v1:
from nni.algorithms.nas.pytorch.darts import DartsTrainer
trainer = DartsTrainer(model,
loss=criterion,
metrics=lambda output, target: accuracy(output, target, topk=(1,)),
optimizer=optim,
num_epochs=args.epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
batch_size=args.batch_size,
log_frequency=args.log_frequency,
unrolled=args.unrolled,
callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")])
if args.visualization:
trainer.enable_visualization()
trainer.train()
else:
from nni.retiarii.oneshot.pytorch import DartsTrainer
trainer = DartsTrainer(
model=model,
loss=criterion,
metrics=lambda output, target: accuracy(output, target, topk=(1,)),
optimizer=optim,
num_epochs=args.epochs,
dataset=dataset_train,
batch_size=args.batch_size,
log_frequency=args.log_frequency,
unrolled=args.unrolled
)
trainer.fit()
final_architecture = trainer.export()
print('Final architecture:', trainer.export())
json.dump(trainer.export(), open('checkpoint.json', 'w'))
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
def accuracy(output, target, topk=(1,)):
""" Computes the precision@k for the specified values of k """
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
# one-hot case
if target.ndimension() > 1:
target = target.max(1)[1]
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = dict()
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
return res
\ No newline at end of file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import tensorflow as tf
def get_dataset():
(x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data()
x_train, x_valid = x_train / 255.0, x_valid / 255.0
train_set = (x_train, y_train)
valid_set = (x_valid, y_valid)
return train_set, valid_set
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import (
AveragePooling2D,
BatchNormalization,
Conv2D,
Dense,
Dropout,
GlobalAveragePooling2D,
MaxPool2D,
ReLU,
SeparableConv2D,
)
from nni.nas.tensorflow.mutables import InputChoice, LayerChoice, MutableScope
def build_conv(filters, kernel_size, name=None):
return Sequential([
Conv2D(filters, kernel_size=1, use_bias=False),
BatchNormalization(trainable=False),
ReLU(),
Conv2D(filters, kernel_size, padding='same'),
BatchNormalization(trainable=False),
ReLU(),
], name)
def build_separable_conv(filters, kernel_size, name=None):
return Sequential([
Conv2D(filters, kernel_size=1, use_bias=False),
BatchNormalization(trainable=False),
ReLU(),
SeparableConv2D(filters, kernel_size, padding='same', use_bias=False),
Conv2D(filters, kernel_size=1, use_bias=False),
BatchNormalization(trainable=False),
ReLU(),
], name)
def build_avg_pool(filters, name=None):
return Sequential([
Conv2D(filters, kernel_size=1, use_bias=False),
BatchNormalization(trainable=False),
ReLU(),
AveragePooling2D(pool_size=3, strides=1, padding='same'),
BatchNormalization(trainable=False),
], name)
def build_max_pool(filters, name=None):
return Sequential([
Conv2D(filters, kernel_size=1, use_bias=False),
BatchNormalization(trainable=False),
ReLU(),
MaxPool2D(pool_size=3, strides=1, padding='same'),
BatchNormalization(trainable=False),
], name)
class FactorizedReduce(Model):
def __init__(self, filters):
super().__init__()
self.conv1 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False)
self.conv2 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False)
self.bn = BatchNormalization(trainable=False)
def call(self, x):
out1 = self.conv1(x)
out2 = self.conv2(x[:, 1:, 1:, :])
out = tf.concat([out1, out2], axis=3)
out = self.bn(out)
return out
class ENASLayer(MutableScope):
def __init__(self, key, prev_labels, filters):
super().__init__(key)
self.mutable = LayerChoice([
build_conv(filters, 3, 'conv3'),
build_separable_conv(filters, 3, 'sepconv3'),
build_conv(filters, 5, 'conv5'),
build_separable_conv(filters, 5, 'sepconv5'),
build_avg_pool(filters, 'avgpool'),
build_max_pool(filters, 'maxpool'),
])
if len(prev_labels) > 0:
self.skipconnect = InputChoice(choose_from=prev_labels, n_chosen=None)
else:
self.skipconnect = None
self.batch_norm = BatchNormalization(trainable=False)
def call(self, prev_layers):
out = self.mutable(prev_layers[-1])
if self.skipconnect is not None:
connection = self.skipconnect(prev_layers[:-1])
if connection is not None:
out += connection
return self.batch_norm(out)
class GeneralNetwork(Model):
def __init__(self, num_layers=12, filters=24, num_classes=10, dropout_rate=0.0):
super().__init__()
self.num_layers = num_layers
self.stem = Sequential([
Conv2D(filters, kernel_size=3, padding='same', use_bias=False),
BatchNormalization()
])
labels = ['layer_{}'.format(i) for i in range(num_layers)]
self.enas_layers = []
for i in range(num_layers):
layer = ENASLayer(labels[i], labels[:i], filters)
self.enas_layers.append(layer)
pool_num = 2
self.pool_distance = num_layers // (pool_num + 1)
self.pool_layers = [FactorizedReduce(filters) for _ in range(pool_num)]
self.gap = GlobalAveragePooling2D()
self.dropout = Dropout(dropout_rate)
self.dense = Dense(num_classes)
def call(self, x):
cur = self.stem(x)
prev_outputs = [cur]
for i, layer in enumerate(self.enas_layers):
if i > 0 and i % self.pool_distance == 0:
pool = self.pool_layers[i // self.pool_distance - 1]
prev_outputs = [pool(tensor) for tensor in prev_outputs]
cur = prev_outputs[-1]
cur = layer(prev_outputs)
prev_outputs.append(cur)
cur = self.gap(cur)
cur = self.dropout(cur)
logits = self.dense(cur)
return logits
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import (
AveragePooling2D,
BatchNormalization,
Conv2D,
Dense,
Dropout,
GlobalAveragePooling2D,
MaxPool2D,
ReLU,
SeparableConv2D,
)
from nni.nas.tensorflow.mutables import InputChoice, LayerChoice, MutableScope
def build_conv_1x1(filters, name=None):
return Sequential([
Conv2D(filters, kernel_size=1, use_bias=False),
BatchNormalization(trainable=False),
ReLU(),
], name)
def build_sep_conv(filters, kernel_size, name=None):
return Sequential([
ReLU(),
SeparableConv2D(filters, kernel_size, padding='same'),
BatchNormalization(trainable=True),
], name)
class FactorizedReduce(Model):
def __init__(self, filters):
super().__init__()
self.conv1 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False)
self.conv2 = Conv2D(filters // 2, kernel_size=1, strides=2, use_bias=False)
self.bn = BatchNormalization(trainable=False)
def call(self, x):
out1 = self.conv1(x)
out2 = self.conv2(x[:, 1:, 1:, :])
out = tf.concat([out1, out2], axis=3)
out = self.bn(out)
return out
class ReductionLayer(Model):
def __init__(self, filters):
super().__init__()
self.reduce0 = FactorizedReduce(filters)
self.reduce1 = FactorizedReduce(filters)
def call(self, prevprev, prev):
return self.reduce0(prevprev), self.reduce1(prev)
class Calibration(Model):
def __init__(self, filters):
super().__init__()
self.filters = filters
self.process = None
def build(self, shape):
assert len(shape) == 4 # batch_size, width, height, filters
if shape[3] != self.filters:
self.process = build_conv_1x1(self.filters)
def call(self, x):
if self.process is None:
return x
return self.process(x)
class Cell(Model):
def __init__(self, cell_name, prev_labels, filters):
super().__init__()
self.input_choice = InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True, key=cell_name + '_input')
self.op_choice = LayerChoice([
build_sep_conv(filters, 3),
build_sep_conv(filters, 5),
AveragePooling2D(pool_size=3, strides=1, padding='same'),
MaxPool2D(pool_size=3, strides=1, padding='same'),
Sequential(), # Identity
], key=cell_name + '_op')
def call(self, prev_layers):
chosen_input, chosen_mask = self.input_choice(prev_layers)
cell_out = self.op_choice(chosen_input)
return cell_out, chosen_mask
class Node(MutableScope):
def __init__(self, node_name, prev_node_names, filters):
super().__init__(node_name)
self.cell_x = Cell(node_name + '_x', prev_node_names, filters)
self.cell_y = Cell(node_name + '_y', prev_node_names, filters)
def call(self, prev_layers):
out_x, mask_x = self.cell_x(prev_layers)
out_y, mask_y = self.cell_y(prev_layers)
return out_x + out_y, mask_x | mask_y
class ENASLayer(Model):
def __init__(self, num_nodes, filters, reduction):
super().__init__()
self.preproc0 = Calibration(filters)
self.preproc1 = Calibration(filters)
self.nodes = []
node_labels = [InputChoice.NO_KEY, InputChoice.NO_KEY]
name_prefix = 'reduce' if reduction else 'normal'
for i in range(num_nodes):
node_labels.append('{}_node_{}'.format(name_prefix, i))
self.nodes.append(Node(node_labels[-1], node_labels[:-1], filters))
self.conv_ops = [Conv2D(filters, kernel_size=1, padding='same', use_bias=False) for _ in range(num_nodes + 2)]
self.bn = BatchNormalization(trainable=False)
def call(self, prevprev, prev):
prev_nodes_out = [self.preproc0(prevprev), self.preproc1(prev)]
nodes_used_mask = tf.zeros(len(self.nodes) + 2, dtype=tf.bool)
for i, node in enumerate(self.nodes):
node_out, mask = node(prev_nodes_out)
nodes_used_mask |= tf.pad(mask, [[0, nodes_used_mask.shape[0] - mask.shape[0]]])
prev_nodes_out.append(node_out)
outputs = []
for used, out, conv in zip(nodes_used_mask.numpy(), prev_nodes_out, self.conv_ops):
if not used:
outputs.append(conv(out))
out = tf.add_n(outputs)
return prev, self.bn(out)
class MicroNetwork(Model):
def __init__(self, num_layers=6, num_nodes=5, out_channels=20, num_classes=10, dropout_rate=0.1):
super().__init__()
self.num_layers = num_layers
self.stem = Sequential([
Conv2D(out_channels * 3, kernel_size=3, padding='same', use_bias=False),
BatchNormalization(),
])
pool_distance = num_layers // 3
pool_layer_indices = [pool_distance, 2 * pool_distance + 1]
self.enas_layers = []
filters = out_channels
for i in range(num_layers + 2):
if i in pool_layer_indices:
reduction = True
filters *= 2
self.enas_layers.append(ReductionLayer(filters))
else:
reduction = False
self.enas_layers.append(ENASLayer(num_nodes, filters, reduction))
self.gap = GlobalAveragePooling2D()
self.dropout = Dropout(dropout_rate)
self.dense = Dense(num_classes)
def call(self, x):
prev = cur = self.stem(x)
for layer in self.enas_layers:
prev, cur = layer(prev, cur)
cur = tf.keras.activations.relu(cur)
cur = self.gap(cur)
cur = self.dropout(cur)
logits = self.dense(cur)
return logits
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from tensorflow.keras.losses import Reduction, SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import SGD
from nni.algorithms.nas.tensorflow import enas
import datasets
from macro import GeneralNetwork
from micro import MicroNetwork
from utils import accuracy, accuracy_metrics
# TODO: argparse
dataset_train, dataset_valid = datasets.get_dataset()
#model = GeneralNetwork()
model = MicroNetwork()
loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE)
optimizer = SGD(learning_rate=0.05, momentum=0.9)
trainer = enas.EnasTrainer(model,
loss=loss,
metrics=accuracy_metrics,
reward_function=accuracy,
optimizer=optimizer,
batch_size=64,
num_epochs=310,
dataset_train=dataset_train,
dataset_valid=dataset_valid)
trainer.train()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import tensorflow as tf
def accuracy_metrics(y_true, logits):
return {'enas_acc': accuracy(y_true, logits)}
def accuracy(y_true, logits):
# y_true: shape=(batch_size) or (batch_size,1), type=integer
# logits: shape=(batch_size, num_of_classes), type=float
# returns float
batch_size = y_true.shape[0]
y_true = tf.squeeze(y_true)
y_pred = tf.math.argmax(logits, axis=1)
y_pred = tf.cast(y_pred, y_true.dtype)
equal = tf.cast(y_pred == y_true, tf.int32)
return tf.math.reduce_sum(equal).numpy() / batch_size
[Documentation](https://nni.readthedocs.io/en/latest/NAS/ENAS.html)
[文档](https://nni.readthedocs.io/zh/latest/NAS/ENAS.html)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from torchvision import transforms
from torchvision.datasets import CIFAR10
def get_dataset(cls):
MEAN = [0.49139968, 0.48215827, 0.44653124]
STD = [0.24703233, 0.24348505, 0.26158768]
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD)
]
train_transform = transforms.Compose(transf + normalize)
valid_transform = transforms.Compose(normalize)
if cls == "cifar10":
dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
else:
raise NotImplementedError
return dataset_train, dataset_valid
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch.nn as nn
from nni.nas.pytorch import mutables
from ops import FactorizedReduce, ConvBranch, PoolBranch
class ENASLayer(mutables.MutableScope):
def __init__(self, key, prev_labels, in_filters, out_filters):
super().__init__(key)
self.in_filters = in_filters
self.out_filters = out_filters
self.mutable = mutables.LayerChoice([
ConvBranch(in_filters, out_filters, 3, 1, 1, separable=False),
ConvBranch(in_filters, out_filters, 3, 1, 1, separable=True),
ConvBranch(in_filters, out_filters, 5, 1, 2, separable=False),
ConvBranch(in_filters, out_filters, 5, 1, 2, separable=True),
PoolBranch('avg', in_filters, out_filters, 3, 1, 1),
PoolBranch('max', in_filters, out_filters, 3, 1, 1)
])
if len(prev_labels) > 0:
self.skipconnect = mutables.InputChoice(choose_from=prev_labels, n_chosen=None)
else:
self.skipconnect = None
self.batch_norm = nn.BatchNorm2d(out_filters, affine=False)
def forward(self, prev_layers):
out = self.mutable(prev_layers[-1])
if self.skipconnect is not None:
connection = self.skipconnect(prev_layers[:-1])
if connection is not None:
out = out + connection
return self.batch_norm(out)
class GeneralNetwork(nn.Module):
def __init__(self, num_layers=12, out_filters=24, in_channels=3, num_classes=10,
dropout_rate=0.0):
super().__init__()
self.num_layers = num_layers
self.num_classes = num_classes
self.out_filters = out_filters
self.stem = nn.Sequential(
nn.Conv2d(in_channels, out_filters, 3, 1, 1, bias=False),
nn.BatchNorm2d(out_filters)
)
pool_distance = self.num_layers // 3
self.pool_layers_idx = [pool_distance - 1, 2 * pool_distance - 1]
self.dropout_rate = dropout_rate
self.dropout = nn.Dropout(self.dropout_rate)
self.layers = nn.ModuleList()
self.pool_layers = nn.ModuleList()
labels = []
for layer_id in range(self.num_layers):
labels.append("layer_{}".format(layer_id))
if layer_id in self.pool_layers_idx:
self.pool_layers.append(FactorizedReduce(self.out_filters, self.out_filters))
self.layers.append(ENASLayer(labels[-1], labels[:-1], self.out_filters, self.out_filters))
self.gap = nn.AdaptiveAvgPool2d(1)
self.dense = nn.Linear(self.out_filters, self.num_classes)
def forward(self, x):
bs = x.size(0)
cur = self.stem(x)
layers = [cur]
for layer_id in range(self.num_layers):
cur = self.layers[layer_id](layers)
layers.append(cur)
if layer_id in self.pool_layers_idx:
for i, layer in enumerate(layers):
layers[i] = self.pool_layers[self.pool_layers_idx.index(layer_id)](layer)
cur = layers[-1]
cur = self.gap(cur).view(bs, -1)
cur = self.dropout(cur)
logits = self.dense(cur)
return logits
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import torch.nn as nn
import torch.nn.functional as F
from nni.nas.pytorch import mutables
from ops import FactorizedReduce, StdConv, SepConvBN, Pool
class AuxiliaryHead(nn.Module):
def __init__(self, in_channels, num_classes):
super().__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.pooling = nn.Sequential(
nn.ReLU(),
nn.AvgPool2d(5, 3, 2)
)
self.proj = nn.Sequential(
StdConv(in_channels, 128),
StdConv(128, 768)
)
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(768, 10, bias=False)
def forward(self, x):
bs = x.size(0)
x = self.pooling(x)
x = self.proj(x)
x = self.avg_pool(x).view(bs, -1)
x = self.fc(x)
return x
class Cell(nn.Module):
def __init__(self, cell_name, prev_labels, channels):
super().__init__()
self.input_choice = mutables.InputChoice(choose_from=prev_labels, n_chosen=1, return_mask=True,
key=cell_name + "_input")
self.op_choice = mutables.LayerChoice([
SepConvBN(channels, channels, 3, 1),
SepConvBN(channels, channels, 5, 2),
Pool("avg", 3, 1, 1),
Pool("max", 3, 1, 1),
nn.Identity()
], key=cell_name + "_op")
def forward(self, prev_layers):
from nni.retiarii.oneshot.pytorch.random import PathSamplingInputChoice
out = self.input_choice(prev_layers)
if isinstance(self.input_choice, PathSamplingInputChoice):
# Retiarii pattern
return out, self.input_choice.mask
else:
chosen_input, chosen_mask = out
cell_out = self.op_choice(chosen_input)
return cell_out, chosen_mask
class Node(mutables.MutableScope):
def __init__(self, node_name, prev_node_names, channels):
super().__init__(node_name)
self.cell_x = Cell(node_name + "_x", prev_node_names, channels)
self.cell_y = Cell(node_name + "_y", prev_node_names, channels)
def forward(self, prev_layers):
out_x, mask_x = self.cell_x(prev_layers)
out_y, mask_y = self.cell_y(prev_layers)
return out_x + out_y, mask_x | mask_y
class Calibration(nn.Module):
def __init__(self, in_channels, out_channels):
super().__init__()
self.process = None
if in_channels != out_channels:
self.process = StdConv(in_channels, out_channels)
def forward(self, x):
if self.process is None:
return x
return self.process(x)
class ReductionLayer(nn.Module):
def __init__(self, in_channels_pp, in_channels_p, out_channels):
super().__init__()
self.reduce0 = FactorizedReduce(in_channels_pp, out_channels, affine=False)
self.reduce1 = FactorizedReduce(in_channels_p, out_channels, affine=False)
def forward(self, pprev, prev):
return self.reduce0(pprev), self.reduce1(prev)
class ENASLayer(nn.Module):
def __init__(self, num_nodes, in_channels_pp, in_channels_p, out_channels, reduction):
super().__init__()
self.preproc0 = Calibration(in_channels_pp, out_channels)
self.preproc1 = Calibration(in_channels_p, out_channels)
self.num_nodes = num_nodes
name_prefix = "reduce" if reduction else "normal"
self.nodes = nn.ModuleList()
node_labels = [mutables.InputChoice.NO_KEY, mutables.InputChoice.NO_KEY]
for i in range(num_nodes):
node_labels.append("{}_node_{}".format(name_prefix, i))
self.nodes.append(Node(node_labels[-1], node_labels[:-1], out_channels))
self.final_conv_w = nn.Parameter(torch.zeros(out_channels, self.num_nodes + 2, out_channels, 1, 1), requires_grad=True)
self.bn = nn.BatchNorm2d(out_channels, affine=False)
self.reset_parameters()
def reset_parameters(self):
nn.init.kaiming_normal_(self.final_conv_w)
def forward(self, pprev, prev):
pprev_, prev_ = self.preproc0(pprev), self.preproc1(prev)
prev_nodes_out = [pprev_, prev_]
nodes_used_mask = torch.zeros(self.num_nodes + 2, dtype=torch.bool, device=prev.device)
for i in range(self.num_nodes):
node_out, mask = self.nodes[i](prev_nodes_out)
nodes_used_mask[:mask.size(0)] |= mask.to(node_out.device)
prev_nodes_out.append(node_out)
unused_nodes = torch.cat([out for used, out in zip(nodes_used_mask, prev_nodes_out) if not used], 1)
unused_nodes = F.relu(unused_nodes)
conv_weight = self.final_conv_w[:, ~nodes_used_mask, :, :, :]
conv_weight = conv_weight.view(conv_weight.size(0), -1, 1, 1)
out = F.conv2d(unused_nodes, conv_weight)
return prev, self.bn(out)
class MicroNetwork(nn.Module):
def __init__(self, num_layers=2, num_nodes=5, out_channels=24, in_channels=3, num_classes=10,
dropout_rate=0.0, use_aux_heads=False):
super().__init__()
self.num_layers = num_layers
self.use_aux_heads = use_aux_heads
self.stem = nn.Sequential(
nn.Conv2d(in_channels, out_channels * 3, 3, 1, 1, bias=False),
nn.BatchNorm2d(out_channels * 3)
)
pool_distance = self.num_layers // 3
pool_layers = [pool_distance, 2 * pool_distance + 1]
self.dropout = nn.Dropout(dropout_rate)
self.layers = nn.ModuleList()
c_pp = c_p = out_channels * 3
c_cur = out_channels
for layer_id in range(self.num_layers + 2):
reduction = False
if layer_id in pool_layers:
c_cur, reduction = c_p * 2, True
self.layers.append(ReductionLayer(c_pp, c_p, c_cur))
c_pp = c_p = c_cur
self.layers.append(ENASLayer(num_nodes, c_pp, c_p, c_cur, reduction))
if self.use_aux_heads and layer_id == pool_layers[-1] + 1:
self.layers.append(AuxiliaryHead(c_cur, num_classes))
c_pp, c_p = c_p, c_cur
self.gap = nn.AdaptiveAvgPool2d(1)
self.dense = nn.Linear(c_cur, num_classes)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
def forward(self, x):
bs = x.size(0)
prev = cur = self.stem(x)
aux_logits = None
for layer in self.layers:
if isinstance(layer, AuxiliaryHead):
if self.training:
aux_logits = layer(cur)
else:
prev, cur = layer(prev, cur)
cur = self.gap(F.relu(cur)).view(bs, -1)
cur = self.dropout(cur)
logits = self.dense(cur)
if aux_logits is not None:
return logits, aux_logits
return logits
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import torch.nn as nn
class StdConv(nn.Module):
def __init__(self, C_in, C_out):
super(StdConv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(C_out, affine=False),
nn.ReLU()
)
def forward(self, x):
return self.conv(x)
class PoolBranch(nn.Module):
def __init__(self, pool_type, C_in, C_out, kernel_size, stride, padding, affine=False):
super().__init__()
self.preproc = StdConv(C_in, C_out)
self.pool = Pool(pool_type, kernel_size, stride, padding)
self.bn = nn.BatchNorm2d(C_out, affine=affine)
def forward(self, x):
out = self.preproc(x)
out = self.pool(out)
out = self.bn(out)
return out
class SeparableConv(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding):
super(SeparableConv, self).__init__()
self.depthwise = nn.Conv2d(C_in, C_in, kernel_size=kernel_size, padding=padding, stride=stride,
groups=C_in, bias=False)
self.pointwise = nn.Conv2d(C_in, C_out, kernel_size=1, bias=False)
def forward(self, x):
out = self.depthwise(x)
out = self.pointwise(out)
return out
class ConvBranch(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding, separable):
super(ConvBranch, self).__init__()
self.preproc = StdConv(C_in, C_out)
if separable:
self.conv = SeparableConv(C_out, C_out, kernel_size, stride, padding)
else:
self.conv = nn.Conv2d(C_out, C_out, kernel_size, stride=stride, padding=padding)
self.postproc = nn.Sequential(
nn.BatchNorm2d(C_out, affine=False),
nn.ReLU()
)
def forward(self, x):
out = self.preproc(x)
out = self.conv(out)
out = self.postproc(out)
return out
class FactorizedReduce(nn.Module):
def __init__(self, C_in, C_out, affine=False):
super().__init__()
self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.bn = nn.BatchNorm2d(C_out, affine=affine)
def forward(self, x):
out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
out = self.bn(out)
return out
class Pool(nn.Module):
def __init__(self, pool_type, kernel_size, stride, padding):
super().__init__()
if pool_type.lower() == 'max':
self.pool = nn.MaxPool2d(kernel_size, stride, padding)
elif pool_type.lower() == 'avg':
self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
else:
raise ValueError()
def forward(self, x):
return self.pool(x)
class SepConvBN(nn.Module):
def __init__(self, C_in, C_out, kernel_size, padding):
super().__init__()
self.relu = nn.ReLU()
self.conv = SeparableConv(C_in, C_out, kernel_size, 1, padding)
self.bn = nn.BatchNorm2d(C_out, affine=True)
def forward(self, x):
x = self.relu(x)
x = self.conv(x)
x = self.bn(x)
return x
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import time
from argparse import ArgumentParser
import torch
import torch.nn as nn
import datasets
from macro import GeneralNetwork
from micro import MicroNetwork
from nni.algorithms.nas.pytorch import enas
from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint,
LRSchedulerCallback)
from utils import accuracy, reward_accuracy
logger = logging.getLogger('nni')
if __name__ == "__main__":
parser = ArgumentParser("enas")
parser.add_argument("--batch-size", default=128, type=int)
parser.add_argument("--log-frequency", default=10, type=int)
parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
parser.add_argument("--visualization", default=False, action="store_true")
parser.add_argument("--v1", default=False, action="store_true")
args = parser.parse_args()
dataset_train, dataset_valid = datasets.get_dataset("cifar10")
mutator = None
ctrl_kwargs = {}
if args.search_for == "macro":
model = GeneralNetwork()
num_epochs = args.epochs or 310
elif args.search_for == "micro":
model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=False)
num_epochs = args.epochs or 150
if args.v1:
mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
else:
ctrl_kwargs = {"tanh_constant": 1.1}
else:
raise AssertionError
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001)
if args.v1:
trainer = enas.EnasTrainer(model,
loss=criterion,
metrics=accuracy,
reward_function=reward_accuracy,
optimizer=optimizer,
callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
batch_size=args.batch_size,
num_epochs=num_epochs,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
log_frequency=args.log_frequency,
mutator=mutator)
if args.visualization:
trainer.enable_visualization()
trainer.train()
else:
from nni.retiarii.oneshot.pytorch.enas import EnasTrainer
trainer = EnasTrainer(model,
loss=criterion,
metrics=accuracy,
reward_function=reward_accuracy,
optimizer=optimizer,
batch_size=args.batch_size,
num_epochs=num_epochs,
dataset=dataset_train,
log_frequency=args.log_frequency,
ctrl_kwargs=ctrl_kwargs)
trainer.fit()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
def accuracy(output, target, topk=(1,)):
""" Computes the precision@k for the specified values of k """
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
# one-hot case
if target.ndimension() > 1:
target = target.max(1)[1]
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = dict()
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0)
res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
return res
def reward_accuracy(output, target, topk=(1,)):
batch_size = target.size(0)
_, predicted = torch.max(output.data, 1)
return (predicted == target).sum().item() / batch_size
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (AveragePooling2D, BatchNormalization, Conv2D, Dense, MaxPool2D)
from tensorflow.keras.losses import Reduction, SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import SGD
from nni.nas.tensorflow.mutables import LayerChoice, InputChoice
from nni.algorithms.nas.tensorflow.enas import EnasTrainer
class Net(Model):
def __init__(self):
super().__init__()
self.conv1 = LayerChoice([
Conv2D(6, 3, padding='same', activation='relu'),
Conv2D(6, 5, padding='same', activation='relu'),
])
self.pool = MaxPool2D(2)
self.conv2 = LayerChoice([
Conv2D(16, 3, padding='same', activation='relu'),
Conv2D(16, 5, padding='same', activation='relu'),
])
self.conv3 = Conv2D(16, 1)
self.skipconnect = InputChoice(n_candidates=1)
self.bn = BatchNormalization()
self.gap = AveragePooling2D(2)
self.fc1 = Dense(120, activation='relu')
self.fc2 = Dense(84, activation='relu')
self.fc3 = Dense(10)
def call(self, x):
bs = x.shape[0]
t = self.conv1(x)
x = self.pool(t)
x0 = self.conv2(x)
x1 = self.conv3(x0)
x0 = self.skipconnect([x0])
if x0 is not None:
x1 += x0
x = self.pool(self.bn(x1))
x = self.gap(x)
x = tf.reshape(x, [bs, -1])
x = self.fc1(x)
x = self.fc2(x)
x = self.fc3(x)
return x
def accuracy(truth, logits):
truth = tf.reshape(truth, (-1, ))
predicted = tf.cast(tf.math.argmax(logits, axis=1), truth.dtype)
equal = tf.cast(predicted == truth, tf.int32)
return tf.math.reduce_sum(equal).numpy() / equal.shape[0]
def accuracy_metrics(truth, logits):
acc = accuracy(truth, logits)
return {'accuracy': acc}
if __name__ == '__main__':
cifar10 = tf.keras.datasets.cifar10
(x_train, y_train), (x_valid, y_valid) = cifar10.load_data()
x_train, x_valid = x_train / 255.0, x_valid / 255.0
train_set = (x_train, y_train)
valid_set = (x_valid, y_valid)
net = Net()
trainer = EnasTrainer(
net,
loss=SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE),
metrics=accuracy_metrics,
reward_function=accuracy,
optimizer=SGD(learning_rate=0.001, momentum=0.9),
batch_size=64,
num_epochs=2,
dataset_train=train_set,
dataset_valid=valid_set
)
trainer.train()
This is a naive example that demonstrates how to use NNI interface to implement a NAS search space.
\ No newline at end of file
这是一个简单示例,演示如何使用 NNI 接口实现 NAS 搜索空间。
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from nni.nas.pytorch.mutables import LayerChoice, InputChoice
from nni.algorithms.nas.pytorch.darts import DartsTrainer
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = LayerChoice([nn.Conv2d(3, 6, 3, padding=1), nn.Conv2d(3, 6, 5, padding=2)])
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = LayerChoice([nn.Conv2d(6, 16, 3, padding=1), nn.Conv2d(6, 16, 5, padding=2)])
self.conv3 = nn.Conv2d(16, 16, 1)
self.skipconnect = InputChoice(n_candidates=1)
self.bn = nn.BatchNorm2d(16)
self.gap = nn.AdaptiveAvgPool2d(4)
self.fc1 = nn.Linear(16 * 4 * 4, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
bs = x.size(0)
x = self.pool(F.relu(self.conv1(x)))
x0 = F.relu(self.conv2(x))
x1 = F.relu(self.conv3(x0))
x0 = self.skipconnect([x0])
if x0 is not None:
x1 += x0
x = self.pool(self.bn(x1))
x = self.gap(x).view(bs, -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def accuracy(output, target):
batch_size = target.size(0)
_, predicted = torch.max(output.data, 1)
return {"acc1": (predicted == target).sum().item() / batch_size}
if __name__ == "__main__":
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
dataset_train = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
dataset_valid = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
trainer = DartsTrainer(net,
loss=criterion,
metrics=accuracy,
optimizer=optimizer,
num_epochs=2,
dataset_train=dataset_train,
dataset_valid=dataset_valid,
batch_size=64,
log_frequency=10)
trainer.enable_visualization()
trainer.train()
trainer.export("checkpoint.json")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment