Commit 1011377c authored by qianyj's avatar qianyj
Browse files

the source code of NNI for DCU

parent abc22158
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import sys
import os
import logging
import pickle
import shutil
import random
import math
import time
import datetime
import argparse
import distutils.util
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
import torch.nn.functional as Func
from model import Model
from nni.nas.pytorch.fixed import apply_fixed_architecture
from dataloader import read_data_sst
logger = logging.getLogger("nni.textnas")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--reset_output_dir",
type=distutils.util.strtobool,
default=True,
help="Whether to clean the output dir if existed. (default: %(default)s)")
parser.add_argument(
"--child_fixed_arc",
type=str,
required=True,
help="Architecture json file. (default: %(default)s)")
parser.add_argument(
"--data_path",
type=str,
default="data",
help="Directory containing the dataset and embedding file. (default: %(default)s)")
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="The output directory. (default: %(default)s)")
parser.add_argument(
"--child_lr_decay_scheme",
type=str,
default="cosine",
help="Learning rate annealing strategy, only 'cosine' supported. (default: %(default)s)")
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="Number of samples each batch for training. (default: %(default)s)")
parser.add_argument(
"--eval_batch_size",
type=int,
default=128,
help="Number of samples each batch for evaluation. (default: %(default)s)")
parser.add_argument(
"--class_num",
type=int,
default=5,
help="The number of categories. (default: %(default)s)")
parser.add_argument(
"--global_seed",
type=int,
default=1234,
help="Seed for reproduction. (default: %(default)s)")
parser.add_argument(
"--max_input_length",
type=int,
default=64,
help="The maximum length of the sentence. (default: %(default)s)")
parser.add_argument(
"--num_epochs",
type=int,
default=10,
help="The number of training epochs. (default: %(default)s)")
parser.add_argument(
"--child_num_layers",
type=int,
default=24,
help="The layer number of the architecture. (default: %(default)s)")
parser.add_argument(
"--child_out_filters",
type=int,
default=256,
help="The dimension of hidden states. (default: %(default)s)")
parser.add_argument(
"--child_out_filters_scale",
type=int,
default=1,
help="The scale of hidden state dimension. (default: %(default)s)")
parser.add_argument(
"--child_lr_T_0",
type=int,
default=10,
help="The length of one cycle. (default: %(default)s)")
parser.add_argument(
"--child_lr_T_mul",
type=int,
default=2,
help="The multiplication factor per cycle. (default: %(default)s)")
parser.add_argument(
"--min_count",
type=int,
default=1,
help="The threshold to cut off low frequent words. (default: %(default)s)")
parser.add_argument(
"--train_ratio",
type=float,
default=1.0,
help="The sample ratio for the training set. (default: %(default)s)")
parser.add_argument(
"--valid_ratio",
type=float,
default=1.0,
help="The sample ratio for the dev set. (default: %(default)s)")
parser.add_argument(
"--child_grad_bound",
type=float,
default=5.0,
help="The threshold for gradient clipping. (default: %(default)s)")
parser.add_argument(
"--child_lr",
type=float,
default=0.02,
help="The initial learning rate. (default: %(default)s)")
parser.add_argument(
"--cnn_keep_prob",
type=float,
default=0.8,
help="Keep prob for cnn layer. (default: %(default)s)")
parser.add_argument(
"--final_output_keep_prob",
type=float,
default=1.0,
help="Keep prob for the last output layer. (default: %(default)s)")
parser.add_argument(
"--lstm_out_keep_prob",
type=float,
default=0.8,
help="Keep prob for the RNN layer. (default: %(default)s)")
parser.add_argument(
"--embed_keep_prob",
type=float,
default=0.8,
help="Keep prob for the embedding layer. (default: %(default)s)")
parser.add_argument(
"--attention_keep_prob",
type=float,
default=0.8,
help="Keep prob for the self-attention layer. (default: %(default)s)")
parser.add_argument(
"--child_l2_reg",
type=float,
default=3e-6,
help="Weight decay factor. (default: %(default)s)")
parser.add_argument(
"--child_lr_max",
type=float,
default=0.002,
help="The max learning rate. (default: %(default)s)")
parser.add_argument(
"--child_lr_min",
type=float,
default=0.001,
help="The min learning rate. (default: %(default)s)")
parser.add_argument(
"--child_optim_algo",
type=str,
default="adam",
help="Optimization algorithm. (default: %(default)s)")
parser.add_argument(
"--checkpoint_dir",
type=str,
default="best_checkpoint",
help="Path for saved checkpoints. (default: %(default)s)")
parser.add_argument(
"--output_type",
type=str,
default="avg",
help="Opertor type for the time steps reduction. (default: %(default)s)")
parser.add_argument(
"--multi_path",
type=distutils.util.strtobool,
default=False,
help="Search for multiple path in the architecture. (default: %(default)s)")
parser.add_argument(
"--is_binary",
type=distutils.util.strtobool,
default=False,
help="Binary label for sst dataset. (default: %(default)s)")
parser.add_argument(
"--is_cuda",
type=distutils.util.strtobool,
default=True,
help="Specify the device type. (default: %(default)s)")
parser.add_argument(
"--is_mask",
type=distutils.util.strtobool,
default=True,
help="Apply mask. (default: %(default)s)")
parser.add_argument(
"--fixed_seed",
type=distutils.util.strtobool,
default=True,
help="Fix the seed. (default: %(default)s)")
parser.add_argument(
"--load_checkpoint",
type=distutils.util.strtobool,
default=False,
help="Wether to load checkpoint. (default: %(default)s)")
parser.add_argument(
"--log_every",
type=int,
default=50,
help="How many steps to log. (default: %(default)s)")
parser.add_argument(
"--eval_every_epochs",
type=int,
default=1,
help="How many epochs to eval. (default: %(default)s)")
global FLAGS
FLAGS = parser.parse_args()
def set_random_seed(seed):
logger.info("set random seed for data reading: {}".format(seed))
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
if FLAGS.is_cuda:
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
def get_model(embedding, num_layers):
logger.info("num layers: {0}".format(num_layers))
assert FLAGS.child_fixed_arc is not None, "Architecture should be provided."
child_model = Model(
embedding=embedding,
hidden_units=FLAGS.child_out_filters_scale * FLAGS.child_out_filters,
num_layers=num_layers,
num_classes=FLAGS.class_num,
choose_from_k=5 if FLAGS.multi_path else 1,
lstm_keep_prob=FLAGS.lstm_out_keep_prob,
cnn_keep_prob=FLAGS.cnn_keep_prob,
att_keep_prob=FLAGS.attention_keep_prob,
att_mask=FLAGS.is_mask,
embed_keep_prob=FLAGS.embed_keep_prob,
final_output_keep_prob=FLAGS.final_output_keep_prob,
global_pool=FLAGS.output_type)
apply_fixed_architecture(child_model, FLAGS.child_fixed_arc)
return child_model
def eval_once(child_model, device, eval_set, criterion, valid_dataloader=None, test_dataloader=None):
if eval_set == "test":
assert test_dataloader is not None
dataloader = test_dataloader
elif eval_set == "valid":
assert valid_dataloader is not None
dataloader = valid_dataloader
else:
raise NotImplementedError("Unknown eval_set '{}'".format(eval_set))
tot_acc = 0
tot = 0
losses = []
with torch.no_grad(): # save memory
for batch in dataloader:
(sent_ids, mask), labels = batch
sent_ids = sent_ids.to(device, non_blocking=True)
mask = mask.to(device, non_blocking=True)
labels = labels.to(device, non_blocking=True)
logits = child_model((sent_ids, mask)) # run
loss = criterion(logits, labels.long())
loss = loss.mean()
preds = logits.argmax(dim=1).long()
acc = torch.eq(preds, labels.long()).long().sum().item()
losses.append(loss)
tot_acc += acc
tot += len(labels)
losses = torch.tensor(losses)
loss = losses.mean()
if tot > 0:
final_acc = float(tot_acc) / tot
else:
final_acc = 0
logger.info("Error in calculating final_acc")
return final_acc, loss
def print_user_flags(FLAGS, line_limit=80):
log_strings = "\n" + "-" * line_limit + "\n"
for flag_name in sorted(vars(FLAGS)):
value = "{}".format(getattr(FLAGS, flag_name))
log_string = flag_name
log_string += "." * (line_limit - len(flag_name) - len(value))
log_string += value
log_strings = log_strings + log_string
log_strings = log_strings + "\n"
log_strings += "-" * line_limit
logger.info(log_strings)
def count_model_params(trainable_params):
num_vars = 0
for var in trainable_params:
num_vars += np.prod([dim for dim in var.size()])
return num_vars
def update_lr(
optimizer,
epoch,
l2_reg=1e-4,
lr_warmup_val=None,
lr_init=0.1,
lr_decay_scheme="cosine",
lr_max=0.002,
lr_min=0.000000001,
lr_T_0=4,
lr_T_mul=1,
sync_replicas=False,
num_aggregate=None,
num_replicas=None):
if lr_decay_scheme == "cosine":
assert lr_max is not None, "Need lr_max to use lr_cosine"
assert lr_min is not None, "Need lr_min to use lr_cosine"
assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine"
assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine"
T_i = lr_T_0
t_epoch = epoch
last_reset = 0
while True:
t_epoch -= T_i
if t_epoch < 0:
break
last_reset += T_i
T_i *= lr_T_mul
T_curr = epoch - last_reset
def _update():
rate = T_curr / T_i * 3.1415926
lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + math.cos(rate))
return lr
learning_rate = _update()
else:
raise ValueError("Unknown learning rate decay scheme {}".format(lr_decay_scheme))
#update lr in optimizer
for params_group in optimizer.param_groups:
params_group['lr'] = learning_rate
return learning_rate
def train(device, data_path, output_dir, num_layers):
logger.info("Build dataloader")
train_dataset, valid_dataset, test_dataset, embedding = \
read_data_sst(data_path,
FLAGS.max_input_length,
FLAGS.min_count,
train_ratio=FLAGS.train_ratio,
valid_ratio=FLAGS.valid_ratio,
is_binary=FLAGS.is_binary)
train_dataloader = DataLoader(train_dataset, batch_size=FLAGS.batch_size, shuffle=True, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=FLAGS.eval_batch_size, pin_memory=True)
logger.info("Build model")
child_model = get_model(embedding, num_layers)
logger.info("Finish build model")
#for name, var in child_model.named_parameters():
# logger.info(name, var.size(), var.requires_grad) # output all params
num_vars = count_model_params(child_model.parameters())
logger.info("Model has {} params".format(num_vars))
for m in child_model.modules(): # initializer
if isinstance(m, (nn.Conv1d, nn.Linear)):
nn.init.xavier_uniform_(m.weight)
criterion = nn.CrossEntropyLoss()
# get optimizer
if FLAGS.child_optim_algo == "adam":
optimizer = optim.Adam(child_model.parameters(), eps=1e-3, weight_decay=FLAGS.child_l2_reg) # with L2
else:
raise ValueError("Unknown optim_algo {}".format(FLAGS.child_optim_algo))
child_model.to(device)
criterion.to(device)
logger.info("Start training")
start_time = time.time()
step = 0
# save path
model_save_path = os.path.join(FLAGS.output_dir, "model.pth")
best_model_save_path = os.path.join(FLAGS.output_dir, "best_model.pth")
best_acc = 0
start_epoch = 0
if FLAGS.load_checkpoint:
if os.path.isfile(model_save_path):
checkpoint = torch.load(model_save_path, map_location = torch.device('cpu'))
step = checkpoint['step']
start_epoch = checkpoint['epoch']
child_model.load_state_dict(checkpoint['child_model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
for epoch in range(start_epoch, FLAGS.num_epochs):
lr = update_lr(optimizer,
epoch,
l2_reg=FLAGS.child_l2_reg,
lr_warmup_val=None,
lr_init=FLAGS.child_lr,
lr_decay_scheme=FLAGS.child_lr_decay_scheme,
lr_max=FLAGS.child_lr_max,
lr_min=FLAGS.child_lr_min,
lr_T_0=FLAGS.child_lr_T_0,
lr_T_mul=FLAGS.child_lr_T_mul)
child_model.train()
for batch in train_dataloader:
(sent_ids, mask), labels = batch
sent_ids = sent_ids.to(device, non_blocking=True)
mask = mask.to(device, non_blocking=True)
labels = labels.to(device, non_blocking=True)
step += 1
logits = child_model((sent_ids, mask)) # run
loss = criterion(logits, labels.long())
loss = loss.mean()
preds = logits.argmax(dim=1).long()
acc = torch.eq(preds, labels.long()).long().sum().item()
optimizer.zero_grad()
loss.backward()
grad_norm = 0
trainable_params = child_model.parameters()
assert FLAGS.child_grad_bound is not None, "Need grad_bound to clip gradients."
# compute the gradient norm value
grad_norm = nn.utils.clip_grad_norm_(trainable_params, 99999999)
for param in trainable_params:
nn.utils.clip_grad_norm_(param, FLAGS.child_grad_bound) # clip grad
optimizer.step()
if step % FLAGS.log_every == 0:
curr_time = time.time()
log_string = ""
log_string += "epoch={:<6d}".format(epoch)
log_string += "ch_step={:<6d}".format(step)
log_string += " loss={:<8.6f}".format(loss)
log_string += " lr={:<8.4f}".format(lr)
log_string += " |g|={:<8.4f}".format(grad_norm)
log_string += " tr_acc={:<3d}/{:>3d}".format(acc, logits.size()[0])
log_string += " mins={:<10.2f}".format(float(curr_time - start_time) / 60)
logger.info(log_string)
epoch += 1
save_state = {
'step' : step,
'epoch' : epoch,
'child_model_state_dict' : child_model.state_dict(),
'optimizer_state_dict' : optimizer.state_dict()}
torch.save(save_state, model_save_path)
child_model.eval()
logger.info("Epoch {}: Eval".format(epoch))
eval_acc, eval_loss = eval_once(child_model, device, "test", criterion, test_dataloader=test_dataloader)
logger.info("ch_step={} {}_accuracy={:<6.4f} {}_loss={:<6.4f}".format(step, "test", eval_acc, "test", eval_loss))
if eval_acc > best_acc:
best_acc = eval_acc
logger.info("Save best model")
save_state = {
'step' : step,
'epoch' : epoch,
'child_model_state_dict' : child_model.state_dict(),
'optimizer_state_dict' : optimizer.state_dict()}
torch.save(save_state, best_model_save_path)
return eval_acc
def main():
parse_args()
if not os.path.isdir(FLAGS.output_dir):
logger.info("Path {} does not exist. Creating.".format(FLAGS.output_dir))
os.makedirs(FLAGS.output_dir)
elif FLAGS.reset_output_dir:
logger.info("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
os.makedirs(FLAGS.output_dir)
print_user_flags(FLAGS)
if FLAGS.fixed_seed:
set_random_seed(FLAGS.global_seed)
device = torch.device("cuda" if FLAGS.is_cuda else "cpu")
train(device, FLAGS.data_path, FLAGS.output_dir, FLAGS.child_num_layers)
if __name__ == "__main__":
main()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
export PYTHONPATH="$(pwd)"
python3 -u retrain.py \
--train_ratio=1.0 \
--valid_ratio=1.0 \
--min_count=1 \
--is_mask=True \
--is_binary=True \
--child_lr_decay_scheme="cosine" \
--data_path="data" \
--class_num=2 \
--child_optim_algo="adam" \
--output_dir="output_sst2" \
--global_seed=1234 \
--max_input_length=64 \
--batch_size=128 \
--eval_batch_size=128 \
--num_epochs=10 \
--log_every=50 \
--eval_every_epochs=1 \
--child_num_layers=24 \
--child_out_filters=256 \
--child_l2_reg=1e-6 \
--cnn_keep_prob=0.8 \
--final_output_keep_prob=1.0 \
--embed_keep_prob=0.8 \
--lstm_out_keep_prob=0.8 \
--attention_keep_prob=0.8 \
--child_lr=0.02 \
--child_lr_max=0.002 \
--child_lr_min=5e-6 \
--child_lr_T_0=10 \
--child_lr_T_mul=2 \
--multi_path=True \
--child_fixed_arc="./arc/final_arc.json" \
--fixed_seed=True \
"$@"
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import os
import random
from argparse import ArgumentParser
from itertools import cycle
import numpy as np
import torch
import torch.nn as nn
from nni.algorithms.nas.pytorch.enas import EnasMutator, EnasTrainer
from nni.nas.pytorch.callbacks import LRSchedulerCallback
from dataloader import read_data_sst
from model import Model
from utils import accuracy
logger = logging.getLogger("nni.textnas")
class TextNASTrainer(EnasTrainer):
def __init__(self, *args, train_loader=None, valid_loader=None, test_loader=None, **kwargs):
super().__init__(*args, **kwargs)
self.train_loader = train_loader
self.valid_loader = valid_loader
self.test_loader = test_loader
def init_dataloader(self):
pass
if __name__ == "__main__":
parser = ArgumentParser("textnas")
parser.add_argument("--batch-size", default=128, type=int)
parser.add_argument("--log-frequency", default=50, type=int)
parser.add_argument("--seed", default=1234, type=int)
parser.add_argument("--epochs", default=10, type=int)
parser.add_argument("--lr", default=5e-3, type=float)
args = parser.parse_args()
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
train_dataset, valid_dataset, test_dataset, embedding = read_data_sst("data")
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, num_workers=4, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, num_workers=4)
train_loader, valid_loader = cycle(train_loader), cycle(valid_loader)
model = Model(embedding)
mutator = EnasMutator(model, temperature=None, tanh_constant=None, entropy_reduction="mean")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-3, weight_decay=2e-6)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=1e-5)
trainer = TextNASTrainer(model,
loss=criterion,
metrics=lambda output, target: {"acc": accuracy(output, target)},
reward_function=accuracy,
optimizer=optimizer,
callbacks=[LRSchedulerCallback(lr_scheduler)],
batch_size=args.batch_size,
num_epochs=args.epochs,
dataset_train=None,
dataset_valid=None,
train_loader=train_loader,
valid_loader=valid_loader,
test_loader=test_loader,
log_frequency=args.log_frequency,
mutator=mutator,
mutator_lr=2e-3,
mutator_steps=500,
mutator_steps_aggregate=1,
child_steps=3000,
baseline_decay=0.99,
test_arc_per_epoch=10)
trainer.train()
os.makedirs("checkpoints", exist_ok=True)
for i in range(20):
trainer.export(os.path.join("checkpoints", "architecture_%02d.json" % i))
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import torch
import torch.nn as nn
INF = 1E10
EPS = 1E-12
logger = logging.getLogger("nni.textnas")
def get_length(mask):
length = torch.sum(mask, 1)
length = length.long().cpu()
return length
class GlobalAvgPool(nn.Module):
def forward(self, x, mask):
x = torch.sum(x, 2)
length = torch.sum(mask, 1, keepdim=True).float()
length += torch.eq(length, 0.0).float() * EPS
length = length.repeat(1, x.size()[1])
x /= length
return x
class GlobalMaxPool(nn.Module):
def forward(self, x, mask):
mask = torch.eq(mask.float(), 0.0).long()
mask = torch.unsqueeze(mask, dim=1).repeat(1, x.size()[1], 1)
mask *= -INF
x += mask
x, _ = torch.max(x + mask, 2)
return x
class IteratorWrapper:
def __init__(self, loader):
self.loader = loader
self.iterator = None
def __iter__(self):
self.iterator = iter(self.loader)
return self
def __len__(self):
return len(self.loader)
def __next__(self):
data = next(self.iterator)
text, length = data.text
max_length = text.size(1)
label = data.label - 1
bs = label.size(0)
mask = torch.arange(max_length, device=length.device).unsqueeze(0).repeat(bs, 1)
mask = mask < length.unsqueeze(-1).repeat(1, max_length)
return (text, mask), label
def accuracy(output, target):
batch_size = target.size(0)
_, predicted = torch.max(output.data, 1)
return (predicted == target).sum().item() / batch_size
from nni.retiarii import basic_unit
import nni.retiarii.nn.pytorch as nn
import warnings
import torch
import torch.nn as torch_nn
from torchvision.models.utils import load_state_dict_from_url
import torch.nn.functional as F
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parents[2]))
# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
# 1.0 - tensorflow.
_BN_MOMENTUM = 1 - 0.9997
_FIRST_DEPTH = 32
_MOBILENET_V2_FILTERS = [16, 24, 32, 64, 96, 160, 320]
_MOBILENET_V2_NUM_LAYERS = [1, 2, 3, 4, 3, 3, 1]
class _ResidualBlock(nn.Module):
def __init__(self, net):
super().__init__()
self.net = net
def forward(self, x):
return self.net(x) + x
class _InvertedResidual(nn.Module):
def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor, skip, bn_momentum=0.1):
super(_InvertedResidual, self).__init__()
assert stride in [1, 2]
assert kernel_size in [3, 5]
mid_ch = in_ch * expansion_factor
self.apply_residual = skip and in_ch == out_ch and stride == 1
self.layers = nn.Sequential(
# Pointwise
nn.Conv2d(in_ch, mid_ch, 1, bias=False),
nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
nn.ReLU(inplace=True),
# Depthwise
nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2,
stride=stride, groups=mid_ch, bias=False),
nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
nn.ReLU(inplace=True),
# Linear pointwise. Note that there's no activation.
nn.Conv2d(mid_ch, out_ch, 1, bias=False),
nn.BatchNorm2d(out_ch, momentum=bn_momentum))
def forward(self, input):
if self.apply_residual:
ret = self.layers(input) + input
else:
ret = self.layers(input)
return ret
def _stack_inverted_residual(in_ch, out_ch, kernel_size, skip, stride, exp_factor, repeats, bn_momentum):
""" Creates a stack of inverted residuals. """
assert repeats >= 1
# First one has no skip, because feature map size changes.
first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, skip, bn_momentum=bn_momentum)
remaining = []
for _ in range(1, repeats):
remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, skip, bn_momentum=bn_momentum))
return nn.Sequential(first, *remaining)
def _stack_normal_conv(in_ch, out_ch, kernel_size, skip, dconv, stride, repeats, bn_momentum):
assert repeats >= 1
stack = []
for i in range(repeats):
s = stride if i == 0 else 1
if dconv:
modules = [
nn.Conv2d(in_ch, in_ch, kernel_size, padding=kernel_size // 2, stride=s, groups=in_ch, bias=False),
nn.BatchNorm2d(in_ch, momentum=bn_momentum),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch, out_ch, 1, padding=0, stride=1, bias=False),
nn.BatchNorm2d(out_ch, momentum=bn_momentum)
]
else:
modules = [
nn.Conv2d(in_ch, out_ch, kernel_size, padding=kernel_size // 2, stride=s, bias=False),
nn.ReLU(inplace=True),
nn.BatchNorm2d(out_ch, momentum=bn_momentum)
]
if skip and in_ch == out_ch and s == 1:
# use different implementation for skip and noskip to align with pytorch
stack.append(_ResidualBlock(nn.Sequential(*modules)))
else:
stack += modules
in_ch = out_ch
return stack
def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
""" Asymmetric rounding to make `val` divisible by `divisor`. With default
bias, will round up, unless the number is no more than 10% greater than the
smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. """
assert 0.0 < round_up_bias < 1.0
new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
return new_val if new_val >= round_up_bias * val else new_val + divisor
def _get_depths(depths, alpha):
""" Scales tensor depths as in reference MobileNet code, prefers rouding up
rather than down. """
return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
class MNASNet(nn.Module):
""" MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
implements the B1 variant of the model.
>>> model = MNASNet(1000, 1.0)
>>> x = torch.rand(1, 3, 224, 224)
>>> y = model(x)
>>> y.dim()
1
>>> y.nelement()
1000
"""
# Version 2 adds depth scaling in the initial stages of the network.
_version = 2
def __init__(self, alpha, depths, convops, kernel_sizes, num_layers,
skips, num_classes=1000, dropout=0.2):
super().__init__()
assert alpha > 0.0
assert len(depths) == len(convops) == len(kernel_sizes) == len(num_layers) == len(skips) == 7
self.alpha = alpha
self.num_classes = num_classes
depths = _get_depths([_FIRST_DEPTH] + depths, alpha)
base_filter_sizes = [16, 24, 40, 80, 96, 192, 320]
exp_ratios = [3, 3, 3, 6, 6, 6, 6]
strides = [1, 2, 2, 2, 1, 2, 1]
layers = [
# First layer: regular conv.
nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
nn.ReLU(inplace=True),
]
count = 0
# for conv, prev_depth, depth, ks, skip, stride, repeat, exp_ratio in \
# zip(convops, depths[:-1], depths[1:], kernel_sizes, skips, strides, num_layers, exp_ratios):
for filter_size, exp_ratio, stride in zip(base_filter_sizes, exp_ratios, strides):
# TODO: restrict that "choose" can only be used within mutator
ph = nn.Placeholder(label=f'mutable_{count}', **{
'kernel_size_options': [1, 3, 5],
'n_layer_options': [1, 2, 3, 4],
'op_type_options': ['__mutated__.base_mnasnet.RegularConv',
'__mutated__.base_mnasnet.DepthwiseConv',
'__mutated__.base_mnasnet.MobileConv'],
# 'se_ratio_options': [0, 0.25],
'skip_options': ['identity', 'no'],
'n_filter_options': [int(filter_size*x) for x in [0.75, 1.0, 1.25]],
'exp_ratio': exp_ratio,
'stride': stride,
'in_ch': depths[0] if count == 0 else None
})
layers.append(ph)
'''if conv == "mconv":
# MNASNet blocks: stacks of inverted residuals.
layers.append(_stack_inverted_residual(prev_depth, depth, ks, skip,
stride, exp_ratio, repeat, _BN_MOMENTUM))
else:
# Normal conv and depth-separated conv
layers += _stack_normal_conv(prev_depth, depth, ks, skip, conv == "dconv",
stride, repeat, _BN_MOMENTUM)'''
count += 1
if count >= 2:
break
layers += [
# Final mapping to classifier input.
nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
nn.ReLU(inplace=True),
]
self.layers = nn.Sequential(*layers)
self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
nn.Linear(1280, num_classes))
self._initialize_weights()
#self.for_test = 10
def forward(self, x):
# if self.for_test == 10:
x = self.layers(x)
# Equivalent to global avgpool and removing H and W dimensions.
x = x.mean([2, 3])
x = F.relu(x)
return self.classifier(x)
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch_nn.init.kaiming_normal_(m.weight, mode="fan_out",
nonlinearity="relu")
if m.bias is not None:
torch_nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
torch_nn.init.ones_(m.weight)
torch_nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
torch_nn.init.kaiming_uniform_(m.weight, mode="fan_out",
nonlinearity="sigmoid")
torch_nn.init.zeros_(m.bias)
def test_model(model):
model(torch.randn(2, 3, 224, 224))
# ====================definition of candidate op classes
BN_MOMENTUM = 1 - 0.9997
class RegularConv(nn.Module):
def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
super().__init__()
self.kernel_size = kernel_size
self.in_ch = in_ch
self.out_ch = out_ch
self.skip = skip
self.exp_ratio = exp_ratio
self.stride = stride
self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=kernel_size // 2, stride=stride, bias=False)
self.relu = nn.ReLU(inplace=True)
self.bn = nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)
def forward(self, x):
out = self.bn(self.relu(self.conv(x)))
if self.skip == 'identity':
out = out + x
return out
class DepthwiseConv(nn.Module):
def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
super().__init__()
self.kernel_size = kernel_size
self.in_ch = in_ch
self.out_ch = out_ch
self.skip = skip
self.exp_ratio = exp_ratio
self.stride = stride
self.conv1 = nn.Conv2d(in_ch, in_ch, kernel_size, padding=kernel_size // 2, stride=stride, groups=in_ch, bias=False)
self.bn1 = nn.BatchNorm2d(in_ch, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(in_ch, out_ch, 1, padding=0, stride=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)
def forward(self, x):
out = self.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
if self.skip == 'identity':
out = out + x
return out
class MobileConv(nn.Module):
def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
super().__init__()
self.kernel_size = kernel_size
self.in_ch = in_ch
self.out_ch = out_ch
self.skip = skip
self.exp_ratio = exp_ratio
self.stride = stride
mid_ch = in_ch * exp_ratio
self.layers = nn.Sequential(
# Pointwise
nn.Conv2d(in_ch, mid_ch, 1, bias=False),
nn.BatchNorm2d(mid_ch, momentum=BN_MOMENTUM),
nn.ReLU(inplace=True),
# Depthwise
nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=(kernel_size - 1) // 2,
stride=stride, groups=mid_ch, bias=False),
nn.BatchNorm2d(mid_ch, momentum=BN_MOMENTUM),
nn.ReLU(inplace=True),
# Linear pointwise. Note that there's no activation.
nn.Conv2d(mid_ch, out_ch, 1, bias=False),
nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM))
def forward(self, x):
out = self.layers(x)
if self.skip == 'identity':
out = out + x
return out
# mnasnet0_5
ir_module = _InvertedResidual(16, 16, 3, 1, 1, True)
import logging
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).resolve().parents[2]))
from nni.retiarii import Mutator
from base_mnasnet import RegularConv, DepthwiseConv, MobileConv
_logger = logging.getLogger(__name__)
class BlockMutator(Mutator):
def __init__(self, target: str):
super(BlockMutator, self).__init__()
self.target = target
def mutate(self, model):
nodes = model.get_nodes_by_label(self.target)
assert len(nodes) == 1
node = nodes[0]
graph = node.graph
related_info = node.operation.parameters
kernel_size = self.choice(related_info['kernel_size_options'])
op_type = self.choice(related_info['op_type_options'])
#self.choice(related_info['se_ratio_options'])
skip = self.choice(related_info['skip_options'])
n_filter = self.choice(related_info['n_filter_options'])
if related_info['in_ch'] is not None:
in_ch = related_info['in_ch']
else:
assert len(node.predecessors) == 1
the_node = node.predecessors[0]
_logger.debug(repr(the_node.operation.parameters))
_logger.debug(the_node.__repr__())
in_ch = the_node.operation.parameters['out_ch']
# update the placeholder to be a new operation
node.update_operation(op_type, {
'kernel_size': kernel_size,
'in_ch': in_ch,
'out_ch': n_filter,
'skip': 'no',
'exp_ratio': related_info['exp_ratio'],
'stride': related_info['stride']
})
# insert new nodes after the placeholder
n_layer = self.choice(related_info['n_layer_options'])
for i in range(1, n_layer):
node = graph.insert_node_on_edge(node.outgoing_edges[0],
'{}_{}'.format(self.target, i),
op_type,
{'kernel_size': kernel_size,
'in_ch': n_filter,
'out_ch': n_filter,
'skip': skip,
'exp_ratio': related_info['exp_ratio'],
'stride': 1})
# fix possible shape mismatch
# TODO: use formal method function to update parameters
if len(node.successors) == 1 and 'in_channels' in node.successors[0].operation.parameters:
node.successors[0].operation.parameters['in_channels'] = n_filter
\ No newline at end of file
import os
import sys
import torch
from pathlib import Path
import nni.retiarii.evaluator.pytorch.lightning as pl
from nni.retiarii import serialize
from base_mnasnet import MNASNet
from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
from nni.retiarii.strategy import TPEStrategy
from torchvision import transforms
from torchvision.datasets import CIFAR10
from mutator import BlockMutator
if __name__ == '__main__':
_DEFAULT_DEPTHS = [16, 24, 40, 80, 96, 192, 320]
_DEFAULT_CONVOPS = ["dconv", "mconv", "mconv", "mconv", "mconv", "mconv", "mconv"]
_DEFAULT_SKIPS = [False, True, True, True, True, True, True]
_DEFAULT_KERNEL_SIZES = [3, 3, 5, 5, 3, 5, 3]
_DEFAULT_NUM_LAYERS = [1, 3, 3, 3, 2, 4, 1]
base_model = MNASNet(0.5, _DEFAULT_DEPTHS, _DEFAULT_CONVOPS, _DEFAULT_KERNEL_SIZES,
_DEFAULT_NUM_LAYERS, _DEFAULT_SKIPS)
train_transform = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
valid_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_dataset = serialize(CIFAR10, root='data/cifar10', train=True, download=True, transform=train_transform)
test_dataset = serialize(CIFAR10, root='data/cifar10', train=False, download=True, transform=valid_transform)
trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
val_dataloaders=pl.DataLoader(test_dataset, batch_size=100),
max_epochs=1, limit_train_batches=0.2)
applied_mutators = [
BlockMutator('mutable_0'),
BlockMutator('mutable_1')
]
simple_strategy = TPEStrategy()
exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy)
exp_config = RetiariiExeConfig('local')
exp_config.experiment_name = 'mnasnet_search'
exp_config.trial_concurrency = 2
exp_config.max_trial_number = 10
exp_config.training_service.use_active_gpu = False
exp_config.execution_engine = 'base'
exp.run(exp_config, 8097)
import random
import nni
import torch
import torch.nn.functional as F
# remember to import nni.retiarii.nn.pytorch as nn, instead of torch.nn as nn
import nni.retiarii.nn.pytorch as nn
import nni.retiarii.strategy as strategy
from nni.retiarii import model_wrapper
from nni.retiarii.evaluator import FunctionalEvaluator
from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment, debug_mutated_model
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import MNIST
class DepthwiseSeparableConv(nn.Module):
def __init__(self, in_ch, out_ch):
super().__init__()
self.depthwise = nn.Conv2d(in_ch, in_ch, kernel_size=3, groups=in_ch)
self.pointwise = nn.Conv2d(in_ch, out_ch, kernel_size=1)
def forward(self, x):
return self.pointwise(self.depthwise(x))
@model_wrapper
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
# LayerChoice is used to select a layer between Conv2d and DwConv.
self.conv2 = nn.LayerChoice([
nn.Conv2d(32, 64, 3, 1),
DepthwiseSeparableConv(32, 64)
])
# ValueChoice is used to select a dropout rate.
# ValueChoice can be used as parameter of modules wrapped in `nni.retiarii.nn.pytorch`
# or customized modules wrapped with `@basic_unit`.
self.dropout1 = nn.Dropout(nn.ValueChoice([0.25, 0.5, 0.75]))
self.dropout2 = nn.Dropout(0.5)
feature = nn.ValueChoice([64, 128, 256])
# Same value choice can be used multiple times
self.fc1 = nn.Linear(9216, feature)
self.fc2 = nn.Linear(feature, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(self.conv2(x), 2)
x = torch.flatten(self.dropout1(x), 1)
x = self.fc2(self.dropout2(F.relu(self.fc1(x))))
return x
def train_epoch(model, device, train_loader, optimizer, epoch):
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % 10 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test_epoch(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
accuracy = 100. * correct / len(test_loader.dataset)
print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
correct, len(test_loader.dataset), accuracy))
return accuracy
def evaluate_model(model_cls):
# "model_cls" is a class, need to instantiate
model = model_cls()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
transf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_loader = DataLoader(MNIST('data/mnist', download=True, transform=transf), batch_size=64, shuffle=True)
test_loader = DataLoader(MNIST('data/mnist', download=True, train=False, transform=transf), batch_size=64)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
for epoch in range(3):
# train the model for one epoch
train_epoch(model, device, train_loader, optimizer, epoch)
# test the model for one epoch
accuracy = test_epoch(model, device, test_loader)
# call report intermediate result. Result can be float or dict
nni.report_intermediate_result(accuracy)
# report final test result
nni.report_final_result(accuracy)
if __name__ == '__main__':
base_model = Net()
search_strategy = strategy.Random()
model_evaluator = FunctionalEvaluator(evaluate_model)
exp = RetiariiExperiment(base_model, model_evaluator, [], search_strategy)
exp_config = RetiariiExeConfig('local')
exp_config.experiment_name = 'mnist_search'
exp_config.trial_concurrency = 2
exp_config.max_trial_number = 20
exp_config.training_service.use_active_gpu = False
export_formatter = 'dict'
# uncomment this for graph-based execution engine
# exp_config.execution_engine = 'base'
# export_formatter = 'code'
exp.run(exp_config, 8081 + random.randint(0, 100))
print('Final model:')
for model_code in exp.export_top_models(formatter=export_formatter):
print(model_code)
import math
import torch.nn as nn
def truncated_normal_(tensor, mean=0, std=1):
# https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15
size = tensor.shape
tmp = tensor.new_empty(size + (4,)).normal_()
valid = (tmp < 2) & (tmp > -2)
ind = valid.max(-1, keepdim=True)[1]
tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
tensor.data.mul_(std).add_(mean)
class ConvBnRelu(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
super(ConvBnRelu, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.conv_bn_relu = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
self.reset_parameters()
def reset_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
fan_in = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
truncated_normal_(m.weight.data, mean=0., std=math.sqrt(1. / fan_in))
if isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self, x):
return self.conv_bn_relu(x)
class Conv3x3BnRelu(ConvBnRelu):
def __init__(self, in_channels, out_channels):
super(Conv3x3BnRelu, self).__init__(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
class Conv1x1BnRelu(ConvBnRelu):
def __init__(self, in_channels, out_channels):
super(Conv1x1BnRelu, self).__init__(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
Projection = Conv1x1BnRelu
import click
import nni
import nni.retiarii.evaluator.pytorch.lightning as pl
import torch.nn as nn
import torchmetrics
from nni.retiarii import model_wrapper, serialize
from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
from nni.retiarii.nn.pytorch import NasBench101Cell
from nni.retiarii.strategy import Random
from pytorch_lightning.callbacks import LearningRateMonitor
from timm.optim import RMSpropTF
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import transforms
from torchvision.datasets import CIFAR10
from base_ops import Conv3x3BnRelu, Conv1x1BnRelu, Projection
@model_wrapper
class NasBench101(nn.Module):
def __init__(self,
stem_out_channels: int = 128,
num_stacks: int = 3,
num_modules_per_stack: int = 3,
max_num_vertices: int = 7,
max_num_edges: int = 9,
num_labels: int = 10,
bn_eps: float = 1e-5,
bn_momentum: float = 0.003):
super().__init__()
op_candidates = {
'conv3x3-bn-relu': lambda num_features: Conv3x3BnRelu(num_features, num_features),
'conv1x1-bn-relu': lambda num_features: Conv1x1BnRelu(num_features, num_features),
'maxpool3x3': lambda num_features: nn.MaxPool2d(3, 1, 1)
}
# initial stem convolution
self.stem_conv = Conv3x3BnRelu(3, stem_out_channels)
layers = []
in_channels = out_channels = stem_out_channels
for stack_num in range(num_stacks):
if stack_num > 0:
downsample = nn.MaxPool2d(kernel_size=2, stride=2)
layers.append(downsample)
out_channels *= 2
for _ in range(num_modules_per_stack):
cell = NasBench101Cell(op_candidates, in_channels, out_channels,
lambda cin, cout: Projection(cin, cout),
max_num_vertices, max_num_edges, label='cell')
layers.append(cell)
in_channels = out_channels
self.features = nn.ModuleList(layers)
self.gap = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Linear(out_channels, num_labels)
for module in self.modules():
if isinstance(module, nn.BatchNorm2d):
module.eps = bn_eps
module.momentum = bn_momentum
def forward(self, x):
bs = x.size(0)
out = self.stem_conv(x)
for layer in self.features:
out = layer(out)
out = self.gap(out).view(bs, -1)
out = self.classifier(out)
return out
def reset_parameters(self):
for module in self.modules():
if isinstance(module, nn.BatchNorm2d):
module.eps = self.config.bn_eps
module.momentum = self.config.bn_momentum
class AccuracyWithLogits(torchmetrics.Accuracy):
def update(self, pred, target):
return super().update(nn.functional.softmax(pred), target)
@nni.trace
class NasBench101TrainingModule(pl.LightningModule):
def __init__(self, max_epochs=108, learning_rate=0.1, weight_decay=1e-4):
super().__init__()
self.save_hyperparameters('learning_rate', 'weight_decay', 'max_epochs')
self.criterion = nn.CrossEntropyLoss()
self.accuracy = AccuracyWithLogits()
def forward(self, x):
y_hat = self.model(x)
return y_hat
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = self.criterion(y_hat, y)
self.log('train_loss', loss, prog_bar=True)
self.log('train_accuracy', self.accuracy(y_hat, y), prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
self.log('val_accuracy', self.accuracy(y_hat, y), prog_bar=True)
def configure_optimizers(self):
optimizer = RMSpropTF(self.parameters(), lr=self.hparams.learning_rate,
weight_decay=self.hparams.weight_decay,
momentum=0.9, alpha=0.9, eps=1.0)
return {
'optimizer': optimizer,
'scheduler': CosineAnnealingLR(optimizer, self.hparams.max_epochs)
}
def on_validation_epoch_end(self):
nni.report_intermediate_result(self.trainer.callback_metrics['val_accuracy'].item())
def teardown(self, stage):
if stage == 'fit':
nni.report_final_result(self.trainer.callback_metrics['val_accuracy'].item())
@click.command()
@click.option('--epochs', default=108, help='Training length.')
@click.option('--batch_size', default=256, help='Batch size.')
@click.option('--port', default=8081, help='On which port the experiment is run.')
@click.option('--benchmark', is_flag=True, default=False)
def _multi_trial_test(epochs, batch_size, port, benchmark):
# initalize dataset. Note that 50k+10k is used. It's a little different from paper
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize([0.49139968, 0.48215827, 0.44653124], [0.24703233, 0.24348505, 0.26158768])
]
train_dataset = serialize(CIFAR10, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
test_dataset = serialize(CIFAR10, 'data', train=False, transform=transforms.Compose(normalize))
# specify training hyper-parameters
training_module = NasBench101TrainingModule(max_epochs=epochs)
# FIXME: need to fix a bug in serializer for this to work
# lr_monitor = serialize(LearningRateMonitor, logging_interval='step')
trainer = pl.Trainer(max_epochs=epochs, gpus=1)
lightning = pl.Lightning(
lightning_module=training_module,
trainer=trainer,
train_dataloader=pl.DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
val_dataloaders=pl.DataLoader(test_dataset, batch_size=batch_size),
)
strategy = Random()
model = NasBench101()
exp = RetiariiExperiment(model, lightning, [], strategy)
exp_config = RetiariiExeConfig('local')
exp_config.trial_concurrency = 2
exp_config.max_trial_number = 20
exp_config.trial_gpu_number = 1
exp_config.training_service.use_active_gpu = False
if benchmark:
exp_config.benchmark = 'nasbench101'
exp_config.execution_engine = 'benchmark'
exp.run(exp_config, port)
if __name__ == '__main__':
_multi_trial_test()
import torch
import torch.nn as nn
OPS_WITH_STRIDE = {
'none': lambda C_in, C_out, stride: Zero(C_in, C_out, stride),
'avg_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'avg'),
'max_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'max'),
'conv_3x3': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (3, 3), (stride, stride), (1, 1), (1, 1)),
'conv_1x1': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (1, 1), (stride, stride), (0, 0), (1, 1)),
'skip_connect': lambda C_in, C_out, stride: nn.Identity() if stride == 1 and C_in == C_out
else FactorizedReduce(C_in, C_out, stride),
}
PRIMITIVES = ['none', 'skip_connect', 'conv_1x1', 'conv_3x3', 'avg_pool_3x3']
class ReLUConvBN(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
super(ReLUConvBN, self).__init__()
self.op = nn.Sequential(
nn.ReLU(inplace=False),
nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
padding=padding, dilation=dilation, bias=False),
nn.BatchNorm2d(C_out)
)
def forward(self, x):
return self.op(x)
class SepConv(nn.Module):
def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
super(SepConv, self).__init__()
self.op = nn.Sequential(
nn.ReLU(inplace=False),
nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride,
padding=padding, dilation=dilation, groups=C_in, bias=False),
nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
nn.BatchNorm2d(C_out),
)
def forward(self, x):
return self.op(x)
class Pooling(nn.Module):
def __init__(self, C_in, C_out, stride, mode):
super(Pooling, self).__init__()
if C_in == C_out:
self.preprocess = None
else:
self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 1)
if mode == 'avg':
self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
elif mode == 'max':
self.op = nn.MaxPool2d(3, stride=stride, padding=1)
else:
raise ValueError('Invalid mode={:} in Pooling'.format(mode))
def forward(self, x):
if self.preprocess:
x = self.preprocess(x)
return self.op(x)
class Zero(nn.Module):
def __init__(self, C_in, C_out, stride):
super(Zero, self).__init__()
self.C_in = C_in
self.C_out = C_out
self.stride = stride
self.is_zero = True
def forward(self, x):
if self.C_in == self.C_out:
if self.stride == 1:
return x.mul(0.)
else:
return x[:, :, ::self.stride, ::self.stride].mul(0.)
else:
shape = list(x.shape)
shape[1] = self.C_out
zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
return zeros
class FactorizedReduce(nn.Module):
def __init__(self, C_in, C_out, stride):
super(FactorizedReduce, self).__init__()
self.stride = stride
self.C_in = C_in
self.C_out = C_out
self.relu = nn.ReLU(inplace=False)
if stride == 2:
C_outs = [C_out // 2, C_out - C_out // 2]
self.convs = nn.ModuleList()
for i in range(2):
self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False))
self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
else:
raise ValueError('Invalid stride : {:}'.format(stride))
self.bn = nn.BatchNorm2d(C_out)
def forward(self, x):
x = self.relu(x)
y = self.pad(x)
out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
out = self.bn(out)
return out
class ResNetBasicblock(nn.Module):
def __init__(self, inplanes, planes, stride):
super(ResNetBasicblock, self).__init__()
assert stride == 1 or stride == 2, 'invalid stride {:}'.format(stride)
self.conv_a = ReLUConvBN(inplanes, planes, 3, stride, 1, 1)
self.conv_b = ReLUConvBN(planes, planes, 3, 1, 1, 1)
if stride == 2:
self.downsample = nn.Sequential(
nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False))
elif inplanes != planes:
self.downsample = ReLUConvBN(inplanes, planes, 1, 1, 0, 1)
else:
self.downsample = None
self.in_dim = inplanes
self.out_dim = planes
self.stride = stride
self.num_conv = 2
def forward(self, inputs):
basicblock = self.conv_a(inputs)
basicblock = self.conv_b(basicblock)
if self.downsample is not None:
inputs = self.downsample(inputs) # residual
return inputs + basicblock
import click
import nni
import nni.retiarii.evaluator.pytorch.lightning as pl
import torch.nn as nn
import torchmetrics
from nni.retiarii import model_wrapper, serialize
from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
from nni.retiarii.nn.pytorch import NasBench201Cell
from nni.retiarii.strategy import Random
from pytorch_lightning.callbacks import LearningRateMonitor
from timm.optim import RMSpropTF
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import transforms
from torchvision.datasets import CIFAR100
from base_ops import ResNetBasicblock, PRIMITIVES, OPS_WITH_STRIDE
@model_wrapper
class NasBench201(nn.Module):
def __init__(self,
stem_out_channels: int = 16,
num_modules_per_stack: int = 5,
num_labels: int = 100):
super().__init__()
self.channels = C = stem_out_channels
self.num_modules = N = num_modules_per_stack
self.num_labels = num_labels
self.stem = nn.Sequential(
nn.Conv2d(3, C, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(C)
)
layer_channels = [C] * N + [C * 2] + [C * 2] * N + [C * 4] + [C * 4] * N
layer_reductions = [False] * N + [True] + [False] * N + [True] + [False] * N
C_prev = C
self.cells = nn.ModuleList()
for C_curr, reduction in zip(layer_channels, layer_reductions):
if reduction:
cell = ResNetBasicblock(C_prev, C_curr, 2)
else:
cell = NasBench201Cell({prim: lambda C_in, C_out: OPS_WITH_STRIDE[prim](C_in, C_out, 1) for prim in PRIMITIVES},
C_prev, C_curr, label='cell')
self.cells.append(cell)
C_prev = C_curr
self.lastact = nn.Sequential(
nn.BatchNorm2d(C_prev),
nn.ReLU(inplace=True)
)
self.global_pooling = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Linear(C_prev, self.num_labels)
def forward(self, inputs):
feature = self.stem(inputs)
for cell in self.cells:
feature = cell(feature)
out = self.lastact(feature)
out = self.global_pooling(out)
out = out.view(out.size(0), -1)
logits = self.classifier(out)
return logits
class AccuracyWithLogits(torchmetrics.Accuracy):
def update(self, pred, target):
return super().update(nn.functional.softmax(pred), target)
@nni.trace
class NasBench201TrainingModule(pl.LightningModule):
def __init__(self, max_epochs=200, learning_rate=0.1, weight_decay=5e-4):
super().__init__()
self.save_hyperparameters('learning_rate', 'weight_decay', 'max_epochs')
self.criterion = nn.CrossEntropyLoss()
self.accuracy = AccuracyWithLogits()
def forward(self, x):
y_hat = self.model(x)
return y_hat
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = self.criterion(y_hat, y)
self.log('train_loss', loss, prog_bar=True)
self.log('train_accuracy', self.accuracy(y_hat, y), prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
self.log('val_accuracy', self.accuracy(y_hat, y), prog_bar=True)
def configure_optimizers(self):
optimizer = RMSpropTF(self.parameters(), lr=self.hparams.learning_rate,
weight_decay=self.hparams.weight_decay,
momentum=0.9, alpha=0.9, eps=1.0)
return {
'optimizer': optimizer,
'scheduler': CosineAnnealingLR(optimizer, self.hparams.max_epochs)
}
def on_validation_epoch_end(self):
nni.report_intermediate_result(self.trainer.callback_metrics['val_accuracy'].item())
def teardown(self, stage):
if stage == 'fit':
nni.report_final_result(self.trainer.callback_metrics['val_accuracy'].item())
@click.command()
@click.option('--epochs', default=12, help='Training length.')
@click.option('--batch_size', default=256, help='Batch size.')
@click.option('--port', default=8081, help='On which port the experiment is run.')
@click.option('--benchmark', is_flag=True, default=False)
def _multi_trial_test(epochs, batch_size, port, benchmark):
# initalize dataset. Note that 50k+10k is used. It's a little different from paper
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize([x / 255 for x in [129.3, 124.1, 112.4]], [x / 255 for x in [68.2, 65.4, 70.4]])
]
train_dataset = serialize(CIFAR100, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
test_dataset = serialize(CIFAR100, 'data', train=False, transform=transforms.Compose(normalize))
# specify training hyper-parameters
training_module = NasBench201TrainingModule(max_epochs=epochs)
# FIXME: need to fix a bug in serializer for this to work
# lr_monitor = serialize(LearningRateMonitor, logging_interval='step')
trainer = pl.Trainer(max_epochs=epochs, gpus=1)
lightning = pl.Lightning(
lightning_module=training_module,
trainer=trainer,
train_dataloader=pl.DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
val_dataloaders=pl.DataLoader(test_dataset, batch_size=batch_size),
)
strategy = Random()
model = NasBench201()
exp = RetiariiExperiment(model, lightning, [], strategy)
exp_config = RetiariiExeConfig('local')
exp_config.trial_concurrency = 2
exp_config.max_trial_number = 20
exp_config.trial_gpu_number = 1
exp_config.training_service.use_active_gpu = False
if benchmark:
exp_config.benchmark = 'nasbench201-cifar100'
exp_config.execution_engine = 'benchmark'
exp.run(exp_config, port)
if __name__ == '__main__':
_multi_trial_test()
# Tuning Transformer with Retiarii
This demo is adapted from PyTorch Transformer tutorial.
Here, we show how we use functions provided by retiarii to tune Transformer's hyper-parameters, in order to achieve better performance.
This demo is tested with PyTorch 1.9, torchtext == 0.10, and nni == 2.4.
Please change the configurations (starting on line 196) accordingly and then run: `python retiarii_transformer_demo.py`
We use a built-in dataset provided by torchtext, WikiText-2, to evaluate Transformer on language modeling. We tune two hyper-parameters: the number of encoder layers (`n_layer`) whose default value in the original paper is 6, and the dropout rate shared by all encoder layers (`p_dropout`) whose default value is 0.1. We report validation perplexity as metric (the lower is better).
We first tune one hyper-parameter with another fixed to the default value. The results are:
![separate](https://user-images.githubusercontent.com/22978940/136937420-80aecee9-43cc-4f8d-b282-18aec0ad3929.png)
And then we tune these two hyper-parameters jointly. The results are:
<p align="center">
<img src="https://user-images.githubusercontent.com/22978940/136937807-342fde98-6498-4cdd-abdd-4633fd15b7dc.png" width="700">
</p>
As we can observe, we have found better hyper-parameters (`n_layer = 8`, `p_dropout = 0.2`) than default values.
###############################################################
# This demo is adapted from PyTorch Transformer tutorial <https://pytorch.org/tutorials/beginner/transformer_tutorial.html>
# Here we show how we use functions provided by retiarii to tune Transformer's hyper-parameters,
# in order to achieve better performance.
# This demo is tested with PyTorch 1.9, torchtext == 0.10, and nni == 2.4
import torch
import torch.nn.functional as F
import nni.retiarii.nn.pytorch as nn
from nni.retiarii import model_wrapper
import nni
import nni.retiarii.strategy as strategy
from nni.retiarii.evaluator import FunctionalEvaluator
from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
import math
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
"""
Args:
x: Tensor, with size [seq_len, batch_size, embedding_dim]
"""
x = x + self.pe[:x.size(0)]
return self.dropout(x)
###############################################################
# PyTorch has already provided modules for Transformer: nn.TransformerEncoderLayer and nn.TransformerEncoder,
# so we can use them directly, but note that to enable retiarii functions, we need to replace "import torch.nn as nn"
# with "import nni.retiarii.nn.pytorch as nn".
#
# We use nn.ValueChoice to make the number of encoder layers (the default is 6) and the dropout rate mutable.
# For other hyper-parameters, we follow the setting in the original paper "Attention is All You Need".
@model_wrapper # This decorator should be put on the top level module.
class Transformer(nn.Module):
def __init__(self, n_token: int, n_head: int = 8,
d_model: int = 512, d_ff: int = 2048):
super().__init__()
p_dropout = nn.ValueChoice([0.1, 0.2, 0.3, 0.4, 0.5], label='p_dropout')
n_layer = nn.ValueChoice([5, 6, 7, 8, 9], label='n_layer')
self.encoder = nn.TransformerEncoder(
nn.TransformerEncoderLayer(d_model, n_head, d_ff, p_dropout),
n_layer
)
self.d_model = d_model
self.decoder = nn.Linear(d_model, n_token)
self.embeddings = nn.Embedding(n_token, d_model)
self.position = PositionalEncoding(d_model)
def forward(self, src, src_mask):
"""
Args:
src: Tensor, with size [seq_len, batch_size]
src_mask: Tensor, with size [seq_len, seq_len]
Returns:
output: Tensor, with size [seq_len, batch_size, n_token]
"""
src = self.embeddings(src) * math.sqrt(self.d_model)
src = self.position(src)
output = self.encoder(src, src_mask)
output = self.decoder(output)
return output
###############################################################
# We wrap the whole training procedure in the fit function.
# This function takes one positional argument model_cls which represents one exploration (i.e., one trial).
# model_cls is automatically generated and passed in by retiarii, and we should instantiate model_cls
# through model = model_cls()
def fit(model_cls):
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
def process_data(raw_text_iter):
"""Converts raw text into a flat Tensor."""
data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
train_iter, val_iter, _ = WikiText2()
train_data = process_data(train_iter)
val_data = process_data(val_iter)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def generate_batches(data, bsz):
"""Divides the data into bsz separate sequences."""
seq_len = data.size(0) // bsz
data = data[:seq_len * bsz]
data = data.view(bsz, seq_len).t().contiguous()
return data.to(device)
batch_size = 20
eval_batch_size = 10
train_data = generate_batches(train_data, batch_size)
val_data = generate_batches(val_data, eval_batch_size)
seq_len = 35
def get_seq(source, i):
"""
Args:
source: Tensor, with size [full_seq_len, batch_size]
i: int
Returns:
tuple (data, target): data has size [seq_len, batch_size]
and target has size [seq_len * batch_size]
"""
part_len = min(seq_len, len(source) - 1 - i)
data = source[i:i+part_len]
target = source[i+1:i+1+part_len].reshape(-1)
return data, target
def generate_square_subsequent_mask(sz):
"""Generates an upper-triangular matrix of -inf, with zeros on diag."""
return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
model = model_cls().to(device)
lr = 5.0
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
def train(model):
model.train()
src_mask = generate_square_subsequent_mask(seq_len).to(device)
for i in range(0, train_data.size(0) - 1, seq_len):
data, target = get_seq(train_data, i)
part_len = data.size(0)
if part_len != seq_len:
src_mask = src_mask[:part_len, :part_len]
output = model(data, src_mask)
loss = F.cross_entropy(output.view(-1, output.size(-1)), target)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
def evaluate(model, eval_data):
model.eval()
src_mask = generate_square_subsequent_mask(seq_len).to(device)
total_loss = 0.
with torch.no_grad():
for i in range(0, eval_data.size(0) - 1, seq_len):
data, target = get_seq(eval_data, i)
part_len = data.size(0)
if part_len != seq_len:
src_mask = src_mask[:part_len, :part_len]
output = model(data, src_mask)
output_flat = output.view(-1, output.size(-1))
total_loss += part_len * F.cross_entropy(output_flat, target).item()
return total_loss / (len(eval_data) - 1)
best_val_loss = float('inf')
for epoch in range(20):
train(model)
val_loss = evaluate(model, val_data)
if val_loss < best_val_loss:
best_val_loss = val_loss
scheduler.step()
best_val_ppl = math.exp(best_val_loss)
nni.report_final_result(best_val_ppl) # reports best validation ppl to nni as final result of one trial
if __name__ == "__main__":
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
n_token = len(vocab)
base_model = Transformer(n_token)
evaluator = FunctionalEvaluator(fit)
exp = RetiariiExperiment(base_model, evaluator, [], strategy.Random())
exp_config = RetiariiExeConfig('local')
exp_config.experiment_name = 'transformer tuning'
exp_config.trial_concurrency = 3 # please change configurations accordingly
exp_config.max_trial_number = 25
exp_config.trial_gpu_number = 1
exp_config.training_service.use_active_gpu = False
export_formatter = 'dict'
exp.run(exp_config, 8081)
print('Final model:')
for model_code in exp.export_top_models(optimize_mode='minimize', formatter=export_formatter):
print(model_code)
[Documentation](https://nni.readthedocs.io/en/latest/NAS/DARTS.html)
[文档](https://nni.readthedocs.io/zh/latest/NAS/DARTS.html)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import numpy as np
import torch
from torchvision import transforms
from torchvision.datasets import CIFAR10
class Cutout(object):
def __init__(self, length):
self.length = length
def __call__(self, img):
h, w = img.size(1), img.size(2)
mask = np.ones((h, w), np.float32)
y = np.random.randint(h)
x = np.random.randint(w)
y1 = np.clip(y - self.length // 2, 0, h)
y2 = np.clip(y + self.length // 2, 0, h)
x1 = np.clip(x - self.length // 2, 0, w)
x2 = np.clip(x + self.length // 2, 0, w)
mask[y1: y2, x1: x2] = 0.
mask = torch.from_numpy(mask)
mask = mask.expand_as(img)
img *= mask
return img
def get_dataset(cls, cutout_length=0):
MEAN = [0.49139968, 0.48215827, 0.44653124]
STD = [0.24703233, 0.24348505, 0.26158768]
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize(MEAN, STD)
]
cutout = []
if cutout_length > 0:
cutout.append(Cutout(cutout_length))
train_transform = transforms.Compose(transf + normalize + cutout)
valid_transform = transforms.Compose(normalize)
if cls == "cifar10":
dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
else:
raise NotImplementedError
return dataset_train, dataset_valid
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from collections import OrderedDict
import torch
import torch.nn as nn
import ops
from nni.retiarii.nn.pytorch import LayerChoice, InputChoice
class AuxiliaryHead(nn.Module):
""" Auxiliary head in 2/3 place of network to let the gradient flow well """
def __init__(self, input_size, C, n_classes):
""" assuming input size 7x7 or 8x8 """
assert input_size in [7, 8]
super().__init__()
self.net = nn.Sequential(
nn.ReLU(inplace=True),
nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False), # 2x2 out
nn.Conv2d(C, 128, kernel_size=1, bias=False),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 768, kernel_size=2, bias=False), # 1x1 out
nn.BatchNorm2d(768),
nn.ReLU(inplace=True)
)
self.linear = nn.Linear(768, n_classes)
def forward(self, x):
out = self.net(x)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)
return logits
class Node(nn.Module):
def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect):
super().__init__()
self.ops = nn.ModuleList()
choice_keys = []
for i in range(num_prev_nodes):
stride = 2 if i < num_downsample_connect else 1
choice_keys.append("{}_p{}".format(node_id, i))
self.ops.append(
LayerChoice(OrderedDict([
("maxpool", ops.PoolBN('max', channels, 3, stride, 1, affine=False)),
("avgpool", ops.PoolBN('avg', channels, 3, stride, 1, affine=False)),
("skipconnect", nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False)),
("sepconv3x3", ops.SepConv(channels, channels, 3, stride, 1, affine=False)),
("sepconv5x5", ops.SepConv(channels, channels, 5, stride, 2, affine=False)),
("dilconv3x3", ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False)),
("dilconv5x5", ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False))
]), label=choice_keys[-1]))
self.drop_path = ops.DropPath()
self.input_switch = InputChoice(n_candidates=len(choice_keys), n_chosen=2, label="{}_switch".format(node_id))
def forward(self, prev_nodes):
assert len(self.ops) == len(prev_nodes)
out = [op(node) for op, node in zip(self.ops, prev_nodes)]
out = [self.drop_path(o) if o is not None else None for o in out]
return self.input_switch(out)
class Cell(nn.Module):
def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction):
super().__init__()
self.reduction = reduction
self.n_nodes = n_nodes
# If previous cell is reduction cell, current input size does not match with
# output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
if reduction_p:
self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
else:
self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)
# generate dag
self.mutable_ops = nn.ModuleList()
for depth in range(2, self.n_nodes + 2):
self.mutable_ops.append(Node("{}_n{}".format("reduce" if reduction else "normal", depth),
depth, channels, 2 if reduction else 0))
def forward(self, s0, s1):
# s0, s1 are the outputs of previous previous cell and previous cell, respectively.
tensors = [self.preproc0(s0), self.preproc1(s1)]
for node in self.mutable_ops:
cur_tensor = node(tensors)
tensors.append(cur_tensor)
output = torch.cat(tensors[2:], dim=1)
return output
class CNN(nn.Module):
def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4,
stem_multiplier=3, auxiliary=False):
super().__init__()
self.in_channels = in_channels
self.channels = channels
self.n_classes = n_classes
self.n_layers = n_layers
self.aux_pos = 2 * n_layers // 3 if auxiliary else -1
c_cur = stem_multiplier * self.channels
self.stem = nn.Sequential(
nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
nn.BatchNorm2d(c_cur)
)
# for the first cell, stem is used for both s0 and s1
# [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
channels_pp, channels_p, c_cur = c_cur, c_cur, channels
self.cells = nn.ModuleList()
reduction_p, reduction = False, False
for i in range(n_layers):
reduction_p, reduction = reduction, False
# Reduce featuremap size and double channels in 1/3 and 2/3 layer.
if i in [n_layers // 3, 2 * n_layers // 3]:
c_cur *= 2
reduction = True
cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction)
self.cells.append(cell)
c_cur_out = c_cur * n_nodes
channels_pp, channels_p = channels_p, c_cur_out
if i == self.aux_pos:
self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)
self.gap = nn.AdaptiveAvgPool2d(1)
self.linear = nn.Linear(channels_p, n_classes)
def forward(self, x):
s0 = s1 = self.stem(x)
aux_logits = None
for i, cell in enumerate(self.cells):
s0, s1 = s1, cell(s0, s1)
if i == self.aux_pos and self.training:
aux_logits = self.aux_head(s1)
out = self.gap(s1)
out = out.view(out.size(0), -1) # flatten
logits = self.linear(out)
if aux_logits is not None:
return logits, aux_logits
return logits
def drop_path_prob(self, p):
for module in self.modules():
if isinstance(module, ops.DropPath):
module.p = p
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import torch
import torch.nn as nn
class DropPath(nn.Module):
def __init__(self, p=0.):
"""
Drop path with probability.
Parameters
----------
p : float
Probability of an path to be zeroed.
"""
super().__init__()
self.p = p
def forward(self, x):
if self.training and self.p > 0.:
keep_prob = 1. - self.p
# per data point mask
mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
return x / keep_prob * mask
return x
class PoolBN(nn.Module):
"""
AvgPool or MaxPool with BN. `pool_type` must be `max` or `avg`.
"""
def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
super().__init__()
if pool_type.lower() == 'max':
self.pool = nn.MaxPool2d(kernel_size, stride, padding)
elif pool_type.lower() == 'avg':
self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
else:
raise ValueError()
self.bn = nn.BatchNorm2d(C, affine=affine)
def forward(self, x):
out = self.pool(x)
out = self.bn(out)
return out
class StdConv(nn.Module):
"""
Standard conv: ReLU - Conv - BN
"""
def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
super().__init__()
self.net = nn.Sequential(
nn.ReLU(),
nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
nn.BatchNorm2d(C_out, affine=affine)
)
def forward(self, x):
return self.net(x)
class FacConv(nn.Module):
"""
Factorized conv: ReLU - Conv(Kx1) - Conv(1xK) - BN
"""
def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
super().__init__()
self.net = nn.Sequential(
nn.ReLU(),
nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
nn.BatchNorm2d(C_out, affine=affine)
)
def forward(self, x):
return self.net(x)
class DilConv(nn.Module):
"""
(Dilated) depthwise separable conv.
ReLU - (Dilated) depthwise separable - Pointwise - BN.
If dilation == 2, 3x3 conv => 5x5 receptive field, 5x5 conv => 9x9 receptive field.
"""
def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
super().__init__()
self.net = nn.Sequential(
nn.ReLU(),
nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
bias=False),
nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
nn.BatchNorm2d(C_out, affine=affine)
)
def forward(self, x):
return self.net(x)
class SepConv(nn.Module):
"""
Depthwise separable conv.
DilConv(dilation=1) * 2.
"""
def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
super().__init__()
self.net = nn.Sequential(
DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
)
def forward(self, x):
return self.net(x)
class FactorizedReduce(nn.Module):
"""
Reduce feature map size by factorized pointwise (stride=2).
"""
def __init__(self, C_in, C_out, affine=True):
super().__init__()
self.relu = nn.ReLU()
self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
self.bn = nn.BatchNorm2d(C_out, affine=affine)
def forward(self, x):
x = self.relu(x)
out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
out = self.bn(out)
return out
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import time
from argparse import ArgumentParser
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datasets
import utils
from model import CNN
from nni.nas.pytorch.utils import AverageMeter
from nni.retiarii import fixed_arch
logger = logging.getLogger('nni')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter()
def train(config, train_loader, model, optimizer, criterion, epoch):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")
cur_step = epoch * len(train_loader)
cur_lr = optimizer.param_groups[0]["lr"]
logger.info("Epoch %d LR %.6f", epoch, cur_lr)
writer.add_scalar("lr", cur_lr, global_step=cur_step)
model.train()
for step, (x, y) in enumerate(train_loader):
x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = x.size(0)
optimizer.zero_grad()
logits, aux_logits = model(x)
loss = criterion(logits, y)
if config.aux_weight > 0.:
loss += config.aux_weight * criterion(aux_logits, y)
loss.backward()
# gradient clipping
nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
optimizer.step()
accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)
writer.add_scalar("loss/train", loss.item(), global_step=cur_step)
writer.add_scalar("acc1/train", accuracy["acc1"], global_step=cur_step)
writer.add_scalar("acc5/train", accuracy["acc5"], global_step=cur_step)
if step % config.log_frequency == 0 or step == len(train_loader) - 1:
logger.info(
"Train: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses,
top1=top1, top5=top5))
cur_step += 1
logger.info("Train: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, config.epochs, top1.avg))
def validate(config, valid_loader, model, criterion, epoch, cur_step):
top1 = AverageMeter("top1")
top5 = AverageMeter("top5")
losses = AverageMeter("losses")
model.eval()
with torch.no_grad():
for step, (X, y) in enumerate(valid_loader):
X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
bs = X.size(0)
logits = model(X)
loss = criterion(logits, y)
accuracy = utils.accuracy(logits, y, topk=(1, 5))
losses.update(loss.item(), bs)
top1.update(accuracy["acc1"], bs)
top5.update(accuracy["acc5"], bs)
if step % config.log_frequency == 0 or step == len(valid_loader) - 1:
logger.info(
"Valid: [{:3d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} "
"Prec@(1,5) ({top1.avg:.1%}, {top5.avg:.1%})".format(
epoch + 1, config.epochs, step, len(valid_loader) - 1, losses=losses,
top1=top1, top5=top5))
writer.add_scalar("loss/test", losses.avg, global_step=cur_step)
writer.add_scalar("acc1/test", top1.avg, global_step=cur_step)
writer.add_scalar("acc5/test", top5.avg, global_step=cur_step)
logger.info("Valid: [{:3d}/{}] Final Prec@1 {:.4%}".format(epoch + 1, config.epochs, top1.avg))
return top1.avg
if __name__ == "__main__":
parser = ArgumentParser("darts")
parser.add_argument("--layers", default=20, type=int)
parser.add_argument("--batch-size", default=96, type=int)
parser.add_argument("--log-frequency", default=10, type=int)
parser.add_argument("--epochs", default=600, type=int)
parser.add_argument("--aux-weight", default=0.4, type=float)
parser.add_argument("--drop-path-prob", default=0.2, type=float)
parser.add_argument("--workers", default=4)
parser.add_argument("--grad-clip", default=5., type=float)
parser.add_argument("--arc-checkpoint", default="./checkpoints/epoch_0.json")
args = parser.parse_args()
dataset_train, dataset_valid = datasets.get_dataset("cifar10", cutout_length=16)
with fixed_arch(args.arc_checkpoint):
model = CNN(32, 3, 36, 10, args.layers, auxiliary=True)
criterion = nn.CrossEntropyLoss()
model.to(device)
criterion.to(device)
optimizer = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=1E-6)
train_loader = torch.utils.data.DataLoader(dataset_train,
batch_size=args.batch_size,
shuffle=True,
num_workers=args.workers,
pin_memory=True)
valid_loader = torch.utils.data.DataLoader(dataset_valid,
batch_size=args.batch_size,
shuffle=False,
num_workers=args.workers,
pin_memory=True)
best_top1 = 0.
for epoch in range(args.epochs):
drop_prob = args.drop_path_prob * epoch / args.epochs
model.drop_path_prob(drop_prob)
# training
train(args, train_loader, model, optimizer, criterion, epoch)
# validation
cur_step = (epoch + 1) * len(train_loader)
top1 = validate(args, valid_loader, model, criterion, epoch, cur_step)
best_top1 = max(best_top1, top1)
lr_scheduler.step()
logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment