Unverified Commit ff1af7f2 authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Merge pull request #3029 from liuzhe-lz/v2.0-merge

Merge master into v2.0
parents e21a6984 3b90b9d9
docs/img/webui-img/search-trial.png

21.6 KB | W: | H:

docs/img/webui-img/search-trial.png

18 KB | W: | H:

docs/img/webui-img/search-trial.png
docs/img/webui-img/search-trial.png
docs/img/webui-img/search-trial.png
docs/img/webui-img/search-trial.png
  • 2-up
  • Swipe
  • Onion skin
docs/img/webui-img/select-trial.png

22.3 KB | W: | H:

docs/img/webui-img/select-trial.png

22.7 KB | W: | H:

docs/img/webui-img/select-trial.png
docs/img/webui-img/select-trial.png
docs/img/webui-img/select-trial.png
docs/img/webui-img/select-trial.png
  • 2-up
  • Swipe
  • Onion skin
# AMCPruner Example
This example shows us how to use AMCPruner example.
## Step 1: train a model for pruning
Run following command to train a mobilenetv2 model:
```bash
python3 amc_train.py --model_type mobilenetv2 --n_epoch 50
```
Once finished, saved checkpoint file can be found at:
```
logs/mobilenetv2_cifar10_train-run1/ckpt.best.pth
```
## Pruning with AMCPruner
Run following command to prune the trained model:
```bash
python3 amc_search.py --model_type mobilenetv2 --ckpt logs/mobilenetv2_cifar10_train-run1/ckpt.best.pth
```
Once finished, pruned model and mask can be found at:
```
logs/mobilenetv2_cifar10_r0.5_search-run2
```
## Finetune pruned model
Run `amc_train.py` again with `--ckpt` and `--mask` to speedup and finetune the pruned model:
```bash
python3 amc_train.py --model_type mobilenetv2 --ckpt logs/mobilenetv2_cifar10_r0.5_search-run2/best_model.pth --mask logs/mobilenetv2_cifar10_r0.5_search-run2/best_mask.pth --n_epoch 100
```
...@@ -20,7 +20,7 @@ def parse_args(): ...@@ -20,7 +20,7 @@ def parse_args():
help='model to prune') help='model to prune')
parser.add_argument('--dataset', default='cifar10', type=str, choices=['cifar10', 'imagenet'], help='dataset to use (cifar/imagenet)') parser.add_argument('--dataset', default='cifar10', type=str, choices=['cifar10', 'imagenet'], help='dataset to use (cifar/imagenet)')
parser.add_argument('--batch_size', default=50, type=int, help='number of data batch size') parser.add_argument('--batch_size', default=50, type=int, help='number of data batch size')
parser.add_argument('--data_root', default='./cifar10', type=str, help='dataset path') parser.add_argument('--data_root', default='./data', type=str, help='dataset path')
parser.add_argument('--flops_ratio', default=0.5, type=float, help='target flops ratio to preserve of the model') parser.add_argument('--flops_ratio', default=0.5, type=float, help='target flops ratio to preserve of the model')
parser.add_argument('--lbound', default=0.2, type=float, help='minimum sparsity') parser.add_argument('--lbound', default=0.2, type=float, help='minimum sparsity')
parser.add_argument('--rbound', default=1., type=float, help='maximum sparsity') parser.add_argument('--rbound', default=1., type=float, help='maximum sparsity')
......
...@@ -13,6 +13,7 @@ import torch ...@@ -13,6 +13,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from torchvision.models import resnet
from nni.compression.torch.pruning.amc.lib.net_measure import measure_model from nni.compression.torch.pruning.amc.lib.net_measure import measure_model
from nni.compression.torch.pruning.amc.lib.utils import get_output_folder from nni.compression.torch.pruning.amc.lib.utils import get_output_folder
...@@ -27,7 +28,9 @@ from mobilenet_v2 import MobileNetV2 ...@@ -27,7 +28,9 @@ from mobilenet_v2 import MobileNetV2
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='AMC train / fine-tune script') parser = argparse.ArgumentParser(description='AMC train / fine-tune script')
parser.add_argument('--model_type', default='mobilenet', type=str, help='name of the model to train') parser.add_argument('--model_type', default='mobilenet', type=str,
choices=['mobilenet', 'mobilenetv2', 'resnet18', 'resnet34', 'resnet50'],
help='name of the model to train')
parser.add_argument('--dataset', default='cifar10', type=str, help='name of the dataset to train') parser.add_argument('--dataset', default='cifar10', type=str, help='name of the dataset to train')
parser.add_argument('--lr', default=0.05, type=float, help='learning rate') parser.add_argument('--lr', default=0.05, type=float, help='learning rate')
parser.add_argument('--n_gpu', default=4, type=int, help='number of GPUs to use') parser.add_argument('--n_gpu', default=4, type=int, help='number of GPUs to use')
...@@ -62,17 +65,21 @@ def get_model(args): ...@@ -62,17 +65,21 @@ def get_model(args):
net = MobileNet(n_class=n_class) net = MobileNet(n_class=n_class)
elif args.model_type == 'mobilenetv2': elif args.model_type == 'mobilenetv2':
net = MobileNetV2(n_class=n_class) net = MobileNetV2(n_class=n_class)
elif args.model_type.startswith('resnet'):
net = resnet.__dict__[args.model_type](pretrained=True)
in_features = net.fc.in_features
net.fc = nn.Linear(in_features, n_class)
else: else:
raise NotImplementedError raise NotImplementedError
if args.ckpt_path is not None: if args.ckpt_path is not None:
# the checkpoint can be state_dict exported by amc_search.py or saved by amc_train.py # the checkpoint can be state_dict exported by amc_search.py or saved by amc_train.py
print('=> Loading checkpoint {} ..'.format(args.ckpt_path)) print('=> Loading checkpoint {} ..'.format(args.ckpt_path))
net.load_state_dict(torch.load(args.ckpt_path)) net.load_state_dict(torch.load(args.ckpt_path, torch.device('cpu')))
if args.mask_path is not None: if args.mask_path is not None:
SZ = 224 if args.dataset == 'imagenet' else 32 SZ = 224 if args.dataset == 'imagenet' else 32
data = torch.randn(2, 3, SZ, SZ) data = torch.randn(2, 3, SZ, SZ)
ms = ModelSpeedup(net, data, args.mask_path) ms = ModelSpeedup(net, data, args.mask_path, torch.device('cpu'))
ms.speedup_model() ms.speedup_model()
net.to(args.device) net.to(args.device)
...@@ -179,11 +186,11 @@ def adjust_learning_rate(optimizer, epoch): ...@@ -179,11 +186,11 @@ def adjust_learning_rate(optimizer, epoch):
return lr return lr
def save_checkpoint(state, is_best, checkpoint_dir='.'): def save_checkpoint(state, is_best, checkpoint_dir='.'):
filename = os.path.join(checkpoint_dir, 'ckpt.pth.tar') filename = os.path.join(checkpoint_dir, 'ckpt.pth')
print('=> Saving checkpoint to {}'.format(filename)) print('=> Saving checkpoint to {}'.format(filename))
torch.save(state, filename) torch.save(state, filename)
if is_best: if is_best:
shutil.copyfile(filename, filename.replace('.pth.tar', '.best.pth.tar')) shutil.copyfile(filename, filename.replace('.pth', '.best.pth'))
if __name__ == '__main__': if __name__ == '__main__':
args = parse_args() args = parse_args()
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import tensorflow as tf import tensorflow as tf
from tensorflow.keras import Model from tensorflow.keras import Model
from tensorflow.keras.layers import (AveragePooling2D, BatchNormalization, Conv2D, Dense, MaxPool2D) from tensorflow.keras.layers import (AveragePooling2D, BatchNormalization, Conv2D, Dense, MaxPool2D)
...@@ -7,8 +10,6 @@ from tensorflow.keras.optimizers import SGD ...@@ -7,8 +10,6 @@ from tensorflow.keras.optimizers import SGD
from nni.nas.tensorflow.mutables import LayerChoice, InputChoice from nni.nas.tensorflow.mutables import LayerChoice, InputChoice
from nni.nas.tensorflow.enas import EnasTrainer from nni.nas.tensorflow.enas import EnasTrainer
tf.get_logger().setLevel('ERROR')
class Net(Model): class Net(Model):
def __init__(self): def __init__(self):
...@@ -53,35 +54,36 @@ class Net(Model): ...@@ -53,35 +54,36 @@ class Net(Model):
return x return x
def accuracy(output, target): def accuracy(truth, logits):
bs = target.shape[0] truth = tf.reshape(truth, -1)
predicted = tf.cast(tf.argmax(output, 1), target.dtype) predicted = tf.cast(tf.math.argmax(logits, axis=1), truth.dtype)
target = tf.reshape(target, [-1]) equal = tf.cast(predicted == truth, tf.int32)
return sum(tf.cast(predicted == target, tf.float32)) / bs return tf.math.reduce_sum(equal).numpy() / equal.shape[0]
def accuracy_metrics(truth, logits):
acc = accuracy(truth, logits)
return {'accuracy': acc}
if __name__ == '__main__': if __name__ == '__main__':
cifar10 = tf.keras.datasets.cifar10 cifar10 = tf.keras.datasets.cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data() (x_train, y_train), (x_valid, y_valid) = cifar10.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0 x_train, x_valid = x_train / 255.0, x_valid / 255.0
split = int(len(x_train) * 0.9) train_set = (x_train, y_train)
dataset_train = tf.data.Dataset.from_tensor_slices((x_train[:split], y_train[:split])).batch(64) valid_set = (x_valid, y_valid)
dataset_valid = tf.data.Dataset.from_tensor_slices((x_train[split:], y_train[split:])).batch(64)
dataset_test = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(64)
net = Net() net = Net()
trainer = EnasTrainer( trainer = EnasTrainer(
net, net,
loss=SparseCategoricalCrossentropy(reduction=Reduction.SUM), loss=SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE),
metrics=accuracy, metrics=accuracy_metrics,
reward_function=accuracy, reward_function=accuracy,
optimizer=SGD(learning_rate=0.001, momentum=0.9), optimizer=SGD(learning_rate=0.001, momentum=0.9),
batch_size=64, batch_size=64,
num_epochs=2, num_epochs=2,
dataset_train=dataset_train, dataset_train=train_set,
dataset_valid=dataset_valid, dataset_valid=valid_set
dataset_test=dataset_test
) )
trainer.train() trainer.train()
#trainer.export('checkpoint')
...@@ -45,6 +45,7 @@ if __name__ == "__main__": ...@@ -45,6 +45,7 @@ if __name__ == "__main__":
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
model = ShuffleNetV2OneShot() model = ShuffleNetV2OneShot()
flops_func = model.get_candidate_flops
if args.load_checkpoint: if args.load_checkpoint:
if not args.spos_preprocessing: if not args.spos_preprocessing:
logger.warning("You might want to use SPOS preprocessing if you are loading their checkpoints.") logger.warning("You might want to use SPOS preprocessing if you are loading their checkpoints.")
...@@ -52,7 +53,7 @@ if __name__ == "__main__": ...@@ -52,7 +53,7 @@ if __name__ == "__main__":
model.cuda() model.cuda()
if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu
model = nn.DataParallel(model, device_ids=list(range(0, torch.cuda.device_count() - 1))) model = nn.DataParallel(model, device_ids=list(range(0, torch.cuda.device_count() - 1)))
mutator = SPOSSupernetTrainingMutator(model, flops_func=model.module.get_candidate_flops, mutator = SPOSSupernetTrainingMutator(model, flops_func=flops_func,
flops_lb=290E6, flops_ub=360E6) flops_lb=290E6, flops_ub=360E6)
criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing) criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing)
optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate,
......
...@@ -17,9 +17,9 @@ tuner: ...@@ -17,9 +17,9 @@ tuner:
trial: trial:
command: python3 mnist.py command: python3 mnist.py
codeDir: . codeDir: .
computeTarget: ${replace_to_your_computeTarget}
image: msranni/nni image: msranni/nni
amlConfig: amlConfig:
subscriptionId: ${replace_to_your_subscriptionId} subscriptionId: ${replace_to_your_subscriptionId}
resourceGroup: ${replace_to_your_resourceGroup} resourceGroup: ${replace_to_your_resourceGroup}
workspaceName: ${replace_to_your_workspaceName} workspaceName: ${replace_to_your_workspaceName}
computeTarget: ${replace_to_your_computeTarget}
...@@ -17,9 +17,9 @@ tuner: ...@@ -17,9 +17,9 @@ tuner:
trial: trial:
command: python3 mnist.py command: python3 mnist.py
codeDir: . codeDir: .
computeTarget: ${replace_to_your_computeTarget}
image: msranni/nni image: msranni/nni
amlConfig: amlConfig:
subscriptionId: ${replace_to_your_subscriptionId} subscriptionId: ${replace_to_your_subscriptionId}
resourceGroup: ${replace_to_your_resourceGroup} resourceGroup: ${replace_to_your_resourceGroup}
workspaceName: ${replace_to_your_workspaceName} workspaceName: ${replace_to_your_workspaceName}
computeTarget: ${replace_to_your_computeTarget}
...@@ -13,21 +13,29 @@ from .mutator import EnasMutator ...@@ -13,21 +13,29 @@ from .mutator import EnasMutator
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
log_frequency = 100
entropy_weight = 0.0001
skip_weight = 0.8
baseline_decay = 0.999
child_steps = 500
mutator_lr = 0.00035
mutator_steps = 50
mutator_steps_aggregate = 20
aux_weight = 0.4
test_arc_per_epoch = 1
class EnasTrainer: class EnasTrainer:
def __init__(self, model, loss, metrics, reward_function, optimizer, batch_size, num_epochs, def __init__(
dataset_train, dataset_valid): self,
model,
loss,
metrics,
reward_function,
optimizer,
batch_size,
num_epochs,
dataset_train,
dataset_valid,
log_frequency=100,
entropy_weight=0.0001,
skip_weight=0.8,
baseline_decay=0.999,
child_steps=500,
mutator_lr=0.00035,
mutator_steps=50,
mutator_steps_aggregate=20,
aux_weight=0.4,
test_arc_per_epoch=1,
):
self.model = model self.model = model
self.loss = loss self.loss = loss
self.metrics = metrics self.metrics = metrics
...@@ -42,11 +50,21 @@ class EnasTrainer: ...@@ -42,11 +50,21 @@ class EnasTrainer:
self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:])) self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid) self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)
self.mutator = EnasMutator(model) self.log_frequency = log_frequency
self.mutator_optim = Adam(learning_rate=mutator_lr) self.entropy_weight = entropy_weight
self.skip_weight = skip_weight
self.baseline_decay = baseline_decay
self.child_steps = child_steps
self.mutator_lr = mutator_lr
self.mutator_steps = mutator_steps
self.mutator_steps_aggregate = mutator_steps_aggregate
self.aux_weight = aux_weight
self.test_arc_per_epoch = test_arc_per_epoch
self.baseline = 0. self.mutator = EnasMutator(model)
self.mutator_optim = Adam(learning_rate=self.mutator_lr)
self.baseline = 0.0
def train(self, validate=True): def train(self, validate=True):
for epoch in range(self.num_epochs): for epoch in range(self.num_epochs):
...@@ -58,14 +76,13 @@ class EnasTrainer: ...@@ -58,14 +76,13 @@ class EnasTrainer:
def validate(self): def validate(self):
self.validate_one_epoch(-1) self.validate_one_epoch(-1)
def train_one_epoch(self, epoch): def train_one_epoch(self, epoch):
train_loader, valid_loader = self._create_train_loader() train_loader, valid_loader = self._create_train_loader()
# Sample model and train # Sample model and train
meters = AverageMeterGroup() meters = AverageMeterGroup()
for step in range(1, child_steps + 1): for step in range(1, self.child_steps + 1):
x, y = next(train_loader) x, y = next(train_loader)
self.mutator.reset() self.mutator.reset()
...@@ -75,64 +92,88 @@ class EnasTrainer: ...@@ -75,64 +92,88 @@ class EnasTrainer:
logits, aux_logits = logits logits, aux_logits = logits
aux_loss = self.loss(aux_logits, y) aux_loss = self.loss(aux_logits, y)
else: else:
aux_loss = 0. aux_loss = 0.0
metrics = self.metrics(y, logits) metrics = self.metrics(y, logits)
loss = self.loss(y, logits) + aux_weight * aux_loss loss = self.loss(y, logits) + self.aux_weight * aux_loss
grads = tape.gradient(loss, self.model.trainable_weights) grads = tape.gradient(loss, self.model.trainable_weights)
grads = fill_zero_grads(grads, self.model.trainable_weights) grads = fill_zero_grads(grads, self.model.trainable_weights)
grads, _ = tf.clip_by_global_norm(grads, 5.0) grads, _ = tf.clip_by_global_norm(grads, 5.0)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
metrics['loss'] = tf.reduce_mean(loss).numpy() metrics["loss"] = tf.reduce_mean(loss).numpy()
meters.update(metrics) meters.update(metrics)
if log_frequency and step % log_frequency == 0: if self.log_frequency and step % self.log_frequency == 0:
logger.info("Model Epoch [%d/%d] Step [%d/%d] %s", epoch + 1, logger.info(
self.num_epochs, step, child_steps, meters) "Model Epoch [%d/%d] Step [%d/%d] %s",
epoch + 1,
self.num_epochs,
step,
self.child_steps,
meters,
)
# Train sampler (mutator) # Train sampler (mutator)
meters = AverageMeterGroup() meters = AverageMeterGroup()
for mutator_step in range(1, mutator_steps + 1): for mutator_step in range(1, self.mutator_steps + 1):
grads_list = [] grads_list = []
for step in range(1, mutator_steps_aggregate + 1): for step in range(1, self.mutator_steps_aggregate + 1):
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
x, y = next(valid_loader) x, y = next(valid_loader)
self.mutator.reset() self.mutator.reset()
logits = self.model(x, training=False) logits = self.model(x, training=False)
metrics = self.metrics(y, logits) metrics = self.metrics(y, logits)
reward = self.reward_function(y, logits) + entropy_weight * self.mutator.sample_entropy reward = (
self.baseline = self.baseline * baseline_decay + reward * (1 - baseline_decay) self.reward_function(y, logits)
+ self.entropy_weight * self.mutator.sample_entropy
)
self.baseline = self.baseline * self.baseline_decay + reward * (
1 - self.baseline_decay
)
loss = self.mutator.sample_log_prob * (reward - self.baseline) loss = self.mutator.sample_log_prob * (reward - self.baseline)
loss += skip_weight * self.mutator.sample_skip_penalty loss += self.skip_weight * self.mutator.sample_skip_penalty
meters.update({ meters.update(
'reward': reward, {
'loss': tf.reduce_mean(loss).numpy(), "reward": reward,
'ent': self.mutator.sample_entropy.numpy(), "loss": tf.reduce_mean(loss).numpy(),
'log_prob': self.mutator.sample_log_prob.numpy(), "ent": self.mutator.sample_entropy.numpy(),
'baseline': self.baseline, "log_prob": self.mutator.sample_log_prob.numpy(),
'skip': self.mutator.sample_skip_penalty, "baseline": self.baseline,
}) "skip": self.mutator.sample_skip_penalty,
}
cur_step = step + (mutator_step - 1) * mutator_steps_aggregate )
if log_frequency and cur_step % log_frequency == 0:
logger.info("RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s", epoch + 1, self.num_epochs, cur_step = step + (mutator_step - 1) * self.mutator_steps_aggregate
mutator_step, mutator_steps, step, mutator_steps_aggregate, if self.log_frequency and cur_step % self.log_frequency == 0:
meters) logger.info(
"RL Epoch [%d/%d] Step [%d/%d] [%d/%d] %s",
epoch + 1,
self.num_epochs,
mutator_step,
self.mutator_steps,
step,
self.mutator_steps_aggregate,
meters,
)
grads = tape.gradient(loss, self.mutator.trainable_weights) grads = tape.gradient(loss, self.mutator.trainable_weights)
grads = fill_zero_grads(grads, self.mutator.trainable_weights) grads = fill_zero_grads(grads, self.mutator.trainable_weights)
grads_list.append(grads) grads_list.append(grads)
total_grads = [tf.math.add_n(weight_grads) for weight_grads in zip(*grads_list)] total_grads = [
tf.math.add_n(weight_grads) for weight_grads in zip(*grads_list)
]
total_grads, _ = tf.clip_by_global_norm(total_grads, 5.0) total_grads, _ = tf.clip_by_global_norm(total_grads, 5.0)
self.mutator_optim.apply_gradients(zip(total_grads, self.mutator.trainable_weights)) self.mutator_optim.apply_gradients(
zip(total_grads, self.mutator.trainable_weights)
)
def validate_one_epoch(self, epoch): def validate_one_epoch(self, epoch):
test_loader = self._create_validate_loader() test_loader = self._create_validate_loader()
for arc_id in range(test_arc_per_epoch): for arc_id in range(self.test_arc_per_epoch):
meters = AverageMeterGroup() meters = AverageMeterGroup()
for x, y in test_loader: for x, y in test_loader:
self.mutator.reset() self.mutator.reset()
...@@ -141,13 +182,17 @@ class EnasTrainer: ...@@ -141,13 +182,17 @@ class EnasTrainer:
logits, _ = logits logits, _ = logits
metrics = self.metrics(y, logits) metrics = self.metrics(y, logits)
loss = self.loss(y, logits) loss = self.loss(y, logits)
metrics['loss'] = tf.reduce_mean(loss).numpy() metrics["loss"] = tf.reduce_mean(loss).numpy()
meters.update(metrics) meters.update(metrics)
logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s", logger.info(
epoch + 1, self.num_epochs, arc_id + 1, test_arc_per_epoch, "Test Epoch [%d/%d] Arc [%d/%d] Summary %s",
meters.summary()) epoch + 1,
self.num_epochs,
arc_id + 1,
self.test_arc_per_epoch,
meters.summary(),
)
def _create_train_loader(self): def _create_train_loader(self):
train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size) train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
......
...@@ -31,7 +31,7 @@ class Nb201TrialConfig(Model): ...@@ -31,7 +31,7 @@ class Nb201TrialConfig(Model):
Dataset used for training and evaluation. NAS-Bench-201 provides the following 4 options: Dataset used for training and evaluation. NAS-Bench-201 provides the following 4 options:
``cifar10-valid`` (training data is splited into 25k for training and 25k for validation, ``cifar10-valid`` (training data is splited into 25k for training and 25k for validation,
validation data is used for test), ``cifar10`` (training data is used in training, validation validation data is used for test), ``cifar10`` (training data is used in training, validation
data is splited into 25k for validation and 25k for testing), ``cifar100`` (same protocol as ``cifar10``), data is splited into 5k for validation and 5k for testing), ``cifar100`` (same protocol as ``cifar10``),
and ``imagenet16-120`` (a subset of 120 classes in ImageNet, downscaled to 16x16, using training data and ``imagenet16-120`` (a subset of 120 classes in ImageNet, downscaled to 16x16, using training data
for training, 6k images from validation set for validation and the other 6k for testing). for training, 6k images from validation set for validation and the other 6k for testing).
""" """
......
...@@ -66,7 +66,12 @@ class Mutator(BaseMutator): ...@@ -66,7 +66,12 @@ class Mutator(BaseMutator):
if reduction_type == 'mean': if reduction_type == 'mean':
return sum(tensor_list) / len(tensor_list) return sum(tensor_list) / len(tensor_list)
if reduction_type == 'concat': if reduction_type == 'concat':
return tf.concat(tensor_list, axis=0) image_data_format = tf.keras.backend.image_data_format()
if image_data_format == "channels_first":
axis = 0
else:
axis = -1
return tf.concat(tensor_list, axis=axis)
raise ValueError('Unrecognized reduction policy: "{}'.format(reduction_type)) raise ValueError('Unrecognized reduction policy: "{}'.format(reduction_type))
def _get_decision(self, mutable): def _get_decision(self, mutable):
......
...@@ -18,6 +18,11 @@ log_level_map = { ...@@ -18,6 +18,11 @@ log_level_map = {
_time_format = '%m/%d/%Y, %I:%M:%S %p' _time_format = '%m/%d/%Y, %I:%M:%S %p'
# FIXME
# This hotfix the bug that querying installed tuners with `package_utils` will activate dispatcher logger.
# This behavior depends on underlying implementation of `nnictl` and is likely to break in future.
_logger_initialized = False
class _LoggerFileWrapper(TextIOBase): class _LoggerFileWrapper(TextIOBase):
def __init__(self, logger_file): def __init__(self, logger_file):
self.file = logger_file self.file = logger_file
...@@ -34,6 +39,11 @@ def init_logger(logger_file_path, log_level_name='info'): ...@@ -34,6 +39,11 @@ def init_logger(logger_file_path, log_level_name='info'):
This will redirect anything from logging.getLogger() as well as stdout to specified file. This will redirect anything from logging.getLogger() as well as stdout to specified file.
logger_file_path: path of logger file (path-like object). logger_file_path: path of logger file (path-like object).
""" """
global _logger_initialized
if _logger_initialized:
return
_logger_initialized = True
if os.environ.get('NNI_PLATFORM') == 'unittest': if os.environ.get('NNI_PLATFORM') == 'unittest':
return # fixme: launching logic needs refactor return # fixme: launching logic needs refactor
...@@ -59,6 +69,11 @@ def init_standalone_logger(): ...@@ -59,6 +69,11 @@ def init_standalone_logger():
Initialize root logger for standalone mode. Initialize root logger for standalone mode.
This will set NNI's log level to INFO and print its log to stdout. This will set NNI's log level to INFO and print its log to stdout.
""" """
global _logger_initialized
if _logger_initialized:
return
_logger_initialized = True
fmt = '[%(asctime)s] %(levelname)s (%(name)s) %(message)s' fmt = '[%(asctime)s] %(levelname)s (%(name)s) %(message)s'
formatter = logging.Formatter(fmt, _time_format) formatter = logging.Formatter(fmt, _time_format)
handler = logging.StreamHandler(sys.stdout) handler = logging.StreamHandler(sys.stdout)
......
...@@ -373,6 +373,11 @@ def set_experiment(experiment_config, mode, port, config_file_name): ...@@ -373,6 +373,11 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'frameworkcontroller_config', 'value': experiment_config['frameworkcontrollerConfig']}) {'key': 'frameworkcontroller_config', 'value': experiment_config['frameworkcontrollerConfig']})
request_data['clusterMetaData'].append( request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']}) {'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'aml':
request_data['clusterMetaData'].append(
{'key': 'aml_config', 'value': experiment_config['amlConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True) response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True)
if check_response(response): if check_response(response):
return response return response
......
...@@ -137,10 +137,15 @@ class Trial: ...@@ -137,10 +137,15 @@ class Trial:
def kill(self, trial_id=None): def kill(self, trial_id=None):
if trial_id == self.id or trial_id is None: if trial_id == self.id or trial_id is None:
if self.process is not None: if self.process is not None:
nni_log(LogType.Info, "%s: killing trial" % self.name) try:
for child in psutil.Process(self.process.pid).children(True): nni_log(LogType.Info, "%s: killing trial" % self.name)
child.kill() for child in psutil.Process(self.process.pid).children(True):
self.process.kill() child.kill()
self.process.kill()
except psutil.NoSuchProcess:
nni_log(LogType.Info, "kill trial %s failed: %s does not exist!" % (trial_id, self.process.pid))
except Exception as ex:
nni_log(LogType.Error, "kill trial %s failed: %s " % (trial_id, str(ex)))
self.cleanup() self.cleanup()
def cleanup(self): def cleanup(self):
......
...@@ -95,6 +95,8 @@ pai: ...@@ -95,6 +95,8 @@ pai:
containerNFSMountPath: containerNFSMountPath:
paiStorageConfigName: paiStorageConfigName:
remote: remote:
remoteConfig:
reuse: false
machineList: machineList:
- ip: - ip:
passwd: passwd:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment