Promote Retiarii to NAS (step 1) - move files (#5020)

867871b2 · Yuge Zhang · GitHub · 481aa292 · 481aa292 · 481aa292
Unverified Commit 867871b2 authored Jul 27, 2022 by Yuge Zhang Committed by GitHub Jul 27, 2022
20 changed files
--- a/nni/algorithms/nas/pytorch/pdarts/__init__.py
+++ b/nni/algorithms/nas/pytorch/pdarts/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-from .trainer import PdartsTrainer
--- a/nni/algorithms/nas/pytorch/pdarts/mutator.py
+++ b/nni/algorithms/nas/pytorch/pdarts/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import copy
-import numpy as np
-import torch
-from torch import nn
-from nni.algorithms.nas.pytorch.darts import DartsMutator
-from nni.nas.pytorch.mutables import LayerChoice
-class PdartsMutator(DartsMutator):
-    """
-    It works with PdartsTrainer to calculate ops weights,
-    and drop weights in different PDARTS epochs.
-    """
-    def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
-        self.pdarts_epoch_index = pdarts_epoch_index
-        self.pdarts_num_to_drop = pdarts_num_to_drop
-        if switches is None:
-            self.switches = {}
-        else:
-            self.switches = switches
-        super(PdartsMutator, self).__init__(model)
-        # this loop go through mutables with different keys,
-        # it's mainly to update length of choices.
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                switches = self.switches.get(mutable.key, [True for j in range(len(mutable))])
-                choices = self.choices[mutable.key]
-                operations_count = np.sum(switches)
-                # +1 and -1 are caused by zero operation in darts network
-                # the zero operation is not in choices list in network, but its weight are in,
-                # so it needs one more weights and switch for zero.
-                self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(operations_count + 1))
-                self.switches[mutable.key] = switches
-        # update LayerChoice instances in model,
-        # it's physically remove dropped choices operations.
-        for module in self.model.modules():
-            if isinstance(module, LayerChoice):
-                switches = self.switches.get(module.key)
-                choices = self.choices[module.key]
-                if len(module) > len(choices):
-                    # from last to first, so that it won't effect previous indexes after removed one.
-                    for index in range(len(switches)-1, -1, -1):
-                        if switches[index] == False:
-                            del module[index]
-                assert len(module) <= len(choices), "Failed to remove dropped choices."
-    def export(self):
-        # Cannot rely on super().export() because P-DARTS has deleted some of the choices and has misaligned length.
-        results = super().sample_final()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                # As some operations are dropped physically,
-                # so it needs to fill back false to track dropped operations.
-                trained_result = results[mutable.key]
-                trained_index = 0
-                switches = self.switches[mutable.key]
-                result = torch.Tensor(switches).bool()
-                for index in range(len(result)):
-                    if result[index]:
-                        result[index] = trained_result[trained_index]
-                        trained_index += 1
-                results[mutable.key] = result
-        return results
-    def drop_paths(self):
-        """
-        This method is called when a PDARTS epoch is finished.
-        It prepares switches for next epoch.
-        candidate operations with False switch will be doppped in next epoch.
-        """
-        all_switches = copy.deepcopy(self.switches)
-        for key in all_switches:
-            switches = all_switches[key]
-            idxs = []
-            for j in range(len(switches)):
-                if switches[j]:
-                    idxs.append(j)
-            sorted_weights = self.choices[key].data.cpu().numpy()[:-1]
-            drop = np.argsort(sorted_weights)[:self.pdarts_num_to_drop[self.pdarts_epoch_index]]
-            for idx in drop:
-                switches[idxs[idx]] = False
-        return all_switches
--- a/nni/algorithms/nas/pytorch/pdarts/trainer.py
+++ b/nni/algorithms/nas/pytorch/pdarts/trainer.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import json
-import logging
-from nni.nas.pytorch.callbacks import LRSchedulerCallback
-from nni.algorithms.nas.pytorch.darts import DartsTrainer
-from nni.nas.pytorch.trainer import BaseTrainer, TorchTensorEncoder
-from .mutator import PdartsMutator
-logger = logging.getLogger(__name__)
-class PdartsTrainer(BaseTrainer):
-    """
-    This trainer implements the PDARTS algorithm.
-    PDARTS bases on DARTS algorithm, and provides a network growth approach to find deeper and better network.
-    This class relies on pdarts_num_layers and pdarts_num_to_drop parameters to control how network grows.
-    pdarts_num_layers means how many layers more than first epoch.
-    pdarts_num_to_drop means how many candidate operations should be dropped in each epoch.
-        So that the grew network can in similar size.
-    """
-    def __init__(self, model_creator, init_layers, metrics,
-                 num_epochs, dataset_train, dataset_valid,
-                 pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 1],
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None, unrolled=False):
-        super(PdartsTrainer, self).__init__()
-        self.model_creator = model_creator
-        self.init_layers = init_layers
-        self.pdarts_num_layers = pdarts_num_layers
-        self.pdarts_num_to_drop = pdarts_num_to_drop
-        self.pdarts_epoch = len(pdarts_num_to_drop)
-        self.darts_parameters = {
-            "metrics": metrics,
-            "num_epochs": num_epochs,
-            "dataset_train": dataset_train,
-            "dataset_valid": dataset_valid,
-            "batch_size": batch_size,
-            "workers": workers,
-            "device": device,
-            "log_frequency": log_frequency,
-            "unrolled": unrolled
-        }
-        self.callbacks = callbacks if callbacks is not None else []
-    def train(self):
-        switches = None
-        for epoch in range(self.pdarts_epoch):
-            layers = self.init_layers+self.pdarts_num_layers[epoch]
-            model, criterion, optim, lr_scheduler = self.model_creator(layers)
-            self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
-            for callback in self.callbacks:
-                callback.build(model, self.mutator, self)
-                callback.on_epoch_begin(epoch)
-            darts_callbacks = []
-            if lr_scheduler is not None:
-                darts_callbacks.append(LRSchedulerCallback(lr_scheduler))
-            self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim,
-                                        callbacks=darts_callbacks, **self.darts_parameters)
-            logger.info("start pdarts training epoch %s...", epoch)
-            self.trainer.train()
-            switches = self.mutator.drop_paths()
-            for callback in self.callbacks:
-                callback.on_epoch_end(epoch)
-    def validate(self):
-        self.trainer.validate()
-    def export(self, file):
-        mutator_export = self.mutator.export()
-        with open(file, "w") as f:
-            json.dump(mutator_export, f, indent=2, sort_keys=True, cls=TorchTensorEncoder)
-    def checkpoint(self):
-        raise NotImplementedError("Not implemented yet")
--- a/nni/algorithms/nas/pytorch/proxylessnas/__init__.py
+++ b/nni/algorithms/nas/pytorch/proxylessnas/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-from .mutator import ProxylessNasMutator
-from .trainer import ProxylessNasTrainer
--- a/nni/algorithms/nas/pytorch/proxylessnas/mutator.py
+++ b/nni/algorithms/nas/pytorch/proxylessnas/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import math
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-import numpy as np
-from nni.nas.pytorch.base_mutator import BaseMutator
-from nni.nas.pytorch.mutables import LayerChoice
-from .utils import detach_variable
-class ArchGradientFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, binary_gates, run_func, backward_func):
-        ctx.run_func = run_func
-        ctx.backward_func = backward_func
-        detached_x = detach_variable(x)
-        with torch.enable_grad():
-            output = run_func(detached_x)
-        ctx.save_for_backward(detached_x, output)
-        return output.data
-    @staticmethod
-    def backward(ctx, grad_output):
-        detached_x, output = ctx.saved_tensors
-        grad_x = torch.autograd.grad(output, detached_x, grad_output, only_inputs=True)
-        # compute gradients w.r.t. binary_gates
-        binary_grads = ctx.backward_func(detached_x.data, output.data, grad_output.data)
-        return grad_x[0], binary_grads, None, None
-class MixedOp(nn.Module):
-    """
-    This class is to instantiate and manage info of one LayerChoice.
-    It includes architecture weights, binary weights, and member functions
-    operating the weights.
-    forward_mode:
-        forward/backward mode for LayerChoice: None, two, full, and full_v2.
-        For training architecture weights, we use full_v2 by default, and for training
-        model weights, we use None.
-    """
-    forward_mode = None
-    def __init__(self, mutable):
-        """
-        Parameters
-        ----------
-        mutable : LayerChoice
-            A LayerChoice in user model
-        """
-        super(MixedOp, self).__init__()
-        self.ap_path_alpha = nn.Parameter(torch.Tensor(len(mutable)))
-        self.ap_path_wb = nn.Parameter(torch.Tensor(len(mutable)))
-        self.ap_path_alpha.requires_grad = False
-        self.ap_path_wb.requires_grad = False
-        self.active_index = [0]
-        self.inactive_index = None
-        self.log_prob = None
-        self.current_prob_over_ops = None
-        self.n_choices = len(mutable)
-    def get_ap_path_alpha(self):
-        return self.ap_path_alpha
-    def to_requires_grad(self):
-        self.ap_path_alpha.requires_grad = True
-        self.ap_path_wb.requires_grad = True
-    def to_disable_grad(self):
-        self.ap_path_alpha.requires_grad = False
-        self.ap_path_wb.requires_grad = False
-    def forward(self, mutable, x):
-        """
-        Define forward of LayerChoice. For 'full_v2', backward is also defined.
-        The 'two' mode is explained in section 3.2.1 in the paper.
-        The 'full_v2' mode is explained in Appendix D in the paper.
-        Parameters
-        ----------
-        mutable : LayerChoice
-            this layer's mutable
-        x : tensor
-            inputs of this layer, only support one input
-        Returns
-        -------
-        output: tensor
-            output of this layer
-        """
-        if MixedOp.forward_mode == 'full' or MixedOp.forward_mode == 'two':
-            output = 0
-            for _i in self.active_index:
-                oi = self.candidate_ops[_i](x)
-                output = output + self.ap_path_wb[_i] * oi
-            for _i in self.inactive_index:
-                oi = self.candidate_ops[_i](x)
-                output = output + self.ap_path_wb[_i] * oi.detach()
-        elif MixedOp.forward_mode == 'full_v2':
-            def run_function(key, candidate_ops, active_id):
-                def forward(_x):
-                    return candidate_ops[active_id](_x)
-                return forward
-            def backward_function(key, candidate_ops, active_id, binary_gates):
-                def backward(_x, _output, grad_output):
-                    binary_grads = torch.zeros_like(binary_gates.data)
-                    with torch.no_grad():
-                        for k in range(len(candidate_ops)):
-                            if k != active_id:
-                                out_k = candidate_ops[k](_x.data)
-                            else:
-                                out_k = _output.data
-                            grad_k = torch.sum(out_k * grad_output)
-                            binary_grads[k] = grad_k
-                    return binary_grads
-                return backward
-            output = ArchGradientFunction.apply(
-                x, self.ap_path_wb, run_function(mutable.key, list(mutable), self.active_index[0]),
-                backward_function(mutable.key, list(mutable), self.active_index[0], self.ap_path_wb))
-        else:
-            output = self.active_op(mutable)(x)
-        return output
-    @property
-    def probs_over_ops(self):
-        """
-        Apply softmax on alpha to generate probability distribution
-        Returns
-        -------
-        pytorch tensor
-            probability distribution
-        """
-        probs = F.softmax(self.ap_path_alpha, dim=0)  # softmax to probability
-        return probs
-    @property
-    def chosen_index(self):
-        """
-        choose the op with max prob
-        Returns
-        -------
-        int
-            index of the chosen one
-        numpy.float32
-            prob of the chosen one
-        """
-        probs = self.probs_over_ops.data.cpu().numpy()
-        index = int(np.argmax(probs))
-        return index, probs[index]
-    def active_op(self, mutable):
-        """
-        assume only one path is active
-        Returns
-        -------
-        PyTorch module
-            the chosen operation
-        """
-        return mutable[self.active_index[0]]
-    @property
-    def active_op_index(self):
-        """
-        return active op's index, the active op is sampled
-        Returns
-        -------
-        int
-            index of the active op
-        """
-        return self.active_index[0]
-    def set_chosen_op_active(self):
-        """
-        set chosen index, active and inactive indexes
-        """
-        chosen_idx, _ = self.chosen_index
-        self.active_index = [chosen_idx]
-        self.inactive_index = [_i for _i in range(0, chosen_idx)] + \
-                              [_i for _i in range(chosen_idx + 1, self.n_choices)]
-    def binarize(self, mutable):
-        """
-        Sample based on alpha, and set binary weights accordingly.
-        ap_path_wb is set in this function, which is called binarize.
-        Parameters
-        ----------
-        mutable : LayerChoice
-            this layer's mutable
-        """
-        self.log_prob = None
-        # reset binary gates
-        self.ap_path_wb.data.zero_()
-        probs = self.probs_over_ops
-        if MixedOp.forward_mode == 'two':
-            # sample two ops according to probs
-            sample_op = torch.multinomial(probs.data, 2, replacement=False)
-            probs_slice = F.softmax(torch.stack([
-                self.ap_path_alpha[idx] for idx in sample_op
-            ]), dim=0)
-            self.current_prob_over_ops = torch.zeros_like(probs)
-            for i, idx in enumerate(sample_op):
-                self.current_prob_over_ops[idx] = probs_slice[i]
-            # choose one to be active and the other to be inactive according to probs_slice
-            c = torch.multinomial(probs_slice.data, 1)[0] # 0 or 1
-            active_op = sample_op[c].item()
-            inactive_op = sample_op[1-c].item()
-            self.active_index = [active_op]
-            self.inactive_index = [inactive_op]
-            # set binary gate
-            self.ap_path_wb.data[active_op] = 1.0
-        else:
-            sample = torch.multinomial(probs, 1)[0].item()
-            self.active_index = [sample]
-            self.inactive_index = [_i for _i in range(0, sample)] + \
-                                [_i for _i in range(sample + 1, len(mutable))]
-            self.log_prob = torch.log(probs[sample])
-            self.current_prob_over_ops = probs
-            self.ap_path_wb.data[sample] = 1.0
-        # avoid over-regularization
-        for choice in mutable:
-            for _, param in choice.named_parameters():
-                param.grad = None
-    @staticmethod
-    def delta_ij(i, j):
-        if i == j:
-            return 1
-        else:
-            return 0
-    def set_arch_param_grad(self, mutable):
-        """
-        Calculate alpha gradient for this LayerChoice.
-        It is calculated using gradient of binary gate, probs of ops.
-        """
-        binary_grads = self.ap_path_wb.grad.data
-        if self.active_op(mutable).is_zero_layer():
-            self.ap_path_alpha.grad = None
-            return
-        if self.ap_path_alpha.grad is None:
-            self.ap_path_alpha.grad = torch.zeros_like(self.ap_path_alpha.data)
-        if MixedOp.forward_mode == 'two':
-            involved_idx = self.active_index + self.inactive_index
-            probs_slice = F.softmax(torch.stack([
-                self.ap_path_alpha[idx] for idx in involved_idx
-            ]), dim=0).data
-            for i in range(2):
-                for j in range(2):
-                    origin_i = involved_idx[i]
-                    origin_j = involved_idx[j]
-                    self.ap_path_alpha.grad.data[origin_i] += \
-                        binary_grads[origin_j] * probs_slice[j] * (MixedOp.delta_ij(i, j) - probs_slice[i])
-            for _i, idx in enumerate(self.active_index):
-                self.active_index[_i] = (idx, self.ap_path_alpha.data[idx].item())
-            for _i, idx in enumerate(self.inactive_index):
-                self.inactive_index[_i] = (idx, self.ap_path_alpha.data[idx].item())
-        else:
-            probs = self.probs_over_ops.data
-            for i in range(self.n_choices):
-                for j in range(self.n_choices):
-                    self.ap_path_alpha.grad.data[i] += binary_grads[j] * probs[j] * (MixedOp.delta_ij(i, j) - probs[i])
-        return
-    def rescale_updated_arch_param(self):
-        """
-        rescale architecture weights for the 'two' mode.
-        """
-        if not isinstance(self.active_index[0], tuple):
-            assert self.active_op.is_zero_layer()
-            return
-        involved_idx = [idx for idx, _ in (self.active_index + self.inactive_index)]
-        old_alphas = [alpha for _, alpha in (self.active_index + self.inactive_index)]
-        new_alphas = [self.ap_path_alpha.data[idx] for idx in involved_idx]
-        offset = math.log(
-            sum([math.exp(alpha) for alpha in new_alphas]) / sum([math.exp(alpha) for alpha in old_alphas])
-        )
-        for idx in involved_idx:
-            self.ap_path_alpha.data[idx] -= offset
-class ProxylessNasMutator(BaseMutator):
-    """
-    This mutator initializes and operates all the LayerChoices of the input model.
-    It is for the corresponding trainer to control the training process of LayerChoices,
-    coordinating with whole training process.
-    """
-    def __init__(self, model):
-        """
-        Init a MixedOp instance for each mutable i.e., LayerChoice.
-        And register the instantiated MixedOp in corresponding LayerChoice.
-        If does not register it in LayerChoice, DataParallel does not work then,
-        because architecture weights are not included in the DataParallel model.
-        When MixedOPs are registered, we use ```requires_grad``` to control
-        whether calculate gradients of architecture weights.
-        Parameters
-        ----------
-        model : pytorch model
-            The model that users want to tune, it includes search space defined with nni nas apis
-        """
-        super(ProxylessNasMutator, self).__init__(model)
-        self._unused_modules = None
-        self.mutable_list = []
-        for mutable in self.undedup_mutables:
-            self.mutable_list.append(mutable)
-            mutable.registered_module = MixedOp(mutable)
-    def on_forward_layer_choice(self, mutable, *args, **kwargs):
-        """
-        Callback of layer choice forward. This function defines the forward
-        logic of the input mutable. So mutable is only interface, its real
-        implementation is defined in mutator.
-        Parameters
-        ----------
-        mutable: LayerChoice
-            forward logic of this input mutable
-        args: list of torch.Tensor
-            inputs of this mutable
-        kwargs: dict
-            inputs of this mutable
-        Returns
-        -------
-        torch.Tensor
-            output of this mutable, i.e., LayerChoice
-        int
-            index of the chosen op
-        """
-        # FIXME: return mask, to be consistent with other algorithms
-        idx = mutable.registered_module.active_op_index
-        return mutable.registered_module(mutable, *args, **kwargs), idx
-    def reset_binary_gates(self):
-        """
-        For each LayerChoice, binarize binary weights
-        based on alpha to only activate one op.
-        It traverses all the mutables in the model to do this.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.binarize(mutable)
-    def set_chosen_op_active(self):
-        """
-        For each LayerChoice, set the op with highest alpha as the chosen op.
-        Usually used for validation.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.set_chosen_op_active()
-    def num_arch_params(self):
-        """
-        The number of mutables, i.e., LayerChoice
-        Returns
-        -------
-        int
-            the number of LayerChoice in user model
-        """
-        return len(self.mutable_list)
-    def set_arch_param_grad(self):
-        """
-        For each LayerChoice, calculate gradients for architecture weights, i.e., alpha
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.set_arch_param_grad(mutable)
-    def get_architecture_parameters(self):
-        """
-        Get all the architecture parameters.
-        yield
-        -----
-        PyTorch Parameter
-            Return ap_path_alpha of the traversed mutable
-        """
-        for mutable in self.undedup_mutables:
-            yield mutable.registered_module.get_ap_path_alpha()
-    def change_forward_mode(self, mode):
-        """
-        Update forward mode of MixedOps, as training architecture weights and
-        model weights use different forward modes.
-        """
-        MixedOp.forward_mode = mode
-    def get_forward_mode(self):
-        """
-        Get forward mode of MixedOp
-        Returns
-        -------
-        string
-            the current forward mode of MixedOp
-        """
-        return MixedOp.forward_mode
-    def rescale_updated_arch_param(self):
-        """
-        Rescale architecture weights in 'two' mode.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.rescale_updated_arch_param()
-    def unused_modules_off(self):
-        """
-        Remove unused modules for each mutables.
-        The removed modules are kept in ```self._unused_modules``` for resume later.
-        """
-        self._unused_modules = []
-        for mutable in self.undedup_mutables:
-            mixed_op = mutable.registered_module
-            unused = {}
-            if self.get_forward_mode() in ['full', 'two', 'full_v2']:
-                involved_index = mixed_op.active_index + mixed_op.inactive_index
-            else:
-                involved_index = mixed_op.active_index
-            for i in range(mixed_op.n_choices):
-                if i not in involved_index:
-                    unused[i] = mutable[i]
-                    mutable[i] = None
-            self._unused_modules.append(unused)
-    def unused_modules_back(self):
-        """
-        Resume the removed modules back.
-        """
-        if self._unused_modules is None:
-            return
-        for m, unused in zip(self.mutable_list, self._unused_modules):
-            for i in unused:
-                m[i] = unused[i]
-        self._unused_modules = None
-    def arch_requires_grad(self):
-        """
-        Make architecture weights require gradient
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_requires_grad()
-    def arch_disable_grad(self):
-        """
-        Disable gradient of architecture weights, i.e., does not
-        calcuate gradient for them.
-        """
-        for mutable in self.undedup_mutables:
-            mutable.registered_module.to_disable_grad()
-    def sample_final(self):
-        """
-        Generate the final chosen architecture.
-        Returns
-        -------
-        dict
-            the choice of each mutable, i.e., LayerChoice
-        """
-        result = dict()
-        for mutable in self.undedup_mutables:
-            assert isinstance(mutable, LayerChoice)
-            index, _ = mutable.registered_module.chosen_index
-            # pylint: disable=not-callable
-            result[mutable.key] = F.one_hot(torch.tensor(index), num_classes=len(mutable)).view(-1).bool()
-        return result
--- a/nni/algorithms/nas/pytorch/proxylessnas/trainer.py
+++ b/nni/algorithms/nas/pytorch/proxylessnas/trainer.py
--- a/nni/algorithms/nas/pytorch/proxylessnas/utils.py
+++ b/nni/algorithms/nas/pytorch/proxylessnas/utils.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import torch
-import torch.nn as nn
-def detach_variable(inputs):
-    """
-    Detach variables
-    Parameters
-    ----------
-    inputs : pytorch tensors
-        pytorch tensors
-    """
-    if isinstance(inputs, tuple):
-        return tuple([detach_variable(x) for x in inputs])
-    else:
-        x = inputs.detach()
-        x.requires_grad = inputs.requires_grad
-        return x
-def cross_entropy_with_label_smoothing(pred, target, label_smoothing=0.1):
-    """
-    Parameters
-    ----------
-    pred : pytorch tensor
-        predicted value
-    target : pytorch tensor
-        label
-    label_smoothing : float
-        the degree of label smoothing
-    Returns
-    -------
-    pytorch tensor
-        cross entropy
-    """
-    logsoftmax = nn.LogSoftmax()
-    n_classes = pred.size(1)
-    # convert to one-hot
-    target = torch.unsqueeze(target, 1)
-    soft_target = torch.zeros_like(pred)
-    soft_target.scatter_(1, target, 1)
-    # label smoothing
-    soft_target = soft_target * (1 - label_smoothing) + label_smoothing / n_classes
-    return torch.mean(torch.sum(- soft_target * logsoftmax(pred), 1))
-def accuracy(output, target, topk=(1,)):
-    """
-    Computes the precision@k for the specified values of k
-    Parameters
-    ----------
-    output : pytorch tensor
-        output, e.g., predicted value
-    target : pytorch tensor
-        label
-    topk : tuple
-        specify top1 and top5
-    Returns
-    -------
-    list
-        accuracy of top1 and top5
-    """
-    maxk = max(topk)
-    batch_size = target.size(0)
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-    res = []
-    for k in topk:
-        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
-        res.append(correct_k.mul_(100.0 / batch_size))
-    return res
--- a/nni/algorithms/nas/pytorch/random/__init__.py
+++ b/nni/algorithms/nas/pytorch/random/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-from .mutator import RandomMutator
--- a/nni/algorithms/nas/pytorch/random/mutator.py
+++ b/nni/algorithms/nas/pytorch/random/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import torch
-import torch.nn.functional as F
-from nni.nas.pytorch.mutator import Mutator
-from nni.nas.pytorch.mutables import LayerChoice, InputChoice
-class RandomMutator(Mutator):
-    """
-    Random mutator that samples a random candidate in the search space each time ``reset()``.
-    It uses random function in PyTorch, so users can set seed in PyTorch to ensure deterministic behavior.
-    """
-    def sample_search(self):
-        """
-        Sample a random candidate.
-        """
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                gen_index = torch.randint(high=len(mutable), size=(1, ))
-                result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool()
-            elif isinstance(mutable, InputChoice):
-                if mutable.n_chosen is None:
-                    result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool()
-                else:
-                    perm = torch.randperm(mutable.n_candidates)
-                    mask = [i in perm[:mutable.n_chosen] for i in range(mutable.n_candidates)]
-                    result[mutable.key] = torch.tensor(mask, dtype=torch.bool)  # pylint: disable=not-callable
-        return result
-    def sample_final(self):
-        """
-        Same as :meth:`sample_search`.
-        """
-        return self.sample_search()
--- a/nni/algorithms/nas/pytorch/spos/__init__.py
+++ b/nni/algorithms/nas/pytorch/spos/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-from .evolution import SPOSEvolution
-from .mutator import SPOSSupernetTrainingMutator
-from .trainer import SPOSSupernetTrainer
--- a/nni/algorithms/nas/pytorch/spos/evolution.py
+++ b/nni/algorithms/nas/pytorch/spos/evolution.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import json
-import logging
-import os
-import re
-from collections import deque
-import numpy as np
-from nni.tuner import Tuner
-from nni.algorithms.nas.pytorch.classic_nas.mutator import LAYER_CHOICE, INPUT_CHOICE
-_logger = logging.getLogger(__name__)
-class SPOSEvolution(Tuner):
-    """
-    SPOS evolution tuner.
-    Parameters
-    ----------
-    max_epochs : int
-        Maximum number of epochs to run.
-    num_select : int
-        Number of survival candidates of each epoch.
-    num_population : int
-        Number of candidates at the start of each epoch. If candidates generated by
-        crossover and mutation are not enough, the rest will be filled with random
-        candidates.
-    m_prob : float
-        The probability of mutation.
-    num_crossover : int
-        Number of candidates generated by crossover in each epoch.
-    num_mutation : int
-        Number of candidates generated by mutation in each epoch.
-    """
-    def __init__(self, max_epochs=20, num_select=10, num_population=50, m_prob=0.1,
-                 num_crossover=25, num_mutation=25):
-        assert num_population >= num_select
-        self.max_epochs = max_epochs
-        self.num_select = num_select
-        self.num_population = num_population
-        self.m_prob = m_prob
-        self.num_crossover = num_crossover
-        self.num_mutation = num_mutation
-        self.epoch = 0
-        self.candidates = []
-        self.search_space = None
-        self.random_state = np.random.RandomState(0)
-        # async status
-        self._to_evaluate_queue = deque()
-        self._sending_parameter_queue = deque()
-        self._pending_result_ids = set()
-        self._reward_dict = dict()
-        self._id2candidate = dict()
-        self._st_callback = None
-    def update_search_space(self, search_space):
-        """
-        Handle the initialization/update event of search space.
-        """
-        self._search_space = search_space
-        self._next_round()
-    def _next_round(self):
-        _logger.info("Epoch %d, generating...", self.epoch)
-        if self.epoch == 0:
-            self._get_random_population()
-            self.export_results(self.candidates)
-        else:
-            best_candidates = self._select_top_candidates()
-            self.export_results(best_candidates)
-            if self.epoch >= self.max_epochs:
-                return
-            self.candidates = self._get_mutation(best_candidates) + self._get_crossover(best_candidates)
-            self._get_random_population()
-        self.epoch += 1
-    def _random_candidate(self):
-        chosen_arch = dict()
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                index = self.random_state.randint(len(choices))
-                chosen_arch[key] = {"_value": choices[index], "_idx": index}
-            elif val["_type"] == INPUT_CHOICE:
-                raise NotImplementedError("Input choice is not implemented yet.")
-        return chosen_arch
-    def _add_to_evaluate_queue(self, cand):
-        _logger.info("Generate candidate %s, adding to eval queue.", self._get_architecture_repr(cand))
-        self._reward_dict[self._hashcode(cand)] = 0.
-        self._to_evaluate_queue.append(cand)
-    def _get_random_population(self):
-        while len(self.candidates) < self.num_population:
-            cand = self._random_candidate()
-            if self._is_legal(cand):
-                _logger.info("Random candidate generated.")
-                self._add_to_evaluate_queue(cand)
-                self.candidates.append(cand)
-    def _get_crossover(self, best):
-        result = []
-        for _ in range(10 * self.num_crossover):
-            cand_p1 = best[self.random_state.randint(len(best))]
-            cand_p2 = best[self.random_state.randint(len(best))]
-            assert cand_p1.keys() == cand_p2.keys()
-            cand = {k: cand_p1[k] if self.random_state.randint(2) == 0 else cand_p2[k]
-                    for k in cand_p1.keys()}
-            if self._is_legal(cand):
-                result.append(cand)
-                self._add_to_evaluate_queue(cand)
-            if len(result) >= self.num_crossover:
-                break
-        _logger.info("Found %d architectures with crossover.", len(result))
-        return result
-    def _get_mutation(self, best):
-        result = []
-        for _ in range(10 * self.num_mutation):
-            cand = best[self.random_state.randint(len(best))].copy()
-            mutation_sample = np.random.random_sample(len(cand))
-            for s, k in zip(mutation_sample, cand):
-                if s < self.m_prob:
-                    choices = self._search_space[k]["_value"]
-                    index = self.random_state.randint(len(choices))
-                    cand[k] = {"_value": choices[index], "_idx": index}
-            if self._is_legal(cand):
-                result.append(cand)
-                self._add_to_evaluate_queue(cand)
-            if len(result) >= self.num_mutation:
-                break
-        _logger.info("Found %d architectures with mutation.", len(result))
-        return result
-    def _get_architecture_repr(self, cand):
-        return re.sub(r"\".*?\": \{\"_idx\": (\d+), \"_value\": \".*?\"\}", r"\1",
-                      self._hashcode(cand))
-    def _is_legal(self, cand):
-        if self._hashcode(cand) in self._reward_dict:
-            return False
-        return True
-    def _select_top_candidates(self):
-        reward_query = lambda cand: self._reward_dict[self._hashcode(cand)]
-        _logger.info("All candidate rewards: %s", list(map(reward_query, self.candidates)))
-        result = sorted(self.candidates, key=reward_query, reverse=True)[:self.num_select]
-        _logger.info("Best candidate rewards: %s", list(map(reward_query, result)))
-        return result
-    @staticmethod
-    def _hashcode(d):
-        return json.dumps(d, sort_keys=True)
-    def _bind_and_send_parameters(self):
-        """
-        There are two types of resources: parameter ids and candidates. This function is called at
-        necessary times to bind these resources to send new trials with st_callback.
-        """
-        result = []
-        while self._sending_parameter_queue and self._to_evaluate_queue:
-            parameter_id = self._sending_parameter_queue.popleft()
-            parameters = self._to_evaluate_queue.popleft()
-            self._id2candidate[parameter_id] = parameters
-            result.append(parameters)
-            self._pending_result_ids.add(parameter_id)
-            self._st_callback(parameter_id, parameters)
-            _logger.info("Send parameter [%d] %s.", parameter_id, self._get_architecture_repr(parameters))
-        return result
-    def generate_multiple_parameters(self, parameter_id_list, **kwargs):
-        """
-        Callback function necessary to implement a tuner. This will put more parameter ids into the
-        parameter id queue.
-        """
-        if "st_callback" in kwargs and self._st_callback is None:
-            self._st_callback = kwargs["st_callback"]
-        for parameter_id in parameter_id_list:
-            self._sending_parameter_queue.append(parameter_id)
-        self._bind_and_send_parameters()
-        return []  # always not use this. might induce problem of over-sending
-    def receive_trial_result(self, parameter_id, parameters, value, **kwargs):
-        """
-        Callback function. Receive a trial result.
-        """
-        _logger.info("Candidate %d, reported reward %f", parameter_id, value)
-        self._reward_dict[self._hashcode(self._id2candidate[parameter_id])] = value
-    def trial_end(self, parameter_id, success, **kwargs):
-        """
-        Callback function when a trial is ended and resource is released.
-        """
-        self._pending_result_ids.remove(parameter_id)
-        if not self._pending_result_ids and not self._to_evaluate_queue:
-            # a new epoch now
-            self._next_round()
-            assert self._st_callback is not None
-            self._bind_and_send_parameters()
-    def export_results(self, result):
-        """
-        Export a number of candidates to `checkpoints` dir.
-        Parameters
-        ----------
-        result : dict
-            Chosen architectures to be exported.
-        """
-        os.makedirs("checkpoints", exist_ok=True)
-        for i, cand in enumerate(result):
-            converted = dict()
-            for cand_key, cand_val in cand.items():
-                onehot = [k == cand_val["_idx"] for k in range(len(self._search_space[cand_key]["_value"]))]
-                converted[cand_key] = onehot
-            with open(os.path.join("checkpoints", "%03d_%03d.json" % (self.epoch, i)), "w") as fp:
-                json.dump(converted, fp)
--- a/nni/algorithms/nas/pytorch/spos/mutator.py
+++ b/nni/algorithms/nas/pytorch/spos/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import logging
-import numpy as np
-from nni.algorithms.nas.pytorch.random import RandomMutator
-_logger = logging.getLogger(__name__)
-class SPOSSupernetTrainingMutator(RandomMutator):
-    """
-    A random mutator with flops limit.
-    Parameters
-    ----------
-    model : nn.Module
-        PyTorch model.
-    flops_func : callable
-        Callable that takes a candidate from `sample_search` and returns its candidate. When `flops_func`
-        is None, functions related to flops will be deactivated.
-    flops_lb : number
-        Lower bound of flops.
-    flops_ub : number
-        Upper bound of flops.
-    flops_bin_num : number
-        Number of bins divided for the interval of flops to ensure the uniformity. Bigger number will be more
-        uniform, but the sampling will be slower.
-    flops_sample_timeout : int
-        Maximum number of attempts to sample before giving up and use a random candidate.
-    """
-    def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None,
-                 flops_bin_num=7, flops_sample_timeout=500):
-        super().__init__(model)
-        self._flops_func = flops_func
-        if self._flops_func is not None:
-            self._flops_bin_num = flops_bin_num
-            self._flops_bins = [flops_lb + (flops_ub - flops_lb) / flops_bin_num * i for i in range(flops_bin_num + 1)]
-            self._flops_sample_timeout = flops_sample_timeout
-    def sample_search(self):
-        """
-        Sample a candidate for training. When `flops_func` is not None, candidates will be sampled uniformly
-        relative to flops.
-        Returns
-        -------
-        dict
-        """
-        if self._flops_func is not None:
-            for times in range(self._flops_sample_timeout):
-                idx = np.random.randint(self._flops_bin_num)
-                cand = super().sample_search()
-                if self._flops_bins[idx] <= self._flops_func(cand) <= self._flops_bins[idx + 1]:
-                    _logger.debug("Sampled candidate flops %f in %d times.", cand, times)
-                    return cand
-            _logger.warning("Failed to sample a flops-valid candidate within %d tries.", self._flops_sample_timeout)
-        return super().sample_search()
-    def sample_final(self):
-        """
-        Implement only to suffice the interface of Mutator.
-        """
-        return self.sample_search()
--- a/nni/algorithms/nas/pytorch/spos/trainer.py
+++ b/nni/algorithms/nas/pytorch/spos/trainer.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import logging
-import torch
-from nni.nas.pytorch.trainer import Trainer
-from nni.nas.pytorch.utils import AverageMeterGroup
-from .mutator import SPOSSupernetTrainingMutator
-logger = logging.getLogger(__name__)
-class SPOSSupernetTrainer(Trainer):
-    """
-    This trainer trains a supernet that can be used for evolution search.
-    Parameters
-    ----------
-    model : nn.Module
-        Model with mutables.
-    mutator : nni.nas.pytorch.mutator.Mutator
-        A mutator object that has been initialized with the model.
-    loss : callable
-        Called with logits and targets. Returns a loss tensor.
-    metrics : callable
-        Returns a dict that maps metrics keys to metrics data.
-    optimizer : Optimizer
-        Optimizer that optimizes the model.
-    num_epochs : int
-        Number of epochs of training.
-    train_loader : iterable
-        Data loader of training. Raise ``StopIteration`` when one epoch is exhausted.
-    dataset_valid : iterable
-        Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted.
-    batch_size : int
-        Batch size.
-    workers: int
-        Number of threads for data preprocessing. Not used for this trainer. Maybe removed in future.
-    device : torch.device
-        Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will
-        automatic detects GPU and selects GPU first.
-    log_frequency : int
-        Number of mini-batches to log metrics.
-    callbacks : list of Callback
-        Callbacks to plug into the trainer. See Callbacks.
-    """
-    def __init__(self, model, loss, metrics,
-                 optimizer, num_epochs, train_loader, valid_loader,
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
-                 callbacks=None):
-        assert torch.cuda.is_available()
-        super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model),
-                         loss, metrics, optimizer, num_epochs, None, None,
-                         batch_size, workers, device, log_frequency, callbacks)
-        self.train_loader = train_loader
-        self.valid_loader = valid_loader
-    def train_one_epoch(self, epoch):
-        self.model.train()
-        meters = AverageMeterGroup()
-        for step, (x, y) in enumerate(self.train_loader):
-            x, y = x.to(self.device), y.to(self.device)
-            self.optimizer.zero_grad()
-            self.mutator.reset()
-            logits = self.model(x)
-            loss = self.loss(logits, y)
-            loss.backward()
-            self.optimizer.step()
-            metrics = self.metrics(logits, y)
-            metrics["loss"] = loss.item()
-            meters.update(metrics)
-            if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch + 1,
-                            self.num_epochs, step + 1, len(self.train_loader), meters)
-    def validate_one_epoch(self, epoch):
-        self.model.eval()
-        meters = AverageMeterGroup()
-        with torch.no_grad():
-            for step, (x, y) in enumerate(self.valid_loader):
-                x, y = x.to(self.device), y.to(self.device)
-                self.mutator.reset()
-                logits = self.model(x)
-                loss = self.loss(logits, y)
-                metrics = self.metrics(logits, y)
-                metrics["loss"] = loss.item()
-                meters.update(metrics)
-                if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("Epoch [%s/%s] Validation Step [%s/%s]  %s", epoch + 1,
-                                self.num_epochs, step + 1, len(self.valid_loader), meters)
--- a/nni/algorithms/nas/tensorflow/__init__.py
+++ b/nni/algorithms/nas/tensorflow/__init__.py
--- a/nni/algorithms/nas/tensorflow/classic_nas/__init__.py
+++ b/nni/algorithms/nas/tensorflow/classic_nas/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-from .mutator import get_and_apply_next_architecture
--- a/nni/algorithms/nas/tensorflow/classic_nas/mutator.py
+++ b/nni/algorithms/nas/tensorflow/classic_nas/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-# pylint: skip-file
-import json
-import logging
-import os
-import sys
-import tensorflow as tf
-import nni
-from nni.runtime.env_vars import trial_env_vars
-from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope
-from nni.nas.tensorflow.mutator import Mutator
-logger = logging.getLogger(__name__)
-NNI_GEN_SEARCH_SPACE = "NNI_GEN_SEARCH_SPACE"
-LAYER_CHOICE = "layer_choice"
-INPUT_CHOICE = "input_choice"
-def get_and_apply_next_architecture(model):
-    """
-    Wrapper of :class:`~nni.nas.tensorflow.classic_nas.mutator.ClassicMutator` to make it more meaningful,
-    similar to ``get_next_parameter`` for HPO.
-    Tt will generate search space based on ``model``.
-    If env ``NNI_GEN_SEARCH_SPACE`` exists, this is in dry run mode for
-    generating search space for the experiment.
-    If not, there are still two mode, one is nni experiment mode where users
-    use ``nnictl`` to start an experiment. The other is standalone mode
-    where users directly run the trial command, this mode chooses the first
-    one(s) for each LayerChoice and InputChoice.
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-    ClassicMutator(model)
-class ClassicMutator(Mutator):
-    """
-    This mutator is to apply the architecture chosen from tuner.
-    It implements the forward function of LayerChoice and InputChoice,
-    to only activate the chosen ones.
-    Parameters
-    ----------
-    model : nn.Module
-        User's model with search space (e.g., LayerChoice, InputChoice) embedded in it.
-    """
-    def __init__(self, model):
-        super(ClassicMutator, self).__init__(model)
-        self._chosen_arch = {}
-        self._search_space = self._generate_search_space()
-        if NNI_GEN_SEARCH_SPACE in os.environ:
-            # dry run for only generating search space
-            self._dump_search_space(os.environ[NNI_GEN_SEARCH_SPACE])
-            sys.exit(0)
-        if trial_env_vars.NNI_PLATFORM is None:
-            logger.warning("This is in standalone mode, the chosen are the first one(s).")
-            self._chosen_arch = self._standalone_generate_chosen()
-        else:
-            # get chosen arch from tuner
-            self._chosen_arch = nni.get_next_parameter()
-            if self._chosen_arch is None:
-                if trial_env_vars.NNI_PLATFORM == "unittest":
-                    # happens if NNI_PLATFORM is intentionally set, e.g., in UT
-                    logger.warning("`NNI_PLATFORM` is set but `param` is None. Falling back to standalone mode.")
-                    self._chosen_arch = self._standalone_generate_chosen()
-                else:
-                    raise RuntimeError("Chosen architecture is None. This may be a platform error.")
-        self.reset()
-    def _sample_layer_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert layer choice to tensor representation.
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        # doesn't support multihot for layer choice yet
-        assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \
-            "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value)
-        mask = tf.one_hot(idx, len(mutable))
-        return tf.cast(tf.reshape(mask, [-1]), tf.bool)
-    def _sample_input_choice(self, mutable, idx, value, search_space_item):
-        """
-        Convert input choice to tensor representation.
-        Parameters
-        ----------
-        mutable : Mutable
-        idx : int
-            Number `idx` of list will be selected.
-        value : str
-            The verbose representation of the selected value.
-        search_space_item : list
-            The list for corresponding search space.
-        """
-        candidate_repr = search_space_item["candidates"]
-        multihot_list = [False] * mutable.n_candidates
-        for i, v in zip(idx, value):
-            assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \
-                "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v)
-            assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx)
-            multihot_list[i] = True
-        return tf.cast(multihot_list, tf.bool)  # pylint: disable=not-callable
-    def sample_search(self):
-        """
-        See :meth:`sample_final`.
-        """
-        return self.sample_final()
-    def sample_final(self):
-        """
-        Convert the chosen arch and apply it on model.
-        """
-        assert set(self._chosen_arch.keys()) == set(self._search_space.keys()), \
-            "Unmatched keys, expected keys '{}' from search space, found '{}'.".format(self._search_space.keys(),
-                                                                                       self._chosen_arch.keys())
-        result = dict()
-        for mutable in self.mutables:
-            if isinstance(mutable, (LayerChoice, InputChoice)):
-                assert mutable.key in self._chosen_arch, \
-                    "Expected '{}' in chosen arch, but not found.".format(mutable.key)
-                data = self._chosen_arch[mutable.key]
-                assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
-                    "'{}' is not a valid choice.".format(data)
-            if isinstance(mutable, LayerChoice):
-                result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, InputChoice):
-                result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"],
-                                                                self._search_space[mutable.key]["_value"])
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return result
-    def _standalone_generate_chosen(self):
-        """
-        Generate the chosen architecture for standalone mode,
-        i.e., choose the first one(s) for LayerChoice and InputChoice.
-        ::
-            { key_name: {"_value": "conv1",
-                         "_idx": 0} }
-            { key_name: {"_value": ["in1"],
-                         "_idx": [0]} }
-        Returns
-        -------
-        dict
-            the chosen architecture
-        """
-        chosen_arch = {}
-        for key, val in self._search_space.items():
-            if val["_type"] == LAYER_CHOICE:
-                choices = val["_value"]
-                chosen_arch[key] = {"_value": choices[0], "_idx": 0}
-            elif val["_type"] == INPUT_CHOICE:
-                choices = val["_value"]["candidates"]
-                n_chosen = val["_value"]["n_chosen"]
-                if n_chosen is None:
-                    n_chosen = len(choices)
-                chosen_arch[key] = {"_value": choices[:n_chosen], "_idx": list(range(n_chosen))}
-            else:
-                raise ValueError("Unknown key '%s' and value '%s'." % (key, val))
-        return chosen_arch
-    def _generate_search_space(self):
-        """
-        Generate search space from mutables.
-        Here is the search space format:
-        ::
-            { key_name: {"_type": "layer_choice",
-                         "_value": ["conv1", "conv2"]} }
-            { key_name: {"_type": "input_choice",
-                         "_value": {"candidates": ["in1", "in2"],
-                                    "n_chosen": 1}} }
-        Returns
-        -------
-        dict
-            the generated search space
-        """
-        search_space = {}
-        for mutable in self.mutables:
-            # for now we only generate flattened search space
-            if isinstance(mutable, LayerChoice):
-                key = mutable.key
-                val = mutable.names
-                search_space[key] = {"_type": LAYER_CHOICE, "_value": val}
-            elif isinstance(mutable, InputChoice):
-                key = mutable.key
-                search_space[key] = {"_type": INPUT_CHOICE,
-                                     "_value": {"candidates": mutable.choose_from,
-                                                "n_chosen": mutable.n_chosen}}
-            elif isinstance(mutable, MutableScope):
-                logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key)
-            else:
-                raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
-        return search_space
-    def _dump_search_space(self, file_path):
-        with open(file_path, "w") as ss_file:
-            json.dump(self._search_space, ss_file, sort_keys=True, indent=2)
--- a/nni/algorithms/nas/tensorflow/enas/__init__.py
+++ b/nni/algorithms/nas/tensorflow/enas/__init__.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-from .mutator import EnasMutator
-from .trainer import EnasTrainer
--- a/nni/algorithms/nas/tensorflow/enas/mutator.py
+++ b/nni/algorithms/nas/tensorflow/enas/mutator.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-# pylint: skip-file
-import tensorflow as tf
-from tensorflow.keras.layers import Dense, Embedding, LSTMCell, RNN
-from tensorflow.keras.losses import SparseCategoricalCrossentropy, Reduction
-from nni.nas.tensorflow.mutator import Mutator
-from nni.nas.tensorflow.mutables import LayerChoice, InputChoice, MutableScope
-class EnasMutator(Mutator):
-    def __init__(self, model,
-                 lstm_size=64,
-                 lstm_num_layers=1,
-                 tanh_constant=1.5,
-                 cell_exit_extra_step=False,
-                 skip_target=0.4,
-                 temperature=None,
-                 branch_bias=0.25,
-                 entropy_reduction='sum'):
-        super().__init__(model)
-        self.tanh_constant = tanh_constant
-        self.temperature = temperature
-        self.cell_exit_extra_step = cell_exit_extra_step
-        cells = [LSTMCell(units=lstm_size, use_bias=False) for _ in range(lstm_num_layers)]
-        self.lstm = RNN(cells, stateful=True)
-        self.g_emb = tf.random.normal((1, 1, lstm_size)) * 0.1
-        self.skip_targets = tf.constant([1.0 - skip_target, skip_target])
-        self.max_layer_choice = 0
-        self.bias_dict = {}
-        for mutable in self.mutables:
-            if isinstance(mutable, LayerChoice):
-                if self.max_layer_choice == 0:
-                    self.max_layer_choice = len(mutable)
-                assert self.max_layer_choice == len(mutable), \
-                        "ENAS mutator requires all layer choice have the same number of candidates."
-                if 'reduce' in mutable.key:
-                    bias = []
-                    for choice in mutable.choices:
-                        if 'conv' in str(type(choice)).lower():
-                            bias.append(branch_bias)
-                        else:
-                            bias.append(-branch_bias)
-                    self.bias_dict[mutable.key] = tf.constant(bias)
-        # exposed for trainer
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-        # internal nn layers
-        self.embedding = Embedding(self.max_layer_choice + 1, lstm_size)
-        self.soft = Dense(self.max_layer_choice, use_bias=False)
-        self.attn_anchor = Dense(lstm_size, use_bias=False)
-        self.attn_query = Dense(lstm_size, use_bias=False)
-        self.v_attn = Dense(1, use_bias=False)
-        assert entropy_reduction in ['sum', 'mean'], 'Entropy reduction must be one of sum and mean.'
-        self.entropy_reduction = tf.reduce_sum if entropy_reduction == 'sum' else tf.reduce_mean
-        self.cross_entropy_loss = SparseCategoricalCrossentropy(from_logits=True, reduction=Reduction.NONE)
-        self._first_sample = True
-    def sample_search(self):
-        self._initialize()
-        self._sample(self.mutables)
-        self._first_sample = False
-        return self._choices
-    def sample_final(self):
-        return self.sample_search()
-    def _sample(self, tree):
-        mutable = tree.mutable
-        if isinstance(mutable, LayerChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_layer_choice(mutable)
-        elif isinstance(mutable, InputChoice) and mutable.key not in self._choices:
-            self._choices[mutable.key] = self._sample_input_choice(mutable)
-        for child in tree.children:
-            self._sample(child)
-        if self.cell_exit_extra_step and isinstance(mutable, MutableScope) and mutable.key not in self._anchors_hid:
-            self._anchors_hid[mutable.key] = self.lstm(self._inputs, 1)
-    def _initialize(self):
-        self._choices = {}
-        self._anchors_hid = {}
-        self._inputs = self.g_emb
-        # seems the `input_shape` parameter of RNN does not work
-        # workaround it by omitting `reset_states` for first run
-        if not self._first_sample:
-            self.lstm.reset_states()
-        self.sample_log_prob = 0
-        self.sample_entropy = 0
-        self.sample_skip_penalty = 0
-    def _sample_layer_choice(self, mutable):
-        logit = self.soft(self.lstm(self._inputs))
-        if self.temperature is not None:
-            logit /= self.temperature
-        if self.tanh_constant is not None:
-            logit = self.tanh_constant * tf.tanh(logit)
-        if mutable.key in self.bias_dict:
-            logit += self.bias_dict[mutable.key]
-        softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-        branch_id = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [1])
-        log_prob = self.cross_entropy_loss(branch_id, logit)
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = log_prob * tf.math.exp(-log_prob)
-        self.sample_entropy += self.entropy_reduction(entropy)
-        self._inputs = tf.reshape(self.embedding(branch_id), [1, 1, -1])
-        mask = tf.one_hot(branch_id, self.max_layer_choice)
-        return tf.cast(tf.reshape(mask, [-1]), tf.bool)
-    def _sample_input_choice(self, mutable):
-        query, anchors = [], []
-        for label in mutable.choose_from:
-            if label not in self._anchors_hid:
-                self._anchors_hid[label] = self.lstm(self._inputs)
-            query.append(self.attn_anchor(self._anchors_hid[label]))
-            anchors.append(self._anchors_hid[label])
-        query = tf.concat(query, axis=0)
-        query = tf.tanh(query + self.attn_query(anchors[-1]))
-        query = self.v_attn(query)
-        if self.temperature is not None:
-            query /= self.temperature
-        if self.tanh_constant is not None:
-            query = self.tanh_constant * tf.tanh(query)
-        if mutable.n_chosen is None:
-            logit = tf.concat([-query, query], axis=1)
-            softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-            skip = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1])
-            skip_prob = tf.math.sigmoid(logit)
-            kl = tf.reduce_sum(skip_prob * tf.math.log(skip_prob / self.skip_targets))
-            self.sample_skip_penalty += kl
-            log_prob = self.cross_entropy_loss(skip, logit)
-            skip = tf.cast(skip, tf.float32)
-            inputs = tf.tensordot(skip, tf.concat(anchors, 0), 1) / (1. + tf.reduce_sum(skip))
-            self._inputs = tf.reshape(inputs, [1, 1, -1])
-        else:
-            assert mutable.n_chosen == 1, "Input choice must select exactly one or any in ENAS."
-            logit = tf.reshape(query, [1, -1])
-            softmax_logit = tf.math.log(tf.nn.softmax(logit, axis=-1))
-            index = tf.reshape(tf.random.categorical(softmax_logit, num_samples=1), [-1])
-            skip = tf.reshape(tf.one_hot(index, mutable.n_candidates), [-1])
-            # when the size is 1, tf does not accept tensor here, complaining the shape is wrong
-            # but using a numpy array seems fine
-            log_prob = self.cross_entropy_loss(logit, query.numpy())
-            self._inputs = tf.reshape(anchors[index.numpy()[0]], [1, 1, -1])
-        self.sample_log_prob += self.entropy_reduction(log_prob)
-        entropy = log_prob * tf.exp(-log_prob)
-        self.sample_entropy += self.entropy_reduction(entropy)
-        assert len(skip) == mutable.n_candidates, (skip, mutable.n_candidates, mutable.n_chosen)
-        return tf.cast(skip, tf.bool)
--- a/nni/algorithms/nas/tensorflow/enas/trainer.py
+++ b/nni/algorithms/nas/tensorflow/enas/trainer.py
--- a/nni/retiarii/evaluator/functional.py
+++ b/nni/retiarii/evaluator/functional.py