Merge pull request #5036 from microsoft/promote-retiarii-to-nas

[DO NOT SQUASH] Promote retiarii to NAS

Merge pull request #5036 from microsoft/promote-retiarii-to-nas
[DO NOT SQUASH] Promote retiarii to NAS
a0fd0036 · Yuge Zhang · GitHub · d6dcb483 · bc6d8796 · a0fd0036
Unverified Commit a0fd0036 authored Aug 01, 2022 by Yuge Zhang Committed by GitHub Aug 01, 2022
20 changed files
--- a/nni/retiarii/graph.py
+++ b/nni/retiarii/graph.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""
-Model representation.
-"""
+# pylint: disable=wildcard-import,unused-wildcard-import

-from __future__ import annotations
-
-import abc
-import json
-from enum import Enum
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
-                    Optional, Set, Tuple, Type, Union, cast, overload)
-
-if TYPE_CHECKING:
-    from .mutator import Mutator
-
-from .operation import Cell, Operation, _IOPseudoOperation
-from .utils import uid
-
-__all__ = ['Evaluator', 'Model', 'ModelStatus', 'Graph', 'Node', 'Edge', 'Mutation', 'IllegalGraphError', 'MetricData']
-
-
-MetricData = Any
-"""
-Type hint for graph metrics (loss, accuracy, etc).
-"""
-
-EdgeEndpoint = Tuple['Node', Optional[int]]
-"""
-Type hint for edge's endpoint. The int indicates nodes' order.
-"""
-
-
-class Evaluator(abc.ABC):
-    """
-    Evaluator of a model. An evaluator should define where the training code is, and the configuration of
-    training code. The configuration includes basic runtime information trainer needs to know (such as number of GPUs)
-    or tune-able parameters (such as learning rate), depending on the implementation of training code.
-
-    Each config should define how it is interpreted in ``_execute()``, taking only one argument which is the mutated model class.
-    For example, functional evaluator might directly import the function and call the function.
-    """
-
-    def evaluate(self, model_cls: Union[Callable[[], Any], Any]) -> Any:
-        """To run evaluation of a model. The model could be either a concrete model or a callable returning a model.
-
-        The concrete implementation of evaluate depends on the implementation of ``_execute()`` in sub-class.
-        """
-        return self._execute(model_cls)
-
-    def __repr__(self):
-        items = ', '.join(['%s=%r' % (k, v) for k, v in self.__dict__.items()])
-        return f'{self.__class__.__name__}({items})'
-
-    @staticmethod
-    def _load(ir: Any) -> 'Evaluator':
-        evaluator_type = ir.get('type')
-        if isinstance(evaluator_type, str):
-            # for debug purposes only
-            for subclass in Evaluator.__subclasses__():
-                if subclass.__name__ == evaluator_type:
-                    evaluator_type = subclass
-                    break
-        assert issubclass(cast(type, evaluator_type), Evaluator)
-        return cast(Type[Evaluator], evaluator_type)._load(ir)
-
-    @abc.abstractmethod
-    def _dump(self) -> Any:
-        """
-        Subclass implements ``_dump`` for their own serialization.
-        They should return a dict, with a key ``type`` which equals ``self.__class__``,
-        and optionally other keys.
-        """
-        pass
-
-    @abc.abstractmethod
-    def _execute(self, model_cls: Union[Callable[[], Any], Any]) -> Any:
-        pass
-
-    @abc.abstractmethod
-    def __eq__(self, other) -> bool:
-        pass
-
-
-class Model:
-    """
-    Represents a neural network model.
-
-    During mutation, one :class:`Model` object is created for each trainable snapshot.
-    For example, consider a mutator that insert a node at an edge for each iteration.
-    In one iteration, the mutator invokes 4 primitives: add node, remove edge, add edge to head, add edge to tail.
-    These 4 primitives operates in one :class:`Model` object.
-    When they are all done the model will be set to "frozen" (trainable) status and be submitted to execution engine.
-    And then a new iteration starts, and a new :class:`Model` object is created by forking last model.
-
-    Attributes
-    ----------
-    python_object
-        Python object of base model. It will be none when the base model is not available.
-    python_class
-        Python class that base model is converted from.
-    python_init_params
-        Initialization parameters of python class.
-    status
-        See :class:`ModelStatus`.
-    root_graph
-        The outermost graph which usually takes dataset as input and feeds output to loss function.
-    graphs
-        All graphs (subgraphs) in this model.
-    evaluator
-        Model evaluator
-    history
-        Mutation history.
-        ``self`` is directly mutated from ``self.history[-1]``;
-        ``self.history[-1]`` is mutated from ``self.history[-2]``, and so on.
-        ``self.history[0]`` is the base graph.
-    metric
-        Training result of the model, or ``None`` if it's not yet trained or has failed to train.
-    intermediate_metrics
-        Intermediate training metrics. If the model is not trained, it's an empty list.
-    """
-
-    def __init__(self, _internal=False):
-        assert _internal, '`Model()` is private, use `model.fork()` instead'
-        self.model_id: int = uid('model')
-        self.python_object: Optional[Any] = None  # type is uncertain because it could differ between DL frameworks
-        self.python_class: Optional[Type] = None
-        self.python_init_params: Optional[Dict[str, Any]] = None
-
-        self.status: ModelStatus = ModelStatus.Mutating
-
-        self._root_graph_name: str = '_model'
-        self.graphs: Dict[str, Graph] = {}
-        self.evaluator: Optional[Evaluator] = None
-
-        self.history: List['Mutation'] = []
-
-        self.metric: Optional[MetricData] = None
-        self.intermediate_metrics: List[MetricData] = []
-
-    def __repr__(self):
-        return f'Model(model_id={self.model_id}, status={self.status}, graphs={list(self.graphs.keys())}, ' + \
-            f'evaluator={self.evaluator}, metric={self.metric}, intermediate_metrics={self.intermediate_metrics}, ' + \
-            f'python_class={self.python_class})'
-
-    @property
-    def root_graph(self) -> 'Graph':
-        return self.graphs[self._root_graph_name]
-
-    def fork(self) -> 'Model':
-        """
-        Create a new model which has same topology, names, and IDs to current one.
-
-        Can only be invoked on a frozen model.
-        The new model will be in `Mutating` state.
-
-        This API is used in mutator base class.
-        """
-        new_model = Model(_internal=True)
-        new_model._root_graph_name = self._root_graph_name
-        new_model.python_class = self.python_class
-        new_model.python_init_params = self.python_init_params
-        new_model.graphs = {name: graph._fork_to(new_model) for name, graph in self.graphs.items()}
-        new_model.evaluator = self.evaluator  # TODO this needs a clever copy (not deepcopy) if we need mutation
-        new_model.history = [*self.history]
-        # Note: the history is not updated. It will be updated when the model is changed, that is in mutator.
-        return new_model
-
-    @staticmethod
-    def _load(ir: Any) -> 'Model':
-        model = Model(_internal=True)
-        for graph_name, graph_data in ir.items():
-            if graph_name != '_evaluator':
-                Graph._load(model, graph_name, graph_data)._register()
-        if '_evaluator' in ir:
-            model.evaluator = Evaluator._load(ir['_evaluator'])
-        return model
-
-    def _dump(self) -> Any:
-        ret = {name: graph._dump() for name, graph in self.graphs.items()}
-        if self.evaluator is not None:
-            ret['_evaluator'] = self.evaluator._dump()
-        return ret
-
-    def get_nodes(self) -> Iterable['Node']:
-        """
-        Traverse through all the nodes.
-        """
-        for graph in self.graphs.values():
-            for node in graph.nodes:
-                yield node
-
-    def get_nodes_by_label(self, label: str) -> List['Node']:
-        """
-        Traverse all the nodes to find the matched node(s) with the given label.
-        There could be multiple nodes with the same label. Name space name can uniquely
-        identify a graph or node.
-
-        NOTE: the implementation does not support the class abstraction
-        """
-        matched_nodes = []
-        for graph in self.graphs.values():
-            nodes = graph.get_nodes_by_label(label)
-            matched_nodes.extend(nodes)
-        return matched_nodes
-
-    def get_nodes_by_type(self, type_name: str) -> List['Node']:
-        """
-        Traverse all the nodes to find the matched node(s) with the given type.
-        """
-        matched_nodes = []
-        for graph in self.graphs.values():
-            nodes = graph.get_nodes_by_type(type_name)
-            matched_nodes.extend(nodes)
-        return matched_nodes
-
-    def get_node_by_name(self, node_name: str) -> 'Node' | None:
-        """
-        Traverse all the nodes to find the matched node with the given name.
-        """
-        matched_nodes = []
-        for graph in self.graphs.values():
-            nodes = graph.get_nodes_by_name(node_name)
-            matched_nodes.extend(nodes)
-        assert len(matched_nodes) <= 1
-        if matched_nodes:
-            return matched_nodes[0]
-        else:
-            return None
-
-    def get_node_by_python_name(self, python_name: str) -> Optional['Node']:
-        """
-        Traverse all the nodes to find the matched node with the given python_name.
-        """
-        matched_nodes = []
-        for graph in self.graphs.values():
-            nodes = graph.get_nodes_by_python_name(python_name)
-            matched_nodes.extend(nodes)
-        # assert len(matched_nodes) <= 1
-        if matched_nodes:
-            return matched_nodes[0]
-        else:
-            return None
-
-    def get_cell_nodes(self) -> List['Node']:
-        matched_nodes = []
-        for graph in self.graphs.values():
-            nodes = [node for node in graph.nodes if isinstance(node.operation, Cell)]
-            matched_nodes.extend(nodes)
-        return matched_nodes
-
-
-class ModelStatus(Enum):
-    """
-    The status of model.
-
-    A model is created in `Mutating` status.
-    When the mutation is done and the model get ready to train, its status becomes `Frozen`.
-    When training started, the model's status becomes `Training`.
-    If training is successfully ended, model's `metric` attribute get set and its status becomes `Trained`.
-    If training failed, the status becomes `Failed`.
-    """
-    Mutating = "mutating"
-    Frozen = "frozen"
-    Training = "training"
-    Trained = "trained"
-    Failed = "failed"
-
-
-_InputPseudoUid = -1
-_OutputPseudoUid = -2
-
-
-class Graph:
-    """
-    Graph topology.
-
-    This class simply represents the topology, with no semantic meaning.
-    All other information like metric, non-graph functions, mutation history, etc should go to :class:`Model`.
-
-    Each graph belongs to and only belongs to one :class:`Model`.
-
-    Attributes
-    ----------
-    model
-        The model containing (and owning) this graph.
-    id
-        Unique ID in the model.
-        If two models have graphs of identical ID, they are semantically the same graph.
-        Typically this means one graph is mutated from another, or they are both mutated from one ancestor.
-    name
-        Mnemonic name of this graph. It should have an one-to-one mapping with ID.
-    input_names
-        Optional mnemonic names of input parameters.
-    output_names
-        Optional mnemonic names of output values.
-    input_node
-        Incoming node.
-    output_node
-        Output node.
-    hidden_nodes
-        Hidden nodes
-    nodes
-        All input/output/hidden nodes.
-    edges
-        Edges.
-    python_name
-        The name of torch.nn.Module, should have one-to-one mapping with items in python model.
-    """
-
-    def __init__(self, model: Model, graph_id: int, name: str = cast(str, None), _internal: bool = False):
-        assert _internal, '`Graph()` is private'
-
-        self.model: Model = model
-        self.id: int = graph_id
-        self.name: str = name or f'_generated_{graph_id}'
-
-        # `python_name` is `None` by default. It should be set after initialization if it is needed.
-        self.python_name: Optional[str] = None
-
-        self.input_node: Node = Node(self, _InputPseudoUid, '_inputs', _IOPseudoOperation('_inputs'), _internal=True)
-        self.output_node: Node = Node(self, _OutputPseudoUid, '_outputs', _IOPseudoOperation('_outputs'), _internal=True)
-        self.hidden_nodes: List[Node] = []
-
-        self.edges: List[Edge] = []
-
-    def __repr__(self):
-        return f'Graph(id={self.id}, name={self.name}, ' + \
-            f'input_names={self.input_node.operation.io_names}, ' + \
-            f'output_names={self.output_node.operation.io_names}, ' + \
-            f'num_hidden_nodes={len(self.hidden_nodes)}, num_edges={len(self.edges)})'
-
-    @property
-    def nodes(self) -> List['Node']:
-        return [self.input_node, self.output_node] + self.hidden_nodes
-
-    def _add_input(self, input_name) -> None:
-        if self.input_node.operation.io_names is None:
-            self.input_node.operation.io_names = [input_name]
-        else:
-            self.input_node.operation.io_names.append(input_name)
-
-    def _add_output(self, output_name) -> None:
-        if self.output_node.operation.io_names is None:
-            self.output_node.operation.io_names = [output_name]
-        else:
-            self.output_node.operation.io_names.append(output_name)
-
-    @overload
-    def add_node(self, name: str, operation: Operation) -> 'Node': ...
-    @overload
-    def add_node(self, name: str, type_name: str, parameters: Dict[str, Any] = cast(Dict[str, Any], None)) -> 'Node': ...
-
-    def add_node(self, name, operation_or_type, parameters=None):  # type: ignore
-        if isinstance(operation_or_type, Operation):
-            op = operation_or_type
-        else:
-            op = Operation.new(operation_or_type, cast(dict, parameters), name)
-        return Node(self, uid(), name, op, _internal=True)._register()
-
-    @overload
-    def insert_node_on_edge(self, edge: 'Edge', name: str, operation: Operation) -> 'Node': ...
-
-    @overload
-    def insert_node_on_edge(self, edge: 'Edge', name: str, type_name: str,
-                            parameters: Dict[str, Any] = cast(Dict[str, Any], None)) -> 'Node': ...
-
-    def insert_node_on_edge(self, edge, name, operation_or_type, parameters=None) -> 'Node':  # type: ignore
-        if isinstance(operation_or_type, Operation):
-            op = operation_or_type
-        else:
-            op = Operation.new(operation_or_type, cast(dict, parameters), name)
-        new_node = Node(self, uid(), name, op, _internal=True)._register()
-        # update edges
-        self.add_edge((edge.head, edge.head_slot), (new_node, None))
-        self.add_edge((new_node, None), (edge.tail, edge.tail_slot))
-        self.del_edge(edge)
-        return new_node
-
-    # mutation
-    def add_edge(self, head: EdgeEndpoint, tail: EdgeEndpoint) -> 'Edge':
-        assert head[0].graph is self and tail[0].graph is self
-        return Edge(head, tail, _internal=True)._register()
-
-    def del_edge(self, edge: 'Edge') -> None:
-        self.edges.remove(edge)
-
-    def get_node_by_name(self, name: str) -> Optional['Node']:
-        """
-        Returns the node which has specified name; or returns `None` if no node has this name.
-        """
-        found = [node for node in self.nodes if node.name == name]
-        return found[0] if found else None
-
-    def get_node_by_python_name(self, python_name: str) -> Optional['Node']:
-        """
-        Returns the node which has specified python_name; or returns `None` if no node has this python_name.
-        """
-        found = [node for node in self.nodes if node.python_name == python_name]
-        return found[0] if found else None
-
-    def get_nodes_by_type(self, operation_type: str) -> List['Node']:
-        """
-        Returns nodes whose operation is specified typed.
-        """
-        return [node for node in self.hidden_nodes if node.operation.type == operation_type]
-
-    def get_node_by_id(self, node_id: int) -> Optional['Node']:
-        """
-        Returns the node which has specified name; or returns `None` if no node has this name.
-        """
-        found = [node for node in self.nodes if node.id == node_id]
-        return found[0] if found else None
-
-    def get_nodes_by_label(self, label: str) -> List['Node']:
-        return [node for node in self.hidden_nodes if node.label == label]
-
-    def get_nodes_by_name(self, name: str) -> List['Node']:
-        return [node for node in self.hidden_nodes if node.name == name]
-
-    def get_nodes_by_python_name(self, python_name: str) -> List['Node']:
-        return [node for node in self.nodes if node.python_name == python_name]
-
-    def topo_sort(self) -> List['Node']:
-        node_to_fanin = {}
-        curr_nodes = []
-        for node in self.nodes:
-            fanin = len(node.incoming_edges)
-            node_to_fanin[node] = fanin
-            if fanin == 0:
-                curr_nodes.append(node)
-
-        sorted_nodes = []
-        while curr_nodes:
-            curr_node = curr_nodes.pop(0)
-            sorted_nodes.append(curr_node)
-            # use successor_slots because a node may connect to another node multiple times
-            # to different slots
-            for successor_slot in curr_node.successor_slots:
-                successor = successor_slot[0]
-                node_to_fanin[successor] -= 1
-                if node_to_fanin[successor] == 0:
-                    curr_nodes.append(successor)
-
-        for key in node_to_fanin:
-            assert node_to_fanin[key] == 0, '{}, fanin: {}, predecessor: {}, edges: {}, fanin: {}, keys: {}'.format(
-                key,
-                node_to_fanin[key],
-                key.predecessors[0],
-                self.edges,
-                node_to_fanin.values(),
-                node_to_fanin.keys())
-
-        return sorted_nodes
-
-    def fork(self) -> 'Graph':
-        """
-        Fork the model and returns corresponding graph in new model.
-        This shortcut might be helpful because many algorithms only cares about "stem" subgraph instead of whole model.
-        """
-        return self.model.fork().graphs[self.name]
-
-    def __eq__(self, other: object) -> bool:
-        return self is other
-
-    def _fork_to(self, model: Model, name_prefix='') -> 'Graph':
-        new_graph = Graph(model, self.id, name_prefix + self.name, _internal=True)._register()
-        # TODO: use node copy instead
-        new_graph.input_node.operation.io_names = self.input_node.operation.io_names
-        new_graph.output_node.operation.io_names = self.output_node.operation.io_names
-        new_graph.input_node.update_label(self.input_node.label)
-        new_graph.output_node.update_label(self.output_node.label)
-        new_graph.python_name = self.python_name
-
-        for node in self.hidden_nodes:
-            new_node = Node(new_graph, node.id, node.name, node.operation, _internal=True)
-            new_node.python_name = node.python_name
-            new_node.update_label(node.label)
-            new_node._register()
-
-        id_to_new_node = {node.id: node for node in new_graph.nodes}
-
-        for edge in self.edges:
-            new_head = id_to_new_node[edge.head.id]
-            new_tail = id_to_new_node[edge.tail.id]
-            Edge((new_head, edge.head_slot), (new_tail, edge.tail_slot), _internal=True)._register()
-
-        return new_graph
-
-    def _copy(self) -> 'Graph':
-        # Copy this graph inside the model.
-        # The new graph will have identical topology, but its nodes' name and ID will be different.
-        new_graph = Graph(self.model, uid(), _internal=True)._register()
-        new_graph.input_node.operation.io_names = self.input_node.operation.io_names
-        new_graph.output_node.operation.io_names = self.output_node.operation.io_names
-        new_graph.input_node.update_label(self.input_node.label)
-        new_graph.output_node.update_label(self.output_node.label)
-        new_graph.python_name = self.python_name
-
-        id_to_new_node = {}  # old node ID -> new node object
-
-        for old_node in self.hidden_nodes:
-            new_node = Node(new_graph, uid(), None, old_node.operation, _internal=True)._register()
-            new_node.python_name = old_node.python_name
-            new_node.update_label(old_node.label)
-            id_to_new_node[old_node.id] = new_node
-
-        for edge in self.edges:
-            new_head = id_to_new_node[edge.head.id]
-            new_tail = id_to_new_node[edge.tail.id]
-            Edge((new_head, edge.head_slot), (new_tail, edge.tail_slot), _internal=True)._register()
-
-        return new_graph
-
-    def _register(self) -> 'Graph':
-        self.model.graphs[self.name] = self
-        return self
-
-    def _rename_graph(self, old_name, new_name):
-        self.model.graphs[old_name].name = new_name
-        self.model.graphs[new_name] = self.model.graphs[old_name]
-        del self.model.graphs[old_name]
-
-    @staticmethod
-    def _load(model: Model, name: str, ir: Any) -> 'Graph':
-        graph = Graph(model, uid(), name, _internal=True)
-        graph.input_node.operation.io_names = ir.get('inputs')
-        graph.output_node.operation.io_names = ir.get('outputs')
-        for node_name, node_data in ir['nodes'].items():
-            Node._load(graph, node_name, node_data)._register()
-        for edge_data in ir['edges']:
-            Edge._load(graph, edge_data)._register()
-        return graph
-
-    def _dump(self) -> Any:
-        return {
-            'inputs': self.input_node.operation.io_names,
-            'outputs': self.output_node.operation.io_names,
-            'nodes': {node.name: node._dump() for node in self.hidden_nodes},
-            'edges': [edge._dump() for edge in self.edges]
-        }
-
-
-class Node:
-    """
-    An operation or an opaque subgraph inside a graph.
-
-    Each node belongs to and only belongs to one :class:`Graph`.
-    Nodes should never be created with constructor. Use :meth:`Graph.add_node` instead.
-
-    The node itself is for topology only.
-    Information of tensor calculation should all go inside ``operation`` attribute.
-
-    TODO: parameter of subgraph (cell)
-    It's easy to assign parameters on cell node, but it's hard to "use" them.
-    We need to design a way to reference stored cell parameters in inner node operations.
-    e.g. ``self.fc = Linear(self.units)``  <-  how to express ``self.units`` in IR?
-
-    Attributes
-    ----------
-    graph
-        The graph containing this node.
-    id
-        Unique ID in the model.
-        If two models have nodes with same ID, they are semantically the same node.
-    name
-        Mnemonic name. It should have an one-to-one mapping with ID.
-    python_name
-        The name of torch.nn.Module, should have one-to-one mapping with items in python model.
-    label
-        Optional. If two nodes have the same label, they are considered same by the mutator.
-    operation
-        Operation.
-    cell
-        Read only shortcut to get the referenced subgraph.
-        If this node is not a subgraph (is a primitive operation), accessing ``cell`` will raise an error.
-    predecessors
-        Predecessor nodes of this node in the graph. This is an optional mutation helper.
-    successors
-        Successor nodes of this node in the graph. This is an optional mutation helper.
-    incoming_edges
-        Incoming edges of this node in the graph. This is an optional mutation helper.
-    outgoing_edges
-        Outgoing edges of this node in the graph. This is an optional mutation helper.
-    """
-
-    def __init__(self, graph, node_id, name, operation, _internal=False):
-        self.graph: Graph = graph
-        self.id: int = node_id
-        self.name: str = name or f'_generated_{node_id}'
-        # `python_name` is `None` by default. It should be set after initialization if it is needed.
-        self.python_name: Optional[str] = None
-        # TODO: the operation is likely to be considered editable by end-user and it will be hard to debug
-        # maybe we should copy it here or make Operation class immutable, in next release
-        self.operation: Operation = operation
-        self.label: Optional[str] = None
-
-    def __repr__(self):
-        return f'Node(id={self.id}, name={self.name}, python_name={self.python_name}, label={self.label}, operation={self.operation})'
-
-    @property
-    def predecessors(self) -> List['Node']:
-        return sorted(set(edge.head for edge in self.incoming_edges), key=(lambda node: node.id))
-
-    @property
-    def successors(self) -> List['Node']:
-        return sorted(set(edge.tail for edge in self.outgoing_edges), key=(lambda node: node.id))
-
-    @property
-    def successor_slots(self) -> Set[Tuple['Node', Union[int, None]]]:
-        return set((edge.tail, edge.tail_slot) for edge in self.outgoing_edges)
-
-    @property
-    def incoming_edges(self) -> List['Edge']:
-        return [edge for edge in self.graph.edges if edge.tail is self]
-
-    @property
-    def outgoing_edges(self) -> List['Edge']:
-        return [edge for edge in self.graph.edges if edge.head is self]
-
-    @property
-    def cell(self) -> Graph:
-        assert isinstance(self.operation, Cell)
-        return self.graph.model.graphs[self.operation.parameters['cell']]
-
-    def update_label(self, label: Optional[str]) -> None:
-        self.label = label
-
-    @overload
-    def update_operation(self, operation: Operation) -> None: ...
-    @overload
-    def update_operation(self, type_name: str, parameters: Dict[str, Any] = cast(Dict[str, Any], None)) -> None: ...
-
-    def update_operation(self, operation_or_type, parameters=None):  # type: ignore
-        if isinstance(operation_or_type, Operation):
-            self.operation = operation_or_type
-        else:
-            self.operation = Operation.new(operation_or_type, cast(dict, parameters))
-
-    # mutation
-    def remove(self) -> None:
-        assert not self.incoming_edges and not self.outgoing_edges
-        self.graph.hidden_nodes.remove(self)
-
-    # mutation
-    def specialize_cell(self) -> Graph:
-        """
-        Only available if the operation is a cell.
-        Duplicate the cell template and let this node reference to newly created copy.
-        """
-        new_cell = self.cell._copy()._register()
-        self.operation = Cell(new_cell.name)
-        return new_cell
-
-    def __eq__(self, other: object) -> bool:
-        return self is other
-
-    def __hash__(self) -> int:
-        return hash(id(self))
-
-    def _register(self) -> 'Node':
-        self.graph.hidden_nodes.append(self)
-        return self
-
-    @staticmethod
-    def _load(graph: Graph, name: str, ir: Any) -> 'Node':
-        if ir['operation']['type'] == '_cell':
-            op = Cell(ir['operation']['cell_name'], ir['operation'].get('parameters', {}), attributes=ir['operation'].get('attributes', {}))
-        else:
-            op = Operation.new(ir['operation']['type'],
-                               ir['operation'].get('parameters', {}),
-                               attributes=ir['operation'].get('attributes', {}))
-        node = Node(graph, uid(), name, op)
-        if 'label' in ir:
-            node.update_label(ir['label'])
-        return node
-
-    def _dump(self) -> Any:
-        ret: Dict[str, Any] = {
-            'operation': {
-                'type': self.operation.type,
-                'parameters': self.operation.parameters,
-                'attributes': self.operation.attributes
-            }
-        }
-        if isinstance(self.operation, Cell):
-            ret['operation']['cell_name'] = self.operation.cell_name
-        if self.label is not None:
-            ret['label'] = self.label
-        if self.python_name is not None:
-            ret['python_name'] = self.python_name
-        return ret
-
-
-class Edge:
-    """
-    A tensor, or "data flow", between two nodes.
-
-    Example forward code snippet: ::
-
-        a, b, c = split(x)
-        p = concat(a, c)
-        q = sum(b, p)
-        z = relu(q)
-
-    Edges in above snippet: ::
-
-        + head: (split, 0), tail: (concat, 0)  # a in concat
-        + head: (split, 2), tail: (concat, 1)  # c in concat
-        + head: (split, 1), tail: (sum, -1 or 0)  # b in sum
-        + head: (concat, null), tail: (sum, -1 or 1)  # p in sum
-        + head: (sum, null), tail: (relu, null)  # q in relu
-
-    Attributes
-    ----------
-    graph
-        Graph.
-    head
-        Head node.
-    tail
-        Tail node.
-    head_slot
-        Index of outputs in head node.
-        If the node has only one output, this should be ``null``.
-    tail_slot
-        Index of inputs in tail node.
-        If the node has only one input, this should be ``null``.
-        If the node does not care about order, this can be ``-1``.
-    """
-
-    def __init__(self, head: EdgeEndpoint, tail: EdgeEndpoint, _internal: bool = False):
-        assert _internal, '`Edge()` is private'
-        self.graph: Graph = head[0].graph
-        self.head: Node = head[0]
-        self.tail: Node = tail[0]
-        self.head_slot: Optional[int] = head[1]
-        self.tail_slot: Optional[int] = tail[1]
-
-    def __repr__(self):
-        return f'Edge(head=({self.head}, {self.head_slot}), tail=({self.tail}, {self.tail_slot}))'
-
-    # mutation
-    def remove(self) -> None:
-        self.graph.edges.remove(self)
-
-    def _register(self) -> 'Edge':
-        self.graph.edges.append(self)
-        return self
-
-    @staticmethod
-    def _load(graph: Graph, ir: Any) -> 'Edge':
-        head = graph.get_node_by_name(ir['head'][0])
-        tail = graph.get_node_by_name(ir['tail'][0])
-        assert head is not None and tail is not None
-        return Edge((head, ir['head'][1]), (tail, ir['tail'][1]), _internal=True)
-
-    def _dump(self) -> Any:
-        return {
-            'head': [self.head.name, self.head_slot],
-            'tail': [self.tail.name, self.tail_slot]
-        }
-
-
-class Mutation:
-    """
-    An execution of mutation, which consists of four parts: a mutator, a list of decisions (choices),
-    the model that it comes from, and the model that it becomes.
-
-    In general cases, the mutation logs are not reliable and should not be replayed as the mutators can
-    be arbitrarily complex. However, for inline mutations, the labels correspond to mutator labels here,
-    this can be useful for metadata visualization and python execution mode.
-
-    Attributes
-    ----------
-    mutator
-        Mutator.
-    samples
-        Decisions/choices.
-    from_
-        Model that is comes from.
-    to
-        Model that it becomes.
-    """
-
-    def __init__(self, mutator: 'Mutator', samples: List[Any], from_: Model, to: Model):  # noqa: F821
-        self.mutator: 'Mutator' = mutator  # noqa: F821
-        self.samples: List[Any] = samples
-        self.from_: Model = from_
-        self.to: Model = to
-
-    def __repr__(self):
-        return f'Edge(mutator={self.mutator}, samples={self.samples}, from={self.from_}, to={self.to})'
-
-
-class IllegalGraphError(ValueError):
-    def __init__(self, graph, *args):
-        self._debug_dump_graph(graph)
-        super().__init__(*args)
-
-    @staticmethod
-    def _debug_dump_graph(graph):
-        if isinstance(graph, Graph):
-            graph = graph._dump()
-        with open('generated/debug.json', 'w') as dump_file:
-            json.dump(graph, dump_file, indent=4)
-
-
-class DebugEvaluator(Evaluator):
-    @staticmethod
-    def _load(ir: Any) -> 'DebugEvaluator':
-        return DebugEvaluator()
-
-    def _dump(self) -> Any:
-        return {'type': DebugEvaluator}
-
-    def _execute(self, model_cls: type) -> Any:
-        pass
-
-    def __eq__(self, other) -> bool:
-        return True
+from nni.nas.execution.common.graph import *
--- a/nni/retiarii/hub/pytorch/autoformer.py
+++ b/nni/retiarii/hub/pytorch/autoformer.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from typing import Optional, Tuple, cast, Any, Dict
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import torch.nn.functional as F
-from timm.models.layers import trunc_normal_, DropPath
-
-import nni.retiarii.nn.pytorch as nn
-from nni.retiarii import model_wrapper, basic_unit
-from nni.retiarii.nn.pytorch.api import ValueChoiceX
-from nni.retiarii.oneshot.pytorch.supermodule.operation import MixedOperation
-from nni.retiarii.oneshot.pytorch.supermodule._valuechoice_utils import traverse_all_options
-from nni.retiarii.oneshot.pytorch.supermodule._operation_utils import Slicable as _S, MaybeWeighted as _W
-
-from .utils.fixed import FixedFactory
-from .utils.pretrained import load_pretrained_weight
-
-
-class RelativePosition2D(nn.Module):
-    def __init__(self, head_embed_dim, length=14,) -> None:
-        super().__init__()
-        self.head_embed_dim = head_embed_dim
-        self.legnth = length
-        self.embeddings_table_v = nn.Parameter(torch.randn(length * 2 + 2, head_embed_dim))
-        self.embeddings_table_h = nn.Parameter(torch.randn(length * 2 + 2, head_embed_dim))
-
-        trunc_normal_(self.embeddings_table_v, std=.02)
-        trunc_normal_(self.embeddings_table_h, std=.02)
-
-    def forward(self, length_q, length_k):
-        # remove the first cls token distance computation
-        length_q = length_q - 1
-        length_k = length_k - 1
-        # init in the device directly, rather than move to device
-        range_vec_q = torch.arange(length_q, device=self.embeddings_table_v.device)
-        range_vec_k = torch.arange(length_k, device=self.embeddings_table_v.device)
-        # compute the row and column distance
-        length_q_sqrt = int(length_q ** 0.5)
-        distance_mat_v = (range_vec_k[None, :] // length_q_sqrt - range_vec_q[:, None] // length_q_sqrt)
-        distance_mat_h = (range_vec_k[None, :]  % length_q_sqrt - range_vec_q[:, None]  % length_q_sqrt)
-        # clip the distance to the range of [-legnth, legnth]
-        distance_mat_clipped_v = torch.clamp(distance_mat_v, - self.legnth, self.legnth)
-        distance_mat_clipped_h = torch.clamp(distance_mat_h, - self.legnth, self.legnth)
-
-        # translate the distance from [1, 2 * legnth + 1], 0 is for the cls token
-        final_mat_v = distance_mat_clipped_v + self.legnth + 1
-        final_mat_h = distance_mat_clipped_h + self.legnth + 1
-        # pad the 0 which represent the cls token
-        final_mat_v = F.pad(final_mat_v, (1, 0, 1, 0), "constant", 0)
-        final_mat_h = F.pad(final_mat_h, (1, 0, 1, 0), "constant", 0)
-
-        final_mat_v = final_mat_v.long()
-        final_mat_h = final_mat_h.long()
-        # get the embeddings with the corresponding distance
-        embeddings = self.embeddings_table_v[final_mat_v] + self.embeddings_table_h[final_mat_h]
-
-        return embeddings
-
-class RelativePositionAttention(nn.Module):
-    """
-    This class is designed to support the relative position in attention.
-    The pytorch built-in nn.MultiheadAttention() does not support relative position embedding.
-    Different from the absolute position embedding, the relative position embedding considers
-    encode the relative distance between input tokens and learn the pairwise relations of them.
-    It is commonly calculated via a look-up table with learnable parameters interacting with queries
-    and keys in self-attention modules.
-    """
-    def __init__(
-            self, embed_dim, num_heads,
-            attn_drop=0., proj_drop=0.,
-            qkv_bias=False, qk_scale=None,
-            rpe_length=14, rpe=False,
-            head_dim=64):
-        super().__init__()
-        self.num_heads = num_heads
-        # head_dim is fixed 64 in official autoformer. set head_dim = None to use flex head dim.
-        self.head_dim = head_dim or (embed_dim // num_heads)
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # Please refer to MixedMultiheadAttention for details.
-        self.q = nn.Linear(embed_dim, head_dim * num_heads, bias = qkv_bias)
-        self.k = nn.Linear(embed_dim, head_dim * num_heads, bias = qkv_bias)
-        self.v = nn.Linear(embed_dim, head_dim * num_heads, bias = qkv_bias)
-
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(head_dim * num_heads, embed_dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.rpe = rpe
-        if rpe:
-            self.rel_pos_embed_k = RelativePosition2D(head_dim, rpe_length)
-            self.rel_pos_embed_v = RelativePosition2D(head_dim, rpe_length)
-
-    def forward(self, x):
-        B, N, _ = x.shape
-        head_dim = self.head_dim
-        # num_heads can not get from self.num_heads directly,
-        # use -1 to compute implicitly.
-        num_heads = -1
-        q = self.q(x).reshape(B, N, num_heads, head_dim).permute(0, 2, 1, 3)
-        k = self.k(x).reshape(B, N, num_heads, head_dim).permute(0, 2, 1, 3)
-        v = self.v(x).reshape(B, N, num_heads, head_dim).permute(0, 2, 1, 3)
-        num_heads = q.size(1)
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-
-        if self.rpe:
-            r_p_k = self.rel_pos_embed_k(N, N)
-            attn = attn + (
-                q.permute(2, 0, 1, 3).reshape(N, num_heads * B, head_dim) @ r_p_k.transpose(2, 1)
-            ).transpose(1, 0).reshape(B, num_heads, N, N) * self.scale
-
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, num_heads * head_dim)
-
-        if self.rpe:
-            attn_1 = attn.permute(2, 0, 1, 3).reshape(N, B * num_heads, N)
-            r_p_v = self.rel_pos_embed_v(N, N)
-            # The size of attention is (B, num_heads, N, N), reshape it to (N, B*num_heads, N) and do batch matmul with
-            # the relative position embedding of V (N, N, head_dim) get shape like (N, B*num_heads, head_dim). We reshape it to the
-            # same size as x (B, num_heads, N, hidden_dim)
-            x = x + (attn_1 @ r_p_v).transpose(1, 0).reshape(B, num_heads, N, head_dim).transpose(2, 1).reshape(B, N, num_heads * head_dim)
-
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class TransformerEncoderLayer(nn.Module):
-    """
-    This class is designed to support the RelativePositionAttention().
-    The pytorch build-in nn.TransformerEncoderLayer() does not support customed attention.
-    """
-    def __init__(
-        self, embed_dim, num_heads, mlp_ratio=4.,
-        qkv_bias=False, qk_scale=None, rpe=False,
-        drop_rate=0., attn_drop=0., proj_drop=0., drop_path=0.,
-        pre_norm=True, rpe_length=14, head_dim=64
-    ):
-        super().__init__()
-
-        self.normalize_before = pre_norm
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.dropout = drop_rate
-        self.attn = RelativePositionAttention(
-            embed_dim=embed_dim,
-            num_heads=num_heads,
-            attn_drop=attn_drop,
-            proj_drop=proj_drop,
-            rpe=rpe,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            rpe_length=rpe_length,
-            head_dim=head_dim
-        )
-
-        self.attn_layer_norm = nn.LayerNorm(embed_dim)
-        self.ffn_layer_norm = nn.LayerNorm(embed_dim)
-
-        self.activation_fn = nn.GELU()
-
-        self.fc1 = nn.Linear(
-            cast(int, embed_dim),
-            cast(int, nn.ValueChoice.to_int(embed_dim * mlp_ratio))
-        )
-        self.fc2 = nn.Linear(
-            cast(int, nn.ValueChoice.to_int(embed_dim * mlp_ratio)),
-            cast(int, embed_dim)
-        )
-
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-
-    def forward(self, x):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(batch, patch_num , sample_embed_dim)`
-        Returns:
-            encoded output of shape `(batch, patch_num, sample_embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(self.attn_layer_norm, x, before=True)
-        x = self.attn(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = self.drop_path(x)
-        x = residual + x
-        x = self.maybe_layer_norm(self.attn_layer_norm, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(self.ffn_layer_norm, x, before=True)
-        x = self.fc1(x)
-        x = self.activation_fn(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = self.drop_path(x)
-        x = residual + x
-        x = self.maybe_layer_norm(self.ffn_layer_norm, x, after=True)
-
-        return x
-
-
-@basic_unit
-class ClsToken(nn.Module):
-    """ Concat class token with dim=embed_dim before patch embedding.
-    """
-    def __init__(self, embed_dim: int):
-        super().__init__()
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        trunc_normal_(self.cls_token, std=.02)
-
-    def forward(self, x):
-        return torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
-
-
-class MixedClsToken(MixedOperation, ClsToken):
-    """ Mixed class token concat operation.
-
-    Supported arguments are:
-
-    - ``embed_dim``
-
-    Prefix of cls_token will be sliced.
-    """
-    bound_type = ClsToken
-    argument_list = ['embed_dim']
-
-    def super_init_argument(self, name: str, value_choice: ValueChoiceX):
-        return max(traverse_all_options(value_choice))
-
-    def forward_with_args(self, embed_dim,
-                        inputs: torch.Tensor) -> torch.Tensor:
-        embed_dim_ = _W(embed_dim)
-        cls_token = _S(self.cls_token)[..., :embed_dim_]
-
-        return torch.cat((cls_token.expand(inputs.shape[0], -1, -1), inputs), dim=1)
-
-
-@basic_unit
-class AbsPosEmbed(nn.Module):
-    """ Add absolute position embedding on patch embedding.
-    """
-    def __init__(self, length: int, embed_dim: int):
-        super().__init__()
-        self.pos_embed = nn.Parameter(torch.zeros(1, length, embed_dim))
-        trunc_normal_(self.pos_embed, std=.02)
-
-    def forward(self, x):
-        return x + self.pos_embed
-
-
-class MixedAbsPosEmbed(MixedOperation, AbsPosEmbed):
-    """ Mixed absolute position embedding add operation.
-
-    Supported arguments are:
-
-    - ``embed_dim``
-
-    Prefix of pos_embed will be sliced.
-    """
-    bound_type = AbsPosEmbed
-    argument_list = ['embed_dim']
-
-    def super_init_argument(self, name: str, value_choice: ValueChoiceX):
-        return max(traverse_all_options(value_choice))
-
-    def forward_with_args(self,  embed_dim,
-                        inputs: torch.Tensor) -> torch.Tensor:
-        embed_dim_ = _W(embed_dim)
-        pos_embed = _S(self.pos_embed)[..., :embed_dim_]
-
-        return inputs + pos_embed
-
-
-@model_wrapper
-class AutoformerSpace(nn.Module):
-    """
-    The search space that is proposed in `Autoformer <https://arxiv.org/abs/2107.00651>`__.
-    There are four searchable variables: depth, embedding dimension, heads number and MLP ratio.
-
-    Parameters
-    ----------
-    search_embed_dim : list of int
-        The search space of embedding dimension.
-    search_mlp_ratio : list of float
-        The search space of MLP ratio.
-    search_num_heads : list of int
-        The search space of number of heads.
-    search_depth: list of int
-        The search space of depth.
-    img_size : int
-        Size of input image.
-    patch_size : int
-        Size of image patch.
-    in_chans : int
-        Number of channels of the input image.
-    num_classes : int
-        Number of classes for classifier.
-    qkv_bias : bool
-        Whether to use bias item in the qkv embedding.
-    drop_rate : float
-        Drop rate of the MLP projection in MSA and FFN.
-    attn_drop_rate : float
-        Drop rate of attention.
-    drop_path_rate : float
-        Drop path rate.
-    pre_norm : bool
-        Whether to use pre_norm. Otherwise post_norm is used.
-    global_pool : bool
-        Whether to use global pooling to generate the image representation. Otherwise the cls_token is used.
-    abs_pos : bool
-        Whether to use absolute positional embeddings.
-    qk_scale : float
-        The scaler on score map in self-attention.
-    rpe : bool
-        Whether to use relative position encoding.
-    """
-
-    def __init__(
-        self,
-        search_embed_dim: Tuple[int, ...] = (192, 216, 240),
-        search_mlp_ratio: Tuple[float, ...] = (3.0, 3.5, 4.0),
-        search_num_heads: Tuple[int, ...] = (3, 4),
-        search_depth: Tuple[int, ...] = (12, 13, 14),
-        img_size: int = 224,
-        patch_size: int = 16,
-        in_chans: int = 3,
-        num_classes: int = 1000,
-        qkv_bias: bool = False,
-        drop_rate: float = 0.,
-        attn_drop_rate: float = 0.,
-        drop_path_rate: float = 0.,
-        pre_norm: bool = True,
-        global_pool: bool = False,
-        abs_pos: bool = True,
-        qk_scale: Optional[float] = None,
-        rpe: bool = True,
-    ):
-        super().__init__()
-        # define search space parameters
-        embed_dim = nn.ValueChoice(list(search_embed_dim), label="embed_dim")
-        depth = nn.ValueChoice(list(search_depth), label="depth")
-        mlp_ratios = [nn.ValueChoice(list(search_mlp_ratio), label=f"mlp_ratio_{i}") for i in range(max(search_depth))]
-        num_heads = [nn.ValueChoice(list(search_num_heads), label=f"num_head_{i}") for i in range(max(search_depth))]
-
-        self.patch_embed = nn.Conv2d(
-            in_chans, cast(int, embed_dim),
-            kernel_size = patch_size,
-            stride = patch_size
-        )
-        self.patches_num = int((img_size // patch_size) ** 2)
-        self.global_pool = global_pool
-
-        self.cls_token = ClsToken(cast(int, embed_dim))
-        self.pos_embed = AbsPosEmbed(self.patches_num+1, cast(int, embed_dim)) if abs_pos else nn.Identity()
-
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, max(search_depth))]  # stochastic depth decay rule
-
-        self.blocks = nn.Repeat(
-            lambda index: TransformerEncoderLayer(
-                embed_dim = embed_dim, num_heads = num_heads[index], mlp_ratio=mlp_ratios[index],
-                qkv_bias = qkv_bias, drop_rate = drop_rate, attn_drop = attn_drop_rate, drop_path=dpr[index],
-                rpe_length=img_size // patch_size, qk_scale=qk_scale, rpe=rpe, pre_norm=pre_norm, head_dim = 64
-            ), depth
-        )
-
-        self.norm = nn.LayerNorm(cast(int, embed_dim)) if pre_norm else nn.Identity()
-        self.head = nn.Linear(cast(int, embed_dim), num_classes) if num_classes > 0 else nn.Identity()
-
-    @classmethod
-    def get_extra_mutation_hooks(cls):
-        return [MixedAbsPosEmbed.mutate, MixedClsToken.mutate]
-
-    @classmethod
-    def load_searched_model(
-        cls, name: str,
-        pretrained: bool = False, download: bool = False, progress: bool = True
-    ) -> nn.Module:
-
-        init_kwargs = {'qkv_bias': True, 'drop_rate': 0.0, 'drop_path_rate': 0.1, 'global_pool': True, 'num_classes': 1000}
-        if name == 'autoformer-tiny':
-            mlp_ratio = [3.5, 3.5, 3.0, 3.5, 3.0, 3.0, 4.0, 4.0, 3.5, 4.0, 3.5, 4.0, 3.5] + [3.0]
-            num_head = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3] + [3]
-            arch: Dict[str, Any] = {
-                'embed_dim': 192,
-                'depth': 13
-            }
-            for i in range(14):
-                arch[f'mlp_ratio_{i}'] = mlp_ratio[i]
-                arch[f'num_head_{i}'] = num_head[i]
-
-            init_kwargs.update({
-                'search_embed_dim': (240, 216, 192),
-                'search_mlp_ratio': (4.0, 3.5, 3.0),
-                'search_num_heads': (4, 3),
-                'search_depth': (14, 13, 12),
-            })
-        elif name == 'autoformer-small':
-            mlp_ratio = [3.0, 3.5, 3.0, 3.5, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.5, 4.0] + [3.0]
-            num_head = [6, 6, 5, 7, 5, 5, 5, 6, 6, 7, 7, 6, 7] + [5]
-            arch: Dict[str, Any] = {
-                'embed_dim': 384,
-                'depth': 13
-            }
-            for i in range(14):
-                arch[f'mlp_ratio_{i}'] = mlp_ratio[i]
-                arch[f'num_head_{i}'] = num_head[i]
-
-            init_kwargs.update({
-                'search_embed_dim': (448, 384, 320),
-                'search_mlp_ratio': (4.0, 3.5, 3.0),
-                'search_num_heads': (7, 6, 5),
-                'search_depth': (14, 13, 12),
-            })
-
-        elif name == 'autoformer-base':
-            mlp_ratio = [3.5, 3.5, 4.0, 3.5, 4.0, 3.5, 3.5, 3.0, 4.0, 4.0, 3.0, 4.0, 3.0, 3.5] + [3.0, 3.0]
-            num_head = [9, 9, 9, 9, 9, 10, 9, 9, 10, 9, 10, 9, 9, 10] + [8, 8]
-            arch: Dict[str, Any] = {
-                'embed_dim': 576,
-                'depth': 14
-            }
-            for i in range(16):
-                arch[f'mlp_ratio_{i}'] = mlp_ratio[i]
-                arch[f'num_head_{i}'] = num_head[i]
-
-            init_kwargs.update({
-                'search_embed_dim': (624, 576, 528),
-                'search_mlp_ratio': (4.0, 3.5, 3.0),
-                'search_num_heads': (10, 9, 8),
-                'search_depth': (16, 15, 14),
-            })
-        else:
-            raise ValueError(f'Unsupported architecture with name: {name}')
-
-        model_factory = FixedFactory(cls, arch)
-        model = model_factory(**init_kwargs)
-
-        if pretrained:
-            weight_file = load_pretrained_weight(name, download=download, progress=progress)
-            pretrained_weights = torch.load(weight_file)
-            model.load_state_dict(pretrained_weights)
-
-        return model
-
-    def forward(self, x):
-        B = x.shape[0]
-        x = self.patch_embed(x)
-        x = x.permute(0, 2, 3, 1).view(B, self.patches_num, -1)
-        x = self.cls_token(x)
-        x = self.pos_embed(x)
-        x = self.blocks(x)
-        x = self.norm(x)
-
-        if self.global_pool:
-            x = torch.mean(x[:, 1:], dim=1)
-        else:
-            x = x[:, 0]
-
-        x = self.head(x)
-
-        return x
+from nni.nas.hub.pytorch.autoformer import *
--- a/nni/retiarii/hub/pytorch/mobilenetv3.py
+++ b/nni/retiarii/hub/pytorch/mobilenetv3.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from functools import partial
-from typing import Tuple, Optional, Callable, Union, List, Type, cast
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import nni.retiarii.nn.pytorch as nn
-from nni.retiarii import model_wrapper
-from nni.typehint import Literal
-
-from .proxylessnas import ConvBNReLU, InvertedResidual, DepthwiseSeparableConv, make_divisible, reset_parameters
-from .utils.fixed import FixedFactory
-from .utils.pretrained import load_pretrained_weight
-
-
-class SqueezeExcite(nn.Module):
-    """Squeeze-and-excite layer.
-
-    We can't use the op from ``torchvision.ops`` because it's not (yet) properly wrapped,
-    and ValueChoice couldn't be processed.
-
-    Reference:
-
-    - https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/efficientnet_blocks.py#L26
-    - https://github.com/d-li14/mobilenetv3.pytorch/blob/3e6938cedcbbc5ee5bc50780ea18e644702d85fc/mobilenetv3.py#L53
-    """
-
-    def __init__(self,
-                 channels: int,
-                 reduction_ratio: float = 0.25,
-                 gate_layer: Optional[Callable[..., nn.Module]] = None,
-                 activation_layer: Optional[Callable[..., nn.Module]] = None):
-        super().__init__()
-
-        rd_channels = make_divisible(channels * reduction_ratio, 8)
-        gate_layer = gate_layer or nn.Hardsigmoid
-        activation_layer = activation_layer or nn.ReLU
-        self.conv_reduce = nn.Conv2d(channels, rd_channels, 1, bias=True)
-        self.act1 = activation_layer(inplace=True)
-        self.conv_expand = nn.Conv2d(rd_channels, channels, 1, bias=True)
-        self.gate = gate_layer()
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.conv_reduce(x_se)
-        x_se = self.act1(x_se)
-        x_se = self.conv_expand(x_se)
-        return x * self.gate(x_se)
-
-
-def _se_or_skip(hidden_ch: int, input_ch: int, optional: bool, se_from_exp: bool, label: str) -> nn.Module:
-    ch = hidden_ch if se_from_exp else input_ch
-    if optional:
-        return nn.LayerChoice({
-            'identity': nn.Identity(),
-            'se': SqueezeExcite(ch)
-        }, label=label)
-    else:
-        return SqueezeExcite(ch)
-
-
-def _act_fn(act_alias: Literal['hswish', 'swish', 'relu']) -> Type[nn.Module]:
-    if act_alias == 'hswish':
-        return nn.Hardswish
-    elif act_alias == 'swish':
-        return nn.SiLU
-    elif act_alias == 'relu':
-        return nn.ReLU
-    else:
-        raise ValueError(f'Unsupported act alias: {act_alias}')
-
-
-@model_wrapper
-class MobileNetV3Space(nn.Module):
-    """
-    MobileNetV3Space implements the largest search space in `TuNAS <https://arxiv.org/abs/2008.06120>`__.
-
-    The search dimensions include widths, expand ratios, kernel sizes, SE ratio.
-    Some of them can be turned off via arguments to narrow down the search space.
-
-    Different from ProxylessNAS search space, this space is implemented with :class:`nn.ValueChoice`.
-
-    We use the following snipppet as reference.
-    https://github.com/google-research/google-research/blob/20736344591f774f4b1570af64624ed1e18d2867/tunas/mobile_search_space_v3.py#L728
-
-    We have ``num_blocks`` which equals to the length of ``self.blocks`` (the main body of the network).
-    For simplicity, the following parameter specification assumes ``num_blocks`` equals 8 (body + head).
-    If a shallower body is intended, arrays including ``base_widths``, ``squeeze_excite``, ``depth_range``,
-    ``stride``, ``activation`` should also be shortened accordingly.
-
-    Parameters
-    ----------
-    num_labels
-        Dimensions for classification head.
-    base_widths
-        Widths of each stage, from stem, to body, to head.
-        Length should be 9, i.e., ``num_blocks + 1`` (because there is a stem width in front).
-    width_multipliers
-        A range of widths multiplier to choose from. The choice is independent for each stage.
-        Or it can be a fixed float. This will be applied on ``base_widths``,
-        and we would also make sure that widths can be divided by 8.
-    expand_ratios
-        A list of expand ratios to choose from. Independent for every **block**.
-    squeeze_excite
-        Indicating whether the current stage can have an optional SE layer.
-        Expect array of length 6 for stage 0 to 5. Each element can be one of ``force``, ``optional``, ``none``.
-    depth_range
-        A range (e.g., ``(1, 4)``),
-        or a list of range (e.g., ``[(1, 3), (1, 4), (1, 4), (1, 3), (0, 2)]``).
-        If a list, the length should be 5. The depth are specified for stage 1 to 5.
-    stride
-        Stride for all stages (including stem and head). Length should be same as ``base_widths``.
-    activation
-        Activation (class) for all stages. Length is same as ``base_widths``.
-    se_from_exp
-        Calculate SE channel reduction from expanded (mid) channels.
-    dropout_rate
-        Dropout rate at classification head.
-    bn_eps
-        Epsilon of batch normalization.
-    bn_momentum
-        Momentum of batch normalization.
-    """
-
-    widths: List[Union[nn.ChoiceOf[int], int]]
-    depth_range: List[Tuple[int, int]]
-
-    def __init__(
-        self, num_labels: int = 1000,
-        base_widths: Tuple[int, ...] = (16, 16, 16, 32, 64, 128, 256, 512, 1024),
-        width_multipliers: Union[Tuple[float, ...], float] = (0.5, 0.625, 0.75, 1.0, 1.25, 1.5, 2.0),
-        expand_ratios: Tuple[float, ...] = (1., 2., 3., 4., 5., 6.),
-        squeeze_excite: Tuple[Literal['force', 'optional', 'none'], ...] = (
-            'none', 'none', 'optional', 'none', 'optional', 'optional'
-        ),
-        depth_range: Union[List[Tuple[int, int]], Tuple[int, int]] = (1, 4),
-        stride: Tuple[int, ...] = (2, 1, 2, 2, 2, 1, 2, 1, 1),
-        activation: Tuple[Literal['hswish', 'swish', 'relu'], ...] = (
-            'hswish', 'relu', 'relu', 'relu', 'hswish', 'hswish', 'hswish', 'hswish', 'hswish'
-        ),
-        se_from_exp: bool = True,
-        dropout_rate: float = 0.2,
-        bn_eps: float = 1e-3,
-        bn_momentum: float = 0.1
-    ):
-        super().__init__()
-
-        self.num_blocks = len(base_widths) - 1  # without stem, equal to len(self.blocks)
-        assert self.num_blocks >= 4
-
-        assert len(base_widths) == len(stride) == len(activation) == self.num_blocks + 1
-
-        # The final two blocks can't have SE
-        assert len(squeeze_excite) == self.num_blocks - 2 and all(se in ['force', 'optional', 'none'] for se in squeeze_excite)
-
-        # The first and final two blocks can't have variational depth
-        if isinstance(depth_range[0], int):
-            depth_range = cast(Tuple[int, int], depth_range)
-            assert len(depth_range) == 2 and depth_range[1] >= depth_range[0] >= 1
-            self.depth_range = [depth_range] * (self.num_blocks - 3)
-        else:
-            assert len(depth_range) == self.num_blocks - 3
-            self.depth_range = cast(List[Tuple[int, int]], depth_range)
-            for d in self.depth_range:
-                d = cast(Tuple[int, int], d)
-                # pylint: disable=unsubscriptable-object
-                assert len(d) == 2 and d[1] >= d[0] >= 1, f'{d} does not satisfy depth constraints'
-
-        self.widths = []
-        for i, base_width in enumerate(base_widths):
-            if isinstance(width_multipliers, float):
-                self.widths.append(make_divisible(base_width * width_multipliers, 8))
-            else:
-                self.widths.append(
-                    # According to tunas, stem and stage 0 share one width multiplier
-                    # https://github.com/google-research/google-research/blob/20736344/tunas/mobile_search_space_v3.py#L791
-                    make_divisible(
-                        nn.ValueChoice(list(width_multipliers), label=f's{max(i - 1, 0)}_width_mult') * base_width, 8
-                    )
-                )
-
-        self.expand_ratios = expand_ratios
-        self.se_from_exp = se_from_exp
-
-        # NOTE: The built-in hardswish produces slightly different output from 3rd-party implementation
-        # But I guess it doesn't really matter.
-        # https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/layers/activations.py#L79
-
-        self.stem = ConvBNReLU(
-            3, self.widths[0],
-            nn.ValueChoice([3, 5], label=f'stem_ks'),
-            stride=stride[0], activation_layer=_act_fn(activation[0])
-        )
-
-        blocks: List[nn.Module] = [
-            # Stage 0
-            # FIXME: this should be an optional layer.
-            # https://github.com/google-research/google-research/blob/20736344/tunas/mobile_search_space_v3.py#L791
-            DepthwiseSeparableConv(
-                self.widths[0], self.widths[1],
-                nn.ValueChoice([3, 5, 7], label=f's0_i0_ks'),
-                stride=stride[1],
-                squeeze_excite=cast(Callable[[nn.MaybeChoice[int], nn.MaybeChoice[int]], nn.Module], partial(
-                    _se_or_skip, optional=squeeze_excite[0] == 'optional', se_from_exp=self.se_from_exp, label=f's0_i0_se'
-                )) if squeeze_excite[0] != 'none' else None,
-                activation_layer=_act_fn(activation[1])
-            ),
-        ]
-
-        blocks += [
-            # Stage 1-5 (by default)
-            self._make_stage(i, self.widths[i], self.widths[i + 1], squeeze_excite[i], stride[i + 1], _act_fn(activation[i + 1]))
-            for i in range(1, self.num_blocks - 2)
-        ]
-
-        # Head
-        blocks += [
-            ConvBNReLU(
-                self.widths[self.num_blocks - 2],
-                self.widths[self.num_blocks - 1],
-                kernel_size=1,
-                stride=stride[self.num_blocks - 1],
-                activation_layer=_act_fn(activation[self.num_blocks - 1])
-            ),
-            nn.AdaptiveAvgPool2d(1),
-
-            # In some implementation, this is a linear instead.
-            # Should be equivalent.
-            ConvBNReLU(
-                self.widths[self.num_blocks - 1],
-                self.widths[self.num_blocks],
-                kernel_size=1,
-                stride=stride[self.num_blocks],
-                norm_layer=nn.Identity,
-                activation_layer=_act_fn(activation[self.num_blocks])
-            )
-        ]
-
-        self.blocks = nn.Sequential(*blocks)
-
-        self.classifier = nn.Sequential(
-            nn.Dropout(dropout_rate),
-            nn.Linear(cast(int, self.widths[self.num_blocks]), num_labels),
-        )
-
-        reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
-
-    def forward(self, x):
-        x = self.stem(x)
-        x = self.blocks(x)
-        x = x.view(x.size(0), -1)
-        x = self.classifier(x)
-        return x
-
-    def _make_stage(self, stage_idx, inp, oup, se, stride, act):
-        def layer_builder(idx):
-            exp = nn.ValueChoice(list(self.expand_ratios), label=f's{stage_idx}_i{idx}_exp')
-            ks = nn.ValueChoice([3, 5, 7], label=f's{stage_idx}_i{idx}_ks')
-            # if SE is true, assign a layer choice to SE
-            se_or_skip = cast(Callable[[nn.MaybeChoice[int], nn.MaybeChoice[int]], nn.Module], partial(
-                _se_or_skip, optional=se == 'optional', se_from_exp=self.se_from_exp, label=f's{stage_idx}_i{idx}_se'
-            )) if se != 'none' else None
-            return InvertedResidual(
-                inp if idx == 0 else oup,
-                oup, exp, ks,
-                stride=stride if idx == 0 else 1,  # only the first layer in each stage can have stride > 1
-                squeeze_excite=se_or_skip,
-                activation_layer=act,
-            )
-
-        # mutable depth
-        min_depth, max_depth = self.depth_range[stage_idx - 1]
-        if stride != 1:
-            min_depth = max(min_depth, 1)
-        return nn.Repeat(layer_builder, depth=(min_depth, max_depth), label=f's{stage_idx}_depth')
-
-    @classmethod
-    def fixed_arch(cls, arch: dict) -> FixedFactory:
-        return FixedFactory(cls, arch)
-
-    @classmethod
-    def load_searched_model(
-        cls, name: str,
-        pretrained: bool = False, download: bool = False, progress: bool = True
-    ) -> nn.Module:
-
-        init_kwargs = {}  # all default
-
-        if name == 'mobilenetv3-large-100':
-            # NOTE: Use bicsubic interpolation to evaluate this
-            # With default interpolation, it yields top-1 75.722
-            arch = {
-                'stem_ks': 3,
-                's0_i0_ks': 3,
-                's1_depth': 2,
-                's1_i0_exp': 4,
-                's1_i0_ks': 3,
-                's1_i1_exp': 3,
-                's1_i1_ks': 3,
-                's2_depth': 3,
-                's2_i0_exp': 3,
-                's2_i0_ks': 5,
-                's2_i1_exp': 3,
-                's2_i1_ks': 5,
-                's2_i2_exp': 3,
-                's2_i2_ks': 5,
-                's3_depth': 4,
-                's3_i0_exp': 6,
-                's3_i0_ks': 3,
-                's3_i1_exp': 2.5,
-                's3_i1_ks': 3,
-                's3_i2_exp': 2.3,
-                's3_i2_ks': 3,
-                's3_i3_exp': 2.3,
-                's3_i3_ks': 3,
-                's4_depth': 2,
-                's4_i0_exp': 6,
-                's4_i0_ks': 3,
-                's4_i1_exp': 6,
-                's4_i1_ks': 3,
-                's5_depth': 3,
-                's5_i0_exp': 6,
-                's5_i0_ks': 5,
-                's5_i1_exp': 6,
-                's5_i1_ks': 5,
-                's5_i2_exp': 6,
-                's5_i2_ks': 5,
-            }
-
-            init_kwargs.update(
-                base_widths=[16, 16, 24, 40, 80, 112, 160, 960, 1280],
-                expand_ratios=[1.0, 2.0, 2.3, 2.5, 3.0, 4.0, 6.0],
-                bn_eps=1e-5,
-                bn_momentum=0.1,
-                width_multipliers=1.0,
-                squeeze_excite=['none', 'none', 'force', 'none', 'force', 'force']
-            )
-
-        elif name.startswith('mobilenetv3-small-'):
-            # Evaluate with bicubic interpolation
-            multiplier = int(name.split('-')[-1]) / 100
-            widths = [16, 16, 24, 40, 48, 96, 576, 1024]
-            for i in range(7):
-                if i > 0 or multiplier >= 0.75:
-                    # fix_stem = True when multiplier < 0.75
-                    # https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/mobilenetv3.py#L421
-                    widths[i] = make_divisible(widths[i] * multiplier, 8)
-            init_kwargs.update(
-                base_widths=widths,
-                width_multipliers=1.0,
-                expand_ratios=[3.0, 3.67, 4.0, 4.5, 6.0],
-                bn_eps=1e-05,
-                bn_momentum=0.1,
-                squeeze_excite=['force', 'none', 'force', 'force', 'force'],
-                activation=['hswish', 'relu', 'relu', 'hswish', 'hswish', 'hswish', 'hswish', 'hswish'],
-                stride=[2, 2, 2, 2, 1, 2, 1, 1],
-                depth_range=(1, 2),
-            )
-
-            arch = {
-                'stem_ks': 3,
-                's0_i0_ks': 3,
-                's1_depth': 2,
-                's1_i0_exp': 4.5,
-                's1_i0_ks': 3,
-                's1_i1_exp': 3.67,
-                's1_i1_ks': 3,
-                's2_depth': 3,
-                's2_i0_exp': 4.0,
-                's2_i0_ks': 5,
-                's2_i1_exp': 6.0,
-                's2_i1_ks': 5,
-                's2_i2_exp': 6.0,
-                's2_i2_ks': 5,
-                's3_depth': 2,
-                's3_i0_exp': 3.0,
-                's3_i0_ks': 5,
-                's3_i1_exp': 3.0,
-                's3_i1_ks': 5,
-                's4_depth': 3,
-                's4_i0_exp': 6.0,
-                's4_i0_ks': 5,
-                's4_i1_exp': 6.0,
-                's4_i1_ks': 5,
-                's4_i2_exp': 6.0,
-                's4_i2_ks': 5
-            }
-
-        elif name.startswith('cream'):
-            # https://github.com/microsoft/Cream/tree/main/Cream
-            # bilinear interpolation
-
-            level = name.split('-')[-1]
-
-            # region cream arch specification
-            if level == '014':
-                arch = {
-                    'stem_ks': 3,
-                    's0_depth': 1,
-                    's0_i0_ks': 3,
-                    's1_depth': 1,
-                    's1_i0_exp': 4.0,
-                    's1_i0_ks': 3,
-                    's2_depth': 2,
-                    's2_i0_exp': 6.0,
-                    's2_i0_ks': 5,
-                    's2_i1_exp': 6.0,
-                    's2_i1_ks': 5,
-                    's3_depth': 2,
-                    's3_i0_exp': 6.0,
-                    's3_i0_ks': 5,
-                    's3_i1_exp': 6.0,
-                    's3_i1_ks': 5,
-                    's4_depth': 1,
-                    's4_i0_exp': 6.0,
-                    's4_i0_ks': 3,
-                    's5_depth': 1,
-                    's5_i0_exp': 6.0,
-                    's5_i0_ks': 5
-                }
-            elif level == '043':
-                arch = {
-                    'stem_ks': 3,
-                    's0_depth': 1,
-                    's0_i0_ks': 3,
-                    's1_depth': 1,
-                    's1_i0_exp': 4.0,
-                    's1_i0_ks': 3,
-                    's2_depth': 2,
-                    's2_i0_exp': 6.0,
-                    's2_i0_ks': 5,
-                    's2_i1_exp': 6.0,
-                    's2_i1_ks': 3,
-                    's3_depth': 2,
-                    's3_i0_exp': 6.0,
-                    's3_i0_ks': 5,
-                    's3_i1_exp': 6.0,
-                    's3_i1_ks': 3,
-                    's4_depth': 3,
-                    's4_i0_exp': 6.0,
-                    's4_i0_ks': 5,
-                    's4_i1_exp': 6.0,
-                    's4_i1_ks': 5,
-                    's4_i2_exp': 6.0,
-                    's4_i2_ks': 5,
-                    's5_depth': 2,
-                    's5_i0_exp': 6.0,
-                    's5_i0_ks': 5,
-                    's5_i1_exp': 6.0,
-                    's5_i1_ks': 5
-                }
-            elif level == '114':
-                arch = {
-                    'stem_ks': 3,
-                    's0_depth': 1,
-                    's0_i0_ks': 3,
-                    's1_depth': 1,
-                    's1_i0_exp': 4.0,
-                    's1_i0_ks': 3,
-                    's2_depth': 2,
-                    's2_i0_exp': 6.0,
-                    's2_i0_ks': 5,
-                    's2_i1_exp': 6.0,
-                    's2_i1_ks': 5,
-                    's3_depth': 2,
-                    's3_i0_exp': 6.0,
-                    's3_i0_ks': 5,
-                    's3_i1_exp': 6.0,
-                    's3_i1_ks': 5,
-                    's4_depth': 3,
-                    's4_i0_exp': 6.0,
-                    's4_i0_ks': 5,
-                    's4_i1_exp': 6.0,
-                    's4_i1_ks': 5,
-                    's4_i2_exp': 6.0,
-                    's4_i2_ks': 5,
-                    's5_depth': 2,
-                    's5_i0_exp': 6.0,
-                    's5_i0_ks': 5,
-                    's5_i1_exp': 6.0,
-                    's5_i1_ks': 5
-                }
-            elif level == '287':
-                arch = {
-                    'stem_ks': 3,
-                    's0_depth': 1,
-                    's0_i0_ks': 3,
-                    's1_depth': 1,
-                    's1_i0_exp': 4.0,
-                    's1_i0_ks': 3,
-                    's2_depth': 2,
-                    's2_i0_exp': 6.0,
-                    's2_i0_ks': 5,
-                    's2_i1_exp': 6.0,
-                    's2_i1_ks': 5,
-                    's3_depth': 3,
-                    's3_i0_exp': 6.0,
-                    's3_i0_ks': 5,
-                    's3_i1_exp': 6.0,
-                    's3_i1_ks': 3,
-                    's3_i2_exp': 6.0,
-                    's3_i2_ks': 5,
-                    's4_depth': 4,
-                    's4_i0_exp': 6.0,
-                    's4_i0_ks': 5,
-                    's4_i1_exp': 6.0,
-                    's4_i1_ks': 5,
-                    's4_i2_exp': 6.0,
-                    's4_i2_ks': 5,
-                    's4_i3_exp': 6.0,
-                    's4_i3_ks': 5,
-                    's5_depth': 3,
-                    's5_i0_exp': 6.0,
-                    's5_i0_ks': 5,
-                    's5_i1_exp': 6.0,
-                    's5_i1_ks': 5,
-                    's5_i2_exp': 6.0,
-                    's5_i2_ks': 5
-                }
-            elif level == '481':
-                arch = {
-                    'stem_ks': 3,
-                    's0_depth': 1,
-                    's0_i0_ks': 3,
-                    's1_depth': 4,
-                    's1_i0_exp': 6.0,
-                    's1_i0_ks': 5,
-                    's1_i1_exp': 4.0,
-                    's1_i1_ks': 7,
-                    's1_i2_exp': 6.0,
-                    's1_i2_ks': 5,
-                    's1_i3_exp': 6.0,
-                    's1_i3_ks': 3,
-                    's2_depth': 4,
-                    's2_i0_exp': 6.0,
-                    's2_i0_ks': 5,
-                    's2_i1_exp': 4.0,
-                    's2_i1_ks': 5,
-                    's2_i2_exp': 6.0,
-                    's2_i2_ks': 5,
-                    's2_i3_exp': 4.0,
-                    's2_i3_ks': 3,
-                    's3_depth': 5,
-                    's3_i0_exp': 6.0,
-                    's3_i0_ks': 5,
-                    's3_i1_exp': 6.0,
-                    's3_i1_ks': 5,
-                    's3_i2_exp': 6.0,
-                    's3_i2_ks': 5,
-                    's3_i3_exp': 6.0,
-                    's3_i3_ks': 3,
-                    's3_i4_exp': 6.0,
-                    's3_i4_ks': 3,
-                    's4_depth': 4,
-                    's4_i0_exp': 6.0,
-                    's4_i0_ks': 5,
-                    's4_i1_exp': 6.0,
-                    's4_i1_ks': 5,
-                    's4_i2_exp': 6.0,
-                    's4_i2_ks': 5,
-                    's4_i3_exp': 6.0,
-                    's4_i3_ks': 5,
-                    's5_depth': 4,
-                    's5_i0_exp': 6.0,
-                    's5_i0_ks': 5,
-                    's5_i1_exp': 6.0,
-                    's5_i1_ks': 5,
-                    's5_i2_exp': 6.0,
-                    's5_i2_ks': 5,
-                    's5_i3_exp': 6.0,
-                    's5_i3_ks': 5
-                }
-            elif level == '604':
-                arch = {
-                    'stem_ks': 3,
-                    's0_depth': 1,
-                    's0_i0_ks': 3,
-                    's1_depth': 5,
-                    's1_i0_exp': 6.0,
-                    's1_i0_ks': 5,
-                    's1_i1_exp': 6.0,
-                    's1_i1_ks': 5,
-                    's1_i2_exp': 4.0,
-                    's1_i2_ks': 5,
-                    's1_i3_exp': 6.0,
-                    's1_i3_ks': 5,
-                    's1_i4_exp': 6.0,
-                    's1_i4_ks': 5,
-                    's2_depth': 5,
-                    's2_i0_exp': 6.0,
-                    's2_i0_ks': 5,
-                    's2_i1_exp': 4.0,
-                    's2_i1_ks': 5,
-                    's2_i2_exp': 6.0,
-                    's2_i2_ks': 5,
-                    's2_i3_exp': 4.0,
-                    's2_i3_ks': 5,
-                    's2_i4_exp': 6.0,
-                    's2_i4_ks': 5,
-                    's3_depth': 5,
-                    's3_i0_exp': 6.0,
-                    's3_i0_ks': 5,
-                    's3_i1_exp': 4.0,
-                    's3_i1_ks': 5,
-                    's3_i2_exp': 6.0,
-                    's3_i2_ks': 5,
-                    's3_i3_exp': 4.0,
-                    's3_i3_ks': 5,
-                    's3_i4_exp': 6.0,
-                    's3_i4_ks': 5,
-                    's4_depth': 6,
-                    's4_i0_exp': 6.0,
-                    's4_i0_ks': 5,
-                    's4_i1_exp': 6.0,
-                    's4_i1_ks': 5,
-                    's4_i2_exp': 4.0,
-                    's4_i2_ks': 5,
-                    's4_i3_exp': 4.0,
-                    's4_i3_ks': 5,
-                    's4_i4_exp': 6.0,
-                    's4_i4_ks': 5,
-                    's4_i5_exp': 6.0,
-                    's4_i5_ks': 5,
-                    's5_depth': 6,
-                    's5_i0_exp': 6.0,
-                    's5_i0_ks': 5,
-                    's5_i1_exp': 6.0,
-                    's5_i1_ks': 5,
-                    's5_i2_exp': 4.0,
-                    's5_i2_ks': 5,
-                    's5_i3_exp': 6.0,
-                    's5_i3_ks': 5,
-                    's5_i4_exp': 6.0,
-                    's5_i4_ks': 5,
-                    's5_i5_exp': 6.0,
-                    's5_i5_ks': 5
-                }
-            else:
-                raise ValueError(f'Unsupported cream model level: {level}')
-            # endregion
-
-            init_kwargs.update(
-                base_widths=[16, 16, 24, 40, 80, 96, 192, 320, 1280],
-                width_multipliers=1.0,
-                expand_ratios=[4.0, 6.0],
-                bn_eps=1e-5,
-                bn_momentum=0.1,
-                squeeze_excite=['force'] * 6,
-                activation=['swish'] * 9
-            )
-
-        else:
-            raise ValueError(f'Unsupported architecture with name: {name}')
-
-        model_factory = cls.fixed_arch(arch)
-        model = model_factory(**init_kwargs)
-
-        if pretrained:
-            weight_file = load_pretrained_weight(name, download=download, progress=progress)
-            pretrained_weights = torch.load(weight_file)
-            model.load_state_dict(pretrained_weights)
-
-        return model
+from nni.nas.hub.pytorch.mobilenetv3 import *
--- a/nni/retiarii/hub/pytorch/nasbench101.py
+++ b/nni/retiarii/hub/pytorch/nasbench101.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import math
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import torch.nn as nn
-from nni.retiarii import model_wrapper
-from nni.retiarii.nn.pytorch import NasBench101Cell
-
-
-__all__ = ['NasBench101']
-
-
-def truncated_normal_(tensor: torch.Tensor, mean: float = 0, std: float = 1):
-    # https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15
-    size = tensor.shape
-    tmp = tensor.new_empty(size + (4,)).normal_()
-    valid = (tmp < 2) & (tmp > -2)
-    ind = valid.max(-1, keepdim=True)[1]
-    tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
-    tensor.data.mul_(std).add_(mean)
-
-
-class ConvBNReLU(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
-        super(ConvBNReLU, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.conv_bn_relu = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
-            nn.BatchNorm2d(out_channels),
-            nn.ReLU(inplace=True)
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                fan_in = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
-                truncated_normal_(m.weight.data, mean=0., std=math.sqrt(1. / fan_in))
-            if isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-
-    def forward(self, x):
-        return self.conv_bn_relu(x)
-
-
-class Conv3x3BNReLU(ConvBNReLU):
-    def __init__(self, in_channels, out_channels):
-        super(Conv3x3BNReLU, self).__init__(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-
-
-class Conv1x1BNReLU(ConvBNReLU):
-    def __init__(self, in_channels, out_channels):
-        super(Conv1x1BNReLU, self).__init__(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-
-
-Projection = Conv1x1BNReLU
-
-
-@model_wrapper
-class NasBench101(nn.Module):
-    """The full search space, proposed by `NAS-Bench-101 <http://proceedings.mlr.press/v97/ying19a/ying19a.pdf>`__.
-
-    It's simply a stack of :class:`NasBench101Cell`. Operations are conv3x3, conv1x1 and maxpool respectively.
-    """
-
-    def __init__(self,
-                 stem_out_channels: int = 128,
-                 num_stacks: int = 3,
-                 num_modules_per_stack: int = 3,
-                 max_num_vertices: int = 7,
-                 max_num_edges: int = 9,
-                 num_labels: int = 10,
-                 bn_eps: float = 1e-5,
-                 bn_momentum: float = 0.003):
-        super().__init__()
-
-        op_candidates = {
-            'conv3x3-bn-relu': lambda num_features: Conv3x3BNReLU(num_features, num_features),
-            'conv1x1-bn-relu': lambda num_features: Conv1x1BNReLU(num_features, num_features),
-            'maxpool3x3': lambda num_features: nn.MaxPool2d(3, 1, 1)
-        }
-
-        # initial stem convolution
-        self.stem_conv = Conv3x3BNReLU(3, stem_out_channels)
-
-        layers = []
-        in_channels = out_channels = stem_out_channels
-        for stack_num in range(num_stacks):
-            if stack_num > 0:
-                downsample = nn.MaxPool2d(kernel_size=2, stride=2)
-                layers.append(downsample)
-                out_channels *= 2
-            for _ in range(num_modules_per_stack):
-                cell = NasBench101Cell(op_candidates, in_channels, out_channels,
-                                       lambda cin, cout: Projection(cin, cout),
-                                       max_num_vertices, max_num_edges, label='cell')
-                layers.append(cell)
-                in_channels = out_channels
-
-        self.features = nn.ModuleList(layers)
-        self.gap = nn.AdaptiveAvgPool2d(1)
-        self.classifier = nn.Linear(out_channels, num_labels)
-
-        for module in self.modules():
-            if isinstance(module, nn.BatchNorm2d):
-                module.eps = bn_eps
-                module.momentum = bn_momentum
-
-    def forward(self, x):
-        bs = x.size(0)
-        out = self.stem_conv(x)
-        for layer in self.features:
-            out = layer(out)
-        out = self.gap(out).view(bs, -1)
-        out = self.classifier(out)
-        return out
+from nni.nas.hub.pytorch.nasbench101 import *
--- a/nni/retiarii/hub/pytorch/nasbench201.py
+++ b/nni/retiarii/hub/pytorch/nasbench201.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from typing import Callable, Dict
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import torch.nn as nn
-
-from nni.retiarii import model_wrapper
-from nni.retiarii.nn.pytorch import NasBench201Cell
-
-
-__all__ = ['NasBench201']
-
-
-OPS_WITH_STRIDE = {
-    'none': lambda C_in, C_out, stride: Zero(C_in, C_out, stride),
-    'avg_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'avg'),
-    'max_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'max'),
-    'conv_3x3': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (3, 3), (stride, stride), (1, 1), (1, 1)),
-    'conv_1x1': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (1, 1), (stride, stride), (0, 0), (1, 1)),
-    'skip_connect': lambda C_in, C_out, stride: nn.Identity() if stride == 1 and C_in == C_out
-    else FactorizedReduce(C_in, C_out, stride),
-}
-
-PRIMITIVES = ['none', 'skip_connect', 'conv_1x1', 'conv_3x3', 'avg_pool_3x3']
-
-
-class ReLUConvBN(nn.Module):
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
-        super(ReLUConvBN, self).__init__()
-        self.op = nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
-                      padding=padding, dilation=dilation, bias=False),
-            nn.BatchNorm2d(C_out)
-        )
-
-    def forward(self, x):
-        return self.op(x)
-
-
-class SepConv(nn.Module):
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
-        super(SepConv, self).__init__()
-        self.op = nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride,
-                      padding=padding, dilation=dilation, groups=C_in, bias=False),
-            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out),
-        )
-
-    def forward(self, x):
-        return self.op(x)
-
-
-class Pooling(nn.Module):
-    def __init__(self, C_in, C_out, stride, mode):
-        super(Pooling, self).__init__()
-        if C_in == C_out:
-            self.preprocess = None
-        else:
-            self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 1)
-        if mode == 'avg':
-            self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
-        elif mode == 'max':
-            self.op = nn.MaxPool2d(3, stride=stride, padding=1)
-        else:
-            raise ValueError('Invalid mode={:} in Pooling'.format(mode))
-
-    def forward(self, x):
-        if self.preprocess:
-            x = self.preprocess(x)
-        return self.op(x)
-
-
-class Zero(nn.Module):
-    def __init__(self, C_in, C_out, stride):
-        super(Zero, self).__init__()
-        self.C_in = C_in
-        self.C_out = C_out
-        self.stride = stride
-        self.is_zero = True
-
-    def forward(self, x):
-        if self.C_in == self.C_out:
-            if self.stride == 1:
-                return x.mul(0.)
-            else:
-                return x[:, :, ::self.stride, ::self.stride].mul(0.)
-        else:
-            shape = list(x.shape)
-            shape[1] = self.C_out
-            zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
-            return zeros
-
-
-class FactorizedReduce(nn.Module):
-    def __init__(self, C_in, C_out, stride):
-        super(FactorizedReduce, self).__init__()
-        self.stride = stride
-        self.C_in = C_in
-        self.C_out = C_out
-        self.relu = nn.ReLU(inplace=False)
-        if stride == 2:
-            C_outs = [C_out // 2, C_out - C_out // 2]
-            self.convs = nn.ModuleList()
-            for i in range(2):
-                self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False))
-            self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
-        else:
-            raise ValueError('Invalid stride : {:}'.format(stride))
-        self.bn = nn.BatchNorm2d(C_out)
-
-    def forward(self, x):
-        x = self.relu(x)
-        y = self.pad(x)
-        out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
-
-
-class ResNetBasicblock(nn.Module):
-    def __init__(self, inplanes, planes, stride):
-        super(ResNetBasicblock, self).__init__()
-        assert stride == 1 or stride == 2, 'invalid stride {:}'.format(stride)
-        self.conv_a = ReLUConvBN(inplanes, planes, 3, stride, 1, 1)
-        self.conv_b = ReLUConvBN(planes, planes, 3, 1, 1, 1)
-        if stride == 2:
-            self.downsample = nn.Sequential(
-                nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
-                nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False))
-        elif inplanes != planes:
-            self.downsample = ReLUConvBN(inplanes, planes, 1, 1, 0, 1)
-        else:
-            self.downsample = None
-        self.in_dim = inplanes
-        self.out_dim = planes
-        self.stride = stride
-        self.num_conv = 2
-
-    def forward(self, inputs):
-        basicblock = self.conv_a(inputs)
-        basicblock = self.conv_b(basicblock)
-
-        if self.downsample is not None:
-            inputs = self.downsample(inputs)  # residual
-        return inputs + basicblock
-
-
-@model_wrapper
-class NasBench201(nn.Module):
-    """The full search space proposed by `NAS-Bench-201 <https://arxiv.org/abs/2001.00326>`__.
-
-    It's a stack of :class:`NasBench201Cell`.
-    """
-    def __init__(self,
-                 stem_out_channels: int = 16,
-                 num_modules_per_stack: int = 5,
-                 num_labels: int = 10):
-        super().__init__()
-        self.channels = C = stem_out_channels
-        self.num_modules = N = num_modules_per_stack
-        self.num_labels = num_labels
-
-        self.stem = nn.Sequential(
-            nn.Conv2d(3, C, kernel_size=3, padding=1, bias=False),
-            nn.BatchNorm2d(C)
-        )
-
-        layer_channels = [C] * N + [C * 2] + [C * 2] * N + [C * 4] + [C * 4] * N
-        layer_reductions = [False] * N + [True] + [False] * N + [True] + [False] * N
-
-        C_prev = C
-        self.cells = nn.ModuleList()
-        for C_curr, reduction in zip(layer_channels, layer_reductions):
-            if reduction:
-                cell = ResNetBasicblock(C_prev, C_curr, 2)
-            else:
-                ops: Dict[str, Callable[[int, int], nn.Module]] = {
-                    prim: lambda C_in, C_out: OPS_WITH_STRIDE[prim](C_in, C_out, 1) for prim in PRIMITIVES
-                }
-                cell = NasBench201Cell(ops, C_prev, C_curr, label='cell')
-            self.cells.append(cell)
-            C_prev = C_curr
-
-        self.lastact = nn.Sequential(
-            nn.BatchNorm2d(C_prev),
-            nn.ReLU(inplace=True)
-        )
-        self.global_pooling = nn.AdaptiveAvgPool2d(1)
-        self.classifier = nn.Linear(C_prev, self.num_labels)
-
-    def forward(self, inputs):
-        feature = self.stem(inputs)
-        for cell in self.cells:
-            feature = cell(feature)
-
-        out = self.lastact(feature)
-        out = self.global_pooling(out)
-        out = out.view(out.size(0), -1)
-        logits = self.classifier(out)
-
-        return logits
+from nni.nas.hub.pytorch.nasbench201 import *
--- a/nni/retiarii/hub/pytorch/nasnet.py
+++ b/nni/retiarii/hub/pytorch/nasnet.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""File containing NASNet-series search space.
+# pylint: disable=wildcard-import,unused-wildcard-import

-The implementation is based on NDS.
-It's called ``nasnet.py`` simply because NASNet is the first to propose such structure.
-"""
-
-from collections import OrderedDict
-from functools import partial
-from typing import Tuple, List, Union, Iterable, Dict, Callable, Optional, cast
-
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
-
-import torch
-
-import nni.retiarii.nn.pytorch as nn
-from nni.retiarii import model_wrapper
-
-from nni.retiarii.oneshot.pytorch.supermodule.sampling import PathSamplingRepeat
-from nni.retiarii.oneshot.pytorch.supermodule.differentiable import DifferentiableMixedRepeat
-
-from .utils.fixed import FixedFactory
-from .utils.pretrained import load_pretrained_weight
-
-
-# the following are NAS operations from
-# https://github.com/facebookresearch/unnas/blob/main/pycls/models/nas/operations.py
-
-OPS = {
-    'none': lambda C, stride, affine:
-        Zero(stride),
-    'avg_pool_2x2': lambda C, stride, affine:
-        nn.AvgPool2d(2, stride=stride, padding=0, count_include_pad=False),
-    'avg_pool_3x3': lambda C, stride, affine:
-        nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False),
-    'avg_pool_5x5': lambda C, stride, affine:
-        nn.AvgPool2d(5, stride=stride, padding=2, count_include_pad=False),
-    'max_pool_2x2': lambda C, stride, affine:
-        nn.MaxPool2d(2, stride=stride, padding=0),
-    'max_pool_3x3': lambda C, stride, affine:
-        nn.MaxPool2d(3, stride=stride, padding=1),
-    'max_pool_5x5': lambda C, stride, affine:
-        nn.MaxPool2d(5, stride=stride, padding=2),
-    'max_pool_7x7': lambda C, stride, affine:
-        nn.MaxPool2d(7, stride=stride, padding=3),
-    'skip_connect': lambda C, stride, affine:
-        nn.Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
-    'conv_1x1': lambda C, stride, affine:
-        nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C, C, 1, stride=stride, padding=0, bias=False),
-            nn.BatchNorm2d(C, affine=affine)
-        ),
-    'conv_3x3': lambda C, stride, affine:
-        nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C, C, 3, stride=stride, padding=1, bias=False),
-            nn.BatchNorm2d(C, affine=affine)
-        ),
-    'sep_conv_3x3': lambda C, stride, affine:
-        SepConv(C, C, 3, stride, 1, affine=affine),
-    'sep_conv_5x5': lambda C, stride, affine:
-        SepConv(C, C, 5, stride, 2, affine=affine),
-    'sep_conv_7x7': lambda C, stride, affine:
-        SepConv(C, C, 7, stride, 3, affine=affine),
-    'dil_conv_3x3': lambda C, stride, affine:
-        DilConv(C, C, 3, stride, 2, 2, affine=affine),
-    'dil_conv_5x5': lambda C, stride, affine:
-        DilConv(C, C, 5, stride, 4, 2, affine=affine),
-    'dil_sep_conv_3x3': lambda C, stride, affine:
-        DilSepConv(C, C, 3, stride, 2, 2, affine=affine),
-    'conv_3x1_1x3': lambda C, stride, affine:
-        nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C, C, (1, 3), stride=(1, stride), padding=(0, 1), bias=False),
-            nn.Conv2d(C, C, (3, 1), stride=(stride, 1), padding=(1, 0), bias=False),
-            nn.BatchNorm2d(C, affine=affine)
-        ),
-    'conv_7x1_1x7': lambda C, stride, affine:
-        nn.Sequential(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(C, C, (1, 7), stride=(1, stride), padding=(0, 3), bias=False),
-            nn.Conv2d(C, C, (7, 1), stride=(stride, 1), padding=(3, 0), bias=False),
-            nn.BatchNorm2d(C, affine=affine)
-        ),
-}
-
-
-class ReLUConvBN(nn.Sequential):
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(
-                C_in, C_out, kernel_size, stride=stride,
-                padding=padding, bias=False
-            ),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-
-class DilConv(nn.Sequential):
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
-        super().__init__(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(
-                C_in, C_in, kernel_size=kernel_size, stride=stride,
-                padding=padding, dilation=dilation, groups=C_in, bias=False
-            ),
-            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
-        )
-
-
-class SepConv(nn.Sequential):
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(
-                C_in, C_in, kernel_size=kernel_size, stride=stride,
-                padding=padding, groups=C_in, bias=False
-            ),
-            nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_in, affine=affine),
-            nn.ReLU(inplace=False),
-            nn.Conv2d(
-                C_in, C_in, kernel_size=kernel_size, stride=1,
-                padding=padding, groups=C_in, bias=False
-            ),
-            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
-        )
-
-
-class DilSepConv(nn.Sequential):
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
-        super().__init__(
-            nn.ReLU(inplace=False),
-            nn.Conv2d(
-                C_in, C_in, kernel_size=kernel_size, stride=stride,
-                padding=padding, dilation=dilation, groups=C_in, bias=False
-            ),
-            nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_in, affine=affine),
-            nn.ReLU(inplace=False),
-            nn.Conv2d(
-                C_in, C_in, kernel_size=kernel_size, stride=1,
-                padding=padding, dilation=dilation, groups=C_in, bias=False
-            ),
-            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
-        )
-
-
-class Zero(nn.Module):
-
-    def __init__(self, stride):
-        super().__init__()
-        self.stride = stride
-
-    def forward(self, x):
-        if self.stride == 1:
-            return x.mul(0.)
-        return x[:, :, ::self.stride, ::self.stride].mul(0.)
-
-
-class FactorizedReduce(nn.Module):
-
-    def __init__(self, C_in, C_out, affine=True):
-        super().__init__()
-        if isinstance(C_out, int):
-            assert C_out % 2 == 0
-        else:   # is a value choice
-            assert all(c % 2 == 0 for c in C_out.all_options())
-        self.relu = nn.ReLU(inplace=False)
-        self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
-        self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
-
-    def forward(self, x):
-        x = self.relu(x)
-        y = self.pad(x)
-        out = torch.cat([self.conv_1(x), self.conv_2(y[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
-
-
-class DropPath_(nn.Module):
-    # https://github.com/khanrc/pt.darts/blob/0.1/models/ops.py
-    def __init__(self, drop_prob=0.):
-        super().__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        if self.training and self.drop_prob > 0.:
-            keep_prob = 1. - self.drop_prob
-            mask = torch.zeros((x.size(0), 1, 1, 1), dtype=torch.float, device=x.device).bernoulli_(keep_prob)
-            return x.div(keep_prob).mul(mask)
-        return x
-
-
-class AuxiliaryHead(nn.Module):
-    def __init__(self, C: int, num_labels: int, dataset: Literal['imagenet', 'cifar']):
-        super().__init__()
-        if dataset == 'imagenet':
-            # assuming input size 14x14
-            stride = 2
-        elif dataset == 'cifar':
-            stride = 3
-
-        self.features = nn.Sequential(
-            nn.ReLU(inplace=True),
-            nn.AvgPool2d(5, stride=stride, padding=0, count_include_pad=False),
-            nn.Conv2d(C, 128, 1, bias=False),
-            nn.BatchNorm2d(128),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, 768, 2, bias=False),
-            nn.BatchNorm2d(768),
-            nn.ReLU(inplace=True)
-        )
-        self.classifier = nn.Linear(768, num_labels)
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.classifier(x.view(x.size(0), -1))
-        return x
-
-
-class SequentialBreakdown(nn.Sequential):
-    """Return all layers of a sequential."""
-
-    def __init__(self, sequential: nn.Sequential):
-        super().__init__(OrderedDict(sequential.named_children()))
-
-    def forward(self, inputs):
-        result = []
-        for module in self:
-            inputs = module(inputs)
-            result.append(inputs)
-        return result
-
-
-class CellPreprocessor(nn.Module):
-    """
-    Aligning the shape of predecessors.
-
-    If the last cell is a reduction cell, ``pre0`` should be ``FactorizedReduce`` instead of ``ReLUConvBN``.
-    See :class:`CellBuilder` on how to calculate those channel numbers.
-    """
-
-    def __init__(self, C_pprev: nn.MaybeChoice[int], C_prev: nn.MaybeChoice[int], C: nn.MaybeChoice[int], last_cell_reduce: bool) -> None:
-        super().__init__()
-
-        if last_cell_reduce:
-            self.pre0 = FactorizedReduce(cast(int, C_pprev), cast(int, C))
-        else:
-            self.pre0 = ReLUConvBN(cast(int, C_pprev), cast(int, C), 1, 1, 0)
-        self.pre1 = ReLUConvBN(cast(int, C_prev), cast(int, C), 1, 1, 0)
-
-    def forward(self, cells):
-        assert len(cells) == 2
-        pprev, prev = cells
-        pprev = self.pre0(pprev)
-        prev = self.pre1(prev)
-
-        return [pprev, prev]
-
-
-class CellPostprocessor(nn.Module):
-    """
-    The cell outputs previous cell + this cell, so that cells can be directly chained.
-    """
-
-    def forward(self, this_cell, previous_cells):
-        return [previous_cells[-1], this_cell]
-
-
-class CellBuilder:
-    """The cell builder is used in Repeat.
-    Builds an cell each time it's "called".
-    Note that the builder is ephemeral, it can only be called once for every index.
-    """
-
-    def __init__(self, op_candidates: List[str],
-                 C_prev_in: nn.MaybeChoice[int],
-                 C_in: nn.MaybeChoice[int],
-                 C: nn.MaybeChoice[int],
-                 num_nodes: int,
-                 merge_op: Literal['all', 'loose_end'],
-                 first_cell_reduce: bool, last_cell_reduce: bool):
-        self.C_prev_in = C_prev_in      # This is the out channels of the cell before last cell.
-        self.C_in = C_in                # This is the out channesl of last cell.
-        self.C = C                      # This is NOT C_out of this stage, instead, C_out = C * len(cell.output_node_indices)
-        self.op_candidates = op_candidates
-        self.num_nodes = num_nodes
-        self.merge_op: Literal['all', 'loose_end'] = merge_op
-        self.first_cell_reduce = first_cell_reduce
-        self.last_cell_reduce = last_cell_reduce
-        self._expect_idx = 0
-
-        # It takes an index that is the index in the repeat.
-        # Number of predecessors for each cell is fixed to 2.
-        self.num_predecessors = 2
-
-        # Number of ops per node is fixed to 2.
-        self.num_ops_per_node = 2
-
-    def op_factory(self, node_index: int, op_index: int, input_index: Optional[int], *,
-                   op: str, channels: int, is_reduction_cell: bool):
-        if is_reduction_cell and (
-            input_index is None or input_index < self.num_predecessors
-        ):  # could be none when constructing search sapce
-            stride = 2
-        else:
-            stride = 1
-        return OPS[op](channels, stride, True)
-
-    def __call__(self, repeat_idx: int):
-        if self._expect_idx != repeat_idx:
-            raise ValueError(f'Expect index {self._expect_idx}, found {repeat_idx}')
-
-        # Reduction cell means stride = 2 and channel multiplied by 2.
-        is_reduction_cell = repeat_idx == 0 and self.first_cell_reduce
-
-        # self.C_prev_in, self.C_in, self.last_cell_reduce are updated after each cell is built.
-        preprocessor = CellPreprocessor(self.C_prev_in, self.C_in, self.C, self.last_cell_reduce)
-
-        ops_factory: Dict[str, Callable[[int, int, Optional[int]], nn.Module]] = {}
-        for op in self.op_candidates:
-            ops_factory[op] = partial(self.op_factory, op=op, channels=cast(int, self.C), is_reduction_cell=is_reduction_cell)
-
-        cell = nn.Cell(ops_factory, self.num_nodes, self.num_ops_per_node, self.num_predecessors, self.merge_op,
-                       preprocessor=preprocessor, postprocessor=CellPostprocessor(),
-                       label='reduce' if is_reduction_cell else 'normal')
-
-        # update state
-        self.C_prev_in = self.C_in
-        self.C_in = self.C * len(cell.output_node_indices)
-        self.last_cell_reduce = is_reduction_cell
-        self._expect_idx += 1
-
-        return cell
-
-
-class NDSStage(nn.Repeat):
-    """This class defines NDSStage, a special type of Repeat, for isinstance check, and shape alignment.
-
-    In NDS, we can't simply use Repeat to stack the blocks,
-    because the output shape of each stacked block can be different.
-    This is a problem for one-shot strategy because they assume every possible candidate
-    should return values of the same shape.
-
-    Therefore, we need :class:`NDSStagePathSampling` and :class:`NDSStageDifferentiable`
-    to manually align the shapes -- specifically, to transform the first block in each stage.
-
-    This is not required though, when depth is not changing, or the mutable depth causes no problem
-    (e.g., when the minimum depth is large enough).
-
-    .. attention::
-
-       Assumption: Loose end is treated as all in ``merge_op`` (the case in one-shot),
-       which enforces reduction cell and normal cells in the same stage to have the exact same output shape.
-    """
-
-    estimated_out_channels_prev: int
-    """Output channels of cells in last stage."""
-
-    estimated_out_channels: int
-    """Output channels of this stage. It's **estimated** because it assumes ``all`` as ``merge_op``."""
-
-    downsampling: bool
-    """This stage has downsampling"""
-
-    def first_cell_transformation_factory(self) -> Optional[nn.Module]:
-        """To make the "previous cell" in first cell's output have the same shape as cells in this stage."""
-        if self.downsampling:
-            return FactorizedReduce(self.estimated_out_channels_prev, self.estimated_out_channels)
-        elif self.estimated_out_channels_prev is not self.estimated_out_channels:
-            # Can't use != here, ValueChoice doesn't support
-            return ReLUConvBN(self.estimated_out_channels_prev, self.estimated_out_channels, 1, 1, 0)
-        return None
-
-
-class NDSStagePathSampling(PathSamplingRepeat):
-    """The path-sampling implementation (for one-shot) of each NDS stage if depth is mutating."""
-    @classmethod
-    def mutate(cls, module, name, memo, mutate_kwargs):
-        if isinstance(module, NDSStage) and isinstance(module.depth_choice, nn.api.ValueChoiceX):
-            return cls(
-                module.first_cell_transformation_factory(),
-                cast(List[nn.Module], module.blocks),
-                module.depth_choice
-            )
-
-    def __init__(self, first_cell_transformation: Optional[nn.Module], *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.first_cell_transformation = first_cell_transformation
-
-    def reduction(self, items: List[Tuple[torch.Tensor, torch.Tensor]], sampled: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
-        if 1 not in sampled or self.first_cell_transformation is None:
-            return super().reduction(items, sampled)
-        # items[0] must be the result of first cell
-        assert len(items[0]) == 2
-        # Only apply the transformation on "prev" output.
-        items[0] = (self.first_cell_transformation(items[0][0]), items[0][1])
-        return super().reduction(items, sampled)
-
-
-class NDSStageDifferentiable(DifferentiableMixedRepeat):
-    """The differentiable implementation (for one-shot) of each NDS stage if depth is mutating."""
-    @classmethod
-    def mutate(cls, module, name, memo, mutate_kwargs):
-        if isinstance(module, NDSStage) and isinstance(module.depth_choice, nn.api.ValueChoiceX):
-            # Only interesting when depth is mutable
-            softmax = mutate_kwargs.get('softmax', nn.Softmax(-1))
-            return cls(
-                module.first_cell_transformation_factory(),
-                cast(List[nn.Module], module.blocks),
-                module.depth_choice,
-                softmax,
-                memo
-            )
-
-    def __init__(self, first_cell_transformation: Optional[nn.Module], *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.first_cell_transformation = first_cell_transformation
-
-    def reduction(
-        self, items: List[Tuple[torch.Tensor, torch.Tensor]], weights: List[float], depths: List[int]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if 1 not in depths or self.first_cell_transformation is None:
-            return super().reduction(items, weights, depths)
-        # Same as NDSStagePathSampling
-        assert len(items[0]) == 2
-        items[0] = (self.first_cell_transformation(items[0][0]), items[0][1])
-        return super().reduction(items, weights, depths)
-
-
-_INIT_PARAMETER_DOCS = """
-
-    Parameters
-    ----------
-    width : int or tuple of int
-        A fixed initial width or a tuple of widths to choose from.
-    num_cells : int or tuple of int
-        A fixed number of cells (depths) to stack, or a tuple of depths to choose from.
-    dataset : "cifar" | "imagenet"
-        The essential differences are in "stem" cells, i.e., how they process the raw image input.
-        Choosing "imagenet" means more downsampling at the beginning of the network.
-    auxiliary_loss : bool
-        If true, another auxiliary classification head will produce the another prediction.
-        This makes the output of network two logits in the training phase.
-
-"""
-
-
-class NDS(nn.Module):
-    __doc__ = """
-    The unified version of NASNet search space.
-
-    We follow the implementation in
-    `unnas <https://github.com/facebookresearch/unnas/blob/main/pycls/models/nas/nas.py>`__.
-    See `On Network Design Spaces for Visual Recognition <https://arxiv.org/abs/1905.13214>`__ for details.
-
-    Different NAS papers usually differ in the way that they specify ``op_candidates`` and ``merge_op``.
-    ``dataset`` here is to give a hint about input resolution, so as to create reasonable stem and auxiliary heads.
-
-    NDS has a speciality that it has mutable depths/widths.
-    This is implemented by accepting a list of int as ``num_cells`` / ``width``.
-    """ + _INIT_PARAMETER_DOCS + """
-    op_candidates : list of str
-        List of operator candidates. Must be from ``OPS``.
-    merge_op : ``all`` or ``loose_end``
-        See :class:`~nni.retiarii.nn.pytorch.Cell`.
-    num_nodes_per_cell : int
-        See :class:`~nni.retiarii.nn.pytorch.Cell`.
-    """
-
-    def __init__(self,
-                 op_candidates: List[str],
-                 merge_op: Literal['all', 'loose_end'] = 'all',
-                 num_nodes_per_cell: int = 4,
-                 width: Union[Tuple[int, ...], int] = 16,
-                 num_cells: Union[Tuple[int, ...], int] = 20,
-                 dataset: Literal['cifar', 'imagenet'] = 'imagenet',
-                 auxiliary_loss: bool = False):
-        super().__init__()
-
-        self.dataset = dataset
-        self.num_labels = 10 if dataset == 'cifar' else 1000
-        self.auxiliary_loss = auxiliary_loss
-
-        # preprocess the specified width and depth
-        if isinstance(width, Iterable):
-            C = nn.ValueChoice(list(width), label='width')
-        else:
-            C = width
-
-        self.num_cells: nn.MaybeChoice[int] = cast(int, num_cells)
-        if isinstance(num_cells, Iterable):
-            self.num_cells = nn.ValueChoice(list(num_cells), label='depth')
-        num_cells_per_stage = [(i + 1) * self.num_cells // 3 - i * self.num_cells // 3 for i in range(3)]
-
-        # auxiliary head is different for network targetted at different datasets
-        if dataset == 'imagenet':
-            self.stem0 = nn.Sequential(
-                nn.Conv2d(3, cast(int, C // 2), kernel_size=3, stride=2, padding=1, bias=False),
-                nn.BatchNorm2d(cast(int, C // 2)),
-                nn.ReLU(inplace=True),
-                nn.Conv2d(cast(int, C // 2), cast(int, C), 3, stride=2, padding=1, bias=False),
-                nn.BatchNorm2d(C),
-            )
-            self.stem1 = nn.Sequential(
-                nn.ReLU(inplace=True),
-                nn.Conv2d(cast(int, C), cast(int, C), 3, stride=2, padding=1, bias=False),
-                nn.BatchNorm2d(C),
-            )
-            C_pprev = C_prev = C_curr = C
-            last_cell_reduce = True
-        elif dataset == 'cifar':
-            self.stem = nn.Sequential(
-                nn.Conv2d(3, cast(int, 3 * C), 3, padding=1, bias=False),
-                nn.BatchNorm2d(cast(int, 3 * C))
-            )
-            C_pprev = C_prev = 3 * C
-            C_curr = C
-            last_cell_reduce = False
-        else:
-            raise ValueError(f'Unsupported dataset: {dataset}')
-
-        self.stages = nn.ModuleList()
-        for stage_idx in range(3):
-            if stage_idx > 0:
-                C_curr *= 2
-            # For a stage, we get C_in, C_curr, and C_out.
-            # C_in is only used in the first cell.
-            # C_curr is number of channels for each operator in current stage.
-            # C_out is usually `C * num_nodes_per_cell` because of concat operator.
-            cell_builder = CellBuilder(op_candidates, C_pprev, C_prev, C_curr, num_nodes_per_cell,
-                                       merge_op, stage_idx > 0, last_cell_reduce)
-            stage: Union[NDSStage, nn.Sequential] = NDSStage(cell_builder, num_cells_per_stage[stage_idx])
-
-            if isinstance(stage, NDSStage):
-                stage.estimated_out_channels_prev = cast(int, C_prev)
-                stage.estimated_out_channels = cast(int, C_curr * num_nodes_per_cell)
-                stage.downsampling = stage_idx > 0
-
-            self.stages.append(stage)
-
-            # NOTE: output_node_indices will be computed on-the-fly in trial code.
-            # When constructing model space, it's just all the nodes in the cell,
-            # which happens to be the case of one-shot supernet.
-
-            # C_pprev is output channel number of last second cell among all the cells already built.
-            if len(stage) > 1:
-                # Contains more than one cell
-                C_pprev = len(cast(nn.Cell, stage[-2]).output_node_indices) * C_curr
-            else:
-                # Look up in the out channels of last stage.
-                C_pprev = C_prev
-
-            # This was originally,
-            # C_prev = num_nodes_per_cell * C_curr.
-            # but due to loose end, it becomes,
-            C_prev = len(cast(nn.Cell, stage[-1]).output_node_indices) * C_curr
-
-            # Useful in aligning the pprev and prev cell.
-            last_cell_reduce = cell_builder.last_cell_reduce
-
-            if stage_idx == 2:
-                C_to_auxiliary = C_prev
-
-        if auxiliary_loss:
-            assert isinstance(self.stages[2], nn.Sequential), 'Auxiliary loss can only be enabled in retrain mode.'
-            self.stages[2] = SequentialBreakdown(cast(nn.Sequential, self.stages[2]))
-            self.auxiliary_head = AuxiliaryHead(C_to_auxiliary, self.num_labels, dataset=self.dataset)  # type: ignore
-
-        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
-        self.classifier = nn.Linear(cast(int, C_prev), self.num_labels)
-
-    def forward(self, inputs):
-        if self.dataset == 'imagenet':
-            s0 = self.stem0(inputs)
-            s1 = self.stem1(s0)
-        else:
-            s0 = s1 = self.stem(inputs)
-
-        for stage_idx, stage in enumerate(self.stages):
-            if stage_idx == 2 and self.auxiliary_loss:
-                s = list(stage([s0, s1]).values())
-                s0, s1 = s[-1]
-                if self.training:
-                    # auxiliary loss is attached to the first cell of the last stage.
-                    logits_aux = self.auxiliary_head(s[0][1])
-            else:
-                s0, s1 = stage([s0, s1])
-
-        out = self.global_pooling(s1)
-        logits = self.classifier(out.view(out.size(0), -1))
-        if self.training and self.auxiliary_loss:
-            return logits, logits_aux  # type: ignore
-        else:
-            return logits
-
-    def set_drop_path_prob(self, drop_prob):
-        """
-        Set the drop probability of Drop-path in the network.
-        Reference: `FractalNet: Ultra-Deep Neural Networks without Residuals <https://arxiv.org/pdf/1605.07648v4.pdf>`__.
-        """
-        for module in self.modules():
-            if isinstance(module, DropPath_):
-                module.drop_prob = drop_prob
-
-    @classmethod
-    def fixed_arch(cls, arch: dict) -> FixedFactory:
-        return FixedFactory(cls, arch)
-
-
-@model_wrapper
-class NASNet(NDS):
-    __doc__ = """
-    Search space proposed in `Learning Transferable Architectures for Scalable Image Recognition <https://arxiv.org/abs/1707.07012>`__.
-
-    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
-    Its operator candidates are :attribute:`~NASNet.NASNET_OPS`.
-    It has 5 nodes per cell, and the output is concatenation of nodes not used as input to other nodes.
-    """ + _INIT_PARAMETER_DOCS
-
-    NASNET_OPS = [
-        'skip_connect',
-        'conv_3x1_1x3',
-        'conv_7x1_1x7',
-        'dil_conv_3x3',
-        'avg_pool_3x3',
-        'max_pool_3x3',
-        'max_pool_5x5',
-        'max_pool_7x7',
-        'conv_1x1',
-        'conv_3x3',
-        'sep_conv_3x3',
-        'sep_conv_5x5',
-        'sep_conv_7x7',
-    ]
-
-    def __init__(self,
-                 width: Union[Tuple[int, ...], int] = (16, 24, 32),
-                 num_cells: Union[Tuple[int, ...], int] = (4, 8, 12, 16, 20),
-                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
-                 auxiliary_loss: bool = False):
-        super().__init__(self.NASNET_OPS,
-                         merge_op='loose_end',
-                         num_nodes_per_cell=5,
-                         width=width,
-                         num_cells=num_cells,
-                         dataset=dataset,
-                         auxiliary_loss=auxiliary_loss)
-
-
-@model_wrapper
-class ENAS(NDS):
-    __doc__ = """Search space proposed in `Efficient neural architecture search via parameter sharing <https://arxiv.org/abs/1802.03268>`__.
-
-    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
-    Its operator candidates are :attribute:`~ENAS.ENAS_OPS`.
-    It has 5 nodes per cell, and the output is concatenation of nodes not used as input to other nodes.
-    """ + _INIT_PARAMETER_DOCS
-
-    ENAS_OPS = [
-        'skip_connect',
-        'sep_conv_3x3',
-        'sep_conv_5x5',
-        'avg_pool_3x3',
-        'max_pool_3x3',
-    ]
-
-    def __init__(self,
-                 width: Union[Tuple[int, ...], int] = (16, 24, 32),
-                 num_cells: Union[Tuple[int, ...], int] = (4, 8, 12, 16, 20),
-                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
-                 auxiliary_loss: bool = False):
-        super().__init__(self.ENAS_OPS,
-                         merge_op='loose_end',
-                         num_nodes_per_cell=5,
-                         width=width,
-                         num_cells=num_cells,
-                         dataset=dataset,
-                         auxiliary_loss=auxiliary_loss)
-
-
-@model_wrapper
-class AmoebaNet(NDS):
-    __doc__ = """Search space proposed in
-    `Regularized evolution for image classifier architecture search <https://arxiv.org/abs/1802.01548>`__.
-
-    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
-    Its operator candidates are :attribute:`~AmoebaNet.AMOEBA_OPS`.
-    It has 5 nodes per cell, and the output is concatenation of nodes not used as input to other nodes.
-    """ + _INIT_PARAMETER_DOCS
-
-    AMOEBA_OPS = [
-        'skip_connect',
-        'sep_conv_3x3',
-        'sep_conv_5x5',
-        'sep_conv_7x7',
-        'avg_pool_3x3',
-        'max_pool_3x3',
-        'dil_sep_conv_3x3',
-        'conv_7x1_1x7',
-    ]
-
-    def __init__(self,
-                 width: Union[Tuple[int, ...], int] = (16, 24, 32),
-                 num_cells: Union[Tuple[int, ...], int] = (4, 8, 12, 16, 20),
-                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
-                 auxiliary_loss: bool = False):
-
-        super().__init__(self.AMOEBA_OPS,
-                         merge_op='loose_end',
-                         num_nodes_per_cell=5,
-                         width=width,
-                         num_cells=num_cells,
-                         dataset=dataset,
-                         auxiliary_loss=auxiliary_loss)
-
-
-@model_wrapper
-class PNAS(NDS):
-    __doc__ = """Search space proposed in
-    `Progressive neural architecture search <https://arxiv.org/abs/1712.00559>`__.
-
-    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
-    Its operator candidates are :attribute:`~PNAS.PNAS_OPS`.
-    It has 5 nodes per cell, and the output is concatenation of all nodes in the cell.
-    """ + _INIT_PARAMETER_DOCS
-
-    PNAS_OPS = [
-        'sep_conv_3x3',
-        'sep_conv_5x5',
-        'sep_conv_7x7',
-        'conv_7x1_1x7',
-        'skip_connect',
-        'avg_pool_3x3',
-        'max_pool_3x3',
-        'dil_conv_3x3',
-    ]
-
-    def __init__(self,
-                 width: Union[Tuple[int, ...], int] = (16, 24, 32),
-                 num_cells: Union[Tuple[int, ...], int] = (4, 8, 12, 16, 20),
-                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
-                 auxiliary_loss: bool = False):
-        super().__init__(self.PNAS_OPS,
-                         merge_op='all',
-                         num_nodes_per_cell=5,
-                         width=width,
-                         num_cells=num_cells,
-                         dataset=dataset,
-                         auxiliary_loss=auxiliary_loss)
-
-
-@model_wrapper
-class DARTS(NDS):
-    __doc__ = """Search space proposed in `Darts: Differentiable architecture search <https://arxiv.org/abs/1806.09055>`__.
-
-    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
-    Its operator candidates are :attribute:`~DARTS.DARTS_OPS`.
-    It has 4 nodes per cell, and the output is concatenation of all nodes in the cell.
-    """ + _INIT_PARAMETER_DOCS
-
-    DARTS_OPS = [
-        'none',
-        'max_pool_3x3',
-        'avg_pool_3x3',
-        'skip_connect',
-        'sep_conv_3x3',
-        'sep_conv_5x5',
-        'dil_conv_3x3',
-        'dil_conv_5x5',
-    ]
-
-    def __init__(self,
-                 width: Union[Tuple[int, ...], int] = (16, 24, 32),
-                 num_cells: Union[Tuple[int, ...], int] = (4, 8, 12, 16, 20),
-                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
-                 auxiliary_loss: bool = False):
-        super().__init__(self.DARTS_OPS,
-                         merge_op='all',
-                         num_nodes_per_cell=4,
-                         width=width,
-                         num_cells=num_cells,
-                         dataset=dataset,
-                         auxiliary_loss=auxiliary_loss)
-
-    @classmethod
-    def load_searched_model(
-        cls, name: str,
-        pretrained: bool = False, download: bool = False, progress: bool = True
-    ) -> nn.Module:
-
-        init_kwargs = {}  # all default
-
-        if name == 'darts-v2':
-            init_kwargs.update(
-                num_cells=20,
-                width=36,
-            )
-            arch = {
-                'normal/op_2_0': 'sep_conv_3x3',
-                'normal/op_2_1': 'sep_conv_3x3',
-                'normal/input_2_0': 0,
-                'normal/input_2_1': 1,
-                'normal/op_3_0': 'sep_conv_3x3',
-                'normal/op_3_1': 'sep_conv_3x3',
-                'normal/input_3_0': 0,
-                'normal/input_3_1': 1,
-                'normal/op_4_0': 'sep_conv_3x3',
-                'normal/op_4_1': 'skip_connect',
-                'normal/input_4_0': 1,
-                'normal/input_4_1': 0,
-                'normal/op_5_0': 'skip_connect',
-                'normal/op_5_1': 'dil_conv_3x3',
-                'normal/input_5_0': 0,
-                'normal/input_5_1': 2,
-                'reduce/op_2_0': 'max_pool_3x3',
-                'reduce/op_2_1': 'max_pool_3x3',
-                'reduce/input_2_0': 0,
-                'reduce/input_2_1': 1,
-                'reduce/op_3_0': 'skip_connect',
-                'reduce/op_3_1': 'max_pool_3x3',
-                'reduce/input_3_0': 2,
-                'reduce/input_3_1': 1,
-                'reduce/op_4_0': 'max_pool_3x3',
-                'reduce/op_4_1': 'skip_connect',
-                'reduce/input_4_0': 0,
-                'reduce/input_4_1': 2,
-                'reduce/op_5_0': 'skip_connect',
-                'reduce/op_5_1': 'max_pool_3x3',
-                'reduce/input_5_0': 2,
-                'reduce/input_5_1': 1
-            }
-
-        else:
-            raise ValueError(f'Unsupported architecture with name: {name}')
-
-        model_factory = cls.fixed_arch(arch)
-        model = model_factory(**init_kwargs)
-
-        if pretrained:
-            weight_file = load_pretrained_weight(name, download=download, progress=progress)
-            pretrained_weights = torch.load(weight_file)
-            model.load_state_dict(pretrained_weights)
-
-        return model
+from nni.nas.hub.pytorch.nasnet import *
--- a/nni/retiarii/hub/pytorch/proxylessnas.py
+++ b/nni/retiarii/hub/pytorch/proxylessnas.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import math
-from typing import Optional, Callable, List, Tuple, Iterator, Union, cast, overload
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import nni.retiarii.nn.pytorch as nn
-from nni.retiarii import model_wrapper
-
-from .utils.fixed import FixedFactory
-from .utils.pretrained import load_pretrained_weight
-
-
-@overload
-def make_divisible(v: Union[int, float], divisor, min_val=None) -> int:
-    ...
-
-
-@overload
-def make_divisible(v: Union[nn.ChoiceOf[int], nn.ChoiceOf[float]], divisor, min_val=None) -> nn.ChoiceOf[int]:
-    ...
-
-
-def make_divisible(v: Union[nn.ChoiceOf[int], nn.ChoiceOf[float], int, float], divisor, min_val=None) -> nn.MaybeChoice[int]:
-    """
-    This function is taken from the original tf repo.
-    It ensures that all layers have a channel number that is divisible by 8
-    It can be seen here:
-    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
-    """
-    if min_val is None:
-        min_val = divisor
-    # This should work for both value choices and constants.
-    new_v = nn.ValueChoice.max(min_val, round(v + divisor // 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    return nn.ValueChoice.condition(new_v < 0.9 * v, new_v + divisor, new_v)
-
-
-def simplify_sequential(sequentials: List[nn.Module]) -> Iterator[nn.Module]:
-    """
-    Flatten the sequential blocks so that the hierarchy looks better.
-    Eliminate identity modules automatically.
-    """
-    for module in sequentials:
-        if isinstance(module, nn.Sequential):
-            for submodule in module.children():
-                # no recursive expansion
-                if not isinstance(submodule, nn.Identity):
-                    yield submodule
-        else:
-            if not isinstance(module, nn.Identity):
-                yield module
-
-
-class ConvBNReLU(nn.Sequential):
-    """
-    The template for a conv-bn-relu block.
-    """
-
-    def __init__(
-        self,
-        in_channels: nn.MaybeChoice[int],
-        out_channels: nn.MaybeChoice[int],
-        kernel_size: nn.MaybeChoice[int] = 3,
-        stride: int = 1,
-        groups: nn.MaybeChoice[int] = 1,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        activation_layer: Optional[Callable[..., nn.Module]] = None,
-        dilation: int = 1,
-    ) -> None:
-        padding = (kernel_size - 1) // 2 * dilation
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if activation_layer is None:
-            activation_layer = nn.ReLU6
-        # If no normalization is used, set bias to True
-        # https://github.com/google-research/google-research/blob/20736344/tunas/rematlib/mobile_model_v3.py#L194
-        norm = norm_layer(cast(int, out_channels))
-        no_normalization = isinstance(norm, nn.Identity)
-        blocks: List[nn.Module] = [
-            nn.Conv2d(
-                cast(int, in_channels),
-                cast(int, out_channels),
-                cast(int, kernel_size),
-                stride,
-                cast(int, padding),
-                dilation=dilation,
-                groups=cast(int, groups),
-                bias=no_normalization
-            ),
-            # Normalization, regardless of batchnorm or identity
-            norm,
-            # One pytorch implementation as an SE here, to faithfully reproduce paper
-            # We follow a more accepted approach to put SE outside
-            # Reference: https://github.com/d-li14/mobilenetv3.pytorch/issues/18
-            activation_layer(inplace=True)
-        ]
-
-        super().__init__(*simplify_sequential(blocks))
-
-
-class DepthwiseSeparableConv(nn.Sequential):
-    """
-    In the original MobileNetV2 implementation, this is InvertedResidual when expand ratio = 1.
-    Residual connection is added if input and output shape are the same.
-
-    References:
-
-    - https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/efficientnet_blocks.py#L90
-    - https://github.com/google-research/google-research/blob/20736344/tunas/rematlib/mobile_model_v3.py#L433
-    - https://github.com/ultmaster/AceNAS/blob/46c8895f/searchspace/proxylessnas/utils.py#L100
-    """
-
-    def __init__(
-        self,
-        in_channels: nn.MaybeChoice[int],
-        out_channels: nn.MaybeChoice[int],
-        kernel_size: nn.MaybeChoice[int] = 3,
-        stride: int = 1,
-        squeeze_excite: Optional[Callable[[nn.MaybeChoice[int], nn.MaybeChoice[int]], nn.Module]] = None,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        activation_layer: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        blocks = [
-            # dw
-            ConvBNReLU(in_channels, in_channels, stride=stride, kernel_size=kernel_size, groups=in_channels,
-                       norm_layer=norm_layer, activation_layer=activation_layer),
-            # optional se
-            squeeze_excite(in_channels, in_channels) if squeeze_excite else nn.Identity(),
-            # pw-linear
-            ConvBNReLU(in_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)
-        ]
-        super().__init__(*simplify_sequential(blocks))
-        # NOTE: "is" is used here instead of "==" to avoid creating a new value choice.
-        self.has_skip = stride == 1 and in_channels is out_channels
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.has_skip:
-            return x + super().forward(x)
-        else:
-            return super().forward(x)
-
-
-class InvertedResidual(nn.Sequential):
-    """
-    An Inverted Residual Block, sometimes called an MBConv Block, is a type of residual block used for image models
-    that uses an inverted structure for efficiency reasons.
-
-    It was originally proposed for the `MobileNetV2 <https://arxiv.org/abs/1801.04381>`__ CNN architecture.
-    It has since been reused for several mobile-optimized CNNs.
-    It follows a narrow -> wide -> narrow approach, hence the inversion.
-    It first widens with a 1x1 convolution, then uses a 3x3 depthwise convolution (which greatly reduces the number of parameters),
-    then a 1x1 convolution is used to reduce the number of channels so input and output can be added.
-
-    This implementation is sort of a mixture between:
-
-    - https://github.com/google-research/google-research/blob/20736344/tunas/rematlib/mobile_model_v3.py#L453
-    - https://github.com/rwightman/pytorch-image-models/blob/b7cb8d03/timm/models/efficientnet_blocks.py#L134
-    """
-
-    def __init__(
-        self,
-        in_channels: nn.MaybeChoice[int],
-        out_channels: nn.MaybeChoice[int],
-        expand_ratio: nn.MaybeChoice[float],
-        kernel_size: nn.MaybeChoice[int] = 3,
-        stride: int = 1,
-        squeeze_excite: Optional[Callable[[nn.MaybeChoice[int], nn.MaybeChoice[int]], nn.Module]] = None,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        activation_layer: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        self.stride = stride
-        self.out_channels = out_channels
-        assert stride in [1, 2]
-
-        hidden_ch = cast(int, make_divisible(in_channels * expand_ratio, 8))
-
-        # NOTE: this equivalence check (==) does NOT work for ValueChoice, need to use "is"
-        self.has_skip = stride == 1 and in_channels is out_channels
-
-        layers: List[nn.Module] = [
-            # point-wise convolution
-            # NOTE: some paper omit this point-wise convolution when stride = 1.
-            # In our implementation, if this pw convolution is intended to be omitted,
-            # please use SepConv instead.
-            ConvBNReLU(in_channels, hidden_ch, kernel_size=1,
-                       norm_layer=norm_layer, activation_layer=activation_layer),
-            # depth-wise
-            ConvBNReLU(hidden_ch, hidden_ch, stride=stride, kernel_size=kernel_size, groups=hidden_ch,
-                       norm_layer=norm_layer, activation_layer=activation_layer),
-            # SE
-            squeeze_excite(
-                cast(int, hidden_ch),
-                cast(int, in_channels)
-            ) if squeeze_excite is not None else nn.Identity(),
-            # pw-linear
-            ConvBNReLU(hidden_ch, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity),
-        ]
-
-        super().__init__(*simplify_sequential(layers))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.has_skip:
-            return x + super().forward(x)
-        else:
-            return super().forward(x)
-
-
-def inverted_residual_choice_builder(
-    expand_ratios: List[int],
-    kernel_sizes: List[int],
-    downsample: bool,
-    stage_input_width: int,
-    stage_output_width: int,
-    label: str
-):
-    def builder(index):
-        stride = 1
-        inp = stage_output_width
-
-        if index == 0:
-            # first layer in stage
-            # do downsample and width reshape
-            inp = stage_input_width
-            if downsample:
-                stride = 2
-
-        oup = stage_output_width
-
-        op_choices = {}
-        for exp_ratio in expand_ratios:
-            for kernel_size in kernel_sizes:
-                op_choices[f'k{kernel_size}e{exp_ratio}'] = InvertedResidual(inp, oup, exp_ratio, kernel_size, stride)
-
-        # It can be implemented with ValueChoice, but we use LayerChoice here
-        # to be aligned with the intention of the original ProxylessNAS.
-        return nn.LayerChoice(op_choices, label=f'{label}_i{index}')
-
-    return builder
-
-
-@model_wrapper
-class ProxylessNAS(nn.Module):
-    """
-    The search space proposed by `ProxylessNAS <https://arxiv.org/abs/1812.00332>`__.
-
-    Following the official implementation, the inverted residual with kernel size / expand ratio variations in each layer
-    is implemented with a :class:`nn.LayerChoice` with all-combination candidates. That means,
-    when used in weight sharing, these candidates will be treated as separate layers, and won't be fine-grained shared.
-    We note that :class:`MobileNetV3Space` is different in this perspective.
-
-    This space can be implemented as part of :class:`MobileNetV3Space`, but we separate those following conventions.
-    """
-
-    def __init__(self, num_labels: int = 1000,
-                 base_widths: Tuple[int, ...] = (32, 16, 32, 40, 80, 96, 192, 320, 1280),
-                 dropout_rate: float = 0.,
-                 width_mult: float = 1.0,
-                 bn_eps: float = 1e-3,
-                 bn_momentum: float = 0.1):
-
-        super().__init__()
-
-        assert len(base_widths) == 9
-        # include the last stage info widths here
-        widths = [make_divisible(width * width_mult, 8) for width in base_widths]
-        downsamples = [True, False, True, True, True, False, True, False]
-
-        self.num_labels = num_labels
-        self.dropout_rate = dropout_rate
-        self.bn_eps = bn_eps
-        self.bn_momentum = bn_momentum
-
-        self.stem = ConvBNReLU(3, widths[0], stride=2, norm_layer=nn.BatchNorm2d)
-
-        blocks: List[nn.Module] = [
-            # first stage is fixed
-            DepthwiseSeparableConv(widths[0], widths[1], kernel_size=3, stride=1)
-        ]
-
-        # https://github.com/ultmaster/AceNAS/blob/46c8895fd8a05ffbc61a6b44f1e813f64b4f66b7/searchspace/proxylessnas/__init__.py#L21
-        for stage in range(2, 8):
-            # Rather than returning a fixed module here,
-            # we return a builder that dynamically creates module for different `repeat_idx`.
-            builder = inverted_residual_choice_builder(
-                [3, 6], [3, 5, 7], downsamples[stage], widths[stage - 1], widths[stage], f's{stage}')
-            if stage < 7:
-                blocks.append(nn.Repeat(builder, (1, 4), label=f's{stage}_depth'))
-            else:
-                # No mutation for depth in the last stage.
-                # Directly call builder to initiate one block
-                blocks.append(builder(0))
-
-        self.blocks = nn.Sequential(*blocks)
-
-        # final layers
-        self.feature_mix_layer = ConvBNReLU(widths[7], widths[8], kernel_size=1, norm_layer=nn.BatchNorm2d)
-        self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)
-        self.dropout_layer = nn.Dropout(dropout_rate)
-        self.classifier = nn.Linear(widths[-1], num_labels)
-
-        reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
-
-    def forward(self, x):
-        x = self.stem(x)
-        x = self.blocks(x)
-        x = self.feature_mix_layer(x)
-        x = self.global_avg_pooling(x)
-        x = x.view(x.size(0), -1)  # flatten
-        x = self.dropout_layer(x)
-        x = self.classifier(x)
-        return x
-
-    def no_weight_decay(self):
-        # this is useful for timm optimizer
-        # no regularizer to linear layer
-        if hasattr(self, 'classifier'):
-            return {'classifier.weight', 'classifier.bias'}
-        return set()
-
-    @classmethod
-    def fixed_arch(cls, arch: dict) -> FixedFactory:
-        return FixedFactory(cls, arch)
-
-    @classmethod
-    def load_searched_model(
-        cls, name: str,
-        pretrained: bool = False, download: bool = False, progress: bool = True
-    ) -> nn.Module:
-
-        init_kwargs = {}  # all default
-
-        if name == 'acenas-m1':
-            arch = {
-                's2_depth': 2,
-                's2_i0': 'k3e6',
-                's2_i1': 'k3e3',
-                's3_depth': 3,
-                's3_i0': 'k5e3',
-                's3_i1': 'k3e3',
-                's3_i2': 'k5e3',
-                's4_depth': 2,
-                's4_i0': 'k3e6',
-                's4_i1': 'k5e3',
-                's5_depth': 4,
-                's5_i0': 'k7e6',
-                's5_i1': 'k3e6',
-                's5_i2': 'k3e6',
-                's5_i3': 'k7e3',
-                's6_depth': 4,
-                's6_i0': 'k7e6',
-                's6_i1': 'k7e6',
-                's6_i2': 'k7e3',
-                's6_i3': 'k7e3',
-                's7_depth': 1,
-                's7_i0': 'k7e6'
-            }
-
-        elif name == 'acenas-m2':
-            arch = {
-                's2_depth': 1,
-                's2_i0': 'k5e3',
-                's3_depth': 3,
-                's3_i0': 'k3e6',
-                's3_i1': 'k3e3',
-                's3_i2': 'k5e3',
-                's4_depth': 2,
-                's4_i0': 'k7e6',
-                's4_i1': 'k5e6',
-                's5_depth': 4,
-                's5_i0': 'k5e6',
-                's5_i1': 'k5e3',
-                's5_i2': 'k5e6',
-                's5_i3': 'k3e6',
-                's6_depth': 4,
-                's6_i0': 'k7e6',
-                's6_i1': 'k5e6',
-                's6_i2': 'k5e3',
-                's6_i3': 'k5e6',
-                's7_depth': 1,
-                's7_i0': 'k7e6'
-            }
-
-        elif name == 'acenas-m3':
-            arch = {
-                's2_depth': 2,
-                's2_i0': 'k3e3',
-                's2_i1': 'k3e6',
-                's3_depth': 2,
-                's3_i0': 'k5e3',
-                's3_i1': 'k3e3',
-                's4_depth': 3,
-                's4_i0': 'k5e6',
-                's4_i1': 'k7e6',
-                's4_i2': 'k3e6',
-                's5_depth': 4,
-                's5_i0': 'k7e6',
-                's5_i1': 'k7e3',
-                's5_i2': 'k7e3',
-                's5_i3': 'k5e3',
-                's6_depth': 4,
-                's6_i0': 'k7e6',
-                's6_i1': 'k7e3',
-                's6_i2': 'k7e6',
-                's6_i3': 'k3e3',
-                's7_depth': 1,
-                's7_i0': 'k5e6'
-            }
-
-        elif name == 'proxyless-cpu':
-            arch = {
-                's2_depth': 4,
-                's2_i0': 'k3e6',
-                's2_i1': 'k3e3',
-                's2_i2': 'k3e3',
-                's2_i3': 'k3e3',
-                's3_depth': 4,
-                's3_i0': 'k3e6',
-                's3_i1': 'k3e3',
-                's3_i2': 'k3e3',
-                's3_i3': 'k5e3',
-                's4_depth': 2,
-                's4_i0': 'k3e6',
-                's4_i1': 'k3e3',
-                's5_depth': 4,
-                's5_i0': 'k5e6',
-                's5_i1': 'k3e3',
-                's5_i2': 'k3e3',
-                's5_i3': 'k3e3',
-                's6_depth': 4,
-                's6_i0': 'k5e6',
-                's6_i1': 'k5e3',
-                's6_i2': 'k5e3',
-                's6_i3': 'k3e3',
-                's7_depth': 1,
-                's7_i0': 'k5e6'
-            }
-
-            init_kwargs['base_widths'] = [40, 24, 32, 48, 88, 104, 216, 360, 1432]
-
-        elif name == 'proxyless-gpu':
-            arch = {
-                's2_depth': 1,
-                's2_i0': 'k5e3',
-                's3_depth': 2,
-                's3_i0': 'k7e3',
-                's3_i1': 'k3e3',
-                's4_depth': 2,
-                's4_i0': 'k7e6',
-                's4_i1': 'k5e3',
-                's5_depth': 3,
-                's5_i0': 'k5e6',
-                's5_i1': 'k3e3',
-                's5_i2': 'k5e3',
-                's6_depth': 4,
-                's6_i0': 'k7e6',
-                's6_i1': 'k7e6',
-                's6_i2': 'k7e6',
-                's6_i3': 'k5e6',
-                's7_depth': 1,
-                's7_i0': 'k7e6'
-            }
-
-            init_kwargs['base_widths'] = [40, 24, 32, 56, 112, 128, 256, 432, 1728]
-
-        elif name == 'proxyless-mobile':
-            arch = {
-                's2_depth': 2,
-                's2_i0': 'k5e3',
-                's2_i1': 'k3e3',
-                's3_depth': 4,
-                's3_i0': 'k7e3',
-                's3_i1': 'k3e3',
-                's3_i2': 'k5e3',
-                's3_i3': 'k5e3',
-                's4_depth': 4,
-                's4_i0': 'k7e6',
-                's4_i1': 'k5e3',
-                's4_i2': 'k5e3',
-                's4_i3': 'k5e3',
-                's5_depth': 4,
-                's5_i0': 'k5e6',
-                's5_i1': 'k5e3',
-                's5_i2': 'k5e3',
-                's5_i3': 'k5e3',
-                's6_depth': 4,
-                's6_i0': 'k7e6',
-                's6_i1': 'k7e6',
-                's6_i2': 'k7e3',
-                's6_i3': 'k7e3',
-                's7_depth': 1,
-                's7_i0': 'k7e6'
-            }
-
-        else:
-            raise ValueError(f'Unsupported architecture with name: {name}')
-
-        model_factory = cls.fixed_arch(arch)
-        model = model_factory(**init_kwargs)
-
-        if pretrained:
-            weight_file = load_pretrained_weight(name, download=download, progress=progress)
-            pretrained_weights = torch.load(weight_file)
-            model.load_state_dict(pretrained_weights)
-
-        return model
-
-
-def reset_parameters(model, model_init='he_fout', init_div_groups=False,
-                     bn_momentum=0.1, bn_eps=1e-5):
-    for m in model.modules():
-        if isinstance(m, nn.Conv2d):
-            if model_init == 'he_fout':
-                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                if init_div_groups:
-                    n /= m.groups
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            elif model_init == 'he_fin':
-                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
-                if init_div_groups:
-                    n /= m.groups
-                m.weight.data.normal_(0, math.sqrt(2. / n))
-            else:
-                raise NotImplementedError
-        elif isinstance(m, nn.BatchNorm2d):
-            m.weight.data.fill_(1)
-            m.bias.data.zero_()
-            m.momentum = bn_momentum
-            m.eps = bn_eps
-        elif isinstance(m, nn.Linear):
-            m.weight.data.normal_(0, 0.01)
-            if m.bias is not None:
-                m.bias.data.zero_()
-        elif isinstance(m, nn.BatchNorm1d):
-            m.weight.data.fill_(1)
-            m.bias.data.zero_()
+from nni.nas.hub.pytorch.proxylessnas import *
--- a/nni/retiarii/hub/pytorch/shufflenet.py
+++ b/nni/retiarii/hub/pytorch/shufflenet.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from typing import cast
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import nni.retiarii.nn.pytorch as nn
-from nni.retiarii import model_wrapper
-
-from .utils.fixed import FixedFactory
-from .utils.pretrained import load_pretrained_weight
-
-
-class ShuffleNetBlock(nn.Module):
-    """
-    Describe the basic building block of shuffle net, as described in
-    `ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices <https://arxiv.org/pdf/1707.01083.pdf>`__.
-
-    When stride = 1, the block expects an input with ``2 * input channels``. Otherwise input channels.
-    """
-
-    def __init__(self, in_channels: int, out_channels: int, mid_channels: nn.MaybeChoice[int], *,
-                 kernel_size: int, stride: int, sequence: str = "pdp", affine: bool = True):
-        super().__init__()
-        assert stride in [1, 2]
-        assert kernel_size in [3, 5, 7]
-        self.channels = in_channels // 2 if stride == 1 else in_channels
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.mid_channels = mid_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.pad = kernel_size // 2
-        self.oup_main = out_channels - self.channels
-        self.affine = affine
-        assert self.oup_main > 0
-
-        self.branch_main = nn.Sequential(*self._decode_point_depth_conv(sequence))
-
-        if stride == 2:
-            self.branch_proj = nn.Sequential(
-                # dw
-                nn.Conv2d(self.channels, self.channels, kernel_size, stride, self.pad,
-                          groups=self.channels, bias=False),
-                nn.BatchNorm2d(self.channels, affine=affine),
-                # pw-linear
-                nn.Conv2d(self.channels, self.channels, 1, 1, 0, bias=False),
-                nn.BatchNorm2d(self.channels, affine=affine),
-                nn.ReLU(inplace=True)
-            )
-        else:
-            # empty block to be compatible with torchscript
-            self.branch_proj = nn.Sequential()
-
-    def forward(self, x):
-        if self.stride == 2:
-            x_proj, x = self.branch_proj(x), x
-        else:
-            x_proj, x = self._channel_shuffle(x)
-        return torch.cat((x_proj, self.branch_main(x)), 1)
-
-    def _decode_point_depth_conv(self, sequence):
-        result = []
-        first_depth = first_point = True
-        pc: int = self.channels
-        c: int = self.channels
-        for i, token in enumerate(sequence):
-            # compute output channels of this conv
-            if i + 1 == len(sequence):
-                assert token == "p", "Last conv must be point-wise conv."
-                c = self.oup_main
-            elif token == "p" and first_point:
-                c = cast(int, self.mid_channels)
-            if token == "d":
-                # depth-wise conv
-                if isinstance(pc, int) and isinstance(c, int):
-                    # check can only be done for static channels
-                    assert pc == c, "Depth-wise conv must not change channels."
-                result.append(nn.Conv2d(pc, c, self.kernel_size, self.stride if first_depth else 1, self.pad,
-                                        groups=c, bias=False))
-                result.append(nn.BatchNorm2d(c, affine=self.affine))
-                first_depth = False
-            elif token == "p":
-                # point-wise conv
-                result.append(nn.Conv2d(pc, c, 1, 1, 0, bias=False))
-                result.append(nn.BatchNorm2d(c, affine=self.affine))
-                result.append(nn.ReLU(inplace=True))
-                first_point = False
-            else:
-                raise ValueError("Conv sequence must be d and p.")
-            pc = c
-        return result
-
-    def _channel_shuffle(self, x):
-        bs, num_channels, height, width = x.size()
-        # NOTE: this line is commented for torchscript
-        # assert (num_channels % 4 == 0)
-        x = x.reshape(bs * num_channels // 2, 2, height * width)
-        x = x.permute(1, 0, 2)
-        x = x.reshape(2, -1, num_channels // 2, height, width)
-        return x[0], x[1]
-
-
-class ShuffleXceptionBlock(ShuffleNetBlock):
-    """
-    The ``choice_x`` version of shuffle net block, described in
-    `Single Path One-shot <https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610528.pdf>`__.
-    """
-
-    def __init__(self, in_channels: int, out_channels: int, mid_channels: nn.MaybeChoice[int], *, stride: int, affine: bool = True):
-        super().__init__(in_channels, out_channels, mid_channels,
-                         kernel_size=3, stride=stride, sequence="dpdpdp", affine=affine)
-
-
-@model_wrapper
-class ShuffleNetSpace(nn.Module):
-    """
-    The search space proposed in `Single Path One-shot <https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610528.pdf>`__.
-
-    The basic building block design is inspired by a state-of-the-art manually-designed network --
-    `ShuffleNetV2 <https://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html>`__.
-    There are 20 choice blocks in total. Each choice block has 4 candidates, namely ``choice 3``, ``choice 5``,
-    ``choice_7`` and ``choice_x`` respectively. They differ in kernel sizes and the number of depthwise convolutions.
-    The size of the search space is :math:`4^{20}`.
-
-    Parameters
-    ----------
-    num_labels : int
-        Number of classes for the classification head. Default: 1000.
-    channel_search : bool
-        If true, for each building block, the number of ``mid_channels``
-        (output channels of the first 1x1 conv in each building block) varies from 0.2x to 1.6x (quantized to multiple of 0.2).
-        Here, "k-x" means k times the number of default channels.
-        Otherwise, 1.0x is used by default. Default: false.
-    affine : bool
-        Apply affine to all batch norm. Default: true.
-    """
-
-    def __init__(self,
-                 num_labels: int = 1000,
-                 channel_search: bool = False,
-                 affine: bool = True):
-        super().__init__()
-
-        self.num_labels = num_labels
-        self.channel_search = channel_search
-        self.affine = affine
-
-        # the block number in each stage. 4 stages in total. 20 blocks in total.
-        self.stage_repeats = [4, 4, 8, 4]
-
-        # output channels for all stages, including the very first layer and the very last layer
-        self.stage_out_channels = [-1, 16, 64, 160, 320, 640, 1024]
-
-        # building first layer
-        out_channels = self.stage_out_channels[1]
-        self.first_conv = nn.Sequential(
-            nn.Conv2d(3, out_channels, 3, 2, 1, bias=False),
-            nn.BatchNorm2d(out_channels),
-            nn.ReLU(inplace=True),
-        )
-
-        feature_blocks = []
-
-        global_block_idx = 0
-        for stage_idx, num_repeat in enumerate(self.stage_repeats):
-            for block_idx in range(num_repeat):
-                # count global index to give names to choices
-                global_block_idx += 1
-
-                # get ready for input and output
-                in_channels = out_channels
-                out_channels = self.stage_out_channels[stage_idx + 2]
-                stride = 2 if block_idx == 0 else 1
-
-                # mid channels can be searched
-                base_mid_channels = out_channels // 2
-                if self.channel_search:
-                    k_choice_list = [int(base_mid_channels * (.2 * k)) for k in range(1, 9)]
-                    mid_channels = nn.ValueChoice(k_choice_list, label=f'channel_{global_block_idx}')
-                else:
-                    mid_channels = int(base_mid_channels)
-
-                mid_channels = cast(nn.MaybeChoice[int], mid_channels)
-
-                choice_block = nn.LayerChoice(dict(
-                    k3=ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=3, stride=stride, affine=affine),
-                    k5=ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=5, stride=stride, affine=affine),
-                    k7=ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=7, stride=stride, affine=affine),
-                    xcep=ShuffleXceptionBlock(in_channels, out_channels, mid_channels=mid_channels, stride=stride, affine=affine)
-                ), label=f'layer_{global_block_idx}')
-                feature_blocks.append(choice_block)
-
-        self.features = nn.Sequential(*feature_blocks)
-
-        # final layers
-        last_conv_channels = self.stage_out_channels[-1]
-        self.conv_last = nn.Sequential(
-            nn.Conv2d(out_channels, last_conv_channels, 1, 1, 0, bias=False),
-            nn.BatchNorm2d(last_conv_channels, affine=affine),
-            nn.ReLU(inplace=True),
-        )
-        self.globalpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.dropout = nn.Dropout(0.1)
-        self.classifier = nn.Sequential(
-            nn.Linear(last_conv_channels, num_labels, bias=False),
-        )
-
-        self._initialize_weights()
-
-    def forward(self, x):
-        x = self.first_conv(x)
-        x = self.features(x)
-        x = self.conv_last(x)
-
-        x = self.globalpool(x)
-
-        x = self.dropout(x)
-        x = x.contiguous().view(-1, self.stage_out_channels[-1])
-        x = self.classifier(x)
-        return x
-
-    def _initialize_weights(self):
-        for name, m in self.named_modules():
-            if isinstance(m, nn.Conv2d):
-                if 'first' in name:
-                    torch.nn.init.normal_(m.weight, 0, 0.01)
-                else:
-                    torch.nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1])
-                if m.bias is not None:
-                    torch.nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm2d):
-                if m.weight is not None:
-                    torch.nn.init.constant_(m.weight, 1)
-                if m.bias is not None:
-                    torch.nn.init.constant_(m.bias, 0.0001)
-                if m.running_mean is not None:
-                    torch.nn.init.constant_(m.running_mean, 0)
-            elif isinstance(m, nn.BatchNorm1d):
-                if m.weight is not None:
-                    torch.nn.init.constant_(m.weight, 1)
-                if m.bias is not None:
-                    torch.nn.init.constant_(m.bias, 0.0001)
-                if m.running_mean is not None:
-                    torch.nn.init.constant_(m.running_mean, 0)
-            elif isinstance(m, nn.Linear):
-                torch.nn.init.normal_(m.weight, 0, 0.01)
-                if m.bias is not None:
-                    torch.nn.init.constant_(m.bias, 0)
-
-    @classmethod
-    def fixed_arch(cls, arch: dict) -> FixedFactory:
-        return FixedFactory(cls, arch)
-
-    @classmethod
-    def load_searched_model(
-        cls, name: str,
-        pretrained: bool = False, download: bool = False, progress: bool = True
-    ) -> nn.Module:
-        if name == 'spos':
-            # NOTE: Need BGR tensor, with no normalization
-            # https://github.com/ultmaster/spacehub-conversion/blob/371a4fd6646b4e11eda3f61187f7c9a1d484b1ca/cutils.py#L63
-            arch = {
-                'layer_1': 'k7',
-                'layer_2': 'k5',
-                'layer_3': 'k3',
-                'layer_4': 'k5',
-                'layer_5': 'k7',
-                'layer_6': 'k3',
-                'layer_7': 'k7',
-                'layer_8': 'k3',
-                'layer_9': 'k7',
-                'layer_10': 'k3',
-                'layer_11': 'k7',
-                'layer_12': 'xcep',
-                'layer_13': 'k3',
-                'layer_14': 'k3',
-                'layer_15': 'k3',
-                'layer_16': 'k3',
-                'layer_17': 'xcep',
-                'layer_18': 'k7',
-                'layer_19': 'xcep',
-                'layer_20': 'xcep'
-            }
-
-        else:
-            raise ValueError(f'Unsupported architecture with name: {name}')
-
-        model_factory = cls.fixed_arch(arch)
-        model = model_factory()
-
-        if pretrained:
-            weight_file = load_pretrained_weight(name, download=download, progress=progress)
-            pretrained_weights = torch.load(weight_file)
-            model.load_state_dict(pretrained_weights)
-
-        return model
+from nni.nas.hub.pytorch.shufflenet import *
--- a/nni/retiarii/hub/pytorch/utils/fixed.py
+++ b/nni/retiarii/hub/pytorch/utils/fixed.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""This file should be merged to nni/retiarii/fixed.py"""
+# pylint: disable=wildcard-import,unused-wildcard-import

-from typing import Type
-
-from nni.retiarii.utils import ContextStack
-
-
-class FixedFactory:
-    """Make a model space ready to create a fixed model.
-
-    Examples
-    --------
-    >>> factory = FixedFactory(ModelSpaceClass, {"choice1": 3})
-    >>> model = factory(channels=16, classes=10)
-    """
-
-    # TODO: mutations on ``init_args`` and ``init_kwargs`` themselves are not supported.
-
-    def __init__(self, cls: Type, arch: dict):
-        self.cls = cls
-        self.arch = arch
-
-    def __call__(self, *init_args, **init_kwargs):
-        with ContextStack('fixed', self.arch):
-            return self.cls(*init_args, **init_kwargs)
-
-    def __repr__(self):
-        return f'FixedFactory(class={self.cls}, arch={self.arch})'
+from nni.nas.hub.pytorch.utils.fixed import *
--- a/nni/retiarii/hub/pytorch/utils/pretrained.py
+++ b/nni/retiarii/hub/pytorch/utils/pretrained.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""
-Weights available in this file are processed with scripts in https://github.com/ultmaster/spacehub-conversion,
-and uploaded with :func:`nni.common.blob_utils.upload_file`.
-"""
+# pylint: disable=wildcard-import,unused-wildcard-import

-import os
-
-from nni.common.blob_utils import NNI_BLOB, nni_cache_home, load_or_download_file
-
-
-PRETRAINED_WEIGHT_URLS = {
-    # proxylessnas
-    'acenas-m1': f'{NNI_BLOB}/nashub/acenas-m1-e215f1b8.pth',
-    'acenas-m2': f'{NNI_BLOB}/nashub/acenas-m2-a8ee9e8f.pth',
-    'acenas-m3': f'{NNI_BLOB}/nashub/acenas-m3-66a5ed7b.pth',
-    'proxyless-cpu': f'{NNI_BLOB}/nashub/proxyless-cpu-2df03430.pth',
-    'proxyless-gpu': f'{NNI_BLOB}/nashub/proxyless-gpu-dbe6dd15.pth',
-    'proxyless-mobile': f'{NNI_BLOB}/nashub/proxyless-mobile-8668a978.pth',
-
-    # mobilenetv3
-    'mobilenetv3-large-100': f'{NNI_BLOB}/nashub/mobilenetv3-large-100-420e040a.pth',
-    'mobilenetv3-small-050': f'{NNI_BLOB}/nashub/mobilenetv3-small-050-05cb7a80.pth',
-    'mobilenetv3-small-075': f'{NNI_BLOB}/nashub/mobilenetv3-small-075-c87d8acb.pth',
-    'mobilenetv3-small-100': f'{NNI_BLOB}/nashub/mobilenetv3-small-100-8332faac.pth',
-    'cream-014': f'{NNI_BLOB}/nashub/cream-014-060aea24.pth',
-    'cream-043': f'{NNI_BLOB}/nashub/cream-043-bec949e1.pth',
-    'cream-114': f'{NNI_BLOB}/nashub/cream-114-fc272590.pth',
-    'cream-287': f'{NNI_BLOB}/nashub/cream-287-a0fcba33.pth',
-    'cream-481': f'{NNI_BLOB}/nashub/cream-481-d85779b6.pth',
-    'cream-604': f'{NNI_BLOB}/nashub/cream-604-9ee425f7.pth',
-
-    # nasnet
-    'darts-v2': f'{NNI_BLOB}/nashub/darts-v2-5465b0d2.pth',
-
-    # spos
-    'spos': f'{NNI_BLOB}/nashub/spos-0b17f6fc.pth',
-
-    # autoformer
-    'autoformer-tiny': f'{NNI_BLOB}/nashub/autoformer-searched-tiny-1e90ebc1.pth',
-    'autoformer-small': f'{NNI_BLOB}/nashub/autoformer-searched-small-4bc5d4e5.pth',
-    'autoformer-base': f'{NNI_BLOB}/nashub/autoformer-searched-base-c417590a.pth'
-}
-
-
-def load_pretrained_weight(name: str, **kwargs) -> str:
-    if name not in PRETRAINED_WEIGHT_URLS:
-        raise ValueError(f'"{name}" do not have a valid pretrained weight file.')
-    url = PRETRAINED_WEIGHT_URLS[name]
-
-    local_path = os.path.join(nni_cache_home(), 'nashub', url.split('/')[-1])
-    load_or_download_file(local_path, url, **kwargs)
-    return local_path
+from nni.nas.hub.pytorch.utils.pretrained import *
--- a/nni/retiarii/integration.py
+++ b/nni/retiarii/integration.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import logging
-import os
-from typing import Any, Callable, Optional, Dict, List, Tuple
+# pylint: disable=wildcard-import,unused-wildcard-import

-import nni
-from nni.common.serializer import PayloadTooLarge
-from nni.common.version import version_dump
-from nni.runtime.msg_dispatcher_base import MsgDispatcherBase
-from nni.runtime.tuner_command_channel import CommandType
-from nni.utils import MetricType
-
-from .graph import MetricData
-from .integration_api import register_advisor
-
-_logger = logging.getLogger(__name__)
-
-
-class RetiariiAdvisor(MsgDispatcherBase):
-    """
-    The class is to connect Retiarii components to NNI backend.
-    It can be considered as a Python wrapper of NNI manager.
-
-    It will function as the main thread when running a Retiarii experiment through NNI.
-    Strategy will be launched as its thread, who will call APIs in execution engine. Execution
-    engine will then find the advisor singleton and send payloads to advisor.
-
-    When metrics are sent back, advisor will first receive the payloads, who will call the callback
-    function (that is a member function in graph listener).
-
-    The conversion advisor provides are minimum. It is only a send/receive module, and execution engine
-    needs to handle all the rest.
-
-    Attributes
-    ----------
-    send_trial_callback
-
-    request_trial_jobs_callback
-
-    trial_end_callback
-
-    intermediate_metric_callback
-
-    final_metric_callback
-    """
-
-    def __init__(self, url: str):
-        super().__init__(url)
-        register_advisor(self)  # register the current advisor as the "global only" advisor
-        self.search_space = None
-
-        self.send_trial_callback: Optional[Callable[[dict], None]] = None
-        self.request_trial_jobs_callback: Optional[Callable[[int], None]] = None
-        self.trial_end_callback: Optional[Callable[[int, bool], None]] = None
-        self.intermediate_metric_callback: Optional[Callable[[int, MetricData], None]] = None
-        self.final_metric_callback: Optional[Callable[[int, MetricData], None]] = None
-
-        self.parameters_count = 0
-
-        # Sometimes messages arrive first before the callbacks get registered.
-        # Or in case that we allow engine to be absent during the experiment.
-        # Here we need to store the messages and invoke them later.
-        self.call_queue: List[Tuple[str, list]] = []
-
-    def register_callbacks(self, callbacks: Dict[str, Callable[..., None]]):
-        """
-        Register callbacks for NNI backend.
-
-        Parameters
-        ----------
-        callbacks
-            A dictionary of callbacks.
-            The key is the name of the callback. The value is the callback function.
-        """
-        self.send_trial_callback = callbacks.get('send_trial')
-        self.request_trial_jobs_callback = callbacks.get('request_trial_jobs')
-        self.trial_end_callback = callbacks.get('trial_end')
-        self.intermediate_metric_callback = callbacks.get('intermediate_metric')
-        self.final_metric_callback = callbacks.get('final_metric')
-
-        self.process_queued_callbacks()
-
-    def process_queued_callbacks(self) -> None:
-        """
-        Process callbacks in queue.
-        Consume the messages that haven't been handled previously.
-        """
-        processed_idx = []
-        for queue_idx, (call_name, call_args) in enumerate(self.call_queue):
-            if call_name == 'send_trial' and self.send_trial_callback is not None:
-                self.send_trial_callback(*call_args)  # pylint: disable=not-callable
-                processed_idx.append(queue_idx)
-            if call_name == 'request_trial_jobs' and self.request_trial_jobs_callback is not None:
-                self.request_trial_jobs_callback(*call_args)  # pylint: disable=not-callable
-                processed_idx.append(queue_idx)
-            if call_name == 'trial_end' and self.trial_end_callback is not None:
-                self.trial_end_callback(*call_args)  # pylint: disable=not-callable
-                processed_idx.append(queue_idx)
-            if call_name == 'intermediate_metric' and self.intermediate_metric_callback is not None:
-                self.intermediate_metric_callback(*call_args)  # pylint: disable=not-callable
-                processed_idx.append(queue_idx)
-            if call_name == 'final_metric' and self.final_metric_callback is not None:
-                self.final_metric_callback(*call_args)  # pylint: disable=not-callable
-                processed_idx.append(queue_idx)
-
-        # Remove processed messages
-        for idx in reversed(processed_idx):
-            self.call_queue.pop(idx)
-
-    def invoke_callback(self, name: str, *args: Any) -> None:
-        """
-        Invoke callback.
-        """
-        self.call_queue.append((name, list(args)))
-        self.process_queued_callbacks()
-
-    def handle_initialize(self, data):
-        """callback for initializing the advisor
-        Parameters
-        ----------
-        data: dict
-            search space
-        """
-        self.handle_update_search_space(data)
-        self.send(CommandType.Initialized, '')
-
-    def _validate_placement_constraint(self, placement_constraint):
-        if placement_constraint is None:
-            raise ValueError('placement_constraint is None')
-        if not 'type' in placement_constraint:
-            raise ValueError('placement_constraint must have `type`')
-        if not 'gpus' in placement_constraint:
-            raise ValueError('placement_constraint must have `gpus`')
-        if placement_constraint['type'] not in ['None', 'GPUNumber', 'Device']:
-            raise ValueError('placement_constraint.type must be either `None`,. `GPUNumber` or `Device`')
-        if placement_constraint['type'] == 'None' and len(placement_constraint['gpus']) > 0:
-            raise ValueError('placement_constraint.gpus must be an empty list when type == None')
-        if placement_constraint['type'] == 'GPUNumber':
-            if len(placement_constraint['gpus']) != 1:
-                raise ValueError('placement_constraint.gpus currently only support one host when type == GPUNumber')
-            for e in placement_constraint['gpus']:
-                if not isinstance(e, int):
-                    raise ValueError('placement_constraint.gpus must be a list of number when type == GPUNumber')
-        if placement_constraint['type'] == 'Device':
-            for e in placement_constraint['gpus']:
-                if not isinstance(e, tuple):
-                    raise ValueError('placement_constraint.gpus must be a list of tuple when type == Device')
-                if not (len(e) == 2 and isinstance(e[0], str) and isinstance(e[1], int)):
-                    raise ValueError('placement_constraint.gpus`s tuple must be (str, int)')
-
-    def send_trial(self, parameters, placement_constraint=None):
-        """
-        Send parameters to NNI.
-
-        Parameters
-        ----------
-        parameters : Any
-            Any payload.
-
-        Returns
-        -------
-        int
-            Parameter ID that is assigned to this parameter,
-            which will be used for identification in future.
-        """
-        self.parameters_count += 1
-        if placement_constraint is None:
-            placement_constraint = {
-                'type': 'None',
-                'gpus': []
-            }
-        self._validate_placement_constraint(placement_constraint)
-        new_trial = {
-            'parameter_id': self.parameters_count,
-            'parameters': parameters,
-            'parameter_source': 'algorithm',
-            'placement_constraint': placement_constraint,
-            'version_info': version_dump()
-        }
-        _logger.debug('New trial sent: %s', new_trial)
-
-        try:
-            send_payload = nni.dump(new_trial, pickle_size_limit=int(os.getenv('PICKLE_SIZE_LIMIT', 64 * 1024)))
-        except PayloadTooLarge:
-            raise ValueError(
-                'Serialization failed when trying to dump the model because payload too large (larger than 64 KB). '
-                'This is usually caused by pickling large objects (like datasets) by mistake. '
-                'See the full error traceback for details and https://nni.readthedocs.io/en/stable/NAS/Serialization.html '
-                'for how to resolve such issue. '
-            )
-
-        # trial parameters can be super large, disable pickle size limit here
-        # nevertheless, there could still be blocked by pipe / nni-manager
-        self.send(CommandType.NewTrialJob, send_payload)
-
-        self.invoke_callback('send_trial', parameters)
-        return self.parameters_count
-
-    def mark_experiment_as_ending(self):
-        self.send(CommandType.NoMoreTrialJobs, '')
-
-    def handle_request_trial_jobs(self, num_trials):
-        _logger.debug('Request trial jobs: %s', num_trials)
-        self.invoke_callback('request_trial_jobs', num_trials)
-
-    def handle_update_search_space(self, data):
-        _logger.debug('Received search space: %s', data)
-        self.search_space = data
-
-    def handle_trial_end(self, data):
-        _logger.debug('Trial end: %s', data)
-        self.invoke_callback('trial_end', nni.load(data['hyper_params'])['parameter_id'], data['event'] == 'SUCCEEDED')
-
-    def handle_report_metric_data(self, data):
-        _logger.debug('Metric reported: %s', data)
-        if data['type'] == MetricType.REQUEST_PARAMETER:
-            raise ValueError('Request parameter not supported')
-        elif data['type'] == MetricType.PERIODICAL:
-            self.invoke_callback('intermediate_metric', data['parameter_id'], self._process_value(data['value']))
-        elif data['type'] == MetricType.FINAL:
-            self.invoke_callback('final_metric', data['parameter_id'], self._process_value(data['value']))
-
-    @staticmethod
-    def _process_value(value) -> Any:  # hopefully a float
-        value = nni.load(value)
-        if isinstance(value, dict):
-            if 'default' in value:
-                return value['default']
-            else:
-                return value
-        return value
+from nni.nas.execution.common.integration import *
--- a/nni/retiarii/integration_api.py
+++ b/nni/retiarii/integration_api.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import warnings
-from typing import NewType, Any
+# pylint: disable=wildcard-import,unused-wildcard-import

-import nni
-from nni.common.version import version_check
-
-# NOTE: this is only for passing flake8, we cannot import RetiariiAdvisor
-# because it would induce cycled import
-RetiariiAdvisor = NewType('RetiariiAdvisor', Any)
-
-_advisor = None  # type is RetiariiAdvisor
-
-
-def get_advisor():
-    # return type: RetiariiAdvisor
-    global _advisor
-    assert _advisor is not None
-    return _advisor
-
-
-def register_advisor(advisor):
-    # type of advisor: RetiariiAdvisor
-    global _advisor
-    if _advisor is not None:
-        warnings.warn('Advisor is already set.'
-                      'You should avoid instantiating RetiariiExperiment twice in one proces.'
-                      'If you are running in a Jupyter notebook, please restart the kernel.')
-    _advisor = advisor
-
-
-def send_trial(parameters: dict, placement_constraint=None) -> int:
-    """
-    Send a new trial. Executed on tuner end.
-    Return a ID that is the unique identifier for this trial.
-    """
-    return get_advisor().send_trial(parameters, placement_constraint)
-
-def receive_trial_parameters() -> dict:
-    """
-    Received a new trial. Executed on trial end.
-    Reload with our json loads because NNI didn't use Retiarii serializer to load the data.
-    """
-    params = nni.get_next_parameter()
-
-    # version check, optional
-    raw_params = nni.trial._params
-    if raw_params is not None and 'version_info' in raw_params:
-        version_check(raw_params['version_info'])
-    else:
-        warnings.warn('Version check failed because `version_info` is not found.')
-
-    return params
-
-
-def get_experiment_id() -> str:
-    return nni.get_experiment_id()
+from nni.nas.execution.common.integration_api import *
--- a/nni/retiarii/mutator.py
+++ b/nni/retiarii/mutator.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import warnings
-from typing import (Any, Iterable, List, Optional, Tuple, cast)
+# pylint: disable=wildcard-import,unused-wildcard-import

-from .graph import Model, Mutation, ModelStatus
-
-
-__all__ = ['Sampler', 'Mutator', 'InvalidMutation']
-
-
-Choice = Any
-
-
-class Sampler:
-    """
-    Handles `Mutator.choice()` calls.
-    """
-
-    def choice(self, candidates: List[Choice], mutator: 'Mutator', model: Model, index: int) -> Choice:
-        raise NotImplementedError()
-
-    def mutation_start(self, mutator: 'Mutator', model: Model) -> None:
-        pass
-
-    def mutation_end(self, mutator: 'Mutator', model: Model) -> None:
-        pass
-
-
-class Mutator:
-    """
-    Mutates graphs in model to generate new model.
-    `Mutator` class will be used in two places:
-
-    1. Inherit `Mutator` to implement graph mutation logic.
-    2. Use `Mutator` subclass to implement NAS strategy.
-
-    In scenario 1, the subclass should implement `Mutator.mutate()` interface with `Mutator.choice()`.
-    In scenario 2, strategy should use constructor or `Mutator.bind_sampler()` to initialize subclass,
-    and then use `Mutator.apply()` to mutate model.
-    For certain mutator subclasses, strategy or sampler can use `Mutator.dry_run()` to predict choice candidates.
-    # Method names are open for discussion.
-
-    If mutator has a label, in most cases, it means that this mutator is applied to nodes with this label.
-    """
-
-    def __init__(self, sampler: Optional[Sampler] = None, label: str = cast(str, None)):
-        self.sampler: Optional[Sampler] = sampler
-        if label is None:
-            warnings.warn('Each mutator should have an explicit label. Mutator without label is deprecated.', DeprecationWarning)
-        self.label: str = label
-        self._cur_model: Optional[Model] = None
-        self._cur_choice_idx: Optional[int] = None
-
-    def bind_sampler(self, sampler: Sampler) -> 'Mutator':
-        """
-        Set the sampler which will handle `Mutator.choice` calls.
-        """
-        self.sampler = sampler
-        return self
-
-    def apply(self, model: Model) -> Model:
-        """
-        Apply this mutator on a model.
-        Returns mutated model.
-        The model will be copied before mutation and the original model will not be modified.
-        """
-        assert self.sampler is not None
-        copy = model.fork()
-        self._cur_model = copy
-        self._cur_choice_idx = 0
-        self._cur_samples = []
-        self.sampler.mutation_start(self, copy)
-        self.mutate(copy)
-        self.sampler.mutation_end(self, copy)
-        copy.history.append(Mutation(self, self._cur_samples, model, copy))
-        copy.status = ModelStatus.Frozen
-        self._cur_model = None
-        self._cur_choice_idx = None
-        return copy
-
-    def dry_run(self, model: Model) -> Tuple[List[List[Choice]], Model]:
-        """
-        Dry run mutator on a model to collect choice candidates.
-        If you invoke this method multiple times on same or different models,
-        it may or may not return identical results, depending on how the subclass implements `Mutator.mutate()`.
-        """
-        sampler_backup = self.sampler
-        recorder = _RecorderSampler()
-        self.sampler = recorder
-        new_model = self.apply(model)
-        self.sampler = sampler_backup
-        return recorder.recorded_candidates, new_model
-
-    def mutate(self, model: Model) -> None:
-        """
-        Abstract method to be implemented by subclass.
-        Mutate a model in place.
-        """
-        raise NotImplementedError()
-
-    def choice(self, candidates: Iterable[Choice]) -> Choice:
-        """
-        Ask sampler to make a choice.
-        """
-        assert self.sampler is not None and self._cur_model is not None and self._cur_choice_idx is not None
-        ret = self.sampler.choice(list(candidates), self, self._cur_model, self._cur_choice_idx)
-        self._cur_samples.append(ret)
-        self._cur_choice_idx += 1
-        return ret
-
-
-class _RecorderSampler(Sampler):
-    def __init__(self):
-        self.recorded_candidates: List[List[Choice]] = []
-
-    def choice(self, candidates: List[Choice], *args) -> Choice:
-        self.recorded_candidates.append(candidates)
-        return candidates[0]
-
-
-class InvalidMutation(Exception):
-    pass
+from nni.nas.mutable.mutator import *
--- a/nni/retiarii/nn/pytorch/api.py
+++ b/nni/retiarii/nn/pytorch/api.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import itertools
-import math
-import operator
-import warnings
-from typing import (Any, Callable, Dict, Generic, Iterable, Iterator, List,
-                    NoReturn, Optional, Sequence, SupportsRound, TypeVar,
-                    Union, cast)
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import torch.nn as nn
-from nni.common.hpo_utils import ParameterSpec
-from nni.common.serializer import Translatable
-from nni.retiarii.serializer import basic_unit
-from nni.retiarii.utils import (STATE_DICT_PY_MAPPING_PARTIAL, ModelNamespace,
-                                NoContextError)
-
-from .mutation_utils import Mutable, generate_new_label, get_fixed_value
-
-__all__ = [
-    # APIs
-    'LayerChoice',
-    'InputChoice',
-    'ValueChoice',
-    'ModelParameterChoice',
-    'Placeholder',
-
-    # Fixed module
-    'ChosenInputs',
-
-    # Type utils
-    'ReductionType',
-    'MaybeChoice',
-    'ChoiceOf',
-]
-
-
-class LayerChoice(Mutable):
-    """
-    Layer choice selects one of the ``candidates``, then apply it on inputs and return results.
-
-    It allows users to put several candidate operations (e.g., PyTorch modules), one of them is chosen in each explored model.
-
-    *New in v2.2:* Layer choice can be nested.
-
-    Parameters
-    ----------
-    candidates : list of nn.Module or OrderedDict
-        A module list to be selected from.
-    prior : list of float
-        Prior distribution used in random sampling.
-    label : str
-        Identifier of the layer choice.
-
-    Attributes
-    ----------
-    length : int
-        Deprecated. Number of ops to choose from. ``len(layer_choice)`` is recommended.
-    names : list of str
-        Names of candidates.
-    choices : list of Module
-        Deprecated. A list of all candidate modules in the layer choice module.
-        ``list(layer_choice)`` is recommended, which will serve the same purpose.
-
-    Examples
-    --------
-
-    ::
-
-        # import nni.retiarii.nn.pytorch as nn
-        # declared in `__init__` method
-        self.layer = nn.LayerChoice([
-            ops.PoolBN('max', channels, 3, stride, 1),
-            ops.SepConv(channels, channels, 3, stride, 1),
-            nn.Identity()
-        ])
-        # invoked in `forward` method
-        out = self.layer(x)
-
-    Notes
-    -----
-    ``candidates`` can be a list of modules or a ordered dict of named modules, for example,
-
-    .. code-block:: python
-
-        self.op_choice = LayerChoice(OrderedDict([
-            ("conv3x3", nn.Conv2d(3, 16, 128)),
-            ("conv5x5", nn.Conv2d(5, 16, 128)),
-            ("conv7x7", nn.Conv2d(7, 16, 128))
-        ]))
-
-    Elements in layer choice can be modified or deleted. Use ``del self.op_choice["conv5x5"]`` or
-    ``self.op_choice[1] = nn.Conv3d(...)``. Adding more choices is not supported yet.
-    """
-
-    # FIXME: prior is designed but not supported yet
-
-    @classmethod
-    def create_fixed_module(cls, candidates: Union[Dict[str, nn.Module], List[nn.Module]], *,
-                            label: Optional[str] = None, **kwargs):
-        chosen = get_fixed_value(label)
-        if isinstance(candidates, list):
-            result = candidates[int(chosen)]
-        else:
-            result = candidates[chosen]
-
-        # map the named hierarchies to support weight inheritance for python engine
-        if hasattr(result, STATE_DICT_PY_MAPPING_PARTIAL):
-            # handle cases where layer choices are nested
-            # already has a mapping, will merge with it
-            prev_mapping = getattr(result, STATE_DICT_PY_MAPPING_PARTIAL)
-            setattr(result, STATE_DICT_PY_MAPPING_PARTIAL, {k: f'{chosen}.{v}' for k, v in prev_mapping.items()})
-        else:
-            # "result" needs to know where to map itself.
-            # Ideally, we should put a _mapping_ in the module where "result" is located,
-            # but it's impossible to put mapping into parent module here.
-            setattr(result, STATE_DICT_PY_MAPPING_PARTIAL, {'__self__': str(chosen)})
-        return result
-
-    def __init__(self, candidates: Union[Dict[str, nn.Module], List[nn.Module]], *,
-                 prior: Optional[List[float]] = None, label: Optional[str] = None, **kwargs):
-        super(LayerChoice, self).__init__()
-        if 'key' in kwargs:
-            warnings.warn(f'"key" is deprecated. Assuming label.')
-            label = kwargs['key']
-        if 'return_mask' in kwargs:
-            warnings.warn(f'"return_mask" is deprecated. Ignoring...')
-        if 'reduction' in kwargs:
-            warnings.warn(f'"reduction" is deprecated. Ignoring...')
-        self.candidates = candidates
-        self.prior = prior or [1 / len(candidates) for _ in range(len(candidates))]
-        assert abs(sum(self.prior) - 1) < 1e-5, 'Sum of prior distribution is not 1.'
-        self._label = generate_new_label(label)
-
-        self.names = []
-        if isinstance(candidates, dict):
-            for name, module in candidates.items():
-                assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \
-                    "Please don't use a reserved name '{}' for your module.".format(name)
-                self.add_module(name, module)
-                self.names.append(name)
-        elif isinstance(candidates, list):
-            for i, module in enumerate(candidates):
-                self.add_module(str(i), module)
-                self.names.append(str(i))
-        else:
-            raise TypeError("Unsupported candidates type: {}".format(type(candidates)))
-        self._first_module = cast(nn.Module, self._modules[self.names[0]])  # to make the dummy forward meaningful
-
-    @property
-    def label(self):
-        return self._label
-
-    def __getitem__(self, idx: Union[int, str]) -> nn.Module:
-        if isinstance(idx, str):
-            return cast(nn.Module, self._modules[idx])
-        return cast(nn.Module, list(self)[idx])
-
-    def __setitem__(self, idx, module):
-        key = idx if isinstance(idx, str) else self.names[idx]
-        return setattr(self, key, module)
-
-    def __delitem__(self, idx):
-        if isinstance(idx, slice):
-            for key in self.names[idx]:
-                delattr(self, key)
-        else:
-            if isinstance(idx, str):
-                key, idx = idx, self.names.index(idx)
-            else:
-                key = self.names[idx]
-            delattr(self, key)
-        del self.names[idx]
-
-    def __len__(self):
-        return len(self.names)
-
-    def __iter__(self):
-        return map(lambda name: self._modules[name], self.names)
-
-    def forward(self, x):
-        """
-        The forward of layer choice is simply running the first candidate module.
-        It shouldn't be called directly by users in most cases.
-        """
-        warnings.warn('You should not run forward of this module directly.')
-        return self._first_module(x)
-
-    def __repr__(self):
-        return f'LayerChoice({self.candidates}, label={repr(self.label)})'
-
-
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
-
-ReductionType = Literal['mean', 'concat', 'sum', 'none']
-
-
-class InputChoice(Mutable):
-    """
-    Input choice selects ``n_chosen`` inputs from ``choose_from`` (contains ``n_candidates`` keys).
-
-    It is mainly for choosing (or trying) different connections. It takes several tensors and chooses ``n_chosen`` tensors from them.
-    When specific inputs are chosen, ``InputChoice`` will become :class:`ChosenInputs`.
-
-    Use ``reduction`` to specify how chosen inputs are reduced into one output. A few options are:
-
-    * ``none``: do nothing and return the list directly.
-    * ``sum``: summing all the chosen inputs.
-    * ``mean``: taking the average of all chosen inputs.
-    * ``concat``: concatenate all chosen inputs at dimension 1.
-
-    We don't support customizing reduction yet.
-
-    Parameters
-    ----------
-    n_candidates : int
-        Number of inputs to choose from. It is required.
-    n_chosen : int
-        Recommended inputs to choose. If None, mutator is instructed to select any.
-    reduction : str
-        ``mean``, ``concat``, ``sum`` or ``none``.
-    prior : list of float
-        Prior distribution used in random sampling.
-    label : str
-        Identifier of the input choice.
-
-    Examples
-    --------
-    ::
-
-        # import nni.retiarii.nn.pytorch as nn
-        # declared in `__init__` method
-        self.input_switch = nn.InputChoice(n_chosen=1)
-        # invoked in `forward` method, choose one from the three
-        out = self.input_switch([tensor1, tensor2, tensor3])
-    """
-
-    @classmethod
-    def create_fixed_module(cls, n_candidates: int, n_chosen: Optional[int] = 1,
-                            reduction: ReductionType = 'sum', *,
-                            prior: Optional[List[float]] = None, label: Optional[str] = None, **kwargs):
-        return ChosenInputs(get_fixed_value(label), reduction=reduction)
-
-    def __init__(self, n_candidates: int, n_chosen: Optional[int] = 1,
-                 reduction: str = 'sum', *,
-                 prior: Optional[List[float]] = None, label: Optional[str] = None, **kwargs):
-        super(InputChoice, self).__init__()
-        if 'key' in kwargs:
-            warnings.warn(f'"key" is deprecated. Assuming label.')
-            label = kwargs['key']
-        if 'return_mask' in kwargs:
-            warnings.warn(f'"return_mask" is deprecated. Ignoring...')
-        if 'choose_from' in kwargs:
-            warnings.warn(f'"reduction" is deprecated. Ignoring...')
-        self.n_candidates = n_candidates
-        self.n_chosen = n_chosen
-        self.reduction = reduction
-        self.prior = prior or [1 / n_candidates for _ in range(n_candidates)]
-        assert self.reduction in ['mean', 'concat', 'sum', 'none']
-        self._label = generate_new_label(label)
-
-    @property
-    def label(self):
-        return self._label
-
-    def forward(self, candidate_inputs: List[torch.Tensor]) -> torch.Tensor:
-        """
-        The forward of input choice is simply the first item of ``candidate_inputs``.
-        It shouldn't be called directly by users in most cases.
-        """
-        warnings.warn('You should not run forward of this module directly.')
-        return candidate_inputs[0]
-
-    def __repr__(self):
-        return f'InputChoice(n_candidates={self.n_candidates}, n_chosen={self.n_chosen}, ' \
-            f'reduction={repr(self.reduction)}, label={repr(self.label)})'
-
-
-class ChosenInputs(nn.Module):
-    """
-    A module that chooses from a tensor list and outputs a reduced tensor.
-    The already-chosen version of InputChoice.
-
-    When forward, ``chosen`` will be used to select inputs from ``candidate_inputs``,
-    and ``reduction`` will be used to choose from those inputs to form a tensor.
-
-    Attributes
-    ----------
-    chosen : list of int
-        Indices of chosen inputs.
-    reduction : ``mean`` | ``concat`` | ``sum`` | ``none``
-        How to reduce the inputs when multiple are selected.
-    """
-
-    def __init__(self, chosen: Union[List[int], int], reduction: ReductionType):
-        super().__init__()
-        self.chosen = chosen if isinstance(chosen, list) else [chosen]
-        self.reduction = reduction
-
-    def forward(self, candidate_inputs):
-        """
-        Compute the reduced input based on ``chosen`` and ``reduction``.
-        """
-        return self._tensor_reduction(self.reduction, [candidate_inputs[i] for i in self.chosen])
-
-    def _tensor_reduction(self, reduction_type, tensor_list):
-        if reduction_type == 'none':
-            return tensor_list
-        if not tensor_list:
-            return None  # empty. return None for now
-        if len(tensor_list) == 1:
-            return tensor_list[0]
-        if reduction_type == 'sum':
-            return sum(tensor_list)
-        if reduction_type == 'mean':
-            return sum(tensor_list) / len(tensor_list)
-        if reduction_type == 'concat':
-            return torch.cat(tensor_list, dim=1)
-        raise ValueError(f'Unrecognized reduction policy: "{reduction_type}"')
-
-
-# the code in ValueChoice can be generated with this codegen
-# this is not done online because I want to have type-hint supports
-# $ python -c "from nni.retiarii.nn.pytorch.api import _valuechoice_codegen; _valuechoice_codegen(_internal=True)"
-def _valuechoice_codegen(*, _internal: bool = False):
-    if not _internal:
-        raise RuntimeError("This method is set to be internal. Please don't use it directly.")
-    MAPPING = {
-        # unary
-        'neg': '-', 'pos': '+', 'invert': '~',
-        # binary
-        'add': '+', 'sub': '-', 'mul': '*', 'matmul': '@',
-        'truediv': '//', 'floordiv': '/', 'mod': '%',
-        'lshift': '<<', 'rshift': '>>',
-        'and': '&', 'xor': '^', 'or': '|',
-        # no reverse
-        'lt': '<', 'le': '<=', 'eq': '==',
-        'ne': '!=', 'ge': '>=', 'gt': '>',
-        # NOTE
-        # Currently we don't support operators like __contains__ (b in a),
-        # Might support them in future when we actually need them.
-    }
-
-    binary_template = """    def __{op}__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.{opt}, '{{}} {sym} {{}}', [self, other])"""
-
-    binary_r_template = """    def __r{op}__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.{opt}, '{{}} {sym} {{}}', [other, self])"""
-
-    unary_template = """    def __{op}__(self: 'ChoiceOf[_value]') -> 'ChoiceOf[_value]':
-        return cast(ChoiceOf[_value], ValueChoiceX(operator.{op}, '{sym}{{}}', [self]))"""
-
-    for op, sym in MAPPING.items():
-        if op in ['neg', 'pos', 'invert']:
-            print(unary_template.format(op=op, sym=sym) + '\n')
-        else:
-            opt = op + '_' if op in ['and', 'or'] else op
-            print(binary_template.format(op=op, opt=opt, sym=sym) + '\n')
-            if op not in ['lt', 'le', 'eq', 'ne', 'ge', 'gt']:
-                print(binary_r_template.format(op=op, opt=opt, sym=sym) + '\n')
-
-
-_func = TypeVar('_func')
-_cand = TypeVar('_cand')
-_value = TypeVar('_value')
-
-
-def _valuechoice_staticmethod_helper(orig_func: _func) -> _func:
-    if orig_func.__doc__ is not None:
-        orig_func.__doc__ += """
-        Notes
-        -----
-        This function performs lazy evaluation.
-        Only the expression will be recorded when the function is called.
-        The real evaluation happens when the inner value choice has determined its final decision.
-        If no value choice is contained in the parameter list, the evaluation will be intermediate."""
-    return orig_func
-
-
-class ValueChoiceX(Generic[_cand], Translatable, nn.Module):
-    """Internal API. Implementation note:
-
-    The transformed (X) version of value choice.
-    It can be the result of composition (transformation) of one or several value choices. For example,
-
-    .. code-block:: python
-
-        nn.ValueChoice([1, 2]) + nn.ValueChoice([3, 4]) + 5
-
-    The instance of base class cannot be created directly. Instead, they should be only the result of transformation of value choice.
-    Therefore, there is no need to implement ``create_fixed_module`` in this class, because,
-    1. For python-engine, value choice itself has create fixed module. Consequently, the transformation is born to be fixed.
-    2. For graph-engine, it uses evaluate to calculate the result.
-
-    Potentially, we have to implement the evaluation logic in oneshot algorithms. I believe we can postpone the discussion till then.
-
-    This class is implemented as a ``nn.Module`` so that it can be scanned by python engine / torchscript.
-    """
-
-    def __init__(self, function: Callable[..., _cand] = cast(Callable[..., _cand], None),
-                 repr_template: str = cast(str, None),
-                 arguments: List[Any] = cast('List[MaybeChoice[_cand]]', None),
-                 dry_run: bool = True):
-        super().__init__()
-
-        if function is None:
-            # this case is a hack for ValueChoice subclass
-            # it will reach here only because ``__init__`` in ``nn.Module`` is useful.
-            return
-
-        self.function = function
-        self.repr_template = repr_template
-        self.arguments = arguments
-
-        assert any(isinstance(arg, ValueChoiceX) for arg in self.arguments)
-
-        if dry_run:
-            # for sanity check
-            self.dry_run()
-
-    def forward(self) -> None:
-        raise RuntimeError('You should never call forward of the composition of a value-choice.')
-
-    def inner_choices(self) -> Iterable['ValueChoice']:
-        """
-        Return a generator of all leaf value choices.
-        Useful for composition of value choices.
-        No deduplication on labels. Mutators should take care.
-        """
-        for arg in self.arguments:
-            if isinstance(arg, ValueChoiceX):
-                yield from arg.inner_choices()
-
-    def dry_run(self) -> _cand:
-        """
-        Dry run the value choice to get one of its possible evaluation results.
-        """
-        # values are not used
-        return self._evaluate(iter([]), True)
-
-    def all_options(self) -> Iterable[_cand]:
-        """Explore all possibilities of a value choice.
-        """
-        # Record all inner choices: label -> candidates, no duplicates.
-        dedup_inner_choices: Dict[str, List[_cand]] = {}
-        # All labels of leaf nodes on tree, possibly duplicates.
-        all_labels: List[str] = []
-
-        for choice in self.inner_choices():
-            all_labels.append(choice.label)
-            if choice.label in dedup_inner_choices:
-                if choice.candidates != dedup_inner_choices[choice.label]:
-                    # check for choice with the same label
-                    raise ValueError(f'"{choice.candidates}" is not equal to "{dedup_inner_choices[choice.label]}", '
-                                     f'but they share the same label: {choice.label}')
-            else:
-                dedup_inner_choices[choice.label] = choice.candidates
-
-        dedup_labels, dedup_candidates = list(dedup_inner_choices.keys()), list(dedup_inner_choices.values())
-
-        for chosen in itertools.product(*dedup_candidates):
-            chosen = dict(zip(dedup_labels, chosen))
-            yield self.evaluate([chosen[label] for label in all_labels])
-
-    def evaluate(self, values: Iterable[_cand]) -> _cand:
-        """
-        Evaluate the result of this group.
-        ``values`` should in the same order of ``inner_choices()``.
-        """
-        return self._evaluate(iter(values), False)
-
-    def _evaluate(self, values: Iterator[_cand], dry_run: bool = False) -> _cand:
-        # "values" iterates in the recursion
-        eval_args = []
-        for arg in self.arguments:
-            if isinstance(arg, ValueChoiceX):
-                # recursive evaluation
-                eval_args.append(arg._evaluate(values, dry_run))
-                # the recursion will stop when it hits a leaf node (value choice)
-                # the implementation is in `ValueChoice`
-            else:
-                # constant value
-                eval_args.append(arg)
-        return self.function(*eval_args)
-
-    def _translate(self):
-        """
-        Try to behave like one of its candidates when used in ``basic_unit``.
-        """
-        return self.dry_run()
-
-    def __repr__(self) -> str:
-        reprs = []
-        for arg in self.arguments:
-            if isinstance(arg, ValueChoiceX) and not isinstance(arg, ValueChoice):
-                reprs.append('(' + repr(arg) + ')')  # add parenthesis for operator priority
-            else:
-                reprs.append(repr(arg))
-        return self.repr_template.format(*reprs)
-
-    # the following are a series of methods to create "ValueChoiceX"
-    # which is a transformed version of value choice
-    # https://docs.python.org/3/reference/datamodel.html#special-method-names
-
-    # Special operators that can be useful in place of built-in conditional operators.
-    @staticmethod
-    @_valuechoice_staticmethod_helper
-    def to_int(obj: 'MaybeChoice[Any]') -> 'MaybeChoice[int]':
-        """
-        Convert a ``ValueChoice`` to an integer.
-        """
-        if isinstance(obj, ValueChoiceX):
-            return ValueChoiceX(int, 'int({})', [obj])
-        return int(obj)
-
-    @staticmethod
-    @_valuechoice_staticmethod_helper
-    def to_float(obj: 'MaybeChoice[Any]') -> 'MaybeChoice[float]':
-        """
-        Convert a ``ValueChoice`` to a float.
-        """
-        if isinstance(obj, ValueChoiceX):
-            return ValueChoiceX(float, 'float({})', [obj])
-        return float(obj)
-
-    @staticmethod
-    @_valuechoice_staticmethod_helper
-    def condition(pred: 'MaybeChoice[bool]',
-                  true: 'MaybeChoice[_value]',
-                  false: 'MaybeChoice[_value]') -> 'MaybeChoice[_value]':
-        """
-        Return ``true`` if the predicate ``pred`` is true else ``false``.
-
-        Examples
-        --------
-        >>> ValueChoice.condition(ValueChoice([1, 2]) > ValueChoice([0, 3]), 2, 1)
-        """
-        if any(isinstance(obj, ValueChoiceX) for obj in [pred, true, false]):
-            return ValueChoiceX(lambda t, c, f: t if c else f, '{} if {} else {}', [true, pred, false])
-        return true if pred else false
-
-    @staticmethod
-    @_valuechoice_staticmethod_helper
-    def max(arg0: Union[Iterable['MaybeChoice[_value]'], 'MaybeChoice[_value]'],
-            *args: 'MaybeChoice[_value]') -> 'MaybeChoice[_value]':
-        """
-        Returns the maximum value from a list of value choices.
-        The usage should be similar to Python's built-in value choices,
-        where the parameters could be an iterable, or at least two arguments.
-        """
-        if not args:
-            if not isinstance(arg0, Iterable):
-                raise TypeError('Expect more than one items to compare max')
-            return cast(MaybeChoice[_value], ValueChoiceX.max(*list(arg0)))
-        lst = list(arg0) if isinstance(arg0, Iterable) else [arg0] + list(args)
-        if any(isinstance(obj, ValueChoiceX) for obj in lst):
-            return ValueChoiceX(max, 'max({})', lst)
-        return max(cast(Any, lst))
-
-    @staticmethod
-    @_valuechoice_staticmethod_helper
-    def min(arg0: Union[Iterable['MaybeChoice[_value]'], 'MaybeChoice[_value]'],
-            *args: 'MaybeChoice[_value]') -> 'MaybeChoice[_value]':
-        """
-        Returns the minunum value from a list of value choices.
-        The usage should be similar to Python's built-in value choices,
-        where the parameters could be an iterable, or at least two arguments.
-        """
-        if not args:
-            if not isinstance(arg0, Iterable):
-                raise TypeError('Expect more than one items to compare min')
-            return cast(MaybeChoice[_value], ValueChoiceX.min(*list(arg0)))
-        lst = list(arg0) if isinstance(arg0, Iterable) else [arg0] + list(args)
-        if any(isinstance(obj, ValueChoiceX) for obj in lst):
-            return ValueChoiceX(min, 'min({})', lst)
-        return min(cast(Any, lst))
-
-    def __hash__(self):
-        # this is required because we have implemented ``__eq__``
-        return id(self)
-
-    # NOTE:
-    # Write operations are not supported. Reasons follow:
-    # - Semantics are not clear. It can be applied to "all" the inner candidates, or only the chosen one.
-    # - Implementation effort is too huge.
-    # As a result, inplace operators like +=, *=, magic methods like `__getattr__` are not included in this list.
-
-    def __getitem__(self: 'ChoiceOf[Any]', key: Any) -> 'ChoiceOf[Any]':
-        return ValueChoiceX(lambda x, y: x[y], '{}[{}]', [self, key])
-
-    # region implement int, float, round, trunc, floor, ceil
-    # because I believe sometimes we need them to calculate #channels
-    # `__int__` and `__float__` are not supported because `__int__` is required to return int.
-    def __round__(self: 'ChoiceOf[SupportsRound[_value]]',
-                  ndigits: Optional['MaybeChoice[int]'] = None) -> 'ChoiceOf[Union[int, SupportsRound[_value]]]':
-        if ndigits is not None:
-            return cast(ChoiceOf[Union[int, SupportsRound[_value]]], ValueChoiceX(round, 'round({}, {})', [self, ndigits]))
-        return cast(ChoiceOf[Union[int, SupportsRound[_value]]], ValueChoiceX(round, 'round({})', [self]))
-
-    def __trunc__(self) -> NoReturn:
-        raise RuntimeError("Try to use `ValueChoice.to_int()` instead of `math.trunc()` on value choices.")
-
-    def __floor__(self: 'ChoiceOf[Any]') -> 'ChoiceOf[int]':
-        return ValueChoiceX(math.floor, 'math.floor({})', [self])
-
-    def __ceil__(self: 'ChoiceOf[Any]') -> 'ChoiceOf[int]':
-        return ValueChoiceX(math.ceil, 'math.ceil({})', [self])
-
-    def __index__(self) -> NoReturn:
-        # https://docs.python.org/3/reference/datamodel.html#object.__index__
-        raise RuntimeError("`__index__` is not allowed on ValueChoice, which means you can't "
-                           "use int(), float(), complex(), range() on a ValueChoice. "
-                           "To cast the type of ValueChoice, please try `ValueChoice.to_int()` or `ValueChoice.to_float()`.")
-
-    def __bool__(self) -> NoReturn:
-        raise RuntimeError('Cannot use bool() on ValueChoice. That means, using ValueChoice in a if-clause is illegal. '
-                           'Please try methods like `ValueChoice.max(a, b)` to see whether that meets your needs.')
-    # endregion
-
-    # region the following code is generated with codegen (see above)
-    # Annotated with "region" because I want to collapse them in vscode
-    def __neg__(self: 'ChoiceOf[_value]') -> 'ChoiceOf[_value]':
-        return cast(ChoiceOf[_value], ValueChoiceX(operator.neg, '-{}', [self]))
-
-    def __pos__(self: 'ChoiceOf[_value]') -> 'ChoiceOf[_value]':
-        return cast(ChoiceOf[_value], ValueChoiceX(operator.pos, '+{}', [self]))
-
-    def __invert__(self: 'ChoiceOf[_value]') -> 'ChoiceOf[_value]':
-        return cast(ChoiceOf[_value], ValueChoiceX(operator.invert, '~{}', [self]))
-
-    def __add__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.add, '{} + {}', [self, other])
-
-    def __radd__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.add, '{} + {}', [other, self])
-
-    def __sub__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.sub, '{} - {}', [self, other])
-
-    def __rsub__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.sub, '{} - {}', [other, self])
-
-    def __mul__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.mul, '{} * {}', [self, other])
-
-    def __rmul__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.mul, '{} * {}', [other, self])
-
-    def __matmul__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.matmul, '{} @ {}', [self, other])
-
-    def __rmatmul__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.matmul, '{} @ {}', [other, self])
-
-    def __truediv__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.truediv, '{} // {}', [self, other])
-
-    def __rtruediv__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.truediv, '{} // {}', [other, self])
-
-    def __floordiv__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.floordiv, '{} / {}', [self, other])
-
-    def __rfloordiv__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.floordiv, '{} / {}', [other, self])
-
-    def __mod__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.mod, '{} % {}', [self, other])
-
-    def __rmod__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.mod, '{} % {}', [other, self])
-
-    def __lshift__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.lshift, '{} << {}', [self, other])
-
-    def __rlshift__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.lshift, '{} << {}', [other, self])
-
-    def __rshift__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.rshift, '{} >> {}', [self, other])
-
-    def __rrshift__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.rshift, '{} >> {}', [other, self])
-
-    def __and__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.and_, '{} & {}', [self, other])
-
-    def __rand__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.and_, '{} & {}', [other, self])
-
-    def __xor__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.xor, '{} ^ {}', [self, other])
-
-    def __rxor__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.xor, '{} ^ {}', [other, self])
-
-    def __or__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.or_, '{} | {}', [self, other])
-
-    def __ror__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.or_, '{} | {}', [other, self])
-
-    def __lt__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.lt, '{} < {}', [self, other])
-
-    def __le__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.le, '{} <= {}', [self, other])
-
-    def __eq__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.eq, '{} == {}', [self, other])
-
-    def __ne__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.ne, '{} != {}', [self, other])
-
-    def __ge__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.ge, '{} >= {}', [self, other])
-
-    def __gt__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(operator.gt, '{} > {}', [self, other])
-    # endregion
-
-    # __pow__, __divmod__, __abs__ are special ones.
-    # Not easy to cover those cases with codegen.
-    def __pow__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]', modulo: Optional['MaybeChoice[Any]'] = None) -> 'ChoiceOf[Any]':
-        if modulo is not None:
-            return ValueChoiceX(pow, 'pow({}, {}, {})', [self, other, modulo])
-        return ValueChoiceX(lambda a, b: a ** b, '{} ** {}', [self, other])
-
-    def __rpow__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]', modulo: Optional['MaybeChoice[Any]'] = None) -> 'ChoiceOf[Any]':
-        if modulo is not None:
-            return ValueChoiceX(pow, 'pow({}, {}, {})', [other, self, modulo])
-        return ValueChoiceX(lambda a, b: a ** b, '{} ** {}', [other, self])
-
-    def __divmod__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(divmod, 'divmod({}, {})', [self, other])
-
-    def __rdivmod__(self: 'ChoiceOf[Any]', other: 'MaybeChoice[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(divmod, 'divmod({}, {})', [other, self])
-
-    def __abs__(self: 'ChoiceOf[Any]') -> 'ChoiceOf[Any]':
-        return ValueChoiceX(abs, 'abs({})', [self])
-
-
-ChoiceOf = ValueChoiceX
-MaybeChoice = Union[ValueChoiceX[_cand], _cand]
-
-
-class ValueChoice(ValueChoiceX[_cand], Mutable):
-    """
-    ValueChoice is to choose one from ``candidates``. The most common use cases are:
-
-    * Used as input arguments of :class:`~nni.retiarii.basic_unit`
-      (i.e., modules in ``nni.retiarii.nn.pytorch`` and user-defined modules decorated with ``@basic_unit``).
-    * Used as input arguments of evaluator (*new in v2.7*).
-
-    It can be used in parameters of operators (i.e., a sub-module of the model): ::
-
-        class Net(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = nn.Conv2d(3, nn.ValueChoice([32, 64]), kernel_size=nn.ValueChoice([3, 5, 7]))
-
-            def forward(self, x):
-                return self.conv(x)
-
-    Or evaluator (only if the evaluator is :doc:`traceable </nas/serialization>`, e.g.,
-    :class:`FunctionalEvaluator <nni.retiarii.evaluator.FunctionalEvaluator>`): ::
-
-        def train_and_evaluate(model_cls, learning_rate):
-            ...
-
-        self.evaluator = FunctionalEvaluator(train_and_evaluate, learning_rate=nn.ValueChoice([1e-3, 1e-2, 1e-1]))
-
-    Value choices supports arithmetic operators, which is particularly useful when searching for a network width multiplier: ::
-
-        # init
-        scale = nn.ValueChoice([1.0, 1.5, 2.0])
-        self.conv1 = nn.Conv2d(3, round(scale * 16))
-        self.conv2 = nn.Conv2d(round(scale * 16), round(scale * 64))
-        self.conv3 = nn.Conv2d(round(scale * 64), round(scale * 256))
-
-        # forward
-        return self.conv3(self.conv2(self.conv1(x)))
-
-    Or when kernel size and padding are coupled so as to keep the output size constant: ::
-
-        # init
-        ks = nn.ValueChoice([3, 5, 7])
-        self.conv = nn.Conv2d(3, 16, kernel_size=ks, padding=(ks - 1) // 2)
-
-        # forward
-        return self.conv(x)
-
-    Or when several layers are concatenated for a final layer. ::
-
-        # init
-        self.linear1 = nn.Linear(3, nn.ValueChoice([1, 2, 3], label='a'))
-        self.linear2 = nn.Linear(3, nn.ValueChoice([4, 5, 6], label='b'))
-        self.final = nn.Linear(nn.ValueChoice([1, 2, 3], label='a') + nn.ValueChoice([4, 5, 6], label='b'), 2)
-
-        # forward
-        return self.final(torch.cat([self.linear1(x), self.linear2(x)], 1))
-
-    Some advanced operators are also provided, such as :meth:`ValueChoice.max` and :meth:`ValueChoice.cond`.
-
-    .. tip::
-
-        All the APIs have an optional argument called ``label``,
-        mutations with the same label will share the same choice. A typical example is, ::
-
-            self.net = nn.Sequential(
-                nn.Linear(10, nn.ValueChoice([32, 64, 128], label='hidden_dim')),
-                nn.Linear(nn.ValueChoice([32, 64, 128], label='hidden_dim'), 3)
-            )
-
-        Sharing the same value choice instance has the similar effect. ::
-
-            class Net(nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    hidden_dim = nn.ValueChoice([128, 512])
-                    self.fc = nn.Sequential(
-                        nn.Linear(64, hidden_dim),
-                        nn.Linear(hidden_dim, 10)
-                    )
-
-    .. warning::
-
-        It looks as if a specific candidate has been chosen (e.g., how it looks like when you can put ``ValueChoice``
-        as a parameter of ``nn.Conv2d``), but in fact it's a syntax sugar as because the basic units and evaluators
-        do all the underlying works. That means, you cannot assume that ``ValueChoice`` can be used in the same way
-        as its candidates. For example, the following usage will NOT work: ::
-
-            self.blocks = []
-            for i in range(nn.ValueChoice([1, 2, 3])):
-                self.blocks.append(Block())
-
-            # NOTE: instead you should probably write
-            # self.blocks = nn.Repeat(Block(), (1, 3))
-
-    Another use case is to initialize the values to choose from in init and call the module in forward to get the chosen value.
-    Usually, this is used to pass a mutable value to a functional API like ``torch.xxx`` or ``nn.functional.xxx```.
-    For example, ::
-
-        class Net(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.dropout_rate = nn.ValueChoice([0., 1.])
-
-            def forward(self, x):
-                return F.dropout(x, self.dropout_rate())
-
-    Parameters
-    ----------
-    candidates : list
-        List of values to choose from.
-    prior : list of float
-        Prior distribution to sample from.
-    label : str
-        Identifier of the value choice.
-    """
-
-    # FIXME: prior is designed but not supported yet
-
-    @classmethod
-    def create_fixed_module(cls, candidates: List[_cand], *, label: Optional[str] = None, **kwargs):
-        value = get_fixed_value(label)
-        if value not in candidates:
-            raise ValueError(f'Value {value} does not belong to the candidates: {candidates}.')
-        return value
-
-    def __init__(self, candidates: List[_cand], *, prior: Optional[List[float]] = None, label: Optional[str] = None):
-        super().__init__()  # type: ignore
-        self.candidates = candidates
-        self.prior = prior or [1 / len(candidates) for _ in range(len(candidates))]
-        assert abs(sum(self.prior) - 1) < 1e-5, 'Sum of prior distribution is not 1.'
-        self._label = generate_new_label(label)
-
-    @property
-    def label(self):
-        return self._label
-
-    def forward(self):
-        """
-        The forward of input choice is simply the first value of ``candidates``.
-        It shouldn't be called directly by users in most cases.
-        """
-        warnings.warn('You should not run forward of this module directly.')
-        return self.candidates[0]
-
-    def inner_choices(self) -> Iterable['ValueChoice']:
-        # yield self because self is the only value choice here
-        yield self
-
-    def dry_run(self) -> _cand:
-        return self.candidates[0]
-
-    def _evaluate(self, values: Iterator[_cand], dry_run: bool = False) -> _cand:
-        if dry_run:
-            return self.candidates[0]
-        try:
-            value = next(values)
-        except StopIteration:
-            raise ValueError(f'Value list {values} is exhausted when trying to get a chosen value of {self}.')
-        if value not in self.candidates:
-            raise ValueError(f'Value {value} does not belong to the candidates of {self}.')
-        return value
-
-    def __repr__(self):
-        return f'ValueChoice({self.candidates}, label={repr(self.label)})'
-
-
-ValueType = TypeVar('ValueType')
-
-
-class ModelParameterChoice:
-    """ModelParameterChoice chooses one hyper-parameter from ``candidates``.
-
-    .. attention::
-
-       This API is internal, and does not guarantee forward-compatibility.
-
-    It's quite similar to :class:`ValueChoice`, but unlike :class:`ValueChoice`,
-    it always returns a fixed value, even at the construction of base model.
-
-    This makes it highly flexible (e.g., can be used in for-loop, if-condition, as argument of any function). For example: ::
-
-        self.has_auxiliary_head = ModelParameterChoice([False, True])
-        # this will raise error if you use `ValueChoice`
-        if self.has_auxiliary_head is True:  # or self.has_auxiliary_head
-            self.auxiliary_head = Head()
-        else:
-            self.auxiliary_head = None
-        print(type(self.has_auxiliary_head))  # <class 'bool'>
-
-    The working mechanism of :class:`ModelParameterChoice` is that, it registers itself
-    in the ``model_wrapper``, as a hyper-parameter of the model, and then returns the value specified with ``default``.
-    At base model construction, the default value will be used (as a mocked hyper-parameter).
-    In trial, the hyper-parameter selected by strategy will be used.
-
-    Although flexible, we still recommend using :class:`ValueChoice` in favor of :class:`ModelParameterChoice`,
-    because information are lost when using :class:`ModelParameterChoice` in exchange of its flexibility,
-    making it incompatible with one-shot strategies and non-python execution engines.
-
-    .. warning::
-
-        :class:`ModelParameterChoice` can NOT be nested.
-
-    .. tip::
-
-        Although called :class:`ModelParameterChoice`, it's meant to tune hyper-parameter of architecture.
-        It's NOT used to tune model-training hyper-parameters like ``learning_rate``.
-        If you need to tune ``learning_rate``, please use :class:`ValueChoice` on arguments of :class:`nni.retiarii.Evaluator`.
-
-    Parameters
-    ----------
-    candidates : list of any
-        List of values to choose from.
-    prior : list of float
-        Prior distribution to sample from. Currently has no effect.
-    default : Callable[[List[Any]], Any] or Any
-        Function that selects one from ``candidates``, or a candidate.
-        Use :meth:`ModelParameterChoice.FIRST` or :meth:`ModelParameterChoice.LAST` to take the first or last item.
-        Default: :meth:`ModelParameterChoice.FIRST`
-    label : str
-        Identifier of the value choice.
-
-    Warnings
-    --------
-    :class:`ModelParameterChoice` is incompatible with one-shot strategies and non-python execution engines.
-
-    Sometimes, the same search space implemented **without** :class:`ModelParameterChoice` can be simpler, and explored
-    with more types of search strategies. For example, the following usages are equivalent: ::
-
-        # with ModelParameterChoice
-        depth = nn.ModelParameterChoice(list(range(3, 10)))
-        blocks = []
-        for i in range(depth):
-            blocks.append(Block())
-
-        # w/o HyperParmaeterChoice
-        blocks = Repeat(Block(), (3, 9))
-
-    Examples
-    --------
-    Get a dynamic-shaped parameter. Because ``torch.zeros`` is not a basic unit, we can't use :class:`ValueChoice` on it.
-
-    >>> parameter_dim = nn.ModelParameterChoice([64, 128, 256])
-    >>> self.token = nn.Parameter(torch.zeros(1, parameter_dim, 32, 32))
-    """
-
-    # FIXME: fix signature in docs
-
-    # FIXME: prior is designed but not supported yet
-
-    def __new__(cls, candidates: List[ValueType], *,
-                prior: Optional[List[float]] = None,
-                default: Union[Callable[[List[ValueType]], ValueType], ValueType] = None,
-                label: Optional[str] = None) -> ValueType:
-        # Actually, creating a `ModelParameterChoice` never creates one.
-        # It always return a fixed value, and register a ParameterSpec
-
-        if default is None:
-            default = cls.FIRST
-
-        try:
-            return cls.create_fixed_module(candidates, label=label)
-        except NoContextError:
-            return cls.create_default(candidates, default, label)
-
-    @staticmethod
-    def create_default(candidates: List[ValueType],
-                       default: Union[Callable[[List[ValueType]], ValueType], ValueType],
-                       label: Optional[str]) -> ValueType:
-        if default not in candidates:
-            # could be callable
-            try:
-                default = cast(Callable[[List[ValueType]], ValueType], default)(candidates)
-            except TypeError as e:
-                if 'not callable' in str(e):
-                    raise TypeError("`default` is not in `candidates`, and it's also not callable.")
-                raise
-
-        default = cast(ValueType, default)
-
-        label = generate_new_label(label)
-        parameter_spec = ParameterSpec(
-            label,          # name
-            'choice',       # TODO: support more types
-            candidates,     # value
-            (label,),       # we don't have nested now
-            True,           # yes, categorical
-        )
-
-        # there could be duplicates. Dedup is done in mutator
-        ModelNamespace.current_context().parameter_specs.append(parameter_spec)
-
-        return default
-
-    @classmethod
-    def create_fixed_module(cls, candidates: List[ValueType], *, label: Optional[str] = None, **kwargs) -> ValueType:
-        # same as ValueChoice
-        value = get_fixed_value(label)
-        if value not in candidates:
-            raise ValueError(f'Value {value} does not belong to the candidates: {candidates}.')
-        return value
-
-    @staticmethod
-    def FIRST(sequence: Sequence[ValueType]) -> ValueType:
-        """Get the first item of sequence. Useful in ``default`` argument."""
-        return sequence[0]
-
-    @staticmethod
-    def LAST(sequence: Sequence[ValueType]) -> ValueType:
-        """Get the last item of sequence. Useful in ``default`` argument."""
-        return sequence[-1]
-
-
-@basic_unit
-class Placeholder(nn.Module):
-    """
-    The API that creates an empty module for later mutations.
-    For advanced usages only.
-    """
-
-    def __init__(self, label, **related_info):
-        self.label = label
-        self.related_info = related_info
-        super().__init__()
-
-    def forward(self, x):
-        """
-        Forward of placeholder is not meaningful.
-        It returns input directly.
-        """
-        return x
+from nni.nas.nn.pytorch.choice import *
--- a/nni/retiarii/nn/pytorch/cell.py
+++ b/nni/retiarii/nn/pytorch/cell.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import copy
-import warnings
-from typing import Callable, Dict, List, Union, Optional, Tuple, Sequence, cast
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch
-import torch.nn as nn
-
-from .api import ChosenInputs, LayerChoice, InputChoice
-from .nn import ModuleList  # pylint: disable=no-name-in-module
-from .mutation_utils import generate_new_label
-
-
-class _ListIdentity(nn.Identity):
-    # workaround for torchscript
-    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
-        return x
-
-
-class _DefaultPostprocessor(nn.Module):
-    # this is also a workaround for torchscript
-
-    def forward(self, this_cell: torch.Tensor, prev_cell: List[torch.Tensor]) -> torch.Tensor:
-        return this_cell
-
-
-CellOpFactory = Callable[[int, int, Optional[int]], nn.Module]
-
-
-def create_cell_op_candidates(
-    op_candidates, node_index, op_index, chosen
-) -> Tuple[Dict[str, nn.Module], bool]:
-    has_factory = False
-
-    # convert the complex type into the type that is acceptable to LayerChoice
-    def convert_single_op(op):
-        nonlocal has_factory
-
-        if isinstance(op, nn.Module):
-            return copy.deepcopy(op)
-        elif callable(op):
-            # Yes! It's using factory to create operations now.
-            has_factory = True
-            # FIXME: I don't know how to check whether we are in graph engine.
-            return op(node_index, op_index, chosen)
-        else:
-            raise TypeError(f'Unrecognized type {type(op)} for op {op}')
-
-    if isinstance(op_candidates, list):
-        res = {str(i): convert_single_op(op) for i, op in enumerate(op_candidates)}
-    elif isinstance(op_candidates, dict):
-        res = {key: convert_single_op(op) for key, op in op_candidates.items()}
-    elif callable(op_candidates):
-        warnings.warn(f'Directly passing a callable into Cell is deprecated. Please consider migrating to list or dict.',
-                      DeprecationWarning)
-        res = op_candidates()
-        has_factory = True
-    else:
-        raise TypeError(f'Unrecognized type {type(op_candidates)} for {op_candidates}')
-
-    return res, has_factory
-
-
-def preprocess_cell_inputs(num_predecessors: int, *inputs: Union[List[torch.Tensor], torch.Tensor]) -> List[torch.Tensor]:
-    if len(inputs) == 1 and isinstance(inputs[0], list):
-        processed_inputs = list(inputs[0])  # shallow copy
-    else:
-        processed_inputs = cast(List[torch.Tensor], list(inputs))
-    assert len(processed_inputs) == num_predecessors, 'The number of inputs must be equal to `num_predecessors`.'
-    return processed_inputs
-
-class Cell(nn.Module):
-    """
-    Cell structure that is popularly used in NAS literature.
-
-    Find the details in:
-
-    * `Neural Architecture Search with Reinforcement Learning <https://arxiv.org/abs/1611.01578>`__.
-    * `Learning Transferable Architectures for Scalable Image Recognition <https://arxiv.org/abs/1707.07012>`__.
-    * `DARTS: Differentiable Architecture Search <https://arxiv.org/abs/1806.09055>`__
-
-    `On Network Design Spaces for Visual Recognition <https://arxiv.org/abs/1905.13214>`__
-    is a good summary of how this structure works in practice.
-
-    A cell consists of multiple "nodes". Each node is a sum of multiple operators. Each operator is chosen from
-    ``op_candidates``, and takes one input from previous nodes and predecessors. Predecessor means the input of cell.
-    The output of cell is the concatenation of some of the nodes in the cell (by default all the nodes).
-
-    Two examples of searched cells are illustrated in the figure below.
-    In these two cells, ``op_candidates`` are series of convolutions and pooling operations.
-    ``num_nodes_per_node`` is set to 2. ``num_nodes`` is set to 5. ``merge_op`` is ``loose_end``.
-    Assuming nodes are enumerated from bottom to top, left to right,
-    ``output_node_indices`` for the normal cell is ``[2, 3, 4, 5, 6]``.
-    For the reduction cell, it's ``[4, 5, 6]``.
-    Please take a look at this
-    `review article <https://sh-tsang.medium.com/review-nasnet-neural-architecture-search-network-image-classification-23139ea0425d>`__
-    if you are interested in details.
-
-    .. image:: ../../../img/nasnet_cell.png
-       :width: 900
-       :align: center
-
-    Here is a glossary table, which could help better understand the terms used above:
-
-    .. list-table::
-        :widths: 25 75
-        :header-rows: 1
-
-        * - Name
-          - Brief Description
-        * - Cell
-          - A cell consists of ``num_nodes`` nodes.
-        * - Node
-          - A node is the **sum** of ``num_ops_per_node`` operators.
-        * - Operator
-          - Each operator is independently chosen from a list of user-specified candidate operators.
-        * - Operator's input
-          - Each operator has one input, chosen from previous nodes as well as predecessors.
-        * - Predecessors
-          - Input of cell. A cell can have multiple predecessors. Predecessors are sent to *preprocessor* for preprocessing.
-        * - Cell's output
-          - Output of cell. Usually concatenation of some nodes (possibly all nodes) in the cell. Cell's output,
-            along with predecessors, are sent to *postprocessor* for postprocessing.
-        * - Preprocessor
-          - Extra preprocessing to predecessors. Usually used in shape alignment (e.g., predecessors have different shapes).
-            By default, do nothing.
-        * - Postprocessor
-          - Extra postprocessing for cell's output. Usually used to chain cells with multiple Predecessors
-            (e.g., the next cell wants to have the outputs of both this cell and previous cell as its input).
-            By default, directly use this cell's output.
-
-    .. tip::
-
-        It's highly recommended to make the candidate operators have an output of the same shape as input.
-        This is because, there can be dynamic connections within cell. If there's shape change within operations,
-        the input shape of the subsequent operation becomes unknown.
-        In addition, the final concatenation could have shape mismatch issues.
-
-    Parameters
-    ----------
-    op_candidates : list of module or function, or dict
-        A list of modules to choose from, or a function that accepts current index and optionally its input index, and returns a module.
-        For example, (2, 3, 0) means the 3rd op in the 2nd node, accepts the 0th node as input.
-        The index are enumerated for all nodes including predecessors from 0.
-        When first created, the input index is ``None``, meaning unknown.
-        Note that in graph execution engine, support of function in ``op_candidates`` is limited.
-        Please also note that, to make :class:`Cell` work with one-shot strategy,
-        ``op_candidates``, in case it's a callable, should not depend on the second input argument,
-        i.e., ``op_index`` in current node.
-    num_nodes : int
-        Number of nodes in the cell.
-    num_ops_per_node: int
-        Number of operators in each node. The output of each node is the sum of all operators in the node. Default: 1.
-    num_predecessors : int
-        Number of inputs of the cell. The input to forward should be a list of tensors. Default: 1.
-    merge_op : "all", or "loose_end"
-        If "all", all the nodes (except predecessors) will be concatenated as the cell's output, in which case, ``output_node_indices``
-        will be ``list(range(num_predecessors, num_predecessors + num_nodes))``.
-        If "loose_end", only the nodes that have never been used as other nodes' inputs will be concatenated to the output.
-        Predecessors are not considered when calculating unused nodes.
-        Details can be found in `NDS paper <https://arxiv.org/abs/1905.13214>`__. Default: all.
-    preprocessor : callable
-        Override this if some extra transformation on cell's input is intended.
-        It should be a callable (``nn.Module`` is also acceptable) that takes a list of tensors which are predecessors,
-        and outputs a list of tensors, with the same length as input.
-        By default, it does nothing to the input.
-    postprocessor : callable
-        Override this if customization on the output of the cell is intended.
-        It should be a callable that takes the output of this cell, and a list which are predecessors.
-        Its return type should be either one tensor, or a tuple of tensors.
-        The return value of postprocessor is the return value of the cell's forward.
-        By default, it returns only the output of the current cell.
-    concat_dim : int
-        The result will be a concatenation of several nodes on this dim. Default: 1.
-    label : str
-        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
-
-    Examples
-    --------
-    Choose between conv2d and maxpool2d.
-    The cell have 4 nodes, 1 op per node, and 2 predecessors.
-
-    >>> cell = nn.Cell([nn.Conv2d(32, 32, 3, padding=1), nn.MaxPool2d(3, padding=1)], 4, 1, 2)
-
-    In forward:
-
-    >>> cell([input1, input2])
-
-    The "list bracket" can be omitted:
-
-    >>> cell(only_input)                    # only one input
-    >>> cell(tensor1, tensor2, tensor3)     # multiple inputs
-
-    Use ``merge_op`` to specify how to construct the output.
-    The output will then have dynamic shape, depending on which input has been used in the cell.
-
-    >>> cell = nn.Cell([nn.Conv2d(32, 32, 3), nn.MaxPool2d(3)], 4, 1, 2, merge_op='loose_end')
-    >>> cell_out_channels = len(cell.output_node_indices) * 32
-
-    The op candidates can be callable that accepts node index in cell, op index in node, and input index.
-
-    >>> cell = nn.Cell([
-    ...     lambda node_index, op_index, input_index: nn.Conv2d(32, 32, 3, stride=2 if input_index < 1 else 1),
-    ... ], 4, 1, 2)
-
-    Predecessor example: ::
-
-        class Preprocessor:
-            def __init__(self):
-                self.conv1 = nn.Conv2d(16, 32, 1)
-                self.conv2 = nn.Conv2d(64, 32, 1)
-
-            def forward(self, x):
-                return [self.conv1(x[0]), self.conv2(x[1])]
-
-        cell = nn.Cell([nn.Conv2d(32, 32, 3), nn.MaxPool2d(3)], 4, 1, 2, preprocessor=Preprocessor())
-        cell([torch.randn(1, 16, 48, 48), torch.randn(1, 64, 48, 48)])  # the two inputs will be sent to conv1 and conv2 respectively
-
-    Warnings
-    --------
-    :class:`Cell` is not supported in :ref:`graph-based execution engine <graph-based-execution-engine>`.
-
-    Attributes
-    ----------
-    output_node_indices : list of int
-        An attribute that contains indices of the nodes concatenated to the output (a list of integers).
-
-        When the cell is first instantiated in the base model, or when ``merge_op`` is ``all``,
-        ``output_node_indices`` must be ``range(num_predecessors, num_predecessors + num_nodes)``.
-
-        When ``merge_op`` is ``loose_end``, ``output_node_indices`` is useful to compute the shape of this cell's output,
-        because the output shape depends on the connection in the cell, and which nodes are "loose ends" depends on mutation.
-
-    op_candidates_factory : CellOpFactory or None
-        If the operations are created with a factory (callable), this is to be set with the factory.
-        One-shot algorithms will use this to make each node a cartesian product of operations and inputs.
-    """
-
-    def __init__(self,
-                 op_candidates: Union[
-                     Callable[[], List[nn.Module]],
-                     List[nn.Module],
-                     List[CellOpFactory],
-                     Dict[str, nn.Module],
-                     Dict[str, CellOpFactory]
-                 ],
-                 num_nodes: int,
-                 num_ops_per_node: int = 1,
-                 num_predecessors: int = 1,
-                 merge_op: Literal['all', 'loose_end'] = 'all',
-                 preprocessor: Optional[Callable[[List[torch.Tensor]], List[torch.Tensor]]] = None,
-                 postprocessor: Optional[Callable[[torch.Tensor, List[torch.Tensor]],
-                                         Union[Tuple[torch.Tensor, ...], torch.Tensor]]] = None,
-                 concat_dim: int = 1,
-                 *,
-                 label: Optional[str] = None):
-        super().__init__()
-        self._label = generate_new_label(label)
-
-        # modules are created in "natural" order
-        # first create preprocessor
-        self.preprocessor = preprocessor or _ListIdentity()
-        # then create intermediate ops
-        self.ops = ModuleList()
-        self.inputs = ModuleList()
-        # finally postprocessor
-        self.postprocessor = postprocessor or _DefaultPostprocessor()
-
-        self.num_nodes = num_nodes
-        self.num_ops_per_node = num_ops_per_node
-        self.num_predecessors = num_predecessors
-        assert merge_op in ['all', 'loose_end']
-        self.merge_op = merge_op
-        self.output_node_indices = list(range(num_predecessors, num_predecessors + num_nodes))
-
-        self.concat_dim = concat_dim
-
-        self.op_candidates_factory: Union[List[CellOpFactory], Dict[str, CellOpFactory], None] = None  # set later
-
-        # fill-in the missing modules
-        self._create_modules(op_candidates)
-
-    def _create_modules(self, op_candidates):
-        for i in range(self.num_predecessors, self.num_nodes + self.num_predecessors):
-            self.ops.append(ModuleList())
-            self.inputs.append(ModuleList())
-            for k in range(self.num_ops_per_node):
-                inp = InputChoice(i, 1, label=f'{self.label}/input_{i}_{k}')
-                chosen = None
-
-                if isinstance(inp, ChosenInputs):
-                    # now we are in the fixed mode
-                    # the length of chosen should be 1
-                    chosen = inp.chosen[0]
-                    if self.merge_op == 'loose_end' and chosen in self.output_node_indices:
-                        # remove it from concat indices
-                        self.output_node_indices.remove(chosen)
-
-                # this is needed because op_candidates can be very complex
-                # the type annoation and docs for details
-                ops, has_factory = create_cell_op_candidates(op_candidates, i, k, chosen)
-                if has_factory:
-                    self.op_candidates_factory = op_candidates
-
-                # though it's layer choice and input choice here, in fixed mode, the chosen module will be created.
-                cast(ModuleList, self.ops[-1]).append(LayerChoice(ops, label=f'{self.label}/op_{i}_{k}'))
-                cast(ModuleList, self.inputs[-1]).append(inp)
-
-    @property
-    def label(self):
-        return self._label
-
-    def forward(self, *inputs: Union[List[torch.Tensor], torch.Tensor]) -> Union[Tuple[torch.Tensor, ...], torch.Tensor]:
-        """Forward propagation of cell.
-
-        Parameters
-        ----------
-        inputs
-            Can be a list of tensors, or several tensors.
-            The length should be equal to ``num_predecessors``.
-
-        Returns
-        -------
-        Tuple[torch.Tensor] | torch.Tensor
-            The return type depends on the output of ``postprocessor``.
-            By default, it's the output of ``merge_op``, which is a contenation (on ``concat_dim``)
-            of some of (possibly all) the nodes' outputs in the cell.
-        """
-        processed_inputs: List[torch.Tensor] = preprocess_cell_inputs(self.num_predecessors, *inputs)
-        states: List[torch.Tensor] = self.preprocessor(processed_inputs)
-        for ops, inps in zip(
-            cast(Sequence[Sequence[LayerChoice]], self.ops),
-            cast(Sequence[Sequence[InputChoice]], self.inputs)
-        ):
-            current_state = []
-            for op, inp in zip(ops, inps):
-                current_state.append(op(inp(states)))
-            current_state = torch.sum(torch.stack(current_state), 0)
-            states.append(current_state)
-        if self.merge_op == 'all':
-            # a special case for graph engine
-            this_cell = torch.cat(states[self.num_predecessors:], self.concat_dim)
-        else:
-            this_cell = torch.cat([states[k] for k in self.output_node_indices], self.concat_dim)
-        return self.postprocessor(this_cell, processed_inputs)
+from nni.nas.nn.pytorch.cell import *
--- a/nni/retiarii/nn/pytorch/component.py
+++ b/nni/retiarii/nn/pytorch/component.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import copy
-import warnings
-from collections import OrderedDict
-from typing import Callable, List, Dict, Union, Tuple, Optional
+# pylint: disable=wildcard-import,unused-wildcard-import,unused-import

-import torch
-import torch.nn as nn
-
-from nni.retiarii.utils import NoContextError, STATE_DICT_PY_MAPPING_PARTIAL
-
-from .api import LayerChoice, ValueChoice, ValueChoiceX, ChoiceOf
-from .cell import Cell
-from .nasbench101 import NasBench101Cell, NasBench101Mutator
-from .mutation_utils import Mutable, generate_new_label, get_fixed_value
-
-
-__all__ = ['Repeat', 'Cell', 'NasBench101Cell', 'NasBench101Mutator', 'NasBench201Cell']
-
-
-class Repeat(Mutable):
-    """
-    Repeat a block by a variable number of times.
-
-    Parameters
-    ----------
-    blocks : function, list of function, module or list of module
-        The block to be repeated. If not a list, it will be replicated (**deep-copied**) into a list.
-        If a list, it should be of length ``max_depth``, the modules will be instantiated in order and a prefix will be taken.
-        If a function, it will be called (the argument is the index) to instantiate a module.
-        Otherwise the module will be deep-copied.
-    depth : int or tuple of int
-        If one number, the block will be repeated by a fixed number of times. If a tuple, it should be (min, max),
-        meaning that the block will be repeated at least ``min`` times and at most ``max`` times.
-        If a ValueChoice, it should choose from a series of positive integers.
-
-        .. versionadded:: 2.8
-
-           Minimum depth can be 0. But this feature is NOT supported on graph engine.
-
-    Examples
-    --------
-    Block() will be deep copied and repeated 3 times. ::
-
-        self.blocks = nn.Repeat(Block(), 3)
-
-    Block() will be repeated 1, 2, or 3 times. ::
-
-        self.blocks = nn.Repeat(Block(), (1, 3))
-
-    Can be used together with layer choice.
-    With deep copy, the 3 layers will have the same label, thus share the choice. ::
-
-        self.blocks = nn.Repeat(nn.LayerChoice([...]), (1, 3))
-
-    To make the three layer choices independent,
-    we need a factory function that accepts index (0, 1, 2, ...) and returns the module of the ``index``-th layer. ::
-
-        self.blocks = nn.Repeat(lambda index: nn.LayerChoice([...], label=f'layer{index}'), (1, 3))
-
-    Depth can be a ValueChoice to support arbitrary depth candidate list. ::
-
-        self.blocks = nn.Repeat(Block(), nn.ValueChoice([1, 3, 5]))
-    """
-
-    @classmethod
-    def create_fixed_module(cls,
-                            blocks: Union[Callable[[int], nn.Module],
-                                          List[Callable[[int], nn.Module]],
-                                          nn.Module,
-                                          List[nn.Module]],
-                            depth: Union[int, Tuple[int, int], ChoiceOf[int]], *, label: Optional[str] = None):
-        if isinstance(depth, tuple):
-            # we can't create a value choice here,
-            # otherwise we will have two value choices, one created here, another in init.
-            depth = get_fixed_value(label)
-
-        if isinstance(depth, int):
-            # if depth is a valuechoice, it should be already an int
-            result = nn.Sequential(*cls._replicate_and_instantiate(blocks, depth))
-
-            if hasattr(result, STATE_DICT_PY_MAPPING_PARTIAL):
-                # already has a mapping, will merge with it
-                prev_mapping = getattr(result, STATE_DICT_PY_MAPPING_PARTIAL)
-                setattr(result, STATE_DICT_PY_MAPPING_PARTIAL, {k: f'blocks.{v}' for k, v in prev_mapping.items()})
-            else:
-                setattr(result, STATE_DICT_PY_MAPPING_PARTIAL, {'__self__': 'blocks'})
-
-            return result
-
-        raise NoContextError(f'Not in fixed mode, or {depth} not an integer.')
-
-    def __init__(self,
-                 blocks: Union[Callable[[int], nn.Module],
-                               List[Callable[[int], nn.Module]],
-                               nn.Module,
-                               List[nn.Module]],
-                 depth: Union[int, Tuple[int, int], ChoiceOf[int]], *, label: Optional[str] = None):
-        super().__init__()
-
-        self._label = None  # by default, no label
-
-        if isinstance(depth, ValueChoiceX):
-            if label is not None:
-                warnings.warn(
-                    'In repeat, `depth` is already a ValueChoice, but `label` is still set. It will be ignored.',
-                    RuntimeWarning
-                )
-            self.depth_choice: Union[int, ChoiceOf[int]] = depth
-            all_values = list(self.depth_choice.all_options())
-            self.min_depth = min(all_values)
-            self.max_depth = max(all_values)
-
-            if isinstance(depth, ValueChoice):
-                self._label = depth.label  # if a leaf node
-
-        elif isinstance(depth, tuple):
-            self.min_depth = depth if isinstance(depth, int) else depth[0]
-            self.max_depth = depth if isinstance(depth, int) else depth[1]
-            self.depth_choice: Union[int, ChoiceOf[int]] = ValueChoice(list(range(self.min_depth, self.max_depth + 1)), label=label)
-            self._label = self.depth_choice.label
-
-        elif isinstance(depth, int):
-            self.min_depth = self.max_depth = depth
-            self.depth_choice: Union[int, ChoiceOf[int]] = depth
-        else:
-            raise TypeError(f'Unsupported "depth" type: {type(depth)}')
-        assert self.max_depth >= self.min_depth >= 0 and self.max_depth >= 1, f'Depth of {self.min_depth} to {self.max_depth} is invalid.'
-        self.blocks = nn.ModuleList(self._replicate_and_instantiate(blocks, self.max_depth))
-
-    @property
-    def label(self) -> Optional[str]:
-        return self._label
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-        return x
-
-    @staticmethod
-    def _replicate_and_instantiate(blocks, repeat):
-        if not isinstance(blocks, list):
-            if isinstance(blocks, nn.Module):
-                blocks = [blocks if i == 0 else copy.deepcopy(blocks) for i in range(repeat)]
-            else:
-                blocks = [blocks for _ in range(repeat)]
-        assert repeat <= len(blocks), f'Not enough blocks to be used. {repeat} expected, only found {len(blocks)}.'
-        if repeat < len(blocks):
-            blocks = blocks[:repeat]
-        if len(blocks) > 0 and not isinstance(blocks[0], nn.Module):
-            blocks = [b(i) for i, b in enumerate(blocks)]
-        return blocks
-
-    def __getitem__(self, index):
-        # shortcut for blocks[index]
-        return self.blocks[index]
-
-    def __len__(self):
-        return self.max_depth
-
-
-class NasBench201Cell(nn.Module):
-    """
-    Cell structure that is proposed in NAS-Bench-201.
-
-    Proposed by `NAS-Bench-201: Extending the Scope of Reproducible Neural Architecture Search <https://arxiv.org/abs/2001.00326>`__.
-
-    This cell is a densely connected DAG with ``num_tensors`` nodes, where each node is tensor.
-    For every i < j, there is an edge from i-th node to j-th node.
-    Each edge in this DAG is associated with an operation transforming the hidden state from the source node
-    to the target node. All possible operations are selected from a predefined operation set, defined in ``op_candidates``.
-    Each of the ``op_candidates`` should be a callable that accepts input dimension and output dimension,
-    and returns a ``Module``.
-
-    Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. For example,
-
-    The space size of this cell would be :math:`|op|^{N(N-1)/2}`, where :math:`|op|` is the number of operation candidates,
-    and :math:`N` is defined by ``num_tensors``.
-
-    Parameters
-    ----------
-    op_candidates : list of callable
-        Operation candidates. Each should be a function accepts input feature and output feature, returning nn.Module.
-    in_features : int
-        Input dimension of cell.
-    out_features : int
-        Output dimension of cell.
-    num_tensors : int
-        Number of tensors in the cell (input included). Default: 4
-    label : str
-        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
-    """
-
-    @staticmethod
-    def _make_dict(x):
-        if isinstance(x, list):
-            return OrderedDict([(str(i), t) for i, t in enumerate(x)])
-        return OrderedDict(x)
-
-    def __init__(self, op_candidates: Union[Dict[str, Callable[[int, int], nn.Module]], List[Callable[[int, int], nn.Module]]],
-                 in_features: int, out_features: int, num_tensors: int = 4,
-                 label: Optional[str] = None):
-        super().__init__()
-        self._label = generate_new_label(label)
-
-        self.layers = nn.ModuleList()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.num_tensors = num_tensors
-
-        op_candidates = self._make_dict(op_candidates)
-
-        for tid in range(1, num_tensors):
-            node_ops = nn.ModuleList()
-            for j in range(tid):
-                inp = in_features if j == 0 else out_features
-                op_choices = OrderedDict([(key, cls(inp, out_features))
-                                          for key, cls in op_candidates.items()])
-                node_ops.append(LayerChoice(op_choices, label=f'{self._label}__{j}_{tid}'))  # put __ here to be compatible with base engine
-            self.layers.append(node_ops)
-
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        """
-        The forward of input choice is simply selecting first on all choices.
-        It shouldn't be called directly by users in most cases.
-        """
-        tensors: List[torch.Tensor] = [inputs]
-        for layer in self.layers:
-            current_tensor: List[torch.Tensor] = []
-            for i, op in enumerate(layer):  # type: ignore
-                current_tensor.append(op(tensors[i]))  # type: ignore
-            tensors.append(torch.sum(torch.stack(current_tensor), 0))
-        return tensors[-1]
+from nni.nas.nn.pytorch.repeat import Repeat
+from nni.nas.nn.pytorch.cell import Cell
+from nni.nas.hub.pytorch.modules import NasBench101Cell, NasBench101Mutator, NasBench201Cell
--- a/nni/retiarii/nn/pytorch/hypermodule.py
+++ b/nni/retiarii/nn/pytorch/hypermodule.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from __future__ import annotations
+# pylint: disable=wildcard-import,unused-wildcard-import,unused-import

-from packaging.version import Version
-import torch
-import torch.nn as nn
-
-from nni.retiarii.serializer import basic_unit
-
-from .api import LayerChoice
-from .mutation_utils import generate_new_label
-
-__all__ = ['AutoActivation']
-
-TorchVersion = '1.5.0'
-
-# ============== unary function modules ==============
-
-@basic_unit
-class UnaryIdentity(nn.Module):
-    def forward(self, x):
-        return x
-
-@basic_unit
-class UnaryNegative(nn.Module):
-    def forward(self, x):
-        return -x
-
-@basic_unit
-class UnaryAbs(nn.Module):
-    def forward(self, x):
-        return torch.abs(x)
-
-@basic_unit
-class UnarySquare(nn.Module):
-    def forward(self, x):
-        return torch.square(x)
-
-@basic_unit
-class UnaryPow(nn.Module):
-    def forward(self, x):
-        return torch.pow(x, 3)
-
-@basic_unit
-class UnarySqrt(nn.Module):
-    def forward(self, x):
-        return torch.sqrt(x)
-
-@basic_unit
-class UnaryMul(nn.Module):
-    def __init__(self):
-        super().__init__()
-        # element-wise for now, will change to per-channel trainable parameter
-        self.beta = torch.nn.Parameter(torch.tensor(1, dtype=torch.float32)) # pylint: disable=not-callable
-    def forward(self, x):
-        return x * self.beta
-
-@basic_unit
-class UnaryAdd(nn.Module):
-    def __init__(self):
-        super().__init__()
-        # element-wise for now, will change to per-channel trainable parameter
-        self.beta = torch.nn.Parameter(torch.tensor(1, dtype=torch.float32)) # pylint: disable=not-callable
-    def forward(self, x):
-        return x + self.beta
-
-@basic_unit
-class UnaryLogAbs(nn.Module):
-    def forward(self, x):
-        return torch.log(torch.abs(x) + 1e-7)
-
-@basic_unit
-class UnaryExp(nn.Module):
-    def forward(self, x):
-        return torch.exp(x)
-
-@basic_unit
-class UnarySin(nn.Module):
-    def forward(self, x):
-        return torch.sin(x)
-
-@basic_unit
-class UnaryCos(nn.Module):
-    def forward(self, x):
-        return torch.cos(x)
-
-@basic_unit
-class UnarySinh(nn.Module):
-    def forward(self, x):
-        return torch.sinh(x)
-
-@basic_unit
-class UnaryCosh(nn.Module):
-    def forward(self, x):
-        return torch.cosh(x)
-
-@basic_unit
-class UnaryTanh(nn.Module):
-    def forward(self, x):
-        return torch.tanh(x)
-
-if not Version(torch.__version__) >= Version(TorchVersion):
-    @basic_unit
-    class UnaryAsinh(nn.Module):
-        def forward(self, x):
-            return torch.asinh(x)
-
-@basic_unit
-class UnaryAtan(nn.Module):
-    def forward(self, x):
-        return torch.atan(x)
-
-if not Version(torch.__version__) >= Version(TorchVersion):
-    @basic_unit
-    class UnarySinc(nn.Module):
-        def forward(self, x):
-            return torch.sinc(x)
-
-@basic_unit
-class UnaryMax(nn.Module):
-    def forward(self, x):
-        return torch.max(x, torch.zeros_like(x))
-
-@basic_unit
-class UnaryMin(nn.Module):
-    def forward(self, x):
-        return torch.min(x, torch.zeros_like(x))
-
-@basic_unit
-class UnarySigmoid(nn.Module):
-    def forward(self, x):
-        return torch.sigmoid(x)
-
-@basic_unit
-class UnaryLogExp(nn.Module):
-    def forward(self, x):
-        return torch.log(1 + torch.exp(x))
-
-@basic_unit
-class UnaryExpSquare(nn.Module):
-    def forward(self, x):
-        return torch.exp(-torch.square(x))
-
-@basic_unit
-class UnaryErf(nn.Module):
-    def forward(self, x):
-        return torch.erf(x)
-
-unary_modules = ['UnaryIdentity', 'UnaryNegative', 'UnaryAbs', 'UnarySquare', 'UnaryPow',
-    'UnarySqrt', 'UnaryMul', 'UnaryAdd', 'UnaryLogAbs', 'UnaryExp', 'UnarySin', 'UnaryCos',
-    'UnarySinh', 'UnaryCosh', 'UnaryTanh', 'UnaryAtan', 'UnaryMax',
-    'UnaryMin', 'UnarySigmoid', 'UnaryLogExp', 'UnaryExpSquare', 'UnaryErf']
-
-if not Version(torch.__version__) >= Version(TorchVersion):
-    unary_modules.append('UnaryAsinh')
-    unary_modules.append('UnarySinc')
-
-# ============== binary function modules ==============
-
-@basic_unit
-class BinaryAdd(nn.Module):
-    def forward(self, x):
-        return x[0] + x[1]
-
-@basic_unit
-class BinaryMul(nn.Module):
-    def forward(self, x):
-        return x[0] * x[1]
-
-@basic_unit
-class BinaryMinus(nn.Module):
-    def forward(self, x):
-        return x[0] - x[1]
-
-@basic_unit
-class BinaryDivide(nn.Module):
-    def forward(self, x):
-        return x[0] / (x[1] + 1e-7)
-
-@basic_unit
-class BinaryMax(nn.Module):
-    def forward(self, x):
-        return torch.max(x[0], x[1])
-
-@basic_unit
-class BinaryMin(nn.Module):
-    def forward(self, x):
-        return torch.min(x[0], x[1])
-
-@basic_unit
-class BinarySigmoid(nn.Module):
-    def forward(self, x):
-        return torch.sigmoid(x[0]) * x[1]
-
-@basic_unit
-class BinaryExpSquare(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.beta = torch.nn.Parameter(torch.tensor(1, dtype=torch.float32)) # pylint: disable=not-callable
-    def forward(self, x):
-        return torch.exp(-self.beta * torch.square(x[0] - x[1]))
-
-@basic_unit
-class BinaryExpAbs(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.beta = torch.nn.Parameter(torch.tensor(1, dtype=torch.float32)) # pylint: disable=not-callable
-    def forward(self, x):
-        return torch.exp(-self.beta * torch.abs(x[0] - x[1]))
-
-@basic_unit
-class BinaryParamAdd(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.beta = torch.nn.Parameter(torch.tensor(1, dtype=torch.float32)) # pylint: disable=not-callable
-    def forward(self, x):
-        return self.beta * x[0] + (1 - self.beta) * x[1]
-
-binary_modules = ['BinaryAdd', 'BinaryMul', 'BinaryMinus', 'BinaryDivide', 'BinaryMax',
-    'BinaryMin', 'BinarySigmoid', 'BinaryExpSquare', 'BinaryExpAbs', 'BinaryParamAdd']
-
-
-class AutoActivation(nn.Module):
-    """
-    This module is an implementation of the paper `Searching for Activation Functions <https://arxiv.org/abs/1710.05941>`__.
-
-    Parameters
-    ----------
-    unit_num : int
-        the number of core units
-
-    Notes
-    -----
-    Current `beta` is not per-channel parameter.
-    """
-    def __init__(self, unit_num: int = 1, label: str | None = None):
-        super().__init__()
-        self._label = generate_new_label(label)
-        self.unaries = nn.ModuleList()
-        self.binaries = nn.ModuleList()
-        self.first_unary = LayerChoice([eval('{}()'.format(unary)) for unary in unary_modules], label = f'{self.label}__unary_0')
-        for i in range(unit_num):
-            one_unary = LayerChoice([eval('{}()'.format(unary)) for unary in unary_modules], label = f'{self.label}__unary_{i+1}')
-            self.unaries.append(one_unary)
-        for i in range(unit_num):
-            one_binary = LayerChoice([eval('{}()'.format(binary)) for binary in binary_modules], label = f'{self.label}__binary_{i}')
-            self.binaries.append(one_binary)
-
-    @property
-    def label(self):
-        return self._label
-
-    def forward(self, x):
-        out = self.first_unary(x)
-        for unary, binary in zip(self.unaries, self.binaries):
-            out = binary(torch.stack([out, unary(x)]))
-        return out
+from nni.nas.hub.pytorch.modules import AutoActivation
--- a/nni/retiarii/nn/pytorch/mutation_utils.py
+++ b/nni/retiarii/nn/pytorch/mutation_utils.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from typing import Any, Optional, Tuple, Union
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch.nn as nn
-from nni.retiarii.utils import NoContextError, ModelNamespace, get_current_context
-
-
-class Mutable(nn.Module):
-    """
-    This is just an implementation trick for now.
-
-    In future, this could be the base class for all PyTorch mutables including layer choice, input choice, etc.
-    This is not considered as an interface, but rather as a base class consisting of commonly used class/instance methods.
-    For API developers, it's not recommended to use ``isinstance(module, Mutable)`` to check for mutable modules either,
-    before the design is finalized.
-    """
-
-    def __new__(cls, *args, **kwargs):
-        if not args and not kwargs:
-            # this can be the case of copy/deepcopy
-            # attributes are assigned afterwards in __dict__
-            return super().__new__(cls)
-
-        try:
-            return cls.create_fixed_module(*args, **kwargs)
-        except NoContextError:
-            return super().__new__(cls)
-
-    @classmethod
-    def create_fixed_module(cls, *args, **kwargs) -> Union[nn.Module, Any]:
-        """
-        Try to create a fixed module from fixed dict.
-        If the code is running in a trial, this method would succeed, and a concrete module instead of a mutable will be created.
-        Raises no context error if the creation failed.
-        """
-        raise NotImplementedError
-
-
-def generate_new_label(label: Optional[str]):
-    if label is None:
-        return ModelNamespace.next_label()
-    return label
-
-
-def get_fixed_value(label: Optional[str]) -> Any:
-    ret = get_current_context('fixed')
-    try:
-        return ret[generate_new_label(label)]
-    except KeyError:
-        raise KeyError(f'Fixed context with {label} not found. Existing values are: {ret}')
-
-
-def get_fixed_dict(label_prefix: Optional[str]) -> Tuple[str, Any]:
-    ret = get_current_context('fixed')
-    try:
-        label_prefix = generate_new_label(label_prefix)
-        ret = {k: v for k, v in ret.items() if k.startswith(label_prefix + '/')}
-        if not ret:
-            raise KeyError
-        return label_prefix, ret
-    except KeyError:
-        raise KeyError(f'Fixed context with prefix {label_prefix} not found. Existing values are: {ret}')
+from nni.nas.nn.pytorch.mutation_utils import *
--- a/nni/retiarii/nn/pytorch/mutator.py
+++ b/nni/retiarii/nn/pytorch/mutator.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import inspect
-from typing import Any, List, Optional, Tuple, Dict, Iterator, Iterable, cast
+# pylint: disable=wildcard-import,unused-wildcard-import

-import torch.nn as nn
-
-from nni.common.serializer import is_traceable, is_wrapped_with_trace
-from nni.retiarii.graph import Cell, Graph, Model, ModelStatus, Node, Evaluator
-from nni.retiarii.mutator import Mutator
-from nni.retiarii.serializer import is_basic_unit, is_model_wrapped
-from nni.retiarii.utils import ModelNamespace, uid
-
-from .api import LayerChoice, InputChoice, ValueChoice, ValueChoiceX, Placeholder
-from .component import NasBench101Cell, NasBench101Mutator
-
-
-class LayerChoiceMutator(Mutator):
-    def __init__(self, nodes: List[Node]):
-        super().__init__(label=nodes[0].operation.parameters['label'])
-        self.nodes = nodes
-
-    def mutate(self, model):
-        candidates = self.nodes[0].operation.parameters['candidates']
-        chosen = self.choice(candidates)
-        for node in self.nodes:
-            # Each layer choice corresponds to a cell, which is unconnected in the base graph.
-            # We add the connections here in the mutation logic.
-            # Thus, the mutated model should not be mutated again. Everything should be based on the original base graph.
-            target = model.graphs[cast(Cell, node.operation).cell_name]
-            chosen_node = target.get_node_by_name(chosen)
-            assert chosen_node is not None
-            target.add_edge((target.input_node, 0), (chosen_node, None))
-            target.add_edge((chosen_node, None), (target.output_node, None))
-            operation = cast(Cell, node.operation)
-            target_node = cast(Node, model.get_node_by_name(node.name))
-            target_node.update_operation(Cell(operation.cell_name))
-
-            # remove redundant nodes
-            for rm_node in list(target.hidden_nodes):  # remove from a list on the fly will cause issues
-                if rm_node.name != chosen_node.name:
-                    rm_node.remove()
-
-
-class InputChoiceMutator(Mutator):
-    def __init__(self, nodes: List[Node]):
-        super().__init__(label=nodes[0].operation.parameters['label'])
-        self.nodes = nodes
-
-    def mutate(self, model):
-        n_candidates = self.nodes[0].operation.parameters['n_candidates']
-        n_chosen = self.nodes[0].operation.parameters['n_chosen']
-        candidates = list(range(n_candidates))
-        if n_chosen is None:
-            chosen = [i for i in candidates if self.choice([False, True])]
-            # FIXME This is a hack to make choice align with the previous format
-            self._cur_samples = chosen
-        else:
-            chosen = [self.choice(candidates) for _ in range(n_chosen)]
-        for node in self.nodes:
-            target = cast(Node, model.get_node_by_name(node.name))
-            target.update_operation('__torch__.nni.retiarii.nn.pytorch.ChosenInputs',
-                                    {'chosen': chosen, 'reduction': node.operation.parameters['reduction']})
-
-
-class ValueChoiceMutator(Mutator):
-    def __init__(self, nodes: List[Node], candidates: List[Any]):
-        # use nodes[0] as an example to get label
-        super().__init__(label=nodes[0].operation.parameters['label'])
-        self.nodes = nodes
-        self.candidates = candidates
-
-    def mutate(self, model):
-        chosen = self.choice(self.candidates)
-        # no need to support transformation here,
-        # because it is naturally done in forward loop
-        for node in self.nodes:
-            target = cast(Node, model.get_node_by_name(node.name))
-            target.update_operation('prim::Constant', {'type': type(chosen).__name__, 'value': chosen})
-
-
-class ParameterChoiceLeafMutator(Mutator):
-    # mutate the leaf node (i.e., ValueChoice) of parameter choices
-    # should be used together with ParameterChoiceMutator
-
-    def __init__(self, candidates: List[Any], label: str):
-        super().__init__(label=label)
-        self.candidates = candidates
-
-    def mutate(self, model: Model) -> None:
-        # leave a record here
-        # real mutations will be done in ParameterChoiceMutator
-        self.choice(self.candidates)
-
-
-class ParameterChoiceMutator(Mutator):
-    # To deal with ValueChoice used as a parameter of a basic unit
-    # should be used together with ParameterChoiceLeafMutator
-    # parameter choice mutator is an empty-shell-mutator
-    # calculate all the parameter values based on previous mutations of value choice mutator
-
-    def __init__(self, nodes: List[Tuple[Node, str]]):
-        super().__init__()
-
-        self.nodes = nodes
-
-    def mutate(self, model: Model) -> None:
-        # looks like {"label1": "cat", "label2": 123}
-        value_choice_decisions = {}
-        for mutation in model.history:
-            if isinstance(mutation.mutator, ParameterChoiceLeafMutator):
-                value_choice_decisions[mutation.mutator.label] = mutation.samples[0]
-
-        for node, argname in self.nodes:
-            # argname is the location of the argument
-            # e.g., Conv2d(out_channels=nn.ValueChoice([1, 2, 3])) => argname = "out_channels"
-            value_choice: ValueChoiceX = node.operation.parameters[argname]
-
-            # calculate all the values on the leaf node of ValueChoiceX computation graph
-            leaf_node_values = []
-            for choice in value_choice.inner_choices():
-                leaf_node_values.append(value_choice_decisions[choice.label])
-            result_value = value_choice.evaluate(leaf_node_values)
-
-            # update model with graph mutation primitives
-            target = cast(Node, model.get_node_by_name(node.name))
-            target.update_operation(target.operation.type, {**target.operation.parameters, argname: result_value})
-
-
-class RepeatMutator(Mutator):
-    def __init__(self, nodes: List[Node]):
-        # nodes is a subgraph consisting of repeated blocks.
-        super().__init__(label=nodes[0].operation.parameters['label'])
-        self.nodes = nodes
-
-    def _retrieve_chain_from_graph(self, graph: Graph) -> List[Node]:
-        u = graph.input_node
-        chain = []
-        while u != graph.output_node:
-            if u != graph.input_node:
-                chain.append(u)
-            assert len(u.successors) == 1, f'This graph is an illegal chain. {u} has output {u.successors}.'
-            u = u.successors[0]
-        return chain
-
-    def mutate(self, model):
-        for node in self.nodes:
-            # the logic here is similar to layer choice. We find cell attached to each node.
-            target: Graph = model.graphs[cast(Cell, node.operation).cell_name]
-            chain = self._retrieve_chain_from_graph(target)
-            # and we get the chosen depth (by value choice)
-            node_in_model = cast(Node, model.get_node_by_name(node.name))
-            # depth is a value choice in base model
-            # but it's already mutated by a ParameterChoiceMutator here
-            chosen_depth: int = node_in_model.operation.parameters['depth']
-            for edge in chain[chosen_depth - 1].outgoing_edges:
-                edge.remove()
-            target.add_edge((chain[chosen_depth - 1], None), (target.output_node, None))
-            for rm_node in chain[chosen_depth:]:
-                for edge in rm_node.outgoing_edges:
-                    edge.remove()
-                rm_node.remove()
-
-            # to delete the unused parameters.
-            target_node = cast(Node, model.get_node_by_name(node.name))
-            cell_operation = cast(Cell, node.operation)
-            target_node.update_operation(Cell(cell_operation.cell_name))
-
-
-def process_inline_mutation(model: Model) -> Optional[List[Mutator]]:
-    applied_mutators = []
-
-    ic_nodes = _group_by_label(model.get_nodes_by_type('__torch__.nni.retiarii.nn.pytorch.api.InputChoice'))
-    for node_list in ic_nodes:
-        assert _is_all_equal(map(lambda node: node.operation.parameters['n_candidates'], node_list)) and \
-            _is_all_equal(map(lambda node: node.operation.parameters['n_chosen'], node_list)), \
-            'Input choice with the same label must have the same number of candidates.'
-        mutator = InputChoiceMutator(node_list)
-        applied_mutators.append(mutator)
-
-    vc_nodes = _group_by_label(model.get_nodes_by_type('__torch__.nni.retiarii.nn.pytorch.api.ValueChoice'))
-    for node_list in vc_nodes:
-        assert _is_all_equal(map(lambda node: node.operation.parameters['candidates'], node_list)), \
-            'Value choice with the same label must have the same candidates.'
-        mutator = ValueChoiceMutator(node_list, node_list[0].operation.parameters['candidates'])
-        applied_mutators.append(mutator)
-
-    # `pc_nodes` are arguments of basic units. They can be compositions.
-    pc_nodes: List[Tuple[Node, str, ValueChoiceX]] = []
-    for node in model.get_nodes():
-        # arguments used in operators like Conv2d
-        # argument `valuechoice` used in generated repeat cell
-        for name, choice in node.operation.parameters.items():
-            if isinstance(choice, ValueChoiceX):
-                # e.g., (conv_node, "out_channels", ValueChoice([1, 3]))
-                pc_nodes.append((node, name, choice))
-
-    # Break `pc_nodes` down to leaf value choices. They should be what we want to sample.
-    leaf_value_choices: Dict[str, List[Any]] = {}
-    for _, __, choice in pc_nodes:
-        for inner_choice in choice.inner_choices():
-            if inner_choice.label not in leaf_value_choices:
-                leaf_value_choices[inner_choice.label] = inner_choice.candidates
-            else:
-                assert leaf_value_choices[inner_choice.label] == inner_choice.candidates, \
-                    'Value choice with the same label must have the same candidates, but found ' \
-                    f'{leaf_value_choices[inner_choice.label]} vs. {inner_choice.candidates}'
-
-    for label, candidates in leaf_value_choices.items():
-        applied_mutators.append(ParameterChoiceLeafMutator(candidates, label))
-
-    # in the end, add another parameter choice mutator for "real" mutations
-    if pc_nodes:
-        applied_mutators.append(ParameterChoiceMutator([(node, name) for node, name, _ in pc_nodes]))
-
-    # apply layer choice at last as it will delete some nodes
-    lc_nodes = _group_by_label(filter(lambda d: d.operation.parameters.get('mutation') == 'layerchoice',
-                                      model.get_nodes_by_type('_cell')))
-    for node_list in lc_nodes:
-        assert _is_all_equal(map(lambda node: len(node.operation.parameters['candidates']), node_list)), \
-            'Layer choice with the same label must have the same number of candidates.'
-        mutator = LayerChoiceMutator(node_list)
-        applied_mutators.append(mutator)
-
-    repeat_nodes = _group_by_label(filter(lambda d: d.operation.parameters.get('mutation') == 'repeat',
-                                          model.get_nodes_by_type('_cell')))
-    for node_list in repeat_nodes:
-        # this check is not completely reliable, because it only checks max and min
-        assert _is_all_equal(map(lambda node: node.operation.parameters['max_depth'], node_list)) and \
-            _is_all_equal(map(lambda node: node.operation.parameters['min_depth'], node_list)), \
-            'Repeat with the same label must have the same candidates.'
-        mutator = RepeatMutator(node_list)
-        applied_mutators.append(mutator)
-
-    if applied_mutators:
-        return applied_mutators
-    return None
-
-
-# The following are written for pure-python mode
-
-
-class ManyChooseManyMutator(Mutator):
-    """
-    Choose based on labels. Will not affect the model itself.
-    """
-
-    def __init__(self, label: str):
-        super().__init__(label=label)
-
-    @staticmethod
-    def candidates(node):
-        if 'n_candidates' in node.operation.parameters:
-            return list(range(node.operation.parameters['n_candidates']))
-        else:
-            return node.operation.parameters['candidates']
-
-    @staticmethod
-    def number_of_chosen(node):
-        if 'n_chosen' in node.operation.parameters:
-            return node.operation.parameters['n_chosen']
-        return 1
-
-    def mutate(self, model: Model) -> None:
-        # this mutate does not have any effect, but it is recorded in the mutation history
-        for node in model.get_nodes_by_label(self.label):
-            n_chosen = self.number_of_chosen(node)
-            if n_chosen is None:
-                candidates = [i for i in self.candidates(node) if self.choice([False, True])]
-                # FIXME This is a hack to make choice align with the previous format
-                # For example, it will convert [False, True, True] into [1, 2].
-                self._cur_samples = candidates
-            else:
-                for _ in range(n_chosen):
-                    self.choice(self.candidates(node))
-            break
-
-
-def extract_mutation_from_pt_module(pytorch_model: nn.Module) -> Tuple[Model, Optional[List[Mutator]]]:
-    model = Model(_internal=True)
-    graph = Graph(model, uid(), '_model', _internal=True)._register()
-    model.python_class = pytorch_model.__class__
-    if len(inspect.signature(model.python_class.__init__).parameters) > 1:
-        if not is_model_wrapped(pytorch_model):
-            raise ValueError('Please annotate the model with @model_wrapper decorator in python execution mode '
-                             'if your model has init parameters.')
-        model.python_init_params = cast(dict, pytorch_model.trace_kwargs)
-    else:
-        model.python_init_params = {}
-
-    # hyper-parameter choice
-    namespace: ModelNamespace = cast(ModelNamespace, pytorch_model._model_namespace)
-    for param_spec in namespace.parameter_specs:
-        assert param_spec.categorical and param_spec.type == 'choice'
-        node = graph.add_node(f'param_spec_{param_spec.name}', 'ModelParameterChoice', {'candidates': param_spec.values})
-        node.label = param_spec.name
-
-    for name, module in pytorch_model.named_modules():
-        # tricky case: value choice that serves as parameters are stored in traced arguments
-        if is_basic_unit(module):
-            trace_kwargs = cast(Dict[str, Any], module.trace_kwargs)
-            for key, value in trace_kwargs.items():
-                if isinstance(value, ValueChoiceX):
-                    for i, choice in enumerate(value.inner_choices()):
-                        node = graph.add_node(f'{name}.init.{key}.{i}', 'ValueChoice', {'candidates': choice.candidates})
-                        node.label = choice.label
-
-        if isinstance(module, (LayerChoice, InputChoice, ValueChoice)):
-            # TODO: check the label of module and warn if it's auto-generated
-            pass
-        if isinstance(module, LayerChoice):
-            node = graph.add_node(name, 'LayerChoice', {'candidates': module.names})
-            node.label = module.label
-        if isinstance(module, InputChoice):
-            node = graph.add_node(name, 'InputChoice',
-                                  {'n_candidates': module.n_candidates, 'n_chosen': module.n_chosen})
-            node.label = module.label
-        if isinstance(module, ValueChoiceX):
-            for i, choice in enumerate(module.inner_choices()):
-                node = graph.add_node(f'{name}.{i}', 'ValueChoice', {'candidates': choice.candidates})
-                node.label = choice.label
-        if isinstance(module, NasBench101Cell):
-            node = graph.add_node(name, 'NasBench101Cell', {
-                'max_num_edges': module.max_num_edges
-            })
-            node.label = module.label
-        if isinstance(module, Placeholder):
-            raise NotImplementedError('Placeholder is not supported in python execution mode.')
-
-    model.status = ModelStatus.Frozen
-    if not graph.hidden_nodes:
-        return model, None
-
-    mutators = []
-    mutators_final = []
-    for nodes in _group_by_label_and_type(graph.hidden_nodes):
-        label = nodes[0].label
-        assert label is not None, f'label of {nodes[0]} can not be None.'
-        assert _is_all_equal(map(lambda n: n.operation.type, nodes)), \
-            f'Node with label "{label}" does not all have the same type.'
-        assert _is_all_equal(map(lambda n: n.operation.parameters, nodes)), \
-            f'Node with label "{label}" does not agree on parameters.'
-        if nodes[0].operation.type == 'NasBench101Cell':
-            # The mutation of Nas-bench-101 is special, and has to be done lastly.
-            mutators_final.append(NasBench101Mutator(label))
-        else:
-            mutators.append(ManyChooseManyMutator(label))
-    return model, mutators + mutators_final
-
-
-# mutations for evaluator
-
-class EvaluatorValueChoiceLeafMutator(Mutator):
-    # see "ParameterChoiceLeafMutator"
-    # works in the same way
-
-    def __init__(self, candidates: List[Any], label: str):
-        super().__init__(label=label)
-        self.candidates = candidates
-
-    def mutate(self, model: Model) -> None:
-        # leave a record here
-        # real mutations will be done in ParameterChoiceMutator
-        self.choice(self.candidates)
-
-
-class EvaluatorValueChoiceMutator(Mutator):
-    # works in the same way as `ParameterChoiceMutator`
-    # we only need one such mutator for one model/evaluator
-
-    def _mutate_traceable_object(self, obj: Any, value_choice_decisions: Dict[str, Any]) -> Any:
-        if not _is_traceable_object(obj):
-            return obj
-
-        updates = {}
-
-        # For each argument that is a composition of value choice
-        # we find all the leaf-value-choice in the mutation
-        # and compute the final updates
-        for key, param in obj.trace_kwargs.items():
-            if isinstance(param, ValueChoiceX):
-                leaf_node_values = [value_choice_decisions[choice.label] for choice in param.inner_choices()]
-                updates[key] = param.evaluate(leaf_node_values)
-            elif is_traceable(param):
-                # Recursively
-                sub_update = self._mutate_traceable_object(param, value_choice_decisions)
-                if sub_update is not param:  # if mutated
-                    updates[key] = sub_update
-
-        if updates:
-            mutated_obj = obj.trace_copy()                  # Make a copy
-            mutated_obj.trace_kwargs.update(updates)        # Mutate
-            mutated_obj = mutated_obj.get()                 # Instantiate the full mutated object
-
-            return mutated_obj
-
-        return obj
-
-    def mutate(self, model: Model) -> None:
-        value_choice_decisions = {}
-        for mutation in model.history:
-            if isinstance(mutation.mutator, EvaluatorValueChoiceLeafMutator):
-                value_choice_decisions[mutation.mutator.label] = mutation.samples[0]
-
-        model.evaluator = self._mutate_traceable_object(model.evaluator, value_choice_decisions)
-
-
-def process_evaluator_mutations(evaluator: Evaluator, existing_mutators: List[Mutator]) -> List[Mutator]:
-    # take all the value choice in the kwargs of evaluaator into a list
-    # `existing_mutators` can mutators generated from `model`
-    if not _is_traceable_object(evaluator):
-        return []
-    mutator_candidates = {}
-    for param in _expand_nested_trace_kwargs(evaluator):
-        if isinstance(param, ValueChoiceX):
-            for choice in param.inner_choices():
-                # merge duplicate labels
-                for mutator in existing_mutators:
-                    if mutator.label == choice.label:
-                        raise ValueError(
-                            f'Found duplicated labels “{choice.label}”. When two value choices have the same name, '
-                            'they would share choices. However, sharing choices between model and evaluator is not supported.'
-                        )
-                if choice.label in mutator_candidates and mutator_candidates[choice.label] != choice.candidates:
-                    raise ValueError(
-                        f'Duplicate labels for evaluator ValueChoice {choice.label}. They should share choices.'
-                        f'But their candidate list is not equal: {mutator_candidates[choice.label][1]} vs. {choice.candidates}'
-                    )
-                mutator_candidates[choice.label] = choice.candidates
-    mutators = []
-    for label, candidates in mutator_candidates.items():
-        mutators.append(EvaluatorValueChoiceLeafMutator(candidates, label))
-    if mutators:
-        # one last mutator to actually apply the mutations
-        mutators.append(EvaluatorValueChoiceMutator())
-    return mutators
-
-
-# the following are written for one-shot mode
-# they shouldn't technically belong here, but all other engines are written here
-# let's refactor later
-
-def process_oneshot_mutations(base_model: nn.Module, evaluator: Evaluator):
-    # It's not intuitive, at all, (actually very hacky) to wrap a `base_model` and `evaluator` into a graph.Model.
-    # But unfortunately, this is the required interface of strategy.
-    model = Model(_internal=True)
-    model.python_object = base_model
-    # no need to set evaluator here because it will be set after this method is called
-
-    return model, []
-
-
-# utility functions
-
-
-def _is_all_equal(lst):
-    last = None
-    for x in lst:
-        if last is not None and last != x:
-            return False
-        last = x
-    return True
-
-
-def _group_by_label_and_type(nodes: Iterable[Node]) -> List[List[Node]]:
-    result = {}
-    for node in nodes:
-        key = (node.label, node.operation.type)
-        if key not in result:
-            result[key] = []
-        result[key].append(node)
-    return list(result.values())
-
-
-def _group_by_label(nodes: Iterable[Node]) -> List[List[Node]]:
-    result = {}
-    for node in nodes:
-        label = node.operation.parameters['label']
-        if label not in result:
-            result[label] = []
-        result[label].append(node)
-    return list(result.values())
-
-
-def _expand_nested_trace_kwargs(obj: Any) -> Iterator[Any]:
-    # Get items from `trace_kwargs`.
-    # If some item is traceable itself, get items recursively.
-
-    if _is_traceable_object(obj):
-        for param in obj.trace_kwargs.values():
-            yield param
-            yield from _expand_nested_trace_kwargs(param)
-
-
-def _is_traceable_object(obj: Any) -> bool:
-    # Is it a traceable "object" (not class)?
-    return is_traceable(obj) and not is_wrapped_with_trace(obj)
+from nni.nas.nn.pytorch.mutator import *
--- a/nni/retiarii/nn/pytorch/nasbench101.py
+++ b/nni/retiarii/nn/pytorch/nasbench101.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import logging
-from collections import OrderedDict
-from typing import Callable, List, Optional, Union, Dict, Tuple, cast
+# pylint: disable=wildcard-import,unused-wildcard-import

-import numpy as np
-import torch
-import torch.nn as nn
-
-from nni.retiarii.mutator import InvalidMutation, Mutator
-from nni.retiarii.graph import Model
-from .api import InputChoice, ValueChoice, LayerChoice
-from .mutation_utils import Mutable, generate_new_label, get_fixed_dict
-
-_logger = logging.getLogger(__name__)
-
-
-def compute_vertex_channels(input_channels, output_channels, matrix):
-    """
-    This is (almost) copied from the original NAS-Bench-101 implementation.
-
-    Computes the number of channels at every vertex.
-
-    Given the input channels and output channels, this calculates the number of channels at each interior vertex.
-    Interior vertices have the same number of channels as the max of the channels of the vertices it feeds into.
-    The output channels are divided amongst the vertices that are directly connected to it.
-    When the division is not even, some vertices may receive an extra channel to compensate.
-
-    Parameters
-    ----------
-    in_channels : int
-        input channels count.
-    output_channels : int
-        output channel count.
-    matrix : np.ndarray
-        adjacency matrix for the module (pruned by model_spec).
-
-    Returns
-    -------
-    list of int
-        list of channel counts, in order of the vertices.
-    """
-
-    num_vertices = np.shape(matrix)[0]
-
-    vertex_channels = [0] * num_vertices
-    vertex_channels[0] = input_channels
-    vertex_channels[num_vertices - 1] = output_channels
-
-    if num_vertices == 2:
-        # Edge case where module only has input and output vertices
-        return vertex_channels
-
-    # Compute the in-degree ignoring input, axis 0 is the src vertex and axis 1 is
-    # the dst vertex. Summing over 0 gives the in-degree count of each vertex.
-    in_degree = np.sum(matrix[1:], axis=0)
-    interior_channels = output_channels // in_degree[num_vertices - 1]
-    correction = output_channels % in_degree[num_vertices - 1]  # Remainder to add
-
-    # Set channels of vertices that flow directly to output
-    for v in range(1, num_vertices - 1):
-        if matrix[v, num_vertices - 1]:
-            vertex_channels[v] = interior_channels
-            if correction:
-                vertex_channels[v] += 1
-                correction -= 1
-
-    # Set channels for all other vertices to the max of the out edges, going backwards.
-    # (num_vertices - 2) index skipped because it only connects to output.
-    for v in range(num_vertices - 3, 0, -1):
-        if not matrix[v, num_vertices - 1]:
-            for dst in range(v + 1, num_vertices - 1):
-                if matrix[v, dst]:
-                    vertex_channels[v] = max(vertex_channels[v], vertex_channels[dst])
-        assert vertex_channels[v] > 0
-
-    _logger.debug('vertex_channels: %s', str(vertex_channels))
-
-    # Sanity check, verify that channels never increase and final channels add up.
-    final_fan_in = 0
-    for v in range(1, num_vertices - 1):
-        if matrix[v, num_vertices - 1]:
-            final_fan_in += vertex_channels[v]
-        for dst in range(v + 1, num_vertices - 1):
-            if matrix[v, dst]:
-                assert vertex_channels[v] >= vertex_channels[dst]
-    assert final_fan_in == output_channels or num_vertices == 2
-    # num_vertices == 2 means only input/output nodes, so 0 fan-in
-
-    return vertex_channels
-
-
-def prune(matrix, ops) -> Tuple[np.ndarray, List[Union[str, Callable[[int], nn.Module]]]]:
-    """
-    Prune the extraneous parts of the graph.
-
-    General procedure:
-
-    1. Remove parts of graph not connected to input.
-    2. Remove parts of graph not connected to output.
-    3. Reorder the vertices so that they are consecutive after steps 1 and 2.
-
-    These 3 steps can be combined by deleting the rows and columns of the
-    vertices that are not reachable from both the input and output (in reverse).
-    """
-    num_vertices = np.shape(matrix)[0]
-
-    # calculate the connection matrix within V number of steps.
-    connections = np.linalg.matrix_power(matrix + np.eye(num_vertices), num_vertices)
-
-    visited_from_input = set([i for i in range(num_vertices) if connections[0, i]])
-    visited_from_output = set([i for i in range(num_vertices) if connections[i, -1]])
-
-    # Any vertex that isn't connected to both input and output is extraneous to the computation graph.
-    extraneous = set(range(num_vertices)).difference(
-        visited_from_input.intersection(visited_from_output))
-
-    if len(extraneous) > num_vertices - 2:
-        raise InvalidMutation('Non-extraneous graph is less than 2 vertices, '
-                              'the input is not connected to the output and the spec is invalid.')
-
-    matrix = np.delete(matrix, list(extraneous), axis=0)
-    matrix = np.delete(matrix, list(extraneous), axis=1)
-    for index in sorted(extraneous, reverse=True):
-        del ops[index]
-    return matrix, ops
-
-
-def truncate(inputs, channels):
-    input_channels = inputs.size(1)
-    if input_channels < channels:
-        raise ValueError('input channel < output channels for truncate')
-    elif input_channels == channels:
-        return inputs   # No truncation necessary
-    else:
-        # Truncation should only be necessary when channel division leads to
-        # vertices with +1 channels. The input vertex should always be projected to
-        # the minimum channel count.
-        assert input_channels - channels == 1
-        return inputs[:, :channels]
-
-
-class _NasBench101CellFixed(nn.Module):
-    """
-    The fixed version of NAS-Bench-101 Cell, used in python-version execution engine.
-    """
-
-    def __init__(self, operations: List[Callable[[int], nn.Module]],
-                 adjacency_list: List[List[int]],
-                 in_features: int, out_features: int, num_nodes: int,
-                 projection: Callable[[int, int], nn.Module]):
-        super().__init__()
-
-        assert num_nodes == len(operations) + 2 == len(adjacency_list) + 1
-
-        raw_operations: List[Union[str, Callable[[int], nn.Module]]] = list(operations)
-        del operations  # operations is no longer needed. Delete it to avoid misuse
-
-        # add psuedo nodes
-        raw_operations.insert(0, 'IN')
-        raw_operations.append('OUT')
-
-        self.connection_matrix = self.build_connection_matrix(adjacency_list, num_nodes)
-        del num_nodes  # raw number of nodes is no longer used
-
-        self.connection_matrix, self.operations = prune(self.connection_matrix, raw_operations)
-
-        self.hidden_features = compute_vertex_channels(in_features, out_features, self.connection_matrix)
-
-        self.num_nodes = len(self.connection_matrix)
-        self.in_features = in_features
-        self.out_features = out_features
-        _logger.info('Prund number of nodes: %d', self.num_nodes)
-        _logger.info('Pruned connection matrix: %s', str(self.connection_matrix))
-
-        self.projections = nn.ModuleList([nn.Identity()])
-        self.ops = nn.ModuleList([nn.Identity()])
-        for i in range(1, self.num_nodes):
-            self.projections.append(projection(in_features, self.hidden_features[i]))
-
-        for i in range(1, self.num_nodes - 1):
-            operation = cast(Callable[[int], nn.Module], self.operations[i])
-            self.ops.append(operation(self.hidden_features[i]))
-
-    @staticmethod
-    def build_connection_matrix(adjacency_list, num_nodes):
-        adjacency_list = [[]] + adjacency_list  # add adjacency for first node
-        connections = np.zeros((num_nodes, num_nodes), dtype='int')
-        for i, lst in enumerate(adjacency_list):
-            assert all([0 <= k < i for k in lst])
-            for k in lst:
-                connections[k, i] = 1
-        return connections
-
-    def forward(self, inputs):
-        tensors = [inputs]
-        for t in range(1, self.num_nodes - 1):
-
-            # Create interior connections, truncating if necessary
-            add_in = [truncate(tensors[src], self.hidden_features[t])
-                      for src in range(1, t) if self.connection_matrix[src, t]]
-
-            # Create add connection from projected input
-            if self.connection_matrix[0, t]:
-                add_in.append(self.projections[t](tensors[0]))
-
-            if len(add_in) == 1:
-                vertex_input = add_in[0]
-            else:
-                vertex_input = sum(add_in)
-
-            # Perform op at vertex t
-            vertex_out = self.ops[t](vertex_input)
-            tensors.append(vertex_out)
-
-        # Construct final output tensor by concating all fan-in and adding input.
-        if np.sum(self.connection_matrix[:, -1]) == 1:
-            src = np.where(self.connection_matrix[:, -1] == 1)[0][0]
-            return self.projections[-1](tensors[0]) if src == 0 else tensors[src]
-
-        outputs = torch.cat([tensors[src] for src in range(1, self.num_nodes - 1) if self.connection_matrix[src, -1]], 1)
-        if self.connection_matrix[0, -1]:
-            outputs += self.projections[-1](tensors[0])
-        assert outputs.size(1) == self.out_features
-        return outputs
-
-
-class NasBench101Cell(Mutable):
-    """
-    Cell structure that is proposed in NAS-Bench-101.
-
-    Proposed by `NAS-Bench-101: Towards Reproducible Neural Architecture Search <http://proceedings.mlr.press/v97/ying19a/ying19a.pdf>`__.
-
-    This cell is usually used in evaluation of NAS algorithms because there is a "comprehensive analysis" of this search space
-    available, which includes a full architecture-dataset that "maps 423k unique architectures to metrics
-    including run time and accuracy". You can also use the space in your own space design, in which scenario it should be possible
-    to leverage results in the benchmark to narrow the huge space down to a few efficient architectures.
-
-    The space of this cell architecture consists of all possible directed acyclic graphs on no more than ``max_num_nodes`` nodes,
-    where each possible node (other than IN and OUT) has one of ``op_candidates``, representing the corresponding operation.
-    Edges connecting the nodes can be no more than ``max_num_edges``.
-    To align with the paper settings, two vertices specially labeled as operation IN and OUT, are also counted into
-    ``max_num_nodes`` in our implementaion, the default value of ``max_num_nodes`` is 7 and ``max_num_edges`` is 9.
-
-    Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. The shape
-    of each hidden nodes will be first automatically computed, depending on the cell structure. Each of the ``op_candidates``
-    should be a callable that accepts computed ``num_features`` and returns a ``Module``. For example,
-
-    .. code-block:: python
-
-        def conv_bn_relu(num_features):
-            return nn.Sequential(
-                nn.Conv2d(num_features, num_features, 1),
-                nn.BatchNorm2d(num_features),
-                nn.ReLU()
-            )
-
-    The output of each node is the sum of its input node feed into its operation, except for the last node (output node),
-    which is the concatenation of its input *hidden* nodes, adding the *IN* node (if IN and OUT are connected).
-
-    When input tensor is added with any other tensor, there could be shape mismatch. Therefore, a projection transformation
-    is needed to transform the input tensor. In paper, this is simply a Conv1x1 followed by BN and ReLU. The ``projection``
-    parameters accepts ``in_features`` and ``out_features``, returns a ``Module``. This parameter has no default value,
-    as we hold no assumption that users are dealing with images. An example for this parameter is,
-
-    .. code-block:: python
-
-        def projection_fn(in_features, out_features):
-            return nn.Conv2d(in_features, out_features, 1)
-
-    Parameters
-    ----------
-    op_candidates : list of callable
-        Operation candidates. Each should be a function accepts number of feature, returning nn.Module.
-    in_features : int
-        Input dimension of cell.
-    out_features : int
-        Output dimension of cell.
-    projection : callable
-        Projection module that is used to preprocess the input tensor of the whole cell.
-        A callable that accept input feature and output feature, returning nn.Module.
-    max_num_nodes : int
-        Maximum number of nodes in the cell, input and output included. At least 2. Default: 7.
-    max_num_edges : int
-        Maximum number of edges in the cell. Default: 9.
-    label : str
-        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
-
-    Warnings
-    --------
-    :class:`NasBench101Cell` is not supported in :ref:`graph-based execution engine <graph-based-execution-engine>`.
-    """
-
-    @staticmethod
-    def _make_dict(x):
-        if isinstance(x, list):
-            return OrderedDict([(str(i), t) for i, t in enumerate(x)])
-        return OrderedDict(x)
-
-    @classmethod
-    def create_fixed_module(cls, op_candidates: Union[Dict[str, Callable[[int], nn.Module]], List[Callable[[int], nn.Module]]],
-                            in_features: int, out_features: int, projection: Callable[[int, int], nn.Module],
-                            max_num_nodes: int = 7, max_num_edges: int = 9, label: Optional[str] = None):
-        def make_list(x): return x if isinstance(x, list) else [x]
-
-        label, selected = get_fixed_dict(label)
-        op_candidates = cls._make_dict(op_candidates)
-        num_nodes = selected[f'{label}/num_nodes']
-        adjacency_list = [make_list(selected[f'{label}/input{i}']) for i in range(1, num_nodes)]
-        if sum([len(e) for e in adjacency_list]) > max_num_edges:
-            raise InvalidMutation(f'Expected {max_num_edges} edges, found: {adjacency_list}')
-        return _NasBench101CellFixed(
-            [op_candidates[selected[f'{label}/op{i}']] for i in range(1, num_nodes - 1)],
-            adjacency_list, in_features, out_features, num_nodes, projection)
-
-        # FIXME: weight inheritance on nasbench101 is not supported yet
-
-    def __init__(self, op_candidates: Union[Dict[str, Callable[[int], nn.Module]], List[Callable[[int], nn.Module]]],
-                 in_features: int, out_features: int, projection: Callable[[int, int], nn.Module],
-                 max_num_nodes: int = 7, max_num_edges: int = 9, label: Optional[str] = None):
-
-        super().__init__()
-        self._label = generate_new_label(label)
-        num_vertices_prior = [2 ** i for i in range(2, max_num_nodes + 1)]
-        num_vertices_prior = (np.array(num_vertices_prior) / sum(num_vertices_prior)).tolist()
-        self.num_nodes = ValueChoice(list(range(2, max_num_nodes + 1)),
-                                     prior=num_vertices_prior,
-                                     label=f'{self._label}/num_nodes')
-        self.max_num_nodes = max_num_nodes
-        self.max_num_edges = max_num_edges
-
-        op_candidates = self._make_dict(op_candidates)
-
-        # this is only for input validation and instantiating enough layer choice and input choice
-        self.hidden_features = out_features
-
-        self.projections = nn.ModuleList([nn.Identity()])
-        self.ops = nn.ModuleList([nn.Identity()])
-        self.inputs = nn.ModuleList([nn.Identity()])
-        for _ in range(1, max_num_nodes):
-            self.projections.append(projection(in_features, self.hidden_features))
-        for i in range(1, max_num_nodes):
-            if i < max_num_nodes - 1:
-                self.ops.append(LayerChoice(OrderedDict([(k, op(self.hidden_features)) for k, op in op_candidates.items()]),
-                                            label=f'{self._label}/op{i}'))
-            self.inputs.append(InputChoice(i, None, label=f'{self._label}/input{i}'))
-
-    @property
-    def label(self):
-        return self._label
-
-    def forward(self, x):
-        """
-        The forward of input choice is simply selecting first on all choices.
-        It shouldn't be called directly by users in most cases.
-        """
-        tensors = [x]
-        for i in range(1, self.max_num_nodes):
-            node_input = self.inputs[i]([self.projections[i](tensors[0])] + [t for t in tensors[1:]])
-            if i < self.max_num_nodes - 1:
-                node_output = self.ops[i](node_input)
-            else:
-                node_output = node_input
-            tensors.append(node_output)
-        return tensors[-1]
-
-
-class NasBench101Mutator(Mutator):
-    # for validation purposes
-    # for python execution engine
-
-    def __init__(self, label: str):
-        super().__init__(label=label)
-
-    @staticmethod
-    def candidates(node):
-        if 'n_candidates' in node.operation.parameters:
-            return list(range(node.operation.parameters['n_candidates']))
-        else:
-            return node.operation.parameters['candidates']
-
-    @staticmethod
-    def number_of_chosen(node):
-        if 'n_chosen' in node.operation.parameters:
-            return node.operation.parameters['n_chosen']
-        return 1
-
-    def mutate(self, model: Model):
-        max_num_edges = cast(int, None)
-        for node in model.get_nodes_by_label(self.label):
-            max_num_edges = node.operation.parameters['max_num_edges']
-            break
-        assert max_num_edges is not None
-        mutation_dict = {mut.mutator.label: mut.samples for mut in model.history}
-        num_nodes = mutation_dict[f'{self.label}/num_nodes'][0]
-        adjacency_list = [mutation_dict[f'{self.label}/input{i}'] for i in range(1, num_nodes)]
-        if sum([len(e) for e in adjacency_list]) > max_num_edges:
-            raise InvalidMutation(f'Expected {max_num_edges} edges, found: {adjacency_list}')
-        matrix = _NasBench101CellFixed.build_connection_matrix(adjacency_list, num_nodes)
-
-        operations = ['IN'] + [mutation_dict[f'{self.label}/op{i}'][0] for i in range(1, num_nodes - 1)] + ['OUT']
-        assert len(operations) == len(matrix)
-        matrix, operations = prune(matrix, operations)  # possible to raise InvalidMutation inside
-
-        # NOTE: a hack to maintain a clean copy of what nasbench101 cell looks like
-        self._cur_samples = {}
-        for i in range(1, len(matrix)):
-            if i + 1 < len(matrix):
-                self._cur_samples[f'op{i}'] = operations[i]
-            self._cur_samples[f'input{i}'] = [k for k in range(i) if matrix[k, i]]
-        self._cur_samples = [self._cur_samples]  # by design, _cur_samples is a list of samples
-
-    def dry_run(self, model):
-        return [], model
+from nni.nas.hub.pytorch.modules.nasbench101 import *