[Retiarii] Search space hub (#4524)

f9bbd8d0 · Yuge Zhang · GitHub · 9fde0e8e · f9bbd8d0 · f9bbd8d0
Unverified Commit f9bbd8d0 authored Apr 06, 2022 by Yuge Zhang Committed by GitHub Apr 06, 2022
16 changed files
--- a/examples/nas/.gitignore
+++ b/examples/nas/.gitignore
@@ -8,3 +8,4 @@ _generated_model_*.py
 _generated_model
 generated
 lightning_logs
+model.onnx
--- a/nni/retiarii/evaluator/pytorch/lightning.py
+++ b/nni/retiarii/evaluator/pytorch/lightning.py
@@ -177,7 +177,6 @@ class _SupervisedLearningModule(LightningModule):
            self.export_onnx = Path(export_onnx)
        else:
            self.export_onnx = None
-        self._already_exported = False
    def forward(self, x):
        y_hat = self.model(x)
@@ -196,12 +195,12 @@ class _SupervisedLearningModule(LightningModule):
        x, y = batch
        y_hat = self(x)
-        if not self._already_exported:
+        if self.export_onnx is not None:
            try:
                self.to_onnx(self.export_onnx, x, export_params=True)
            except RuntimeError as e:
                warnings.warn(f'ONNX conversion failed. As a result, you might not be able to use visualization. Error message: {e}')
-            self._already_exported = True
+            self.export_onnx = None
        self.log('val_loss', self.criterion(y_hat, y), prog_bar=True)
        for name, metric in self.metrics.items():

--- a/nni/retiarii/hub/README.md
+++ b/nni/retiarii/hub/README.md
+This README will be deleted once this hub got stabilized, after which we will promote it in the documentation.
+## Why
+We hereby provides a series of state-of-the-art search space, which is PyTorch model + mutations + training recipe.
+For further motivations and plans, please see https://github.com/microsoft/nni/issues/4249.
+## Reproduction Roadmap
+1. Runnable
+2. Load checkpoint of searched architecture and evaluate
+3. Reproduce searched architecture
+4. Runnable with built-in algos
+5. Reproduce result with at least one algo
+|                        | 1      | 2      | 3      | 4      | 5      |
+|------------------------|--------|--------|--------|--------|--------|
+| NasBench101            | Y      |        |        |        |        |
+| NasBench201            | Y      |        |        |        |        |
+| NASNet                 | Y      |        |        |        |        |
+| ENAS                   | Y      |        |        |        |        |
+| AmoebaNet              | Y      |        |        |        |        |
+| PNAS                   | Y      |        |        |        |        |
+| DARTS                  | Y      |        |        |        |        |
+| ProxylessNAS           | Y      |        |        |        |        |
+| MobileNetV3Space       | Y      |        |        |        |        |
+| ShuffleNetSpace        | Y      |        |        |        |        |
+| ShuffleNetSpace (ch)   | Y      |        |        |        |        |
--- a/nni/retiarii/hub/__init__.py
+++ b/nni/retiarii/hub/__init__.py
--- a/nni/retiarii/hub/pytorch/__init__.py
+++ b/nni/retiarii/hub/pytorch/__init__.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from .mobilenetv3 import MobileNetV3Space
+from .nasbench101 import NasBench101
+from .nasbench201 import NasBench201
+from .nasnet import NDS, NASNet, ENAS, AmoebaNet, PNAS, DARTS
+from .proxylessnas import ProxylessNAS
+from .shufflenet import ShuffleNetSpace
--- a/nni/retiarii/hub/pytorch/mobilenetv3.py
+++ b/nni/retiarii/hub/pytorch/mobilenetv3.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from typing import Tuple, Optional, Callable
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+from .proxylessnas import ConvBNReLU, InvertedResidual, SeparableConv, make_divisible, reset_parameters
+class h_sigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_sigmoid, self).__init__()
+        self.relu = nn.ReLU6(inplace=inplace)
+    def forward(self, x):
+        return self.relu(x + 3) / 6
+class h_swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(h_swish, self).__init__()
+        self.sigmoid = h_sigmoid(inplace=inplace)
+    def forward(self, x):
+        return x * self.sigmoid(x)
+class SELayer(nn.Module):
+    """Squeeze-and-excite layer."""
+    def __init__(self,
+                 channels: int,
+                 reduction: int = 4,
+                 activation_layer: Optional[Callable[..., nn.Module]] = None):
+        super().__init__()
+        if activation_layer is None:
+            activation_layer = nn.Sigmoid
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channels, make_divisible(channels // reduction, 8)),
+            nn.ReLU(inplace=True),
+            nn.Linear(make_divisible(channels // reduction, 8), channels),
+            activation_layer()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+@model_wrapper
+class MobileNetV3Space(nn.Module):
+    """
+    MobileNetV3Space implements the largest search space in `TuNAS <https://arxiv.org/abs/2008.06120>`__.
+    The search dimensions include widths, expand ratios, kernel sizes, SE ratio.
+    Some of them can be turned off via arguments to narrow down the search space.
+    Different from ProxylessNAS search space, this space is implemented with :class:`nn.ValueChoice`.
+    We use the following snipppet as reference.
+    https://github.com/google-research/google-research/blob/20736344591f774f4b1570af64624ed1e18d2867/tunas/mobile_search_space_v3.py#L728
+    """
+    def __init__(self, num_labels: int = 1000,
+                 base_widths: Tuple[int, ...] = (16, 16, 32, 64, 128, 256, 512, 1024),
+                 width_multipliers: Tuple[float, ...] = (0.5, 0.625, 0.75, 1.0, 1.25, 1.5, 2.0),
+                 expand_ratios: Tuple[int, ...] = (1, 2, 3, 4, 5, 6),
+                 dropout_rate: float = 0.2,
+                 bn_eps: float = 1e-3,
+                 bn_momentum: float = 0.1):
+        super().__init__()
+        self.widths = [
+            nn.ValueChoice([make_divisible(base_width * mult, 8) for mult in width_multipliers], label=f'width_{i}')
+            for i, base_width in enumerate(base_widths)
+        ]
+        self.expand_ratios = expand_ratios
+        blocks = [
+            # Stem
+            ConvBNReLU(
+                3, self.widths[0],
+                nn.ValueChoice([3, 5], label='ks_0'),
+                stride=2, activation_layer=h_swish
+            ),
+            SeparableConv(self.widths[0], self.widths[0], activation_layer=nn.ReLU),
+        ]
+        # counting for kernel sizes and expand ratios
+        self.layer_count = 2
+        blocks += [
+            # Body
+            self._make_stage(1, self.widths[0], self.widths[1], False, 2, nn.ReLU),
+            self._make_stage(2, self.widths[1], self.widths[2], True, 2, nn.ReLU),
+            self._make_stage(1, self.widths[2], self.widths[3], False, 2, h_swish),
+            self._make_stage(1, self.widths[3], self.widths[4], True, 1, h_swish),
+            self._make_stage(1, self.widths[4], self.widths[5], True, 2, h_swish),
+        ]
+        # Head
+        blocks += [
+            ConvBNReLU(self.widths[5], self.widths[6], 1, 1, activation_layer=h_swish),
+            nn.AdaptiveAvgPool2d(1),
+            ConvBNReLU(self.widths[6], self.widths[7], 1, 1, norm_layer=nn.Identity, activation_layer=h_swish),
+        ]
+        self.blocks = nn.Sequential(*blocks)
+        self.classifier = nn.Sequential(
+            nn.Dropout(dropout_rate),
+            nn.Linear(self.widths[7], num_labels),
+        )
+        reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
+    def forward(self, x):
+        x = self.blocks(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+    def _make_stage(self, stage_idx, inp, oup, se, stride, act):
+        # initialize them first because they are related to layer_count.
+        exp, ks, se_blocks = [], [], []
+        for _ in range(4):
+            exp.append(nn.ValueChoice(list(self.expand_ratios), label=f'exp_{self.layer_count}'))
+            ks.append(nn.ValueChoice([3, 5, 7], label=f'ks_{self.layer_count}'))
+            if se:
+                # if SE is true, assign a layer choice to SE
+                se_blocks.append(
+                    lambda hidden_ch: nn.LayerChoice([nn.Identity(), SELayer(hidden_ch)], label=f'se_{self.layer_count}')
+                )
+            else:
+                se_blocks.append(None)
+            self.layer_count += 1
+        blocks = [
+            # stride = 2
+            InvertedResidual(inp, oup, exp[0], ks[0],
+                             stride, squeeze_and_excite=se_blocks[0], activation_layer=act),
+            # stride = 1, residual connection should be automatically enabled
+            InvertedResidual(oup, oup, exp[1], ks[1], squeeze_and_excite=se_blocks[1], activation_layer=act),
+            InvertedResidual(oup, oup, exp[2], ks[2], squeeze_and_excite=se_blocks[2], activation_layer=act),
+            InvertedResidual(oup, oup, exp[3], ks[3], squeeze_and_excite=se_blocks[3], activation_layer=act)
+        ]
+        # mutable depth
+        return nn.Repeat(blocks, depth=(1, 4), label=f'depth_{stage_idx}')
--- a/nni/retiarii/hub/pytorch/nasbench101.py
+++ b/nni/retiarii/hub/pytorch/nasbench101.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import math
+import torch.nn as nn
+from nni.retiarii import model_wrapper
+from nni.retiarii.nn.pytorch import NasBench101Cell
+__all__ = ['NasBench101']
+def truncated_normal_(tensor, mean=0, std=1):
+    # https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/15
+    size = tensor.shape
+    tmp = tensor.new_empty(size + (4,)).normal_()
+    valid = (tmp < 2) & (tmp > -2)
+    ind = valid.max(-1, keepdim=True)[1]
+    tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
+    tensor.data.mul_(std).add_(mean)
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+        super(ConvBNReLU, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_bn_relu = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)
+        )
+        self.reset_parameters()
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                fan_in = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                truncated_normal_(m.weight.data, mean=0., std=math.sqrt(1. / fan_in))
+            if isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def forward(self, x):
+        return self.conv_bn_relu(x)
+class Conv3x3BNReLU(ConvBNReLU):
+    def __init__(self, in_channels, out_channels):
+        super(Conv3x3BNReLU, self).__init__(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+class Conv1x1BNReLU(ConvBNReLU):
+    def __init__(self, in_channels, out_channels):
+        super(Conv1x1BNReLU, self).__init__(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+Projection = Conv1x1BNReLU
+@model_wrapper
+class NasBench101(nn.Module):
+    """The full search space, proposed by `NAS-Bench-101 <http://proceedings.mlr.press/v97/ying19a/ying19a.pdf>`__.
+    It's simply a stack of :class:`NasBench101Cell`. Operations are conv3x3, conv1x1 and maxpool respectively.
+    """
+    def __init__(self,
+                 stem_out_channels: int = 128,
+                 num_stacks: int = 3,
+                 num_modules_per_stack: int = 3,
+                 max_num_vertices: int = 7,
+                 max_num_edges: int = 9,
+                 num_labels: int = 10,
+                 bn_eps: float = 1e-5,
+                 bn_momentum: float = 0.003):
+        super().__init__()
+        op_candidates = {
+            'conv3x3-bn-relu': lambda num_features: Conv3x3BNReLU(num_features, num_features),
+            'conv1x1-bn-relu': lambda num_features: Conv1x1BNReLU(num_features, num_features),
+            'maxpool3x3': lambda num_features: nn.MaxPool2d(3, 1, 1)
+        }
+        # initial stem convolution
+        self.stem_conv = Conv3x3BNReLU(3, stem_out_channels)
+        layers = []
+        in_channels = out_channels = stem_out_channels
+        for stack_num in range(num_stacks):
+            if stack_num > 0:
+                downsample = nn.MaxPool2d(kernel_size=2, stride=2)
+                layers.append(downsample)
+                out_channels *= 2
+            for _ in range(num_modules_per_stack):
+                cell = NasBench101Cell(op_candidates, in_channels, out_channels,
+                                       lambda cin, cout: Projection(cin, cout),
+                                       max_num_vertices, max_num_edges, label='cell')
+                layers.append(cell)
+                in_channels = out_channels
+        self.features = nn.ModuleList(layers)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Linear(out_channels, num_labels)
+        for module in self.modules():
+            if isinstance(module, nn.BatchNorm2d):
+                module.eps = bn_eps
+                module.momentum = bn_momentum
+    def forward(self, x):
+        bs = x.size(0)
+        out = self.stem_conv(x)
+        for layer in self.features:
+            out = layer(out)
+        out = self.gap(out).view(bs, -1)
+        out = self.classifier(out)
+        return out
+    def reset_parameters(self):
+        for module in self.modules():
+            if isinstance(module, nn.BatchNorm2d):
+                module.eps = self.config.bn_eps
+                module.momentum = self.config.bn_momentum
--- a/nni/retiarii/hub/pytorch/nasbench201.py
+++ b/nni/retiarii/hub/pytorch/nasbench201.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import torch
+import torch.nn as nn
+from nni.retiarii import model_wrapper
+from nni.retiarii.nn.pytorch import NasBench201Cell
+__all__ = ['NasBench201']
+OPS_WITH_STRIDE = {
+    'none': lambda C_in, C_out, stride: Zero(C_in, C_out, stride),
+    'avg_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'avg'),
+    'max_pool_3x3': lambda C_in, C_out, stride: Pooling(C_in, C_out, stride, 'max'),
+    'conv_3x3': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (3, 3), (stride, stride), (1, 1), (1, 1)),
+    'conv_1x1': lambda C_in, C_out, stride: ReLUConvBN(C_in, C_out, (1, 1), (stride, stride), (0, 0), (1, 1)),
+    'skip_connect': lambda C_in, C_out, stride: nn.Identity() if stride == 1 and C_in == C_out
+    else FactorizedReduce(C_in, C_out, stride),
+}
+PRIMITIVES = ['none', 'skip_connect', 'conv_1x1', 'conv_3x3', 'avg_pool_3x3']
+class ReLUConvBN(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
+        super(ReLUConvBN, self).__init__()
+        self.op = nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C_in, C_out, kernel_size, stride=stride,
+                      padding=padding, dilation=dilation, bias=False),
+            nn.BatchNorm2d(C_out)
+        )
+    def forward(self, x):
+        return self.op(x)
+class SepConv(nn.Module):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation):
+        super(SepConv, self).__init__()
+        self.op = nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride,
+                      padding=padding, dilation=dilation, groups=C_in, bias=False),
+            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out),
+        )
+    def forward(self, x):
+        return self.op(x)
+class Pooling(nn.Module):
+    def __init__(self, C_in, C_out, stride, mode):
+        super(Pooling, self).__init__()
+        if C_in == C_out:
+            self.preprocess = None
+        else:
+            self.preprocess = ReLUConvBN(C_in, C_out, 1, 1, 0, 1)
+        if mode == 'avg':
+            self.op = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
+        elif mode == 'max':
+            self.op = nn.MaxPool2d(3, stride=stride, padding=1)
+        else:
+            raise ValueError('Invalid mode={:} in Pooling'.format(mode))
+    def forward(self, x):
+        if self.preprocess:
+            x = self.preprocess(x)
+        return self.op(x)
+class Zero(nn.Module):
+    def __init__(self, C_in, C_out, stride):
+        super(Zero, self).__init__()
+        self.C_in = C_in
+        self.C_out = C_out
+        self.stride = stride
+        self.is_zero = True
+    def forward(self, x):
+        if self.C_in == self.C_out:
+            if self.stride == 1:
+                return x.mul(0.)
+            else:
+                return x[:, :, ::self.stride, ::self.stride].mul(0.)
+        else:
+            shape = list(x.shape)
+            shape[1] = self.C_out
+            zeros = x.new_zeros(shape, dtype=x.dtype, device=x.device)
+            return zeros
+class FactorizedReduce(nn.Module):
+    def __init__(self, C_in, C_out, stride):
+        super(FactorizedReduce, self).__init__()
+        self.stride = stride
+        self.C_in = C_in
+        self.C_out = C_out
+        self.relu = nn.ReLU(inplace=False)
+        if stride == 2:
+            C_outs = [C_out // 2, C_out - C_out // 2]
+            self.convs = nn.ModuleList()
+            for i in range(2):
+                self.convs.append(nn.Conv2d(C_in, C_outs[i], 1, stride=stride, padding=0, bias=False))
+            self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
+        else:
+            raise ValueError('Invalid stride : {:}'.format(stride))
+        self.bn = nn.BatchNorm2d(C_out)
+    def forward(self, x):
+        x = self.relu(x)
+        y = self.pad(x)
+        out = torch.cat([self.convs[0](x), self.convs[1](y[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
+class ResNetBasicblock(nn.Module):
+    def __init__(self, inplanes, planes, stride):
+        super(ResNetBasicblock, self).__init__()
+        assert stride == 1 or stride == 2, 'invalid stride {:}'.format(stride)
+        self.conv_a = ReLUConvBN(inplanes, planes, 3, stride, 1, 1)
+        self.conv_b = ReLUConvBN(planes, planes, 3, 1, 1, 1)
+        if stride == 2:
+            self.downsample = nn.Sequential(
+                nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
+                nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False))
+        elif inplanes != planes:
+            self.downsample = ReLUConvBN(inplanes, planes, 1, 1, 0, 1)
+        else:
+            self.downsample = None
+        self.in_dim = inplanes
+        self.out_dim = planes
+        self.stride = stride
+        self.num_conv = 2
+    def forward(self, inputs):
+        basicblock = self.conv_a(inputs)
+        basicblock = self.conv_b(basicblock)
+        if self.downsample is not None:
+            inputs = self.downsample(inputs)  # residual
+        return inputs + basicblock
+@model_wrapper
+class NasBench201(nn.Module):
+    """The full search space proposed by `NAS-Bench-201 <https://arxiv.org/abs/2001.00326>`__.
+    It's a stack of :class:`NasBench201Cell`.
+    """
+    def __init__(self,
+                 stem_out_channels: int = 16,
+                 num_modules_per_stack: int = 5,
+                 num_labels: int = 10):
+        super().__init__()
+        self.channels = C = stem_out_channels
+        self.num_modules = N = num_modules_per_stack
+        self.num_labels = num_labels
+        self.stem = nn.Sequential(
+            nn.Conv2d(3, C, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(C)
+        )
+        layer_channels = [C] * N + [C * 2] + [C * 2] * N + [C * 4] + [C * 4] * N
+        layer_reductions = [False] * N + [True] + [False] * N + [True] + [False] * N
+        C_prev = C
+        self.cells = nn.ModuleList()
+        for C_curr, reduction in zip(layer_channels, layer_reductions):
+            if reduction:
+                cell = ResNetBasicblock(C_prev, C_curr, 2)
+            else:
+                cell = NasBench201Cell({prim: lambda C_in, C_out: OPS_WITH_STRIDE[prim](C_in, C_out, 1) for prim in PRIMITIVES},
+                                       C_prev, C_curr, label='cell')
+            self.cells.append(cell)
+            C_prev = C_curr
+        self.lastact = nn.Sequential(
+            nn.BatchNorm2d(C_prev),
+            nn.ReLU(inplace=True)
+        )
+        self.global_pooling = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Linear(C_prev, self.num_labels)
+    def forward(self, inputs):
+        feature = self.stem(inputs)
+        for cell in self.cells:
+            feature = cell(feature)
+        out = self.lastact(feature)
+        out = self.global_pooling(out)
+        out = out.view(out.size(0), -1)
+        logits = self.classifier(out)
+        return logits
--- a/nni/retiarii/hub/pytorch/nasnet.py
+++ b/nni/retiarii/hub/pytorch/nasnet.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""File containing NASNet-series search space.
+The implementation is based on NDS.
+It's called ``nasnet.py`` simply because NASNet is the first to propose such structure.
+"""
+from collections import OrderedDict
+from typing import Tuple, List, Union, Iterable, Dict, Callable
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+import torch
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+# the following are NAS operations from
+# https://github.com/facebookresearch/unnas/blob/main/pycls/models/nas/operations.py
+OPS = {
+    'none': lambda C, stride, affine:
+        Zero(stride),
+    'avg_pool_2x2': lambda C, stride, affine:
+        nn.AvgPool2d(2, stride=stride, padding=0, count_include_pad=False),
+    'avg_pool_3x3': lambda C, stride, affine:
+        nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False),
+    'avg_pool_5x5': lambda C, stride, affine:
+        nn.AvgPool2d(5, stride=stride, padding=2, count_include_pad=False),
+    'max_pool_2x2': lambda C, stride, affine:
+        nn.MaxPool2d(2, stride=stride, padding=0),
+    'max_pool_3x3': lambda C, stride, affine:
+        nn.MaxPool2d(3, stride=stride, padding=1),
+    'max_pool_5x5': lambda C, stride, affine:
+        nn.MaxPool2d(5, stride=stride, padding=2),
+    'max_pool_7x7': lambda C, stride, affine:
+        nn.MaxPool2d(7, stride=stride, padding=3),
+    'skip_connect': lambda C, stride, affine:
+        nn.Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
+    'conv_1x1': lambda C, stride, affine:
+        nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C, C, 1, stride=stride, padding=0, bias=False),
+            nn.BatchNorm2d(C, affine=affine)
+        ),
+    'conv_3x3': lambda C, stride, affine:
+        nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C, C, 3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(C, affine=affine)
+        ),
+    'sep_conv_3x3': lambda C, stride, affine:
+        SepConv(C, C, 3, stride, 1, affine=affine),
+    'sep_conv_5x5': lambda C, stride, affine:
+        SepConv(C, C, 5, stride, 2, affine=affine),
+    'sep_conv_7x7': lambda C, stride, affine:
+        SepConv(C, C, 7, stride, 3, affine=affine),
+    'dil_conv_3x3': lambda C, stride, affine:
+        DilConv(C, C, 3, stride, 2, 2, affine=affine),
+    'dil_conv_5x5': lambda C, stride, affine:
+        DilConv(C, C, 5, stride, 4, 2, affine=affine),
+    'dil_sep_conv_3x3': lambda C, stride, affine:
+        DilSepConv(C, C, 3, stride, 2, 2, affine=affine),
+    'conv_3x1_1x3': lambda C, stride, affine:
+        nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C, C, (1, 3), stride=(1, stride), padding=(0, 1), bias=False),
+            nn.Conv2d(C, C, (3, 1), stride=(stride, 1), padding=(1, 0), bias=False),
+            nn.BatchNorm2d(C, affine=affine)
+        ),
+    'conv_7x1_1x7': lambda C, stride, affine:
+        nn.Sequential(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(C, C, (1, 7), stride=(1, stride), padding=(0, 3), bias=False),
+            nn.Conv2d(C, C, (7, 1), stride=(stride, 1), padding=(3, 0), bias=False),
+            nn.BatchNorm2d(C, affine=affine)
+        ),
+}
+class ReLUConvBN(nn.Sequential):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(
+                C_in, C_out, kernel_size, stride=stride,
+                padding=padding, bias=False
+            ),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+class DilConv(nn.Sequential):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
+        super().__init__(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(
+                C_in, C_in, kernel_size=kernel_size, stride=stride,
+                padding=padding, dilation=dilation, groups=C_in, bias=False
+            ),
+            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine),
+        )
+class SepConv(nn.Sequential):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(
+                C_in, C_in, kernel_size=kernel_size, stride=stride,
+                padding=padding, groups=C_in, bias=False
+            ),
+            nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_in, affine=affine),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(
+                C_in, C_in, kernel_size=kernel_size, stride=1,
+                padding=padding, groups=C_in, bias=False
+            ),
+            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine),
+        )
+class DilSepConv(nn.Sequential):
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
+        super().__init__(
+            nn.ReLU(inplace=False),
+            nn.Conv2d(
+                C_in, C_in, kernel_size=kernel_size, stride=stride,
+                padding=padding, dilation=dilation, groups=C_in, bias=False
+            ),
+            nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_in, affine=affine),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(
+                C_in, C_in, kernel_size=kernel_size, stride=1,
+                padding=padding, dilation=dilation, groups=C_in, bias=False
+            ),
+            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine),
+        )
+class Zero(nn.Module):
+    def __init__(self, stride):
+        super().__init__()
+        self.stride = stride
+    def forward(self, x):
+        if self.stride == 1:
+            return x.mul(0.)
+        return x[:, :, ::self.stride, ::self.stride].mul(0.)
+class FactorizedReduce(nn.Module):
+    def __init__(self, C_in, C_out, affine=True):
+        super().__init__()
+        if isinstance(C_out, int):
+            assert C_out % 2 == 0
+        else:   # is a value choice
+            assert all(c % 2 == 0 for c in C_out.all_options())
+        self.relu = nn.ReLU(inplace=False)
+        self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+        self.pad = nn.ConstantPad2d((0, 1, 0, 1), 0)
+    def forward(self, x):
+        x = self.relu(x)
+        y = self.pad(x)
+        out = torch.cat([self.conv_1(x), self.conv_2(y[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
+class DropPath_(nn.Module):
+    # https://github.com/khanrc/pt.darts/blob/0.1/models/ops.py
+    def __init__(self, drop_prob=0.):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        if self.training and self.drop_prob > 0.:
+            keep_prob = 1. - self.drop_prob
+            mask = torch.zeros((x.size(0), 1, 1, 1), dtype=torch.float, device=x.device).bernoulli_(keep_prob)
+            return x.div(keep_prob).mul(mask)
+        return x
+class AuxiliaryHead(nn.Module):
+    def __init__(self, C: int, num_labels: int, dataset: Literal['imagenet', 'cifar']):
+        super().__init__()
+        if dataset == 'imagenet':
+            # assuming input size 14x14
+            stride = 2
+        elif dataset == 'cifar':
+            stride = 3
+        self.features = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(5, stride=stride, padding=0, count_include_pad=False),
+            nn.Conv2d(C, 128, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 768, 2, bias=False),
+            nn.BatchNorm2d(768),
+            nn.ReLU(inplace=True)
+        )
+        self.classifier = nn.Linear(768, num_labels)
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x.view(x.size(0), -1))
+        return x
+class SequentialBreakdown(nn.Sequential):
+    """Return all layers of a sequential."""
+    def __init__(self, sequential: nn.Sequential):
+        super().__init__(OrderedDict(sequential.named_children()))
+    def forward(self, inputs):
+        result = []
+        for module in self:
+            inputs = module(inputs)
+            result.append(inputs)
+        return result
+class CellPreprocessor(nn.Module):
+    """
+    Aligning the shape of predecessors.
+    If the last cell is a reduction cell, ``pre0`` should be ``FactorizedReduce`` instead of ``ReLUConvBN``.
+    See :class:`CellBuilder` on how to calculate those channel numbers.
+    """
+    def __init__(self, C_pprev: int, C_prev: int, C: int, last_cell_reduce: bool) -> None:
+        super().__init__()
+        if last_cell_reduce:
+            self.pre0 = FactorizedReduce(C_pprev, C)
+        else:
+            self.pre0 = ReLUConvBN(C_pprev, C, 1, 1, 0)
+        self.pre1 = ReLUConvBN(C_prev, C, 1, 1, 0)
+    def forward(self, cells):
+        assert len(cells) == 2
+        pprev, prev = cells
+        pprev = self.pre0(pprev)
+        prev = self.pre1(prev)
+        return [pprev, prev]
+class CellPostprocessor(nn.Module):
+    """
+    The cell outputs previous cell + this cell, so that cells can be directly chained.
+    """
+    def forward(self, this_cell, previous_cells):
+        return [previous_cells[-1], this_cell]
+class CellBuilder:
+    """The cell builder is used in Repeat.
+    Builds an cell each time it's "called".
+    Note that the builder is ephemeral, it can only be called once for every index.
+    """
+    def __init__(self, op_candidates: List[str], C_prev_in: int, C_in: int, C: int,
+                 num_nodes: int, merge_op: Literal['all', 'loose_end'],
+                 first_cell_reduce: bool, last_cell_reduce: bool):
+        self.C_prev_in = C_prev_in      # This is the out channels of the cell before last cell.
+        self.C_in = C_in                # This is the out channesl of last cell.
+        self.C = C                      # This is NOT C_out of this stage, instead, C_out = C * len(cell.output_node_indices)
+        self.op_candidates = op_candidates
+        self.num_nodes = num_nodes
+        self.merge_op = merge_op
+        self.first_cell_reduce = first_cell_reduce
+        self.last_cell_reduce = last_cell_reduce
+        self._expect_idx = 0
+    def __call__(self, repeat_idx: int):
+        if self._expect_idx != repeat_idx:
+            raise ValueError(f'Expect index {self._expect_idx}, found {repeat_idx}')
+        # It takes an index that is the index in the repeat.
+        # Number of predecessors for each cell is fixed to 2.
+        num_predecessors = 2
+        # Number of ops per node is fixed to 2.
+        num_ops_per_node = 2
+        # Reduction cell means stride = 2 and channel multiplied by 2.
+        is_reduction_cell = repeat_idx == 0 and self.first_cell_reduce
+        # self.C_prev_in, self.C_in, self.last_cell_reduce are updated after each cell is built.
+        preprocessor = CellPreprocessor(self.C_prev_in, self.C_in, self.C, self.last_cell_reduce)
+        ops_factory: Dict[str, Callable[[int, int, int], nn.Module]] = {
+            op:  # make final chosen ops named with their aliases
+            lambda node_index, op_index, input_index:
+            OPS[op](self.C, 2 if is_reduction_cell and (
+                    input_index is None or input_index < num_predecessors  # could be none when constructing search sapce
+                    ) else 1, True)
+            for op in self.op_candidates
+        }
+        cell = nn.Cell(ops_factory, self.num_nodes, num_ops_per_node, num_predecessors, self.merge_op,
+                       preprocessor=preprocessor, postprocessor=CellPostprocessor(),
+                       label='reduce' if is_reduction_cell else 'normal')
+        # update state
+        self.C_prev_in = self.C_in
+        self.C_in = self.C * len(cell.output_node_indices)
+        self.last_cell_reduce = is_reduction_cell
+        self._expect_idx += 1
+        return cell
+_INIT_PARAMETER_DOCS = """
+    Parameters
+    ----------
+    width : int or tuple of int
+        A fixed initial width or a tuple of widths to choose from.
+    num_cells : int or tuple of int
+        A fixed number of cells (depths) to stack, or a tuple of depths to choose from.
+    dataset : "cifar" | "imagenet"
+        The essential differences are in "stem" cells, i.e., how they process the raw image input.
+        Choosing "imagenet" means more downsampling at the beginning of the network.
+    auxiliary_loss : bool
+        If true, another auxiliary classification head will produce the another prediction.
+        This makes the output of network two logits in the training phase.
+"""
+class NDS(nn.Module):
+    """
+    The unified version of NASNet search space.
+    We follow the implementation in
+    `unnas <https://github.com/facebookresearch/unnas/blob/main/pycls/models/nas/nas.py>`__.
+    See `On Network Design Spaces for Visual Recognition <https://arxiv.org/abs/1905.13214>`__ for details.
+    Different NAS papers usually differ in the way that they specify ``op_candidates`` and ``merge_op``.
+    ``dataset`` here is to give a hint about input resolution, so as to create reasonable stem and auxiliary heads.
+    NDS has a speciality that it has mutable depths/widths.
+    This is implemented by accepting a list of int as ``num_cells`` / ``width``.
+    """ + _INIT_PARAMETER_DOCS + """
+    op_candidates : list of str
+        List of operator candidates. Must be from ``OPS``.
+    merge_op : ``all`` or ``loose_end``
+        See :class:`~nni.retiarii.nn.pytorch.Cell`.
+    num_nodes_per_cell : int
+        See :class:`~nni.retiarii.nn.pytorch.Cell`.
+    """
+    def __init__(self,
+                 op_candidates: List[str],
+                 merge_op: Literal['all', 'loose_end'] = 'all',
+                 num_nodes_per_cell: int = 4,
+                 width: Union[Tuple[int], int] = 16,
+                 num_cells: Union[Tuple[int], int] = 20,
+                 dataset: Literal['cifar', 'imagenet'] = 'imagenet',
+                 auxiliary_loss: bool = False):
+        super().__init__()
+        self.dataset = dataset
+        self.num_labels = 10 if dataset == 'cifar' else 1000
+        self.auxiliary_loss = auxiliary_loss
+        # preprocess the specified width and depth
+        if isinstance(width, Iterable):
+            C = nn.ValueChoice(list(width), label='width')
+        else:
+            C = width
+        if isinstance(num_cells, Iterable):
+            num_cells = nn.ValueChoice(list(num_cells), label='depth')
+        num_cells_per_stage = [i * num_cells // 3 - (i - 1) * num_cells // 3 for i in range(3)]
+        # auxiliary head is different for network targetted at different datasets
+        if dataset == 'imagenet':
+            self.stem0 = nn.Sequential(
+                nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False),
+                nn.BatchNorm2d(C // 2),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False),
+                nn.BatchNorm2d(C),
+            )
+            self.stem1 = nn.Sequential(
+                nn.ReLU(inplace=True),
+                nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False),
+                nn.BatchNorm2d(C),
+            )
+            C_pprev = C_prev = C_curr = C
+            last_cell_reduce = True
+        elif dataset == 'cifar':
+            self.stem = nn.Sequential(
+                nn.Conv2d(3, 3 * C, 3, padding=1, bias=False),
+                nn.BatchNorm2d(3 * C)
+            )
+            C_pprev = C_prev = 3 * C
+            C_curr = C
+            last_cell_reduce = False
+        self.stages = nn.ModuleList()
+        for stage_idx in range(3):
+            if stage_idx > 0:
+                C_curr *= 2
+            # For a stage, we get C_in, C_curr, and C_out.
+            # C_in is only used in the first cell.
+            # C_curr is number of channels for each operator in current stage.
+            # C_out is usually `C * num_nodes_per_cell` because of concat operator.
+            cell_builder = CellBuilder(op_candidates, C_pprev, C_prev, C_curr, num_nodes_per_cell,
+                                       merge_op, stage_idx > 0, last_cell_reduce)
+            stage = nn.Repeat(cell_builder, num_cells_per_stage[stage_idx])
+            self.stages.append(stage)
+            # C_pprev is output channel number of last second cell among all the cells already built.
+            if len(stage) > 1:
+                # Contains more than one cell
+                C_pprev = len(stage[-2].output_node_indices) * C_curr
+            else:
+                # Look up in the out channels of last stage.
+                C_pprev = C_prev
+            # This was originally,
+            # C_prev = num_nodes_per_cell * C_curr.
+            # but due to loose end, it becomes,
+            C_prev = len(stage[-1].output_node_indices) * C_curr
+            # Useful in aligning the pprev and prev cell.
+            last_cell_reduce = cell_builder.last_cell_reduce
+            if stage_idx == 2:
+                C_to_auxiliary = C_prev
+        if auxiliary_loss:
+            assert isinstance(self.stages[2], nn.Sequential), 'Auxiliary loss can only be enabled in retrain mode.'
+            self.stages[2] = SequentialBreakdown(self.stages[2])
+            self.auxiliary_head = AuxiliaryHead(C_to_auxiliary, self.num_labels, dataset=self.dataset)
+        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = nn.Linear(C_prev, self.num_labels)
+    def forward(self, inputs):
+        if self.dataset == 'imagenet':
+            s0 = self.stem0(inputs)
+            s1 = self.stem1(s0)
+        else:
+            s0 = s1 = self.stem(inputs)
+        for stage_idx, stage in enumerate(self.stages):
+            if stage_idx == 2 and self.auxiliary_loss:
+                s = list(stage([s0, s1]).values())
+                s0, s1 = s[-1]
+                if self.training:
+                    # auxiliary loss is attached to the first cell of the last stage.
+                    logits_aux = self.auxiliary_head(s[0][1])
+            else:
+                s0, s1 = stage([s0, s1])
+        out = self.global_pooling(s1)
+        logits = self.classifier(out.view(out.size(0), -1))
+        if self.training and self.auxiliary_loss:
+            return logits, logits_aux
+        else:
+            return logits
+    def set_drop_path_prob(self, drop_prob):
+        """
+        Set the drop probability of Drop-path in the network.
+        Reference: `FractalNet: Ultra-Deep Neural Networks without Residuals <https://arxiv.org/pdf/1605.07648v4.pdf>`__.
+        """
+        for module in self.modules():
+            if isinstance(module, DropPath_):
+                module.drop_prob = drop_prob
+@model_wrapper
+class NASNet(NDS):
+    __doc__ = """
+    Search space proposed in `Learning Transferable Architectures for Scalable Image Recognition <https://arxiv.org/abs/1707.07012>`__.
+    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
+    Its operator candidates are :attribute:`~NASNet.NASNET_OPS`.
+    It has 5 nodes per cell, and the output is concatenation of nodes not used as input to other nodes.
+    """ + _INIT_PARAMETER_DOCS
+    NASNET_OPS = [
+        'skip_connect',
+        'conv_3x1_1x3',
+        'conv_7x1_1x7',
+        'dil_conv_3x3',
+        'avg_pool_3x3',
+        'max_pool_3x3',
+        'max_pool_5x5',
+        'max_pool_7x7',
+        'conv_1x1',
+        'conv_3x3',
+        'sep_conv_3x3',
+        'sep_conv_5x5',
+        'sep_conv_7x7',
+    ]
+    def __init__(self,
+                 width: Union[Tuple[int], int] = (16, 24, 32),
+                 num_cells: Union[Tuple[int], int] = (4, 8, 12, 16, 20),
+                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
+                 auxiliary_loss: bool = False):
+        super().__init__(self.NASNET_OPS,
+                         merge_op='loose_end',
+                         num_nodes_per_cell=5,
+                         width=width,
+                         num_cells=num_cells,
+                         dataset=dataset,
+                         auxiliary_loss=auxiliary_loss)
+@model_wrapper
+class ENAS(NDS):
+    __doc__ = """Search space proposed in `Efficient neural architecture search via parameter sharing <https://arxiv.org/abs/1802.03268>`__.
+    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
+    Its operator candidates are :attribute:`~ENAS.ENAS_OPS`.
+    It has 5 nodes per cell, and the output is concatenation of nodes not used as input to other nodes.
+    """ + _INIT_PARAMETER_DOCS
+    ENAS_OPS = [
+        'skip_connect',
+        'sep_conv_3x3',
+        'sep_conv_5x5',
+        'avg_pool_3x3',
+        'max_pool_3x3',
+    ]
+    def __init__(self,
+                 width: Union[Tuple[int], int] = (16, 24, 32),
+                 num_cells: Union[Tuple[int], int] = (4, 8, 12, 16, 20),
+                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
+                 auxiliary_loss: bool = False):
+        super().__init__(self.ENAS_OPS,
+                         merge_op='loose_end',
+                         num_nodes_per_cell=5,
+                         width=width,
+                         num_cells=num_cells,
+                         dataset=dataset,
+                         auxiliary_loss=auxiliary_loss)
+@model_wrapper
+class AmoebaNet(NDS):
+    __doc__ = """Search space proposed in
+    `Regularized evolution for image classifier architecture search <https://arxiv.org/abs/1802.01548>`__.
+    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
+    Its operator candidates are :attribute:`~AmoebaNet.AMOEBA_OPS`.
+    It has 5 nodes per cell, and the output is concatenation of nodes not used as input to other nodes.
+    """ + _INIT_PARAMETER_DOCS
+    AMOEBA_OPS = [
+        'skip_connect',
+        'sep_conv_3x3',
+        'sep_conv_5x5',
+        'sep_conv_7x7',
+        'avg_pool_3x3',
+        'max_pool_3x3',
+        'dil_sep_conv_3x3',
+        'conv_7x1_1x7',
+    ]
+    def __init__(self,
+                 width: Union[Tuple[int], int] = (16, 24, 32),
+                 num_cells: Union[Tuple[int], int] = (4, 8, 12, 16, 20),
+                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
+                 auxiliary_loss: bool = False):
+        super().__init__(self.AMOEBA_OPS,
+                         merge_op='loose_end',
+                         num_nodes_per_cell=5,
+                         width=width,
+                         num_cells=num_cells,
+                         dataset=dataset,
+                         auxiliary_loss=auxiliary_loss)
+@model_wrapper
+class PNAS(NDS):
+    __doc__ = """Search space proposed in
+    `Progressive neural architecture search <https://arxiv.org/abs/1712.00559>`__.
+    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
+    Its operator candidates are :attribute:`~PNAS.PNAS_OPS`.
+    It has 5 nodes per cell, and the output is concatenation of all nodes in the cell.
+    """ + _INIT_PARAMETER_DOCS
+    PNAS_OPS = [
+        'sep_conv_3x3',
+        'sep_conv_5x5',
+        'sep_conv_7x7',
+        'conv_7x1_1x7',
+        'skip_connect',
+        'avg_pool_3x3',
+        'max_pool_3x3',
+        'dil_conv_3x3',
+    ]
+    def __init__(self,
+                 width: Union[Tuple[int], int] = (16, 24, 32),
+                 num_cells: Union[Tuple[int], int] = (4, 8, 12, 16, 20),
+                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
+                 auxiliary_loss: bool = False):
+        super().__init__(self.PNAS_OPS,
+                         merge_op='all',
+                         num_nodes_per_cell=5,
+                         width=width,
+                         num_cells=num_cells,
+                         dataset=dataset,
+                         auxiliary_loss=auxiliary_loss)
+@model_wrapper
+class DARTS(NDS):
+    __doc__ = """Search space proposed in `Darts: Differentiable architecture search <https://arxiv.org/abs/1806.09055>`__.
+    It is built upon :class:`~nni.retiarii.nn.pytorch.Cell`, and implemented based on :class:`~NDS`.
+    Its operator candidates are :attribute:`~DARTS.DARTS_OPS`.
+    It has 4 nodes per cell, and the output is concatenation of all nodes in the cell.
+    """ + _INIT_PARAMETER_DOCS
+    DARTS_OPS = [
+        'none',
+        'max_pool_3x3',
+        'avg_pool_3x3',
+        'skip_connect',
+        'sep_conv_3x3',
+        'sep_conv_5x5',
+        'dil_conv_3x3',
+        'dil_conv_5x5',
+    ]
+    def __init__(self,
+                 width: Union[Tuple[int], int] = (16, 24, 32),
+                 num_cells: Union[Tuple[int], int] = (4, 8, 12, 16, 20),
+                 dataset: Literal['cifar', 'imagenet'] = 'cifar',
+                 auxiliary_loss: bool = False):
+        super().__init__(self.DARTS_OPS,
+                         merge_op='all',
+                         num_nodes_per_cell=4,
+                         width=width,
+                         num_cells=num_cells,
+                         dataset=dataset,
+                         auxiliary_loss=auxiliary_loss)
--- a/nni/retiarii/hub/pytorch/proxylessnas.py
+++ b/nni/retiarii/hub/pytorch/proxylessnas.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import math
+from typing import Optional, Callable, List, Tuple
+import torch
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+def make_divisible(v, divisor, min_val=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_val is None:
+        min_val = divisor
+    # This should work for both value choices and constants.
+    new_v = nn.ValueChoice.max(min_val, round(v + divisor // 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    return nn.ValueChoice.condition(new_v < 0.9 * v, new_v + divisor, new_v)
+class ConvBNReLU(nn.Sequential):
+    """
+    The template for a conv-bn-relu block.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation_layer: Optional[Callable[..., nn.Module]] = None,
+        dilation: int = 1,
+    ) -> None:
+        padding = (kernel_size - 1) // 2 * dilation
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if activation_layer is None:
+            activation_layer = nn.ReLU6
+        super().__init__(
+            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation=dilation, groups=groups,
+                      bias=False),
+            norm_layer(out_channels),
+            activation_layer(inplace=True)
+        )
+        self.out_channels = out_channels
+class SeparableConv(nn.Sequential):
+    """
+    In the original MobileNetV2 implementation, this is InvertedResidual when expand ratio = 1.
+    Residual connection is added if input and output shape are the same.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__(
+            # dw
+            ConvBNReLU(in_channels, in_channels, stride=stride, kernel_size=kernel_size, groups=in_channels,
+                       norm_layer=norm_layer, activation_layer=activation_layer),
+            # pw-linear
+            ConvBNReLU(in_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)
+        )
+        self.residual_connection = stride == 1 and in_channels == out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.residual_connection:
+            return x + super().forward(x)
+        else:
+            return super().forward(x)
+class InvertedResidual(nn.Sequential):
+    """
+    An Inverted Residual Block, sometimes called an MBConv Block, is a type of residual block used for image models
+    that uses an inverted structure for efficiency reasons.
+    It was originally proposed for the `MobileNetV2 <https://arxiv.org/abs/1801.04381>`__ CNN architecture.
+    It has since been reused for several mobile-optimized CNNs.
+    It follows a narrow -> wide -> narrow approach, hence the inversion.
+    It first widens with a 1x1 convolution, then uses a 3x3 depthwise convolution (which greatly reduces the number of parameters),
+    then a 1x1 convolution is used to reduce the number of channels so input and output can be added.
+    Follow implementation of:
+    https://github.com/google-research/google-research/blob/20736344591f774f4b1570af64624ed1e18d2867/tunas/rematlib/mobile_model_v3.py#L453
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        expand_ratio: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        squeeze_and_excite: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        self.stride = stride
+        self.out_channels = out_channels
+        assert stride in [1, 2]
+        hidden_ch = nn.ValueChoice.to_int(round(in_channels * expand_ratio))
+        # FIXME: check whether this equal works
+        # Residual connection is added here stride = 1 and input channels and output channels are the same.
+        self.residual_connection = stride == 1 and in_channels == out_channels
+        layers: List[nn.Module] = [
+            # point-wise convolution
+            # NOTE: some paper omit this point-wise convolution when stride = 1.
+            # In our implementation, if this pw convolution is intended to be omitted,
+            # please use SepConv instead.
+            ConvBNReLU(in_channels, hidden_ch, kernel_size=1,
+                       norm_layer=norm_layer, activation_layer=activation_layer),
+            # depth-wise
+            ConvBNReLU(hidden_ch, hidden_ch, stride=stride, kernel_size=kernel_size, groups=hidden_ch,
+                       norm_layer=norm_layer, activation_layer=activation_layer)
+        ]
+        if squeeze_and_excite:
+            layers.append(squeeze_and_excite(hidden_ch))
+        layers += [
+            # pw-linear
+            ConvBNReLU(hidden_ch, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=nn.Identity)
+        ]
+        super().__init__(*layers)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.residual_connection:
+            return x + super().forward(x)
+        else:
+            return super().forward(x)
+def inverted_residual_choice_builder(
+    expand_ratios: List[int],
+    kernel_sizes: List[int],
+    downsample: bool,
+    stage_input_width: int,
+    stage_output_width: int,
+    label: str
+):
+    def builder(index):
+        stride = 1
+        inp = stage_output_width
+        if index == 0:
+            # first layer in stage
+            # do downsample and width reshape
+            inp = stage_input_width
+            if downsample:
+                stride = 2
+        oup = stage_output_width
+        op_choices = {}
+        for exp_ratio in expand_ratios:
+            for kernel_size in kernel_sizes:
+                op_choices[f'k{kernel_size}e{exp_ratio}'] = InvertedResidual(inp, oup, exp_ratio, kernel_size, stride)
+        # It can be implemented with ValueChoice, but we use LayerChoice here
+        # to be aligned with the intention of the original ProxylessNAS.
+        return nn.LayerChoice(op_choices, label=f'{label}_i{index}')
+    return builder
+@model_wrapper
+class ProxylessNAS(nn.Module):
+    """
+    The search space proposed by `ProxylessNAS <https://arxiv.org/abs/1812.00332>`__.
+    Following the official implementation, the inverted residual with kernel size / expand ratio variations in each layer
+    is implemented with a :class:`nn.LayerChoice` with all-combination candidates. That means,
+    when used in weight sharing, these candidates will be treated as separate layers, and won't be fine-grained shared.
+    We note that ``MobileNetV3Space`` is different in this perspective.
+    """
+    def __init__(self, num_labels: int = 1000,
+                 base_widths: Tuple[int, ...] = (32, 16, 32, 40, 80, 96, 192, 320, 1280),
+                 dropout_rate: float = 0.,
+                 width_mult: float = 1.0,
+                 bn_eps: float = 1e-3,
+                 bn_momentum: float = 0.1):
+        super().__init__()
+        assert len(base_widths) == 9
+        # include the last stage info widths here
+        widths = [make_divisible(width * width_mult, 8) for width in base_widths]
+        downsamples = [True, False, True, True, True, False, True, False]
+        self.num_labels = num_labels
+        self.dropout_rate = dropout_rate
+        self.bn_eps = bn_eps
+        self.bn_momentum = bn_momentum
+        self.first_conv = ConvBNReLU(3, widths[0], stride=2, norm_layer=nn.BatchNorm2d)
+        blocks = [
+            # first stage is fixed
+            SeparableConv(widths[0], widths[1], kernel_size=3, stride=1)
+        ]
+        # https://github.com/ultmaster/AceNAS/blob/46c8895fd8a05ffbc61a6b44f1e813f64b4f66b7/searchspace/proxylessnas/__init__.py#L21
+        for stage in range(2, 8):
+            # Rather than returning a fixed module here,
+            # we return a builder that dynamically creates module for different `repeat_idx`.
+            builder = inverted_residual_choice_builder(
+                [3, 6], [3, 5, 7], downsamples[stage], widths[stage - 1], widths[stage], f's{stage}')
+            if stage < 6:
+                blocks.append(nn.Repeat(builder, (1, 4), label=f's{stage}_depth'))
+            else:
+                # No mutation for depth in the last stage.
+                # Directly call builder to initiate one block
+                blocks.append(builder(0))
+        self.blocks = nn.Sequential(*blocks)
+        # final layers
+        self.feature_mix_layer = ConvBNReLU(widths[7], widths[8], kernel_size=1, norm_layer=nn.BatchNorm2d)
+        self.global_avg_pooling = nn.AdaptiveAvgPool2d(1)
+        self.dropout_layer = nn.Dropout(dropout_rate)
+        self.classifier = nn.Linear(widths[-1], num_labels)
+        reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)
+    def forward(self, x):
+        x = self.first_conv(x)
+        x = self.blocks(x)
+        x = self.feature_mix_layer(x)
+        x = self.global_avg_pooling(x)
+        x = x.view(x.size(0), -1)  # flatten
+        x = self.dropout_layer(x)
+        x = self.classifier(x)
+        return x
+    def no_weight_decay(self):
+        # this is useful for timm optimizer
+        # no regularizer to linear layer
+        if hasattr(self, 'classifier'):
+            return {'classifier.weight', 'classifier.bias'}
+        return set()
+def reset_parameters(model, model_init='he_fout', init_div_groups=False,
+                     bn_momentum=0.1, bn_eps=1e-5):
+    for m in model.modules():
+        if isinstance(m, nn.Conv2d):
+            if model_init == 'he_fout':
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                if init_div_groups:
+                    n /= m.groups
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif model_init == 'he_fin':
+                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                if init_div_groups:
+                    n /= m.groups
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            else:
+                raise NotImplementedError
+        elif isinstance(m, nn.BatchNorm2d):
+            m.weight.data.fill_(1)
+            m.bias.data.zero_()
+            m.momentum = bn_momentum
+            m.eps = bn_eps
+        elif isinstance(m, nn.Linear):
+            m.weight.data.normal_(0, 0.01)
+            if m.bias is not None:
+                m.bias.data.zero_()
+        elif isinstance(m, nn.BatchNorm1d):
+            m.weight.data.fill_(1)
+            m.bias.data.zero_()
--- a/nni/retiarii/hub/pytorch/shufflenet.py
+++ b/nni/retiarii/hub/pytorch/shufflenet.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import torch
+import nni.retiarii.nn.pytorch as nn
+from nni.retiarii import model_wrapper
+class ShuffleNetBlock(nn.Module):
+    """
+    Describe the basic building block of shuffle net, as described in
+    `ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices <https://arxiv.org/pdf/1707.01083.pdf>`__.
+    When stride = 1, the block expects an input with ``2 * input channels``. Otherwise input channels.
+    """
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: int, *,
+                 kernel_size: int, stride: int, sequence: str = "pdp", affine: bool = True):
+        super().__init__()
+        assert stride in [1, 2]
+        assert kernel_size in [3, 5, 7]
+        self.channels = in_channels // 2 if stride == 1 else in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mid_channels = mid_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.pad = kernel_size // 2
+        self.oup_main = out_channels - self.channels
+        self.affine = affine
+        assert self.oup_main > 0
+        self.branch_main = nn.Sequential(*self._decode_point_depth_conv(sequence))
+        if stride == 2:
+            self.branch_proj = nn.Sequential(
+                # dw
+                nn.Conv2d(self.channels, self.channels, kernel_size, stride, self.pad,
+                          groups=self.channels, bias=False),
+                nn.BatchNorm2d(self.channels, affine=affine),
+                # pw-linear
+                nn.Conv2d(self.channels, self.channels, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(self.channels, affine=affine),
+                nn.ReLU(inplace=True)
+            )
+        else:
+            # empty block to be compatible with torchscript
+            self.branch_proj = nn.Sequential()
+    def forward(self, x):
+        if self.stride == 2:
+            x_proj, x = self.branch_proj(x), x
+        else:
+            x_proj, x = self._channel_shuffle(x)
+        return torch.cat((x_proj, self.branch_main(x)), 1)
+    def _decode_point_depth_conv(self, sequence):
+        result = []
+        first_depth = first_point = True
+        pc = c = self.channels
+        for i, token in enumerate(sequence):
+            # compute output channels of this conv
+            if i + 1 == len(sequence):
+                assert token == "p", "Last conv must be point-wise conv."
+                c = self.oup_main
+            elif token == "p" and first_point:
+                c = self.mid_channels
+            if token == "d":
+                # depth-wise conv
+                if isinstance(pc, int) and isinstance(c, int):
+                    # check can only be done for static channels
+                    assert pc == c, "Depth-wise conv must not change channels."
+                result.append(nn.Conv2d(pc, c, self.kernel_size, self.stride if first_depth else 1, self.pad,
+                                        groups=c, bias=False))
+                result.append(nn.BatchNorm2d(c, affine=self.affine))
+                first_depth = False
+            elif token == "p":
+                # point-wise conv
+                result.append(nn.Conv2d(pc, c, 1, 1, 0, bias=False))
+                result.append(nn.BatchNorm2d(c, affine=self.affine))
+                result.append(nn.ReLU(inplace=True))
+                first_point = False
+            else:
+                raise ValueError("Conv sequence must be d and p.")
+            pc = c
+        return result
+    def _channel_shuffle(self, x):
+        bs, num_channels, height, width = x.size()
+        # NOTE: this line is commented for torchscript
+        # assert (num_channels % 4 == 0)
+        x = x.reshape(bs * num_channels // 2, 2, height * width)
+        x = x.permute(1, 0, 2)
+        x = x.reshape(2, -1, num_channels // 2, height, width)
+        return x[0], x[1]
+class ShuffleXceptionBlock(ShuffleNetBlock):
+    """
+    The ``choice_x`` version of shuffle net block, described in
+    `Single Path One-shot <https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610528.pdf>`__.
+    """
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: int, *, stride: int, affine: bool = True):
+        super().__init__(in_channels, out_channels, mid_channels,
+                         kernel_size=3, stride=stride, sequence="dpdpdp", affine=affine)
+@model_wrapper
+class ShuffleNetSpace(nn.Module):
+    """
+    The search space proposed in `Single Path One-shot <https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610528.pdf>`__.
+    The basic building block design is inspired by a state-of-the-art manually-designed network --
+    `ShuffleNetV2 <https://openaccess.thecvf.com/content_ECCV_2018/html/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.html>`__.
+    There are 20 choice blocks in total. Each choice block has 4 candidates, namely ``choice 3``, ``choice 5``,
+    ``choice_7`` and ``choice_x`` respectively. They differ in kernel sizes and the number of depthwise convolutions.
+    The size of the search space is :math:`4^{20}`.
+    Parameters
+    ----------
+    num_labels : int
+        Number of classes for the classification head. Default: 1000.
+    channel_search : bool
+        If true, for each building block, the number of ``mid_channels``
+        (output channels of the first 1x1 conv in each building block) varies from 0.2x to 1.6x (quantized to multiple of 0.2).
+        Here, "k-x" means k times the number of default channels.
+        Otherwise, 1.0x is used by default. Default: false.
+    affine : bool
+        Apply affine to all batch norm. Default: false.
+    """
+    def __init__(self,
+                 num_labels: int = 1000,
+                 channel_search: bool = False,
+                 affine: bool = False):
+        super().__init__()
+        self.num_labels = num_labels
+        self.channel_search = channel_search
+        self.affine = affine
+        # the block number in each stage. 4 stages in total. 20 blocks in total.
+        self.stage_repeats = [4, 4, 8, 4]
+        # output channels for all stages, including the very first layer and the very last layer
+        self.stage_out_channels = [-1, 16, 64, 160, 320, 640, 1024]
+        # building first layer
+        out_channels = self.stage_out_channels[1]
+        self.first_conv = nn.Sequential(
+            nn.Conv2d(3, out_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+        self.features = []
+        global_block_idx = 0
+        for stage_idx, num_repeat in enumerate(self.stage_repeats):
+            for block_idx in range(num_repeat):
+                # count global index to give names to choices
+                global_block_idx += 1
+                # get ready for input and output
+                in_channels = out_channels
+                out_channels = self.stage_out_channels[stage_idx + 2]
+                stride = 2 if block_idx == 0 else 1
+                # mid channels can be searched
+                base_mid_channels = out_channels // 2
+                if self.channel_search:
+                    k_choice_list = [int(base_mid_channels * (.2 * k)) for k in range(1, 9)]
+                    mid_channels = nn.ValueChoice(k_choice_list, label=f'channel_{global_block_idx}')
+                else:
+                    mid_channels = int(base_mid_channels)
+                choice_block = nn.LayerChoice([
+                    ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=3, stride=stride, affine=affine),
+                    ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=5, stride=stride, affine=affine),
+                    ShuffleNetBlock(in_channels, out_channels, mid_channels=mid_channels, kernel_size=7, stride=stride, affine=affine),
+                    ShuffleXceptionBlock(in_channels, out_channels, mid_channels=mid_channels, stride=stride, affine=affine)
+                ], label=f'layer_{global_block_idx}')
+                self.features.append(choice_block)
+        self.features = nn.Sequential(*self.features)
+        # final layers
+        last_conv_channels = self.stage_out_channels[-1]
+        self.conv_last = nn.Sequential(
+            nn.Conv2d(out_channels, last_conv_channels, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(last_conv_channels, affine=affine),
+            nn.ReLU(inplace=True),
+        )
+        self.globalpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Sequential(
+            nn.Linear(last_conv_channels, num_labels, bias=False),
+        )
+        self._initialize_weights()
+    def forward(self, x):
+        x = self.first_conv(x)
+        x = self.features(x)
+        x = self.conv_last(x)
+        x = self.globalpool(x)
+        x = self.dropout(x)
+        x = x.contiguous().view(-1, self.stage_out_channels[-1])
+        x = self.classifier(x)
+        return x
+    def _initialize_weights(self):
+        for name, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                if 'first' in name:
+                    torch.nn.init.normal_(m.weight, 0, 0.01)
+                else:
+                    torch.nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1])
+                if m.bias is not None:
+                    torch.nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                if m.weight is not None:
+                    torch.nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    torch.nn.init.constant_(m.bias, 0.0001)
+                torch.nn.init.constant_(m.running_mean, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                if m.weight is not None:
+                    torch.nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    torch.nn.init.constant_(m.bias, 0.0001)
+                torch.nn.init.constant_(m.running_mean, 0)
+            elif isinstance(m, nn.Linear):
+                torch.nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    torch.nn.init.constant_(m.bias, 0)
--- a/nni/retiarii/nn/pytorch/api.py
+++ b/nni/retiarii/nn/pytorch/api.py
@@ -388,7 +388,7 @@ def _valuechoice_staticmethod_helper(orig_func):
    return orig_func
-class ValueChoiceX(Translatable):
+class ValueChoiceX(Translatable, nn.Module):
    """Internal API. Implementation note:
    The transformed (X) version of value choice.
@@ -404,6 +404,8 @@ class ValueChoiceX(Translatable):
    2. For graph-engine, it uses evaluate to calculate the result.
    Potentially, we have to implement the evaluation logic in oneshot algorithms. I believe we can postpone the discussion till then.
+    This class is implemented as a ``nn.Module`` so that it can be scanned by python engine / torchscript.
    """
    def __init__(self, function: Callable[..., Any], repr_template: str, arguments: List[Any], dry_run: bool = True):
@@ -424,6 +426,9 @@ class ValueChoiceX(Translatable):
            # for sanity check
            self.dry_run()
+    def forward(self) -> None:
+        raise RuntimeError('You should never call forward of the composition of a value-choice.')
    def inner_choices(self) -> Iterable['ValueChoice']:
        """
        Return an iterable of all leaf value choices.

--- a/nni/retiarii/nn/pytorch/component.py
+++ b/nni/retiarii/nn/pytorch/component.py
@@ -93,6 +93,8 @@ class Repeat(Mutable):
                 depth: Union[int, Tuple[int, int]], *, label: Optional[str] = None):
        super().__init__()
+        self._label = None  # by default, no label
        if isinstance(depth, ValueChoiceX):
            if label is not None:
                warnings.warn(
@@ -103,10 +105,16 @@ class Repeat(Mutable):
            all_values = list(self.depth_choice.all_options())
            self.min_depth = min(all_values)
            self.max_depth = max(all_values)
+            if isinstance(depth, ValueChoice):
+                self._label = depth.label  # if a leaf node
        elif isinstance(depth, tuple):
            self.min_depth = depth if isinstance(depth, int) else depth[0]
            self.max_depth = depth if isinstance(depth, int) else depth[1]
            self.depth_choice = ValueChoice(list(range(self.min_depth, self.max_depth + 1)), label=label)
+            self._label = self.depth_choice.label
        elif isinstance(depth, int):
            self.min_depth = self.max_depth = depth
            self.depth_choice = depth
@@ -116,8 +124,8 @@ class Repeat(Mutable):
        self.blocks = nn.ModuleList(self._replicate_and_instantiate(blocks, self.max_depth))
    @property
-    def label(self):
+    def label(self) -> Optional[str]:
-        return self.depth_choice.label
+        return self._label
    def forward(self, x):
        for block in self.blocks:
@@ -142,6 +150,9 @@ class Repeat(Mutable):
        # shortcut for blocks[index]
        return self.blocks[index]
+    def __len__(self):
+        return self.max_depth
 class NasBench201Cell(nn.Module):
    """

--- a/nni/retiarii/nn/pytorch/mutator.py
+++ b/nni/retiarii/nn/pytorch/mutator.py
@@ -311,9 +311,10 @@ def extract_mutation_from_pt_module(pytorch_model: nn.Module) -> Tuple[Model, Op
            node = graph.add_node(name, 'InputChoice',
                                  {'n_candidates': module.n_candidates, 'n_chosen': module.n_chosen})
            node.label = module.label
-        if isinstance(module, ValueChoice):
+        if isinstance(module, ValueChoiceX):
-            node = graph.add_node(name, 'ValueChoice', {'candidates': module.candidates})
+            for i, choice in enumerate(module.inner_choices()):
-            node.label = module.label
+                node = graph.add_node(f'{name}.{i}', 'ValueChoice', {'candidates': choice.candidates})
+                node.label = choice.label
        if isinstance(module, NasBench101Cell):
            node = graph.add_node(name, 'NasBench101Cell', {
                'max_num_edges': module.max_num_edges

--- a/test/ut/retiarii/test_highlevel_apis.py
+++ b/test/ut/retiarii/test_highlevel_apis.py
@@ -683,6 +683,27 @@ class GraphIR(unittest.TestCase):
            new_model = _apply_all_mutators(model, mutators, samplers)
            self.assertTrue((self._get_converted_pytorch_model(new_model)(torch.zeros(1, 16)) == target).all())
+    def test_repeat_valuechoicex(self):
+        class AddOne(nn.Module):
+            def forward(self, x):
+                return x + 1
+        @model_wrapper
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.block = nn.Repeat(AddOne(), nn.ValueChoice([0, 2, 4]) + 1)
+            def forward(self, x):
+                return self.block(x)
+        model, mutators = self._get_model_with_mutators(Net())
+        self.assertEqual(len(mutators), 1 + self.repeat_incr + self.value_choice_incr)
+        samplers = [EnumerateSampler() for _ in range(len(mutators))]
+        for target in [1, 3, 5]:
+            new_model = _apply_all_mutators(model, mutators, samplers)
+            self.assertTrue((self._get_converted_pytorch_model(new_model)(torch.zeros(1, 16)) == target).all())
    def test_repeat_weight_inheritance(self):
        @model_wrapper
        class Net(nn.Module):

--- a/test/ut/retiarii/test_space_hub.py
+++ b/test/ut/retiarii/test_space_hub.py
+"""Currently, this is only a sanity-check (runnable) of spaces provided in hub."""
+import random
+from torchvision import transforms
+from torchvision.datasets import FakeData
+import pytest
+import pytorch_lightning
+import nni
+import nni.runtime.platform.test
+import nni.retiarii.evaluator.pytorch.lightning as pl
+import nni.retiarii.hub.pytorch as searchspace
+from nni.retiarii.utils import ContextStack
+from nni.retiarii.execution.utils import _unpack_if_only_one
+from nni.retiarii.mutator import InvalidMutation, Sampler
+from nni.retiarii.nn.pytorch.mutator import extract_mutation_from_pt_module
+pytestmark = pytest.mark.skipif(pytorch_lightning.__version__ < '1.0', reason='Incompatible APIs.')
+def _reset():
+    # this is to not affect other tests in sdk
+    nni.trial._intermediate_seq = 0
+    nni.trial._params = {'foo': 'bar', 'parameter_id': 0}
+    nni.runtime.platform.test._last_metric = None
+class RandomSampler(Sampler):
+    def __init__(self):
+        self.counter = 0
+    def choice(self, candidates, *args, **kwargs):
+        self.counter += 1
+        return random.choice(candidates)
+def try_mutation_until_success(base_model, mutators, retry):
+    if not retry:
+        raise ValueError('Retry exhausted.')
+    try:
+        model = base_model
+        for mutator in mutators:
+            model = mutator.bind_sampler(RandomSampler()).apply(model)
+        return model
+    except InvalidMutation:
+        return try_mutation_until_success(base_model, mutators, retry - 1)
+def _test_searchspace_on_dataset(searchspace, dataset='cifar10', arch=None):
+    _reset()
+    # dataset supports cifar10 and imagenet
+    model, mutators = extract_mutation_from_pt_module(searchspace)
+    if arch is None:
+        model = try_mutation_until_success(model, mutators, 10)
+        arch = {mut.mutator.label: _unpack_if_only_one(mut.samples) for mut in model.history}
+    print('Selected model:', arch)
+    with ContextStack('fixed', arch):
+        model = model.python_class(**model.python_init_params)
+    if dataset == 'cifar10':
+        train_data = FakeData(size=200, image_size=(3, 32, 32), num_classes=10, transform=transforms.ToTensor())
+        valid_data = FakeData(size=200, image_size=(3, 32, 32), num_classes=10, transform=transforms.ToTensor())
+    elif dataset == 'imagenet':
+        train_data = FakeData(size=200, image_size=(3, 224, 224), num_classes=1000, transform=transforms.ToTensor())
+        valid_data = FakeData(size=200, image_size=(3, 224, 224), num_classes=1000, transform=transforms.ToTensor())
+    train_dataloader = pl.DataLoader(train_data, batch_size=4, shuffle=True)
+    valid_dataloader = pl.DataLoader(valid_data, batch_size=6)
+    evaluator = pl.Classification(
+        train_dataloader=train_dataloader,
+        val_dataloaders=valid_dataloader,
+        export_onnx=False,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=3,
+    )
+    evaluator.fit(model)
+    # cleanup to avoid affecting later test cases
+    _reset()
+def test_nasbench101():
+    ss = searchspace.NasBench101()
+    _test_searchspace_on_dataset(ss)
+def test_nasbench201():
+    ss = searchspace.NasBench101()
+    _test_searchspace_on_dataset(ss)
+def test_nasnet():
+    _test_searchspace_on_dataset(searchspace.NASNet())
+    _test_searchspace_on_dataset(searchspace.ENAS())
+    _test_searchspace_on_dataset(searchspace.AmoebaNet())
+    _test_searchspace_on_dataset(searchspace.PNAS())
+    _test_searchspace_on_dataset(searchspace.DARTS())
+def test_nasnet_corner_case():
+    # The case is that output channel of reduce cell and normal cell are different
+    # CellPreprocessor needs to know whether its predecessors are normal cell / reduction cell
+    arch = {
+        "width": 32,
+        "depth": 8,
+        "normal/op_2_0": "max_pool_7x7",
+        "normal/op_2_1": "conv_1x1",
+        "normal/op_3_0": "sep_conv_5x5",
+        "normal/op_3_1": "max_pool_7x7",
+        "normal/op_4_0": "sep_conv_5x5",
+        "normal/op_4_1": "conv_1x1",
+        "normal/op_5_0": "max_pool_3x3",
+        "normal/op_5_1": "sep_conv_5x5",
+        "normal/op_6_0": "max_pool_7x7",
+        "normal/op_6_1": "sep_conv_5x5",
+        "normal/input_2_0": 0,
+        "normal/input_2_1": 0,
+        "normal/input_3_0": 0,
+        "normal/input_3_1": 1,
+        "normal/input_4_0": 1,
+        "normal/input_4_1": 2,
+        "normal/input_5_0": 0,
+        "normal/input_5_1": 1,
+        "normal/input_6_0": 0,
+        "normal/input_6_1": 2,
+        "reduce/op_2_0": "dil_conv_3x3",
+        "reduce/op_2_1": "max_pool_7x7",
+        "reduce/op_3_0": "dil_conv_3x3",
+        "reduce/op_3_1": "dil_conv_3x3",
+        "reduce/op_4_0": "conv_7x1_1x7",
+        "reduce/op_4_1": "conv_7x1_1x7",
+        "reduce/op_5_0": "max_pool_3x3",
+        "reduce/op_5_1": "conv_1x1",
+        "reduce/op_6_0": "sep_conv_7x7",
+        "reduce/op_6_1": "sep_conv_3x3",
+        "reduce/input_2_0": 1,
+        "reduce/input_2_1": 1,
+        "reduce/input_3_0": 0,
+        "reduce/input_3_1": 1,
+        "reduce/input_4_0": 2,
+        "reduce/input_4_1": 1,
+        "reduce/input_5_0": 0,
+        "reduce/input_5_1": 4,
+        "reduce/input_6_0": 3,
+        "reduce/input_6_1": 3,
+    }
+    _test_searchspace_on_dataset(searchspace.NASNet(), arch=arch)
+def test_nasnet_fixwd():
+    # minimum
+    ss = searchspace.DARTS(width=16, num_cells=4)
+    _test_searchspace_on_dataset(ss)
+    # medium
+    ss = searchspace.NASNet(width=16, num_cells=12)
+    _test_searchspace_on_dataset(ss)
+def test_nasnet_imagenet():
+    ss = searchspace.ENAS(dataset='imagenet')
+    _test_searchspace_on_dataset(ss, dataset='imagenet')
+    ss = searchspace.PNAS(dataset='imagenet')
+    _test_searchspace_on_dataset(ss, dataset='imagenet')
+def test_proxylessnas():
+    ss = searchspace.ProxylessNAS()
+    _test_searchspace_on_dataset(ss, dataset='imagenet')
+def test_mobilenetv3():
+    ss = searchspace.MobileNetV3Space()
+    _test_searchspace_on_dataset(ss, dataset='imagenet')
+def test_shufflenet():
+    ss = searchspace.ShuffleNetSpace()
+    _test_searchspace_on_dataset(ss, dataset='imagenet')
+    ss = searchspace.ShuffleNetSpace(channel_search=True)
+    _test_searchspace_on_dataset(ss, dataset='imagenet')