Merge pull request #4760 from microsoft/dev-oneshot

[DO NOT SQUASH] One-shot as strategy

Merge pull request #4760 from microsoft/dev-oneshot
[DO NOT SQUASH] One-shot as strategy
8547b21c · Yuge Zhang · GitHub · 58d205d3 · 2355bacb · 8547b21c
Unverified Commit 8547b21c authored Apr 21, 2022 by Yuge Zhang Committed by GitHub Apr 21, 2022
20 changed files
--- a/nni/common/hpo_utils/formatting.py
+++ b/nni/common/hpo_utils/formatting.py
@@ -55,6 +55,8 @@ class ParameterSpec(NamedTuple):

    categorical: bool               # Whether this paramter is categorical (unordered) or numerical (ordered)
    size: int = cast(int, None)     # If it's categorical, how many candidates it has
+    chosen_size: int | None = 1     # If it's categorical, it should choose how many candidates.
+                                    # By default, 1. If none, arbitrary number of candidates can be chosen.

    # uniform distributed
    low: float = cast(float, None)  # Lower bound of uniform parameter

--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -34,7 +34,9 @@ from ..execution.utils import get_mutation_dict
 from ..graph import Evaluator
 from ..integration import RetiariiAdvisor
 from ..mutator import Mutator
-from ..nn.pytorch.mutator import extract_mutation_from_pt_module, process_inline_mutation, process_evaluator_mutations
+from ..nn.pytorch.mutator import (
+    extract_mutation_from_pt_module, process_inline_mutation, process_evaluator_mutations, process_oneshot_mutations
+)
 from ..oneshot.interface import BaseOneShotTrainer
 from ..serializer import is_model_wrapped
 from ..strategy import BaseStrategy
@@ -89,7 +91,7 @@ class RetiariiExeConfig(ConfigBase):
        if key == 'trial_code_directory' and not (str(value) == '.' or os.path.isabs(value)):
            raise AttributeError(f'{key} is not supposed to be set in Retiarii mode by users!')
        if key == 'execution_engine':
-            assert value in ['base', 'py', 'cgo', 'benchmark'], f'The specified execution engine "{value}" is not supported.'
+            assert value in ['base', 'py', 'cgo', 'benchmark', 'oneshot'], f'The specified execution engine "{value}" is not supported.'
            self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry ' + value
        self.__dict__[key] = value

@@ -118,9 +120,11 @@ _validation_rules = {
 }


-def preprocess_model(base_model, trainer, applied_mutators, full_ir=True, dummy_input=None):
+def preprocess_model(base_model, evaluator, applied_mutators, full_ir=True, dummy_input=None, oneshot=False):
    # TODO: this logic might need to be refactored into execution engine
-    if full_ir:
+    if oneshot:
+        base_model_ir, mutators = process_oneshot_mutations(base_model, evaluator)
+    elif full_ir:
        try:
            script_module = torch.jit.script(base_model)
        except Exception as e:
@@ -137,7 +141,7 @@ def preprocess_model(base_model, trainer, applied_mutators, full_ir=True, dummy_
        mutators = process_inline_mutation(base_model_ir)
    else:
        base_model_ir, mutators = extract_mutation_from_pt_module(base_model)
-    base_model_ir.evaluator = trainer
+    base_model_ir.evaluator = evaluator

    if mutators is not None and applied_mutators:
        raise RuntimeError('Have not supported mixed usage of LayerChoice/InputChoice and mutators, '
@@ -146,12 +150,12 @@ def preprocess_model(base_model, trainer, applied_mutators, full_ir=True, dummy_
        applied_mutators = mutators

    # Add mutations on evaluators
-    applied_mutators += process_evaluator_mutations(trainer, applied_mutators)
+    applied_mutators += process_evaluator_mutations(evaluator, applied_mutators)

    return base_model_ir, applied_mutators


-def debug_mutated_model(base_model, trainer, applied_mutators):
+def debug_mutated_model(base_model, evaluator, applied_mutators):
    """
    Locally run only one trial without launching an experiment for debug purpose, then exit.
    For example, it can be used to quickly check shape mismatch.
@@ -159,16 +163,18 @@ def debug_mutated_model(base_model, trainer, applied_mutators):
    Specifically, it applies mutators (default to choose the first candidate for the choices)
    to generate a new model, then run this model locally.

+    The model will be parsed with graph execution engine.
+
    Parameters
    ----------
    base_model : nni.retiarii.nn.pytorch.nn.Module
        the base model
-    trainer : nni.retiarii.evaluator
+    evaluator : nni.retiarii.graph.Evaluator
        the training class of the generated models
    applied_mutators : list
        a list of mutators that will be applied on the base model for generating a new model
    """
-    base_model_ir, applied_mutators = preprocess_model(base_model, trainer, applied_mutators)
+    base_model_ir, applied_mutators = preprocess_model(base_model, evaluator, applied_mutators)
    from ..strategy import _LocalDebugStrategy
    strategy = _LocalDebugStrategy()
    strategy.run(base_model_ir, applied_mutators)
@@ -176,17 +182,95 @@ def debug_mutated_model(base_model, trainer, applied_mutators):


 class RetiariiExperiment(Experiment):
-    def __init__(self, base_model: nn.Module, trainer: Union[Evaluator, BaseOneShotTrainer],
-                 applied_mutators: List[Mutator] = None, strategy: BaseStrategy = None):
+    """
+    The entry for a NAS experiment.
+    Users can use this class to start/stop or inspect an experiment, like exporting the results.
+
+    Experiment is a sub-class of :class:`nni.experiment.Experiment`, there are many similarities such as
+    configurable training service to distributed running the experiment on remote server.
+    But unlike :class:`nni.experiment.Experiment`, RetiariiExperiment doesn't support configure:
+
+    - ``trial_code_directory``, which can only be current working directory.
+    - ``search_space``, which is auto-generated in NAS.
+    - ``trial_command``, which must be ``python -m nni.retiarii.trial_entry`` to launch the modulized trial code.
+
+    RetiariiExperiment also doesn't have tuner/assessor/advisor, because they are also implemented in strategy.
+
+    Also, unlike :class:`nni.experiment.Experiment` which is bounded to a node server,
+    RetiariiExperiment optionally starts a node server to schedule the trials, when the strategy is a multi-trial strategy.
+    When the strategy is one-shot, the step of launching node server is omitted, and the experiment is run locally by default.
+
+    Configurations of experiments, such as execution engine, number of GPUs allocated,
+    should be put into a :class:`RetiariiExeConfig` and used as an argument of :meth:`RetiariiExperiment.run`.
+
+    Parameters
+    ----------
+    base_model : nn.Module
+        The model defining the search space / base skeleton without mutation.
+        It should be wrapped by decorator ``nni.retiarii.model_wrapper``.
+    evaluator : nni.retiarii.Evaluator, default = None
+        Evaluator for the experiment.
+        If you are using a one-shot trainer, it should be placed here, although this usage is deprecated.
+    applied_mutators : list of nni.retiarii.Mutator, default = None
+        Mutators os mutate the base model. If none, mutators are skipped.
+        Note that when ``base_model`` uses inline mutations (e.g., LayerChoice), ``applied_mutators`` must be empty / none.
+    strategy : nni.retiarii.strategy.BaseStrategy, default = None
+        Exploration strategy. Can be multi-trial or one-shot.
+    trainer : BaseOneShotTrainer
+        Kept for compatibility purposes.
+
+    Examples
+    --------
+    Multi-trial NAS:
+    >>> base_model = Net()
+    >>> search_strategy = strategy.Random()
+    >>> model_evaluator = FunctionalEvaluator(evaluate_model)
+    >>> exp = RetiariiExperiment(base_model, model_evaluator, [], search_strategy)
+    >>> exp_config = RetiariiExeConfig('local')
+    >>> exp_config.trial_concurrency = 2
+    >>> exp_config.max_trial_number = 20
+    >>> exp_config.training_service.use_active_gpu = False
+    >>> exp.run(exp_config, 8081)
+
+    One-shot NAS:
+    >>> base_model = Net()
+    >>> search_strategy = strategy.DARTS()
+    >>> evaluator = pl.Classification(train_dataloader=train_loader, val_dataloaders=valid_loader)
+    >>> exp = RetiariiExperiment(base_model, evaluator, [], search_strategy)
+    >>> exp_config = RetiariiExeConfig()
+    >>> exp_config.execution_engine = 'oneshot'  # must be set of one-shot strategy
+    >>> exp.run(exp_config)
+
+    Export top models:
+    >>> for model_dict in exp.export_top_models(formatter='dict'):
+    ...     print(model_dict)
+    >>> with nni.retarii.fixed_arch(model_dict):
+    ...     final_model = Net()
+    """
+
+    def __init__(self, base_model: nn.Module, evaluator: Union[BaseOneShotTrainer, Evaluator] = None,
+                 applied_mutators: List[Mutator] = None, strategy: BaseStrategy = None,
+                 trainer: BaseOneShotTrainer = None):
+        if trainer is not None:
+            warnings.warn('Usage of `trainer` in RetiariiExperiment is deprecated and will be removed soon. '
+                          'Please consider specifying it as a positional argument, or use `evaluator`.', DeprecationWarning)
+            evaluator = trainer
+
+        if evaluator is None:
+            raise ValueError('Evaluator should not be none.')
+
        # TODO: The current design of init interface of Retiarii experiment needs to be reviewed.
        self.config: RetiariiExeConfig = None
        self.port: Optional[int] = None

        self.base_model = base_model
-        self.trainer = trainer
+        self.evaluator: Evaluator = evaluator
        self.applied_mutators = applied_mutators
        self.strategy = strategy

+        # FIXME: this is only a workaround
+        from nni.retiarii.oneshot.pytorch.strategy import OneShotStrategy
+        if not isinstance(strategy, OneShotStrategy):
            self._dispatcher = RetiariiAdvisor()
            self._dispatcher_thread: Optional[Thread] = None
            self._proc: Optional[Popen] = None
@@ -203,7 +287,7 @@ class RetiariiExperiment(Experiment):

    def _start_strategy(self):
        base_model_ir, self.applied_mutators = preprocess_model(
-            self.base_model, self.trainer, self.applied_mutators,
+            self.base_model, self.evaluator, self.applied_mutators,
            full_ir=self.config.execution_engine not in ['py', 'benchmark'],
            dummy_input=self.config.dummy_input
        )
@@ -308,8 +392,23 @@ class RetiariiExperiment(Experiment):
        Run the experiment.
        This function will block until experiment finish or error.
        """
-        if isinstance(self.trainer, BaseOneShotTrainer):
-            self.trainer.fit()
+        if isinstance(self.evaluator, BaseOneShotTrainer):
+            # TODO: will throw a deprecation warning soon
+            # warnings.warn('You are using the old implementation of one-shot algos based on One-shot trainer. '
+            #               'We will try to convert this trainer to our new implementation to run the algorithm. '
+            #               'In case you want to stick to the old implementation, '
+            #               'please consider using ``trainer.fit()`` instead of experiment.', DeprecationWarning)
+            self.evaluator.fit()
+
+        if config is None:
+            warnings.warn('config = None is deprecate in future. If you are running a one-shot experiment, '
+                          'please consider creating a config and set execution engine to `oneshot`.', DeprecationWarning)
+            config = RetiariiExeConfig()
+            config.execution_engine = 'oneshot'
+
+        if config.execution_engine == 'oneshot':
+            base_model_ir, self.applied_mutators = preprocess_model(self.base_model, self.evaluator, self.applied_mutators, oneshot=True)
+            self.strategy.run(base_model_ir, self.applied_mutators)
        else:
            assert config is not None, 'You are using classic search mode, config cannot be None!'
            self.config = config
@@ -396,10 +495,14 @@ class RetiariiExperiment(Experiment):
        """
        if formatter == 'code':
            assert self.config.execution_engine != 'py', 'You should use `dict` formatter when using Python execution engine.'
-        if isinstance(self.trainer, BaseOneShotTrainer):
+        if isinstance(self.evaluator, BaseOneShotTrainer):
            assert top_k == 1, 'Only support top_k is 1 for now.'
-            return self.trainer.export()
-        else:
+            return self.evaluator.export()
+        try:
+            # this currently works for one-shot algorithms
+            return self.strategy.export_top_models(top_k=top_k)
+        except NotImplementedError:
+            # when strategy hasn't implemented its own export logic
            all_models = filter(lambda m: m.metric is not None, list_models())
            assert optimize_mode in ['maximize', 'minimize']
            all_models = sorted(all_models, key=lambda m: m.metric, reverse=optimize_mode == 'maximize')

--- a/nni/retiarii/graph.py
+++ b/nni/retiarii/graph.py
@@ -84,6 +84,8 @@ class Model:

    Attributes
    ----------
+    python_object
+        Python object of base model. It will be none when the base model is not available.
    python_class
        Python class that base model is converted from.
    python_init_params
@@ -110,6 +112,7 @@ class Model:
    def __init__(self, _internal=False):
        assert _internal, '`Model()` is private, use `model.fork()` instead'
        self.model_id: int = uid('model')
+        self.python_object: Optional[Any] = None  # type is uncertain because it could differ between DL frameworks
        self.python_class: Optional[Type] = None
        self.python_init_params: Optional[Dict[str, Any]] = None


--- a/nni/retiarii/nn/pytorch/mutator.py
+++ b/nni/retiarii/nn/pytorch/mutator.py
@@ -428,6 +428,20 @@ def process_evaluator_mutations(evaluator: Evaluator, existing_mutators: List[Mu
    return mutators


+# the following are written for one-shot mode
+# they shouldn't technically belong here, but all other engines are written here
+# let's refactor later
+
+def process_oneshot_mutations(base_model: nn.Module, evaluator: Evaluator):
+    # It's not intuitive, at all, (actually very hacky) to wrap a `base_model` and `evaluator` into a graph.Model.
+    # But unfortunately, this is the required interface of strategy.
+    model = Model(_internal=True)
+    model.python_object = base_model
+    # no need to set evaluator here because it will be set after this method is called
+
+    return model, []
+
+
 # utility functions



--- a/nni/retiarii/oneshot/pytorch/__init__.py
+++ b/nni/retiarii/oneshot/pytorch/__init__.py
@@ -5,6 +5,6 @@ from .darts import DartsTrainer
 from .enas import EnasTrainer
 from .proxyless import ProxylessTrainer
 from .random import SinglePathTrainer, RandomTrainer
-from .differentiable import DartsModule, ProxylessModule, SNASModule
-from .sampling import EnasModule, RandomSampleModule
+from .differentiable import DartsLightningModule, ProxylessLightningModule, GumbelDartsLightningModule
+from .sampling import EnasLightningModule, RandomSamplingLightningModule
 from .utils import InterleavedTrainValDataLoader, ConcatenateTrainValDataLoader
--- a/nni/retiarii/oneshot/pytorch/base_lightning.py
+++ b/nni/retiarii/oneshot/pytorch/base_lightning.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

+import warnings
+from itertools import chain
+from typing import Dict, Callable, List, Union, Any, Tuple
+
 import pytorch_lightning as pl
 import torch.optim as optim
 import torch.nn as nn

 from torch.optim.lr_scheduler import _LRScheduler

+import nni.retiarii.nn.pytorch as nas_nn
+from nni.common.hpo_utils import ParameterSpec
+from nni.common.serializer import is_traceable
+from nni.retiarii.nn.pytorch.api import ValueChoiceX
+from .supermodule.base import BaseSuperNetModule
+
+__all__ = ['MutationHook', 'BaseSuperNetModule', 'BaseOneShotLightningModule', 'traverse_and_mutate_submodules']
+
+
+MutationHook = Callable[[nn.Module, str, Dict[str, Any]], Union[nn.Module, bool, Tuple[nn.Module, bool]]]
+

-def _replace_module_with_type(root_module, replace_dict, modules):
+def traverse_and_mutate_submodules(
+    root_module: nn.Module, hooks: List[MutationHook], mutate_kwargs: Dict[str, Any], topdown: bool = True
+) -> List[BaseSuperNetModule]:
    """
-    Replace xxxChoice in user's model with NAS modules.
+    Traverse the module-tree of ``root_module``, and call ``hooks`` on every tree node.

    Parameters
    ----------
    root_module : nn.Module
-        User-defined module with xxxChoice in it. In fact, since this method is called in the ``__init__`` of
-        ``BaseOneShotLightningModule``, this will be a pl.LightningModule.
-    replace_dict : Dict[Type[nn.Module], Callable[[nn.Module], nn.Module]]
-        Functions to replace xxxChoice modules. Keys should be xxxChoice type and values should be a
-        function that return an nn.module.
-    modules : List[nn.Module]
-        The replace result. This is also the return value of this function.
+        User-defined model space.
+        Since this method is called in the ``__init__`` of :class:`BaseOneShotLightningModule`,
+        it's usually a ``pytorch_lightning.LightningModule``.
+        The mutation will be in-place on ``root_module``.
+    hooks : List[MutationHook]
+        List of mutation hooks. See :class:`BaseOneShotLightningModule` for how to write hooks.
+        When a hook returns an module, the module will be replaced (mutated) to the new module.
+    mutate_kwargs : dict
+        Extra keyword arguments passed to hooks.
+    topdown : bool, default = False
+        If topdown is true, hooks are first called, before traversing its sub-module (i.e., pre-order DFS).
+        Otherwise, sub-modules are first traversed, before calling hooks on this node (i.e., post-order DFS).

    Returns
    ----------
-    modules : List[nn.Module]
+    modules : Dict[str, nn.Module]
        The replace result.
    """
-    if modules is None:
-        modules = []
+    memo = {}
+
+    module_list = []

    def apply(m):
        for name, child in m.named_children():
-            child_type = type(child)
-            if child_type in replace_dict.keys():
-                setattr(m, name, replace_dict[child_type](child))
-                modules.append((child.key, getattr(m, name)))
+            # post-order DFS
+            if not topdown:
+                apply(child)
+
+            mutate_result = None
+
+            for hook in hooks:
+                hook_suggest = hook(child, name, memo, mutate_kwargs)
+
+                # parse the mutate result
+                if isinstance(hook_suggest, tuple):
+                    hook_suggest, suppress = hook_suggest
+                elif hook_suggest is True:
+                    hook_suggest, suppress = None, True
+                elif not hook_suggest:  # none / false
+                    hook_suggest, suppress = None, False
+                elif isinstance(hook_suggest, nn.Module):
+                    suppress = True
                else:
+                    raise TypeError(f'Mutation hook returned {hook_suggest} of unsupported type: {type(hook_suggest)}.')
+
+                if hook_suggest is not None:
+                    if not isinstance(hook_suggest, BaseSuperNetModule):
+                        warnings.warn("Mutation hook didn't return a BaseSuperNetModule. It will be ignored in hooked module list.",
+                                      RuntimeWarning)
+                    setattr(m, name, hook_suggest)
+
+                    mutate_result = hook_suggest
+
+                # if suppress, no further mutation hooks are called
+                if suppress:
+                    break
+
+            if isinstance(mutate_result, BaseSuperNetModule):
+                module_list.append(mutate_result)
+
+            # pre-order DFS
+            if topdown:
                apply(child)

    apply(root_module)
-    return modules
+
+    return module_list
+
+
+def no_default_hook(module: nn.Module, name: str, memo: Dict[str, Any], mutate_kwargs: Dict[str, Any]) -> bool:
+    """Add this hook at the end of your hook list to raise error for unsupported mutation primitives."""
+
+    # Forward IS NOT supernet
+    primitive_list = (
+        nas_nn.LayerChoice,
+        nas_nn.InputChoice,
+        nas_nn.ValueChoice,
+        nas_nn.Repeat,
+        nas_nn.NasBench101Cell,
+        # nas_nn.Cell,              # later
+        # nas_nn.NasBench201Cell,   # forward = supernet
+    )
+
+    if isinstance(module, primitive_list):
+        raise TypeError(f'{type(module).__name__} is not supported')
+
+    if isinstance(module, nas_nn.Cell) and module.merge_op != 'all':
+        # need output_node_indices, which depends on super-net
+        raise TypeError(f'Cell with merge_op `{module.merge_op}` is not supported')
+
+    if is_traceable(module):
+        # check whether there is a value-choice in its arguments
+        has_valuechoice = False
+        for arg in chain(module.trace_args, module.trace_kwargs.values()):
+            if isinstance(arg, ValueChoiceX):
+                has_valuechoice = True
+                break
+
+        if has_valuechoice:
+            raise TypeError(f'`basic_unit` {type(module).__name__} with value choice in its arguments is not supported. '
+                            'Please try to remove `basic_unit` to see if that works, or support this type with value choice manually.')
+
+    return True  # suppress all other hooks


 class BaseOneShotLightningModule(pl.LightningModule):
+
+    _mutation_hooks_note = """mutation_hooks : List[MutationHook]
+        Mutation hooks are callable that inputs an Module and returns a :class:`BaseSuperNetModule`.
+        They are invoked in :meth:`traverse_and_mutate_submodules`, on each submodules.
+        For each submodule, the hook list are invoked subsequently,
+        the later hooks can see the result from previous hooks.
+        The modules that are processed by ``mutation_hooks`` will be replaced by the returned module,
+        stored in ``nas_modules``, and be the focus of the NAS algorithm.
+
+        The hook list will be appended by ``default_mutation_hooks`` in each one-shot module.
+
+        To be more specific, the input arguments are three arguments:
+
+        #. a module that might be processed,
+        #. name of the module in its parent module,
+        #. a memo dict whose usage depends on the particular algorithm.
+
+        Note that the memo should be read/written by hooks.
+        There won't be any hooks called on root module.
+        The returned arguments can be also one of the three kinds:
+
+        #. tuple of: :class:`BaseSuperNetModule` or None, and boolean,
+        #. boolean,
+        #. :class:`BaseSuperNetModule` or None.
+
+        The boolean value is ``suppress`` indicates whether the folliwng hooks should be called.
+        When it's true, it suppresses the subsequent hooks, and they will never be invoked.
+        Without boolean value specified, it's assumed to be false.
+        If a none value appears on the place of :class:`BaseSuperNetModule`, it means the hook suggests to
+        keep the module unchanged, and nothing will happen.
    """
-    The base class for all one-shot NAS modules. Essential function such as preprocessing user's model, redirecting lightning
-    hooks for user's model, configuring optimizers and exporting NAS result are implemented in this class.
+
+    _inner_module_note = """inner_module : pytorch_lightning.LightningModule
+        It's a `LightningModule <https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html>`__
+        that defines computations, train/val loops, optimizers in a single class.
+        When used in NNI, the ``inner_module`` is the combination of instances of evaluator + base model
+        (to be precise, a base model wrapped with LightningModule in evaluator).
+    """
+
+    __doc__ = """
+    The base class for all one-shot NAS modules.
+
+    In NNI, we try to separate the "search" part and "training" part in one-shot NAS.
+    The "training" part is defined with evaluator interface (has to be lightning evaluator interface to work with oneshot).
+    Since the lightning evaluator has already broken down the training into minimal building blocks,
+    we can re-assemble them after combining them with the "search" part of a particular algorithm.
+
+    After the re-assembling, this module has defined all the search + training. The experiment can use a lightning trainer
+    (which is another part in the evaluator) to train this module, so as to complete the search process.
+
+    Essential function such as preprocessing user's model, redirecting lightning hooks for user's model,
+    configuring optimizers and exporting NAS result are implemented in this class.

    Attributes
    ----------
-    nas_modules : List[nn.Module]
-        The replace result of a specific NAS method. xxxChoice will be replaced with some other modules with respect to the
-        NAS method.
+    nas_modules : List[BaseSuperNetModule]
+        Modules that have been mutated, which the search algorithms should care about.

    Parameters
    ----------
-    base_model : pl.LightningModule
-        The evaluator in ``nni.retiarii.evaluator.lightning``. User defined model is wrapped by base_model, and base_model will
-        be wrapped by this model.
-    custom_replace_dict : Dict[Type[nn.Module], Callable[[nn.Module], nn.Module]], default = None
-        The custom xxxChoice replace method. Keys should be xxxChoice type and values should return an ``nn.module``. This custom
-        replace dict will override the default replace dict of each NAS method.
-    """
+    """ + _inner_module_note + _mutation_hooks_note
+
    automatic_optimization = False

-    def __init__(self, base_model, custom_replace_dict=None):
+    def default_mutation_hooks(self) -> List[MutationHook]:
+        """Override this to define class-default mutation hooks."""
+        return [no_default_hook]
+
+    def mutate_kwargs(self) -> Dict[str, Any]:
+        """Extra keyword arguments passed to mutation hooks. Usually algo-specific."""
+        return {}
+
+    def __init__(self, base_model: pl.LightningModule, mutation_hooks: List[MutationHook] = None):
        super().__init__()
        assert isinstance(base_model, pl.LightningModule)
        self.model = base_model

-        # replace xxxChoice with respect to NAS alg
-        # replaced modules are stored in self.nas_modules
-        self.nas_modules = []
-        choice_replace_dict = self.default_replace_dict
-        if custom_replace_dict is not None:
-            for k, v in custom_replace_dict.items():
-                assert isinstance(v, nn.Module)
-                choice_replace_dict[k] = v
-        _replace_module_with_type(self.model, choice_replace_dict, self.nas_modules)
+        # append the default hooks
+        mutation_hooks = (mutation_hooks or []) + self.default_mutation_hooks()
+
+        # traverse the model, calling hooks on every submodule
+        self.nas_modules: List[BaseSuperNetModule] = traverse_and_mutate_submodules(
+            self.model, mutation_hooks, self.mutate_kwargs(), topdown=True)
+
+    def search_space_spec(self) -> Dict[str, ParameterSpec]:
+        """Get the search space specification from ``nas_module``.
+
+        Returns
+        -------
+        dict
+            Key is the name of the choice, value is the corresponding :class:`ParameterSpec`.
+        """
+        result = {}
+        for module in self.nas_modules:
+            result.update(module.search_space_spec())
+        return result
+
+    def resample(self) -> Dict[str, Any]:
+        """Trigger the resample for each ``nas_module``.
+        Sometimes (e.g., in differentiable cases), it does nothing.
+
+        Returns
+        -------
+        dict
+            Sampled architecture.
+        """
+        result = {}
+        for module in self.nas_modules:
+            result.update(module.resample(memo=result))
+        return result
+
+    def export(self) -> Dict[str, Any]:
+        """
+        Export the NAS result, ideally the best choice of each ``nas_module``.
+        You may implement an ``export`` method for your customized ``nas_module``.
+
+        Returns
+        --------
+        dict
+            Keys are names of ``nas_modules``, and values are the choice indices of them.
+        """
+        result = {}
+        for module in self.nas_modules:
+            result.update(module.export(memo=result))
+        return result

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
-        # You can use self.architecture_optimizers or self.user_optimizers to get optimizers in
-        # your own training step.
+        """This is the implementation of what happens in training loops of one-shot algos.
+        It usually calls ``self.model.training_step`` which implements the real training recipe of the users' model.
+        """
        return self.model.training_step(batch, batch_idx)

    def configure_optimizers(self):
        """
        Combine architecture optimizers and user's model optimizers.
        You can overwrite configure_architecture_optimizers if architecture optimizers are needed in your NAS algorithm.
-        By now ``self.model`` is currently a :class:`nni.retiarii.evaluator.pytorch.lightning._SupervisedLearningModule`
-        and it only returns 1 optimizer. But for extendibility, codes for other return value types are also implemented.
+        For now ``self.model`` is tested against :class:`nni.retiarii.evaluator.pytorch.lightning._SupervisedLearningModule`
+        and it only returns 1 optimizer.
+        But for extendibility, codes for other return value types are also implemented.
        """
        # pylint: disable=assignment-from-none
        arc_optimizers = self.configure_architecture_optimizers()
@@ -123,6 +308,9 @@ class BaseOneShotLightningModule(pl.LightningModule):
        return arc_optimizers + w_optimizers, lr_schedulers

    def on_train_start(self):
+        # redirect the access to trainer/log to this module
+        # but note that we might be missing other attributes,
+        # which could potentially be a problem
        self.model.trainer = self.trainer
        self.model.log = self.log
        return self.model.on_train_start()
@@ -136,10 +324,10 @@ class BaseOneShotLightningModule(pl.LightningModule):
    def on_fit_end(self):
        return self.model.on_train_end()

-    def on_train_batch_start(self, batch, batch_idx, unused = 0):
+    def on_train_batch_start(self, batch, batch_idx, unused=0):
        return self.model.on_train_batch_start(batch, batch_idx, unused)

-    def on_train_batch_end(self, outputs, batch, batch_idx, unused = 0):
+    def on_train_batch_end(self, outputs, batch, batch_idx, unused=0):
        return self.model.on_train_batch_end(outputs, batch, batch_idx, unused)

    def on_epoch_start(self):
@@ -160,7 +348,7 @@ class BaseOneShotLightningModule(pl.LightningModule):
    def on_after_backward(self):
        return self.model.on_after_backward()

-    def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val = None, gradient_clip_algorithm = None):
+    def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val=None, gradient_clip_algorithm=None):
        return self.model.configure_gradient_clipping(optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm)

    def configure_architecture_optimizers(self):
@@ -175,20 +363,6 @@ class BaseOneShotLightningModule(pl.LightningModule):
        """
        return None

-    @property
-    def default_replace_dict(self):
-        """
-        Default xxxChoice replace dict. This is called in ``__init__`` to get the default replace functions for your NAS algorithm.
-        Note that your default replace functions may be overridden by user-defined custom_replace_dict.
-
-        Returns
-        ----------
-        replace_dict : Dict[Type, Callable[nn.Module, nn.Module]]
-            Same as ``custom_replace_dict`` in ``__init__``, but this will be overridden if users define their own replace functions.
-        """
-        replace_dict = {}
-        return replace_dict
-
    def call_lr_schedulers(self, batch_index):
        """
        Function that imitates lightning trainer's behaviour of calling user's lr schedulers. Since auto_optimization is turned off
@@ -229,13 +403,13 @@ class BaseOneShotLightningModule(pl.LightningModule):

    def call_user_optimizers(self, method):
        """
-        Function that imitates lightning trainer's behaviour of calling user's optimizers. Since auto_optimization is turned off by this
+        Function that imitates lightning trainer's behavior of calling user's optimizers. Since auto_optimization is turned off by this
        class, you can use this function to make user optimizers behave as they were automatically handled by the lightning trainer.

        Parameters
        ----------
        method : str
-            Method to call. Only 'step' and 'zero_grad' are supported now.
+            Method to call. Only ``step`` and ``zero_grad`` are supported now.
        """
        def apply_method(optimizer, method):
            if method == 'step':
@@ -271,7 +445,7 @@ class BaseOneShotLightningModule(pl.LightningModule):
            architecture optimizers.
        """
        opts = self.optimizers()
-        if isinstance(opts,list):
+        if isinstance(opts, list):
            # pylint: disable=unsubscriptable-object
            arc_opts = opts[:self.arc_optim_count]
            if len(arc_opts) == 1:
@@ -285,7 +459,7 @@ class BaseOneShotLightningModule(pl.LightningModule):
    @property
    def user_optimizers(self):
        """
-        Get user optimizers from all optimizers. Use this to get user optimizers in ``training step``.
+        Get user optimizers from all optimizers. Use this to get user optimizers in ``training_step``.

        Returns
        ----------
@@ -293,26 +467,10 @@ class BaseOneShotLightningModule(pl.LightningModule):
            Optimizers defined by user's model. This will be None if there is no user optimizers.
        """
        opts = self.optimizers()
-        if isinstance(opts,list):
+        if isinstance(opts, list):
            # pylint: disable=unsubscriptable-object
            return opts[self.arc_optim_count:]
        # If there is only 1 optimizer and no architecture optimizer
        if self.arc_optim_count == 0:
            return opts
        return None
-
-    def export(self):
-        """
-        Export the NAS result, ideally the best choice of each nas_modules.
-        You may implement an ``export`` method for your customized nas_module.
-
-        Returns
-        --------
-        result : Dict[str, int]
-            Keys are names of nas_modules, and values are the choice indices of them.
-        """
-        result = {}
-        for name, module in self.nas_modules:
-            if name not in result:
-                result[name] = module.export()
-        return result
--- a/nni/retiarii/oneshot/pytorch/differentiable.py
+++ b/nni/retiarii/oneshot/pytorch/differentiable.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from collections import OrderedDict
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from nni.retiarii.nn.pytorch import LayerChoice, InputChoice
-from .base_lightning import BaseOneShotLightningModule
-
-
-class DartsLayerChoice(nn.Module):
-    def __init__(self, layer_choice):
-        super(DartsLayerChoice, self).__init__()
-        self.name = layer_choice.label
-        self.op_choices = nn.ModuleDict(OrderedDict([(name, layer_choice[name]) for name in layer_choice.names]))
-        self.alpha = nn.Parameter(torch.randn(len(self.op_choices)) * 1e-3)
-
-    def forward(self, *args, **kwargs):
-        op_results = torch.stack([op(*args, **kwargs) for op in self.op_choices.values()])
-        alpha_shape = [-1] + [1] * (len(op_results.size()) - 1)
-        return torch.sum(op_results * F.softmax(self.alpha, -1).view(*alpha_shape), 0)
-
-    def parameters(self):
-        for _, p in self.named_parameters():
-            yield p
+"""Experimental version of differentiable one-shot implementation."""

-    def named_parameters(self, recurse=False):
-        for name, p in super(DartsLayerChoice, self).named_parameters():
-            if name == 'alpha':
-                continue
-            yield name, p
+from typing import List
+import pytorch_lightning as pl
+import torch

-    def export(self):
-        return list(self.op_choices.keys())[torch.argmax(self.alpha).item()]
+from .base_lightning import BaseOneShotLightningModule, MutationHook, no_default_hook
+from .supermodule.differentiable import (
+    DifferentiableMixedLayer, DifferentiableMixedInput,
+    MixedOpDifferentiablePolicy, GumbelSoftmax
+)
+from .supermodule.proxyless import ProxylessMixedInput, ProxylessMixedLayer
+from .supermodule.operation import NATIVE_MIXED_OPERATIONS


-class DartsInputChoice(nn.Module):
-    def __init__(self, input_choice):
-        super(DartsInputChoice, self).__init__()
-        self.name = input_choice.label
-        self.alpha = nn.Parameter(torch.randn(input_choice.n_candidates) * 1e-3)
-        self.n_chosen = input_choice.n_chosen or 1
+class DartsLightningModule(BaseOneShotLightningModule):
+    _darts_note = """
+    DARTS :cite:p:`liu2018darts` algorithm is one of the most fundamental one-shot algorithm.

-    def forward(self, inputs):
-        inputs = torch.stack(inputs)
-        alpha_shape = [-1] + [1] * (len(inputs.size()) - 1)
-        return torch.sum(inputs * F.softmax(self.alpha, -1).view(*alpha_shape), 0)
+    DARTS repeats iterations, where each iteration consists of 2 training phases.
+    The phase 1 is architecture step, in which model parameters are frozen and the architecture parameters are trained.
+    The phase 2 is model step, in which architecture parameters are frozen and model parameters are trained.

-    def parameters(self):
-        for _, p in self.named_parameters():
-            yield p
+    The current implementation is for DARTS in first order. Second order (unrolled) is not supported yet.

-    def named_parameters(self, recurse=False):
-        for name, p in super(DartsInputChoice, self).named_parameters():
-            if name == 'alpha':
-                continue
-            yield name, p
+    *New in v2.8*: Supports searching for ValueChoices on operations, with the technique described in
+    `FBNetV2: Differentiable Neural Architecture Search for Spatial and Channel Dimensions <https://arxiv.org/abs/2004.05565>`__.
+    One difference is that, in DARTS, we are using Softmax instead of GumbelSoftmax.

-    def export(self):
-        return torch.argsort(-self.alpha).cpu().numpy().tolist()[:self.n_chosen]
+    {{module_notes}}

+    Parameters
+    ----------
+    {{module_params}}
+    {base_params}
+    arc_learning_rate : float
+        Learning rate for architecture optimizer. Default: 3.0e-4
+    """.format(base_params=BaseOneShotLightningModule._mutation_hooks_note)
+
+    __doc__ = _darts_note.format(
+        module_notes='The DARTS Module should be trained with :class:`nni.retiarii.oneshot.utils.InterleavedTrainValDataLoader`.',
+        module_params=BaseOneShotLightningModule._inner_module_note,
+    )

-class DartsModule(BaseOneShotLightningModule):
-    """
-    The DARTS module. Each iteration consists of 2 training phases. The phase 1 is architecture step, in which model parameters are
-    frozen and the architecture parameters are trained. The phase 2 is model step, in which architecture parameters are frozen and
-    model parameters are trained. See [darts] for details.
-    The DARTS Module should be trained with :class:`nni.retiarii.oneshot.utils.InterleavedTrainValDataLoader`.
+    def default_mutation_hooks(self) -> List[MutationHook]:
+        """Replace modules with differentiable versions"""
+        hooks = [
+            DifferentiableMixedLayer.mutate,
+            DifferentiableMixedInput.mutate,
+        ]
+        hooks += [operation.mutate for operation in NATIVE_MIXED_OPERATIONS]
+        hooks.append(no_default_hook)
+        return hooks
+
+    def mutate_kwargs(self):
+        """Use differentiable strategy for mixed operations."""
+        return {
+            'mixed_op_sampling': MixedOpDifferentiablePolicy
+        }

-    Reference
-    ----------
-    .. [darts] H. Liu, K. Simonyan, and Y. Yang, “DARTS: Differentiable Architecture Search,” presented at the
-        International Conference on Learning Representations, Sep. 2018. Available: https://openreview.net/forum?id=S1eYHoC5FX
-    """
+    def __init__(self, inner_module: pl.LightningModule,
+                 mutation_hooks: List[MutationHook] = None,
+                 arc_learning_rate: float = 3.0E-4):
+        self.arc_learning_rate = arc_learning_rate
+        super().__init__(inner_module, mutation_hooks=mutation_hooks)

    def training_step(self, batch, batch_idx):
        # grad manually
@@ -85,7 +77,7 @@ class DartsModule(BaseOneShotLightningModule):
        # phase 1: architecture step
        # The _resample hook is kept for some darts-based NAS methods like proxyless.
        # See code of those methods for details.
-        self._resample()
+        self.resample()
        arc_optim.zero_grad()
        arc_step_loss = self.model.training_step(val_batch, 2 * batch_idx)
        if isinstance(arc_step_loss, dict):
@@ -95,7 +87,7 @@ class DartsModule(BaseOneShotLightningModule):
        arc_optim.step()

        # phase 2: model step
-        self._resample()
+        self.resample()
        self.call_user_optimizers('zero_grad')
        loss_and_metrics = self.model.training_step(trn_batch, 2 * batch_idx + 1)
        w_step_loss = loss_and_metrics['loss'] \
@@ -107,257 +99,110 @@ class DartsModule(BaseOneShotLightningModule):

        return loss_and_metrics

-    def _resample(self):
-        # Note: This hook is kept for following darts-based NAS algs.
-        pass
-
    def finalize_grad(self):
        # Note: This hook is currently kept for Proxyless NAS.
        pass

-    @property
-    def default_replace_dict(self):
-        return {
-            LayerChoice : DartsLayerChoice,
-            InputChoice : DartsInputChoice
-        }
-
    def configure_architecture_optimizers(self):
-        # The alpha in DartsXXXChoices is the architecture parameter of DARTS. All alphas share one optimizer.
-        ctrl_params = {}
-        for _, m in self.nas_modules:
-            if m.name in ctrl_params:
-                assert m.alpha.size() == ctrl_params[m.name].size(), 'Size of parameters with the same label should be same.'
-                m.alpha = ctrl_params[m.name]
-            else:
-                ctrl_params[m.name] = m.alpha
-        ctrl_optim = torch.optim.Adam(list(ctrl_params.values()), 3.e-4, betas=(0.5, 0.999),
+        # The alpha in DartsXXXChoices are the architecture parameters of DARTS. They share one optimizer.
+        ctrl_params = []
+        for m in self.nas_modules:
+            ctrl_params += list(m.parameters(arch=True))
+        ctrl_optim = torch.optim.Adam(list(set(ctrl_params)), 3.e-4, betas=(0.5, 0.999),
                                      weight_decay=1.0E-3)
        return ctrl_optim


-class _ArchGradientFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, binary_gates, run_func, backward_func):
-        ctx.run_func = run_func
-        ctx.backward_func = backward_func
-
-        detached_x = x.detach()
-        detached_x.requires_grad = x.requires_grad
-        with torch.enable_grad():
-            output = run_func(detached_x)
-        ctx.save_for_backward(detached_x, output)
-        return output.data
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        detached_x, output = ctx.saved_tensors
-
-        grad_x = torch.autograd.grad(output, detached_x, grad_output, only_inputs=True)
-        # compute gradients w.r.t. binary_gates
-        binary_grads = ctx.backward_func(detached_x.data, output.data, grad_output.data)
-
-        return grad_x[0], binary_grads, None, None
-
-
-class ProxylessLayerChoice(nn.Module):
-    def __init__(self, ops):
-        super(ProxylessLayerChoice, self).__init__()
-        self.ops = nn.ModuleList(ops)
-        self.alpha = nn.Parameter(torch.randn(len(self.ops)) * 1E-3)
-        self._binary_gates = nn.Parameter(torch.randn(len(self.ops)) * 1E-3)
-        self.sampled = None
-
-    def forward(self, *args, **kwargs):
-        if self.training:
-            def run_function(ops, active_id, **kwargs):
-                def forward(_x):
-                    return ops[active_id](_x, **kwargs)
-                return forward
-
-            def backward_function(ops, active_id, binary_gates, **kwargs):
-                def backward(_x, _output, grad_output):
-                    binary_grads = torch.zeros_like(binary_gates.data)
-                    with torch.no_grad():
-                        for k in range(len(ops)):
-                            if k != active_id:
-                                out_k = ops[k](_x.data, **kwargs)
-                            else:
-                                out_k = _output.data
-                            grad_k = torch.sum(out_k * grad_output)
-                            binary_grads[k] = grad_k
-                    return binary_grads
-                return backward
-
-            assert len(args) == 1
-            x = args[0]
-            return _ArchGradientFunction.apply(
-                x, self._binary_gates, run_function(self.ops, self.sampled, **kwargs),
-                backward_function(self.ops, self.sampled, self._binary_gates, **kwargs)
-            )
-
-        return super().forward(*args, **kwargs)
-
-    def resample(self):
-        probs = F.softmax(self.alpha, dim=-1)
-        sample = torch.multinomial(probs, 1)[0].item()
-        self.sampled = sample
-        with torch.no_grad():
-            self._binary_gates.zero_()
-            self._binary_gates.grad = torch.zeros_like(self._binary_gates.data)
-            self._binary_gates.data[sample] = 1.0
+class ProxylessLightningModule(DartsLightningModule):
+    _proxyless_note = """
+    Implementation of ProxylessNAS :cite:p:`cai2018proxylessnas`.
+    It's a DARTS-based method that resamples the architecture to reduce memory consumption.
+    Essentially, it samples one path on forward,
+    and implements its own backward to update the architecture parameters based on only one path.

-    def finalize_grad(self):
-        binary_grads = self._binary_gates.grad
-        with torch.no_grad():
-            if self.alpha.grad is None:
-                self.alpha.grad = torch.zeros_like(self.alpha.data)
-            probs = F.softmax(self.alpha, dim=-1)
-            for i in range(len(self.ops)):
-                for j in range(len(self.ops)):
-                    self.alpha.grad[i] += binary_grads[j] * probs[j] * (int(i == j) - probs[i])
-
-    def export(self):
-        return torch.argmax(self.alpha).item()
-
-    def export_prob(self):
-        return F.softmax(self.alpha, dim=-1)
-
-
-class ProxylessInputChoice(nn.Module):
-    def __init__(self, input_choice):
-        super().__init__()
-        self.num_input_candidates = input_choice.n_candidates
-        self.alpha = nn.Parameter(torch.randn(input_choice.n_candidates) * 1E-3)
-        self._binary_gates = nn.Parameter(torch.randn(input_choice.n_candidates) * 1E-3)
-        self.sampled = None
-
-    def forward(self, inputs):
-        if self.training:
-            def run_function(active_sample):
-                return lambda x: x[active_sample]
-
-            def backward_function(binary_gates):
-                def backward(_x, _output, grad_output):
-                    binary_grads = torch.zeros_like(binary_gates.data)
-                    with torch.no_grad():
-                        for k in range(self.num_input_candidates):
-                            out_k = _x[k].data
-                            grad_k = torch.sum(out_k * grad_output)
-                            binary_grads[k] = grad_k
-                    return binary_grads
-                return backward
-
-            inputs = torch.stack(inputs, 0)
-            return _ArchGradientFunction.apply(
-                inputs, self._binary_gates, run_function(self.sampled),
-                backward_function(self._binary_gates)
-            )
-
-        return super().forward(inputs)
+    {{module_notes}}

-    def resample(self, sample=None):
-        if sample is None:
-            probs = F.softmax(self.alpha, dim=-1)
-            sample = torch.multinomial(probs, 1)[0].item()
-        self.sampled = sample
-        with torch.no_grad():
-            self._binary_gates.zero_()
-            self._binary_gates.grad = torch.zeros_like(self._binary_gates.data)
-            self._binary_gates.data[sample] = 1.0
-        return self.sampled
-
-    def finalize_grad(self):
-        binary_grads = self._binary_gates.grad
-        with torch.no_grad():
-            if self.alpha.grad is None:
-                self.alpha.grad = torch.zeros_like(self.alpha.data)
-            probs = F.softmax(self.alpha, dim=-1)
-            for i in range(self.num_input_candidates):
-                for j in range(self.num_input_candidates):
-                    self.alpha.grad[i] += binary_grads[j] * probs[j] * (int(i == j) - probs[i])
-
-
-class ProxylessModule(DartsModule):
-    """
-    The Proxyless Module. This is a darts-based method that resamples the architecture to reduce memory consumption.
-    The Proxyless Module should be trained with :class:`nni.retiarii.oneshot.pytorch.utils.InterleavedTrainValDataLoader`.
-
-    Reference
+    Parameters
    ----------
-    .. [proxyless] H. Cai, L. Zhu, and S. Han, “ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware,” presented
-        at the International Conference on Learning Representations, Sep. 2018. Available: https://openreview.net/forum?id=HylVB3AqYm
-    """
-
-    @property
-    def default_replace_dict(self):
-        return {
-            LayerChoice : ProxylessLayerChoice,
-            InputChoice : ProxylessInputChoice
-        }
-
-    def configure_architecture_optimizers(self):
-        ctrl_optim = torch.optim.Adam([m.alpha for _, m in self.nas_modules], 3.e-4,
-                                           weight_decay=0, betas=(0, 0.999), eps=1e-8)
-        return ctrl_optim
+    {{module_params}}
+    {base_params}
+    arc_learning_rate : float
+        Learning rate for architecture optimizer. Default: 3.0e-4
+    """.format(base_params=BaseOneShotLightningModule._mutation_hooks_note)
+
+    __doc__ = _proxyless_note.format(
+        module_notes='This module should be trained with :class:`nni.retiarii.oneshot.pytorch.utils.InterleavedTrainValDataLoader`.',
+        module_params=BaseOneShotLightningModule._inner_module_note,
+    )

-    def _resample(self):
-        for _, m in self.nas_modules:
-            m.resample()
+    def default_mutation_hooks(self) -> List[MutationHook]:
+        """Replace modules with gumbel-differentiable versions"""
+        hooks = [
+            ProxylessMixedLayer.mutate,
+            ProxylessMixedInput.mutate,
+            no_default_hook,
+        ]
+        # FIXME: no support for mixed operation currently
+        return hooks

    def finalize_grad(self):
-        for _, m in self.nas_modules:
+        for m in self.nas_modules:
            m.finalize_grad()


-class SNASLayerChoice(DartsLayerChoice):
-    def forward(self, *args, **kwargs):
-        self.one_hot = F.gumbel_softmax(self.alpha, self.temp)
-        op_results = torch.stack([op(*args, **kwargs) for op in self.op_choices.values()])
-        alpha_shape = [-1] + [1] * (len(op_results.size()) - 1)
-        yhat = torch.sum(op_results * self.one_hot.view(*alpha_shape), 0)
-        return yhat
-
+class GumbelDartsLightningModule(DartsLightningModule):
+    _gumbel_darts_note = """
+    Implementation of SNAS :cite:p:`xie2018snas`.
+    It's a DARTS-based method that uses gumbel-softmax to simulate one-hot distribution.
+    Essentially, it samples one path on forward,
+    and implements its own backward to update the architecture parameters based on only one path.

-class SNASInputChoice(DartsInputChoice):
-    def forward(self, inputs):
-        self.one_hot = F.gumbel_softmax(self.alpha, self.temp)
-        inputs = torch.stack(inputs)
-        alpha_shape = [-1] + [1] * (len(inputs.size()) - 1)
-        yhat = torch.sum(inputs * self.one_hot.view(*alpha_shape), 0)
-        return yhat
+    *New in v2.8*: Supports searching for ValueChoices on operations, with the technique described in
+    `FBNetV2: Differentiable Neural Architecture Search for Spatial and Channel Dimensions <https://arxiv.org/abs/2004.05565>`__.

-
-class SNASModule(DartsModule):
-    """
-    The SNAS Module. This is a darts-based method that uses gumble-softmax to simulate one-hot distribution.
-    The SNAS Module should be trained with :class:`nni.retiarii.oneshot.utils.InterleavedTrainValDataLoader`.
+    {{module_notes}}

    Parameters
    ----------
-    base_model : pl.LightningModule
-        The evaluator in ``nni.retiarii.evaluator.lightning``. User defined model is wrapped by base_model, and base_model will
-        be wrapped by this model.
-    gumble_temperature : float
-        The initial temperature used in gumble-softmax.
+    {{module_params}}
+    {base_params}
+    gumbel_temperature : float
+        The initial temperature used in gumbel-softmax.
    use_temp_anneal : bool
-        True: a linear annealing will be applied to gumble_temperature. False: run at a fixed temperature. See [snas] for details.
+        If true, a linear annealing will be applied to ``gumbel_temperature``.
+        Otherwise, run at a fixed temperature. See :cite:t:`xie2018snas` for details.
    min_temp : float
        The minimal temperature for annealing. No need to set this if you set ``use_temp_anneal`` False.
-    custom_replace_dict : Dict[Type[nn.Module], Callable[[nn.Module], nn.Module]], default = None
-        The custom xxxChoice replace method. Keys should be xxxChoice type and values should return an ``nn.module``. This custom
-        replace dict will override the default replace dict of each NAS method.
+    arc_learning_rate : float
+        Learning rate for architecture optimizer. Default: 3.0e-4
+    """.format(base_params=BaseOneShotLightningModule._mutation_hooks_note)
+
+    def default_mutation_hooks(self) -> List[MutationHook]:
+        """Replace modules with gumbel-differentiable versions"""
+        hooks = [
+            DifferentiableMixedLayer.mutate,
+            DifferentiableMixedInput.mutate,
+        ]
+        hooks += [operation.mutate for operation in NATIVE_MIXED_OPERATIONS]
+        hooks.append(no_default_hook)
+        return hooks
+
+    def mutate_kwargs(self):
+        """Use gumbel softmax."""
+        return {
+            'mixed_op_sampling': MixedOpDifferentiablePolicy,
+            'softmax': GumbelSoftmax(),
+        }

-    Reference
-    ----------
-    .. [snas] S. Xie, H. Zheng, C. Liu, and L. Lin, “SNAS: stochastic neural architecture search,” presented at the
-        International Conference on Learning Representations, Sep. 2018. Available: https://openreview.net/forum?id=rylqooRqK7
-    """
-    def __init__(self, base_model, gumble_temperature = 1., use_temp_anneal = False,
-                 min_temp = .33, custom_replace_dict=None):
-        super().__init__(base_model, custom_replace_dict)
-        self.temp = gumble_temperature
-        self.init_temp = gumble_temperature
+    def __init__(self, inner_module,
+                 mutation_hooks: List[MutationHook] = None,
+                 arc_learning_rate: float = 3.0e-4,
+                 gumbel_temperature: float = 1.,
+                 use_temp_anneal: bool = False,
+                 min_temp: float = .33):
+        super().__init__(inner_module, mutation_hooks, arc_learning_rate=arc_learning_rate)
+        self.temp = gumbel_temperature
+        self.init_temp = gumbel_temperature
        self.use_temp_anneal = use_temp_anneal
        self.min_temp = min_temp

@@ -366,14 +211,7 @@ class SNASModule(DartsModule):
            self.temp = (1 - self.trainer.current_epoch / self.trainer.max_epochs) * (self.init_temp - self.min_temp) + self.min_temp
            self.temp = max(self.temp, self.min_temp)

-            for _, nas_module in self.nas_modules:
-                nas_module.temp = self.temp
+        for module in self.nas_modules:
+            module._softmax.temp = self.temp

        return self.model.on_epoch_start()
-
-    @property
-    def default_replace_dict(self):
-        return {
-            LayerChoice : SNASLayerChoice,
-            InputChoice : SNASInputChoice
-        }
--- a/nni/retiarii/oneshot/pytorch/sampling.py
+++ b/nni/retiarii/oneshot/pytorch/sampling.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-import random
+"""Experimental version of sampling-based one-shot implementation."""
+
+from typing import Dict, Any, List
+
+import pytorch_lightning as pl
 import torch
 import torch.nn as nn
 import torch.optim as optim

-from nni.retiarii.nn.pytorch.api import LayerChoice, InputChoice
-from .random import PathSamplingLayerChoice, PathSamplingInputChoice
-from .base_lightning import BaseOneShotLightningModule
+from .base_lightning import BaseOneShotLightningModule, MutationHook, no_default_hook
+from .supermodule.sampling import PathSamplingInput, PathSamplingLayer, MixedOpPathSamplingPolicy
+from .supermodule.operation import NATIVE_MIXED_OPERATIONS
 from .enas import ReinforceController, ReinforceField


-class EnasModule(BaseOneShotLightningModule):
-    """
-    The ENAS module. There are 2 steps in an epoch. 1: training model parameters. 2: training ENAS RL agent. The agent will produce
-    a sample of model architecture to get the best reward.
-    The ENASModule should be trained with :class:`nni.retiarii.oneshot.utils.ConcatenateTrainValDataloader`.
+class RandomSamplingLightningModule(BaseOneShotLightningModule):
+    _random_note = """
+    Random Sampling NAS Algorithm.
+    In each epoch, model parameters are trained after a uniformly random sampling of each choice.
+    Notably, the exporting result is **also a random sample** of the search space.
+
+    Parameters
+    ----------
+    {{module_params}}
+    {base_params}
+    """.format(base_params=BaseOneShotLightningModule._mutation_hooks_note)
+
+    __doc__ = _random_note.format(
+        module_params=BaseOneShotLightningModule._inner_module_note,
+    )
+
+    # turn on automatic optimization because nothing interesting is going on here.
+    automatic_optimization = True
+
+    def default_mutation_hooks(self) -> List[MutationHook]:
+        """Replace modules with differentiable versions"""
+        hooks = [
+            PathSamplingLayer.mutate,
+            PathSamplingInput.mutate,
+        ]
+        hooks += [operation.mutate for operation in NATIVE_MIXED_OPERATIONS]
+        hooks.append(no_default_hook)
+        return hooks
+
+    def mutate_kwargs(self):
+        """Use path sampling strategy for mixed-operations."""
+        return {
+            'mixed_op_sampling': MixedOpPathSamplingPolicy
+        }
+
+    def training_step(self, batch, batch_idx):
+        self.resample()
+        return self.model.training_step(batch, batch_idx)
+
+
+class EnasLightningModule(RandomSamplingLightningModule):
+    _enas_note = """
+    The implementation of ENAS :cite:p:`pham2018efficient`. There are 2 steps in an epoch.
+    Firstly, training model parameters.
+    Secondly, training ENAS RL agent. The agent will produce a sample of model architecture to get the best reward.
+
+    {{module_notes}}

    Parameters
    ----------
-    base_model : pl.LightningModule
-        he evaluator in ``nni.retiarii.evaluator.lightning``. User defined model is wrapped by base_model, and base_model will
-        be wrapped by this model.
+    {{module_params}}
+    {base_params}
    ctrl_kwargs : dict
        Optional kwargs that will be passed to :class:`ReinforceController`.
    entropy_weight : float
@@ -33,26 +78,36 @@ class EnasModule(BaseOneShotLightningModule):
        Decay factor of baseline. New baseline will be equal to ``baseline_decay * baseline_old + reward * (1 - baseline_decay)``.
    ctrl_steps_aggregate : int
        Number of steps that will be aggregated into one mini-batch for RL controller.
-    grad_clip : float
-        Gradient clipping value.
-    custom_replace_dict : Dict[Type[nn.Module], Callable[[nn.Module], nn.Module]], default = None
-        The custom xxxChoice replace method. Keys should be xxxChoice type and values should return an ``nn.module``. This custom
-        replace dict will override the default replace dict of each NAS method.
-
-    Reference
-    ----------
-    .. [enas] H. Pham, M. Guan, B. Zoph, Q. Le, and J. Dean, “Efficient Neural Architecture Search via Parameters Sharing,”
-        in Proceedings of the 35th International Conference on Machine Learning, Jul. 2018, pp. 4095-4104.
-        Available: https://proceedings.mlr.press/v80/pham18a.html
-    """
-    def __init__(self, base_model, ctrl_kwargs = None,
-                 entropy_weight = 1e-4, skip_weight = .8, baseline_decay = .999,
-                 ctrl_steps_aggregate = 20, grad_clip = 0, custom_replace_dict = None):
-        super().__init__(base_model, custom_replace_dict)
-
-        self.nas_fields = [ReinforceField(name, len(module),
-                                          isinstance(module, PathSamplingLayerChoice) or module.n_chosen == 1)
-                           for name, module in self.nas_modules]
+    ctrl_grad_clip : float
+        Gradient clipping value of controller.
+    """.format(base_params=BaseOneShotLightningModule._mutation_hooks_note)
+
+    __doc__ = _enas_note.format(
+        module_notes='``ENASModule`` should be trained with :class:`nni.retiarii.oneshot.utils.ConcatenateTrainValDataloader`.',
+        module_params=BaseOneShotLightningModule._inner_module_note,
+    )
+
+    automatic_optimization = False
+
+    def __init__(self,
+                 inner_module: pl.LightningModule,
+                 *,
+                 ctrl_kwargs: Dict[str, Any] = None,
+                 entropy_weight: float = 1e-4,
+                 skip_weight: float = .8,
+                 baseline_decay: float = .999,
+                 ctrl_steps_aggregate: float = 20,
+                 ctrl_grad_clip: float = 0,
+                 mutation_hooks: List[MutationHook] = None):
+        super().__init__(inner_module, mutation_hooks)
+
+        # convert parameter spec to legacy ReinforceField
+        # this part will be refactored
+        self.nas_fields: List[ReinforceField] = []
+        for name, param_spec in self.search_space_spec().items():
+            if param_spec.chosen_size not in (1, None):
+                raise ValueError('ENAS does not support n_chosen to be values other than 1 or None.')
+            self.nas_fields.append(ReinforceField(name, param_spec.size, param_spec.chosen_size == 1))
        self.controller = ReinforceController(self.nas_fields, **(ctrl_kwargs or {}))

        self.entropy_weight = entropy_weight
@@ -60,25 +115,18 @@ class EnasModule(BaseOneShotLightningModule):
        self.baseline_decay = baseline_decay
        self.baseline = 0.
        self.ctrl_steps_aggregate = ctrl_steps_aggregate
-        self.grad_clip = grad_clip
+        self.ctrl_grad_clip = ctrl_grad_clip

    def configure_architecture_optimizers(self):
        return optim.Adam(self.controller.parameters(), lr=3.5e-4)

-    @property
-    def default_replace_dict(self):
-        return {
-            LayerChoice : PathSamplingLayerChoice,
-            InputChoice : PathSamplingInputChoice
-        }
-
    def training_step(self, batch, batch_idx):
        # The ConcatenateTrainValDataloader yields both data and which dataloader it comes from.
        batch, source = batch

        if source == 'train':
            # step 1: train model params
-            self._resample()
+            self.resample()
            self.call_user_optimizers('zero_grad')
            loss_and_metrics = self.model.training_step(batch, batch_idx)
            w_step_loss = loss_and_metrics['loss'] \
@@ -92,7 +140,7 @@ class EnasModule(BaseOneShotLightningModule):
            x, y = batch
            arc_opt = self.architecture_optimizers
            arc_opt.zero_grad()
-            self._resample()
+            self.resample()
            with torch.no_grad():
                logits = self.model(x)
            # use the default metric of self.model as reward function
@@ -100,7 +148,7 @@ class EnasModule(BaseOneShotLightningModule):
                _, metric = next(iter(self.model.metrics.items()))
            else:
                if 'default' not in self.model.metrics.keys():
-                    raise KeyError('model.metrics should contain a ``default`` key when' \
+                    raise KeyError('model.metrics should contain a ``default`` key when'
                                   'there are multiple metrics')
                metric = self.model.metrics['default']

@@ -116,63 +164,28 @@ class EnasModule(BaseOneShotLightningModule):
            self.manual_backward(rnn_step_loss)

            if (batch_idx + 1) % self.ctrl_steps_aggregate == 0:
-                if self.grad_clip > 0:
-                    nn.utils.clip_grad_norm_(self.controller.parameters(), self.grad_clip)
+                if self.ctrl_grad_clip > 0:
+                    nn.utils.clip_grad_norm_(self.controller.parameters(), self.ctrl_grad_clip)
                arc_opt.step()
                arc_opt.zero_grad()

-    def _resample(self):
-        """
-        Resample the architecture as ENAS result. This doesn't require an ``export`` method in nas_modules to work.
-        """
-        result = self.controller.resample()
-        for name, module in self.nas_modules:
-            module.sampled = result[name]
+    def resample(self):
+        """Resample the architecture with ENAS controller."""
+        sample = self.controller.resample()
+        result = self._interpret_controller_sampling_result(sample)
+        for module in self.nas_modules:
+            module.resample(memo=result)
+        return result

    def export(self):
+        """Run one more inference of ENAS controller."""
        self.controller.eval()
        with torch.no_grad():
-            return self.controller.resample()
-
-
-class RandomSampleModule(BaseOneShotLightningModule):
-    """
-    Random Sampling NAS Algorithm. In each epoch, model parameters are trained after a uniformly random sampling of each choice.
-    The training result is also a random sample of the search space.
-
-    Parameters
-    ----------
-    base_model : pl.LightningModule
-        he evaluator in ``nni.retiarii.evaluator.lightning``. User defined model is wrapped by base_model, and base_model will
-        be wrapped by this model.
-    custom_replace_dict : Dict[Type[nn.Module], Callable[[nn.Module], nn.Module]], default = None
-        The custom xxxChoice replace method. Keys should be xxxChoice type and values should return an ``nn.module``. This custom
-        replace dict will override the default replace dict of each NAS method.
-    """
-    automatic_optimization = True
-
-    def training_step(self, batch, batch_idx):
-        self._resample()
-        return self.model.training_step(batch, batch_idx)
-
-    @property
-    def default_replace_dict(self):
-        return {
-            LayerChoice : PathSamplingLayerChoice,
-            InputChoice : PathSamplingInputChoice
-        }
-
-    def _resample(self):
-        """
-        Resample the architecture as RandomSample result. This is simply a uniformly sampling that doesn't require an ``export``
-        method in nas_modules to work.
-        """
-        result = {}
-        for name, module in self.nas_modules:
-            if name not in result:
-                result[name] = random.randint(0, len(module) - 1)
-            module.sampled = result[name]
-        return result
-
-    def export(self):
-        return self._resample()
+            return self._interpret_controller_sampling_result(self.controller.resample())
+
+    def _interpret_controller_sampling_result(self, sample: Dict[str, int]) -> Dict[str, Any]:
+        """Convert ``{label: index}`` to ``{label: name}``"""
+        space_spec = self.search_space_spec()
+        for key in list(sample.keys()):
+            sample[key] = space_spec[key].values[sample[key]]
+        return sample
--- a/nni/retiarii/oneshot/pytorch/strategy.py
+++ b/nni/retiarii/oneshot/pytorch/strategy.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Strategy integration of one-shot.
+
+This file is put here simply because it relies on "pytorch".
+For consistency, please consider importing strategies from ``nni.retiarii.strategy``.
+For example, ``nni.retiarii.strategy.DartsStrategy`` (this requires pytorch to be installed of course).
+
+When adding/modifying a new strategy in this file, don't forget to link it in strategy/oneshot.py.
+"""
+
+import warnings
+from typing import Any, List, Optional, Type, Union, Tuple
+
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+from nni.retiarii.graph import Model
+from nni.retiarii.strategy.base import BaseStrategy
+from nni.retiarii.evaluator.pytorch.lightning import Lightning, LightningModule
+
+from .base_lightning import BaseOneShotLightningModule
+from .differentiable import DartsLightningModule, ProxylessLightningModule, GumbelDartsLightningModule
+from .sampling import EnasLightningModule, RandomSamplingLightningModule
+from .utils import InterleavedTrainValDataLoader, ConcatenateTrainValDataLoader
+
+
+class OneShotStrategy(BaseStrategy):
+    """Wrap an one-shot lightning module as a one-shot strategy."""
+
+    def __init__(self, oneshot_module: Type[BaseOneShotLightningModule], **kwargs):
+        self.oneshot_module = oneshot_module
+        self.oneshot_kwargs = kwargs
+
+        self.model: Optional[BaseOneShotLightningModule] = None
+
+    def _get_dataloader(self, train_dataloader: DataLoader, val_dataloaders: DataLoader) \
+        -> Union[DataLoader, Tuple[DataLoader, DataLoader]]:
+        """
+        One-shot strategy typically requires a customized dataloader.
+
+        If only train dataloader is produced, return one dataloader.
+        Otherwise, return train dataloader and valid loader as a tuple.
+        """
+        raise NotImplementedError()
+
+    def run(self, base_model: Model, applied_mutators):
+        # one-shot strategy doesn't use ``applied_mutators``
+        # but get the "mutators" on their own
+
+        _reason = 'The reason might be that you have used the wrong execution engine. Try to set engine to `oneshot` and try again.'
+
+        py_model: nn.Module = base_model.python_object
+        if not isinstance(py_model, nn.Module):
+            raise TypeError('Model is not a nn.Module. ' + _reason)
+
+        if applied_mutators:
+            raise ValueError('Mutator is not empty. ' + _reason)
+
+        if not isinstance(base_model.evaluator, Lightning):
+            raise TypeError('Evaluator needs to be a lightning evaluator to make one-shot strategy work.')
+
+        evaluator_module: LightningModule = base_model.evaluator.module
+        evaluator_module.set_model(py_model)
+
+        self.model: BaseOneShotLightningModule = self.oneshot_module(evaluator_module, **self.oneshot_kwargs)
+        evaluator: Lightning = base_model.evaluator
+        dataloader = self._get_dataloader(evaluator.train_dataloader, evaluator.val_dataloaders)
+        if isinstance(dataloader, tuple):
+            dataloader, val_loader = dataloader
+            evaluator.trainer.fit(self.model, dataloader, val_loader)
+        else:
+            evaluator.trainer.fit(self.model, dataloader)
+
+    def export_top_models(self, top_k: int = 1) -> List[Any]:
+        if self.model is None:
+            raise RuntimeError('One-shot strategy needs to be run before export.')
+        if top_k != 1:
+            warnings.warn('One-shot strategy currently only supports exporting top-1 model.', RuntimeWarning)
+        return [self.model.export()]
+
+
+class DARTS(OneShotStrategy):
+    __doc__ = DartsLightningModule._darts_note.format(module_notes='', module_params='')
+
+    def __init__(self, **kwargs):
+        super().__init__(DartsLightningModule, **kwargs)
+
+    def _get_dataloader(self, train_dataloader, val_dataloaders):
+        return InterleavedTrainValDataLoader(train_dataloader, val_dataloaders)
+
+
+class Proxyless(OneShotStrategy):
+    __doc__ = ProxylessLightningModule._proxyless_note.format(module_notes='', module_params='')
+
+    def __init__(self, **kwargs):
+        super().__init__(ProxylessLightningModule, **kwargs)
+
+    def _get_dataloader(self, train_dataloader, val_dataloaders):
+        return InterleavedTrainValDataLoader(train_dataloader, val_dataloaders)
+
+
+class GumbelDARTS(OneShotStrategy):
+    __doc__ = GumbelDartsLightningModule._gumbel_darts_note.format(module_notes='', module_params='')
+
+    def __init__(self, **kwargs):
+        super().__init__(GumbelDartsLightningModule, **kwargs)
+
+    def _get_dataloader(self, train_dataloader, val_dataloaders):
+        return InterleavedTrainValDataLoader(train_dataloader, val_dataloaders)
+
+
+class ENAS(OneShotStrategy):
+    __doc__ = EnasLightningModule._enas_note.format(module_notes='', module_params='')
+
+    def __init__(self, **kwargs):
+        super().__init__(EnasLightningModule, **kwargs)
+
+    def _get_dataloader(self, train_dataloader, val_dataloaders):
+        return ConcatenateTrainValDataLoader(train_dataloader, val_dataloaders)
+
+
+class RandomOneShot(OneShotStrategy):
+    __doc__ = RandomSamplingLightningModule._random_note.format(module_notes='', module_params='')
+
+    def __init__(self, **kwargs):
+        super().__init__(RandomSamplingLightningModule, **kwargs)
+
+    def _get_dataloader(self, train_dataloader, val_dataloaders):
+        return train_dataloader, val_dataloaders
--- a/nni/retiarii/oneshot/pytorch/supermodule/__init__.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/__init__.py
--- a/nni/retiarii/oneshot/pytorch/supermodule/_operation_utils.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/_operation_utils.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Thie file handles "slice" commonly used in mixed-operation.
+
+The ``slice_type`` we support here, is "slice" or "list of slice".
+The reason is that sometimes (e.g., in multi-head attention),
+the tensor slice could be from multiple parts. This type is extensible.
+We can support arbitrary masks in future if we need them.
+
+To slice a tensor, we need ``multidim_slice``,
+which is simply a tuple consists of ``slice_type``.
+
+Usually in python programs, the variable put into slice's start, stop and step
+should be integers (or NoneType).
+But in our case, it could also be a dict from integer to float,
+representing a distribution of integer. When that happens,
+we convert a "slice with some weighted values", to a "weighted slice".
+To this end, we track the computation with ``MaybeWeighted``,
+and replay the computation with each possible value.
+Meanwhile, we record their weights.
+Note that ``MaybeWeighted`` is also extensible.
+We can support more types of objects on slice in future.
+
+The fixed/weighted slice is fed into ``_slice_weight``,
+which interprets the slice and apply it on a tensor.
+"""
+
+import operator
+from typing import Tuple, Union, List, Dict, Callable, Optional, Iterator, TypeVar, Any, Generic
+
+import numpy as np
+import torch
+
+T = TypeVar('T')
+
+slice_type = Union[slice, List[slice]]
+multidim_slice = Tuple[slice_type, ...]
+
+scalar_or_scalar_dict = Union[T, Dict[T, float]]
+int_or_int_dict = scalar_or_scalar_dict[int]
+
+_value_fn_type = Optional[Callable[[int_or_int_dict], int]]
+
+
+def zeros_like(arr: T) -> T:
+    if isinstance(arr, np.ndarray):
+        return np.zeros_like(arr)
+    elif isinstance(arr, torch.Tensor):
+        return torch.zeros_like(arr)
+    else:
+        raise TypeError(f'Unsupported type for {arr}: {type(arr)}')
+
+
+def _eliminate_list_slice(shape: tuple, slice_: multidim_slice) -> multidim_slice:
+    # get rid of list of slice
+    result = []
+    for i in range(len(slice_)):
+        if isinstance(slice_[i], list):
+            # convert list of slices to mask
+            mask = np.zeros(shape[i], dtype=np.bool)
+            for sl in slice_[i]:
+                mask[sl] = 1
+            result.append(mask)
+        else:
+            result.append(slice_[i])
+    return tuple(result)
+
+
+def _slice_weight(weight: T, slice_: Union[multidim_slice, List[Tuple[multidim_slice, float]]]) -> T:
+    # slice_ can be a tuple of slice, e.g., ([3:6], [2:4])
+    # or tuple of slice -> float, e.g. {([3:6],): 0.6, ([2:4],): 0.3}
+
+    if isinstance(slice_, list):
+        # for weighted case, we get the corresponding masks. e.g.,
+        # {([3:6],): 0.6, ([2:4],): 0.3} => [0, 0, 0.3, 0.9, 0.6, 0.6] (if the whole length is 6)
+        # this mask is broadcasted and multiplied onto the weight
+
+        masks = []
+
+        # the accepted argument is list of tuple here
+        # because slice can't be key of dict
+        for sl, wt in slice_:
+            # create a mask with weight w
+            with torch.no_grad():
+                mask = zeros_like(weight)
+                mask[_eliminate_list_slice(weight.shape, sl)] = 1
+
+            # track gradients here
+            masks.append((mask * wt))
+
+        masks = sum(masks)
+
+        return masks * weight
+
+    else:
+        # for unweighted case, we slice it directly.
+
+        def _do_slice(arr, slice_):
+            return arr[_eliminate_list_slice(arr.shape, slice_)]
+
+        # sometimes, we don't need slice.
+        # this saves an op on computational graph, which will hopefully make training faster
+
+        # Use a dummy array to check this. Otherwise it would be too complex.
+        dummy_arr = np.zeros(weight.shape, dtype=np.bool)
+        no_effect = _do_slice(dummy_arr, slice_).shape == dummy_arr.shape
+
+        if no_effect:
+            return weight
+
+        return _do_slice(weight, slice_)
+
+
+class Slicable(Generic[T]):
+    """Wraps the weight so that in can be sliced with a ``multidim_slice``.
+    The value within the slice can be instances of :class:`MaybeWeighted`.
+
+    Examples
+    --------
+    >>> weight = conv2d.weight
+    >>> Slicable(weight)[:MaybeWeighted({32: 0.4, 64: 0.6})]
+    Tensor of shape (64, 64, 3, 3)
+    """
+
+    def __init__(self, weight: T):
+        if not isinstance(weight, np.ndarray) and not torch.is_tensor(weight):
+            raise TypeError(f'Unsuppoted weight type: {type(weight)}')
+        self.weight = weight
+
+    def __getitem__(self, index: multidim_slice) -> T:
+        if not isinstance(index, tuple):
+            index = (index, )
+
+        # Get the dict value in index's leafs
+        # There can be at most one dict
+        leaf_dict: Optional[Dict[int, float]] = None
+        for maybe_weighted in _iterate_over_multidim_slice(index):
+            for d in maybe_weighted.leaf_values():
+                if isinstance(d, dict):
+                    if leaf_dict is None:
+                        leaf_dict = d
+                    elif leaf_dict is not d:
+                        raise ValueError('There can be at most one distinct dict in leaf values.')
+
+        if leaf_dict is None:
+            # in case of simple types with no dict
+            res_index = _evaluate_multidim_slice(index)
+        else:
+            # there is a dict, iterate over dict
+            res_index = []
+            for val, wt in leaf_dict.items():
+                res_index_item = _evaluate_multidim_slice(index, lambda _: val)
+                res_index.append((res_index_item, wt))
+
+        return _slice_weight(self.weight, res_index)
+
+
+class MaybeWeighted:
+    """Wrap a value (int or dict with int keys), so that the computation on it can be replayed.
+    It builds a binary tree. If ``value`` is not None, it's a leaf node.
+    Otherwise, it has left sub-tree and right sub-tree and an operation.
+
+    Only support basic arithmetic operations: ``+``, ``-``, ``*``, ``//``.
+    """
+
+    def __init__(self,
+                 value: Optional[int_or_int_dict] = None, *,
+                 lhs: Optional[Union['MaybeWeighted', int]] = None,
+                 rhs: Optional[Union['MaybeWeighted', int]] = None,
+                 operation: Optional[Callable[[int, int], int]] = None):
+        if operation is None:
+            if not isinstance(value, (int, dict)):
+                raise TypeError(f'Unsupported value type: {type(value)}')
+        self.value = value
+        self.lhs = lhs
+        self.rhs = rhs
+        self.operation = operation
+
+    def leaf_values(self) -> Iterator[Dict[int, float]]:
+        """Iterate over values on leaf nodes."""
+        if self.value is not None:
+            yield self.value
+        else:
+            if isinstance(self.lhs, MaybeWeighted):
+                yield from self.lhs.leaf_values()
+            if isinstance(self.rhs, MaybeWeighted):
+                yield from self.rhs.leaf_values()
+
+    def evaluate(self, value_fn: _value_fn_type = None) -> int:
+        """Evaluate the value on root node, after replacing every value on leaf node with ``value_fn``.
+        If ``value_fn`` is none, no replacement will happen and the raw value will be used.
+        """
+        if self.value is not None:
+            if value_fn is not None:
+                return value_fn(self.value)
+            return self.value
+        else:
+            if isinstance(self.lhs, MaybeWeighted):
+                eval_lhs = self.lhs.evaluate(value_fn)
+            else:
+                eval_lhs = self.lhs
+            if isinstance(self.rhs, MaybeWeighted):
+                eval_rhs = self.rhs.evaluate(value_fn)
+            else:
+                eval_rhs = self.rhs
+            return self.operation(eval_lhs, eval_rhs)
+
+    def __repr__(self):
+        if self.value is not None:
+            return f'{self.__class__.__name__}({self.value})'
+        return f'{self.__class__.__name__}(lhs={self.lhs}, rhs={self.rhs}, op={self.operation})'
+
+    def __add__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=self, rhs=other, operation=operator.add)
+
+    def __radd__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=other, rhs=self, operation=operator.add)
+
+    def __sub__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=self, rhs=other, operation=operator.sub)
+
+    def __rsub__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=other, rhs=self, operation=operator.sub)
+
+    def __mul__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=self, rhs=other, operation=operator.mul)
+
+    def __rmul__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=other, rhs=self, operation=operator.mul)
+
+    def __floordiv__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=self, rhs=other, operation=operator.floordiv)
+
+    def __rfloordiv__(self, other: Any) -> 'MaybeWeighted':
+        return MaybeWeighted(lhs=other, rhs=self, operation=operator.floordiv)
+
+
+def _iterate_over_slice_type(s: slice_type):
+    if isinstance(s, list):
+        for se in s:
+            yield from _iterate_over_slice_type(se)
+    else:
+        # s must be a "slice" now
+        if isinstance(s.start, MaybeWeighted):
+            yield s.start
+        if isinstance(s.stop, MaybeWeighted):
+            yield s.stop
+        if isinstance(s.step, MaybeWeighted):
+            yield s.step
+
+
+def _iterate_over_multidim_slice(ms: multidim_slice):
+    """Get :class:`MaybeWeighted` instances in ``ms``."""
+    for s in ms:
+        if s is not None:
+            yield from _iterate_over_slice_type(s)
+
+
+def _evaluate_slice_type(s: slice_type, value_fn: _value_fn_type = None):
+    if isinstance(s, list):
+        return [_evaluate_slice_type(se, value_fn) for se in s]
+    else:
+        return slice(
+            s.start.evaluate(value_fn) if isinstance(s.start, MaybeWeighted) else s.start,
+            s.stop.evaluate(value_fn) if isinstance(s.stop, MaybeWeighted) else s.stop,
+            s.step.evaluate(value_fn) if isinstance(s.step, MaybeWeighted) else s.step
+        )
+
+
+def _evaluate_multidim_slice(ms: multidim_slice, value_fn: _value_fn_type = None):
+    """Wraps :meth:`MaybeWeighted.evaluate` to evaluate the whole ``multidim_slice``."""
+    res = []
+    for s in ms:
+        if s is not None:
+            res.append(_evaluate_slice_type(s, value_fn))
+        else:
+            res.append(None)
+    return tuple(res)
--- a/nni/retiarii/oneshot/pytorch/supermodule/_singlepathnas.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/_singlepathnas.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+# pylint: skip-file
+
+"""This file is an incomplete implementation of `Single-path NAS <https://arxiv.org/abs/1904.02877>`__.
+These are merely some components of the algorithm. The complete support is an undergoing work item.
+
+Keep this file here so that it can be "blamed".
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from nni.retiarii.nn.pytorch import ValueChoice
+
+
+class DifferentiableSuperConv2d(nn.Conv2d):
+    """
+    Only ``kernel_size`` ``in_channels`` and ``out_channels`` are supported. Kernel size candidates should be larger or smaller
+    than each other in both candidates. See examples below:
+    the following example is not allowed:
+        >>> ValueChoice(candidates = [(5, 3), (3, 5)])
+            □ ■ ■ ■ □   □ □ □ □ □
+            □ ■ ■ ■ □   ■ ■ ■ ■ ■    # candidates are not bigger or smaller on both dimension
+            □ ■ ■ ■ □   ■ ■ ■ ■ ■
+            □ ■ ■ ■ □   ■ ■ ■ ■ ■
+            □ ■ ■ ■ □   □ □ □ □ □
+    the following 3 examples are valid:
+        >>> ValueChoice(candidates = [5, 3, 1])
+            ■ ■ ■ ■ ■   □ □ □ □ □   □ □ □ □ □
+            ■ ■ ■ ■ ■   □ ■ ■ ■ □   □ □ □ □ □
+            ■ ■ ■ ■ ■   □ ■ ■ ■ □   □ □ ■ □ □
+            ■ ■ ■ ■ ■   □ ■ ■ ■ □   □ □ □ □ □
+            ■ ■ ■ ■ ■   □ □ □ □ □   □ □ □ □ □
+        >>> ValueChoice(candidates = [(5, 7), (3, 5), (1, 3)])
+            ■ ■ ■ ■ ■ ■ ■  □ □ □ □ □ □ □   □ □ □ □ □ □ □
+            ■ ■ ■ ■ ■ ■ ■  □ ■ ■ ■ ■ ■ □   □ □ □ □ □ □ □
+            ■ ■ ■ ■ ■ ■ ■  □ ■ ■ ■ ■ ■ □   □ □ ■ ■ ■ □ □
+            ■ ■ ■ ■ ■ ■ ■  □ ■ ■ ■ ■ ■ □   □ □ □ □ □ □ □
+            ■ ■ ■ ■ ■ ■ ■  □ □ □ □ □ □ □   □ □ □ □ □ □ □
+        >>> # when the difference between any two candidates is not even, the left upper will be picked:
+        >>> ValueChoice(candidates = [(5, 5), (4, 4), (3, 3)])
+            ■ ■ ■ ■ ■   ■ ■ ■ ■ □   □ □ □ □ □
+            ■ ■ ■ ■ ■   ■ ■ ■ ■ □   □ ■ ■ ■ □
+            ■ ■ ■ ■ ■   ■ ■ ■ ■ □   □ ■ ■ ■ □
+            ■ ■ ■ ■ ■   ■ ■ ■ ■ □   □ ■ ■ ■ □
+            ■ ■ ■ ■ ■   □ □ □ □ □   □ □ □ □ □
+    """
+
+    def __init__(self, module, name):
+        self.label = name
+        args = module.trace_kwargs
+
+        # compulsory params
+        if isinstance(args['in_channels'], ValueChoice):
+            args['in_channels'] = max(args['in_channels'].candidates)
+
+        self.out_channel_candidates = None
+        if isinstance(args['out_channels'], ValueChoice):
+            self.out_channel_candidates = sorted(args['out_channels'].candidates, reverse=True)
+            args['out_channels'] = self.out_channel_candidates[0]
+
+        # kernel_size may be an int or tuple, we turn it into a tuple for simplicity
+        self.kernel_size_candidates = None
+        if isinstance(args['kernel_size'], ValueChoice):
+            # unify kernel size as tuple
+            candidates = args['kernel_size'].candidates
+            if not isinstance(candidates[0], tuple):
+                candidates = [(k, k) for k in candidates]
+
+            # sort kernel size in descending order
+            self.kernel_size_candidates = sorted(candidates, key=lambda t: t[0], reverse=True)
+            for i in range(0, len(self.kernel_size_candidates) - 1):
+                bigger = self.kernel_size_candidates[i]
+                smaller = self.kernel_size_candidates[i + 1]
+                assert bigger[1] > smaller[1] or (bigger[1] == smaller[1] and bigger[0] > smaller[0]), f'Kernel_size candidates ' \
+                    f'should be larger or smaller than each other on both dimensions, but found {bigger} and {smaller}.'
+            args['kernel_size'] = self.kernel_size_candidates[0]
+
+        super().__init__(**args)
+        self.generate_architecture_params()
+
+    def forward(self, input):
+        # Note that there is no need to handle ``in_channels`` here since it is already handle by the ``out_channels`` in the
+        # previous module. If we multiply alpha with refer to ``in_channels`` here again, the alpha will indeed be considered
+        # twice, which is not what we expect.
+        weight = self.weight
+
+        def sum_weight(input_weight, masks, thresholds, indicator):
+            """
+            This is to get the weighted sum of weight.
+
+            Parameters
+            ----------
+            input_weight : Tensor
+                the weight to be weighted summed
+            masks : List[Tensor]
+                weight masks.
+            thresholds : List[float]
+                thresholds, should have a length of ``len(masks) - 1``
+            indicator : Callable[[Tensor, float], float]
+                take a tensor and a threshold as input, and output the weight
+
+            Returns
+            ----------
+            weight : Tensor
+                weighted sum of ``input_weight``. this is of the same shape as ``input_sum``
+            """
+            # Note that ``masks`` and ``thresholds`` have different lengths. There alignment is shown below:
+            # self.xxx_candidates = [   c_0  ,   c_1  , ... ,  c_n-2  ,   c_n-1 ] # descending order
+            # self.xxx_mask       = [ mask_0 , mask_1 , ... , mask_n-2, mask_n-1]
+            # self.t_xxx          = [   t_0  ,   t_2  , ... ,  t_n-2 ]
+            # So we zip the first n-1 items, and multiply masks[-1] in the end.
+            weight = torch.zeros_like(input_weight)
+            for mask, t in zip(masks[:-1], thresholds):
+                cur_part = input_weight * mask
+                alpha = indicator(cur_part, t)
+                weight = (weight + cur_part) * alpha
+            # we do not consider skip-op here for out_channel/expansion candidates, which means at least the smallest channel
+            # candidate is included
+            weight += input_weight * masks[-1]
+
+            return weight
+
+        if self.kernel_size_candidates is not None:
+            weight = sum_weight(weight, self.kernel_masks, self.t_kernel, self.Lasso_sigmoid)
+
+        if self.out_channel_candidates is not None:
+            weight = sum_weight(weight, self.channel_masks, self.t_expansion, self.Lasso_sigmoid)
+
+        output = self._conv_forward(input, weight, self.bias)
+        return output
+
+    def parameters(self):
+        for _, p in self.named_parameters():
+            yield p
+
+    def named_parameters(self):
+        for name, p in super().named_parameters():
+            if name == 'alpha':
+                continue
+            yield name, p
+
+    def export(self):
+        """
+        result = {
+            'kernel_size': i,
+            'out_channels': j
+        }
+        which means the best candidate for an argument is the i-th one if candidates are sorted in descending order
+        """
+        result = {}
+        eps = 1e-5
+        with torch.no_grad():
+            if self.kernel_size_candidates is not None:
+                weight = torch.zeros_like(self.weight)
+                # ascending order
+                for i in range(len(self.kernel_size_candidates) - 2, -1, -1):
+                    mask = self.kernel_masks[i]
+                    t = self.t_kernel[i]
+                    cur_part = self.weight * mask
+                    alpha = self.Lasso_sigmoid(cur_part, t)
+                    if alpha <= eps:  # takes the smaller one
+                        result['kernel_size'] = self.kernel_size_candidates[i + 1]
+                        break
+                    weight = (weight + cur_part) * alpha
+
+                if 'kernel_size' not in result:
+                    result['kernel_size'] = self.kernel_size_candidates[0]
+            else:
+                weight = self.weight
+
+            if self.out_channel_candidates is not None:
+                for i in range(len(self.out_channel_candidates) - 2, -1, -1):
+                    mask = self.channel_masks[i]
+                    t = self.t_expansion[i]
+                    alpha = self.Lasso_sigmoid(weight * mask, t)
+                    if alpha <= eps:
+                        result['out_channels'] = self.out_channel_candidates[i + 1]
+
+                if 'out_channels' not in result:
+                    result['out_channels'] = self.out_channel_candidates[0]
+
+        return result
+
+    @staticmethod
+    def Lasso_sigmoid(matrix, t):
+        """
+        A trick that can make use of both the value of bool(lasso > t) and the gradient of sigmoid(lasso - t)
+
+        Parameters
+        ----------
+        matrix : Tensor
+            the matrix to calculate lasso norm
+        t : float
+            the threshold
+        """
+        lasso = torch.norm(matrix) - t
+        indicator = (lasso > 0).float()  # torch.sign(lasso)
+        with torch.no_grad():
+            #            indicator = indicator / 2 + .5 # realign indicator from (-1, 1) to (0, 1)
+            indicator -= F.sigmoid(lasso)
+        indicator += F.sigmoid(lasso)
+        return indicator
+
+    def generate_architecture_params(self):
+        self.alpha = {}
+        if self.kernel_size_candidates is not None:
+            # kernel size arch params
+            self.t_kernel = nn.Parameter(torch.rand(len(self.kernel_size_candidates) - 1))
+            self.alpha['kernel_size'] = self.t_kernel
+            # kernel size mask
+            self.kernel_masks = []
+            for i in range(0, len(self.kernel_size_candidates) - 1):
+                big_size = self.kernel_size_candidates[i]
+                small_size = self.kernel_size_candidates[i + 1]
+                mask = torch.zeros_like(self.weight)
+                mask[:, :, :big_size[0], :big_size[1]] = 1          # if self.weight.shape = (out, in, 7, 7), big_size = (5, 5) and
+                mask[:, :, :small_size[0], :small_size[1]] = 0      # small_size = (3, 3), mask will look like:
+                self.kernel_masks.append(mask)  # 0 0 0 0 0 0 0
+            mask = torch.zeros_like(self.weight)  # 0 1 1 1 1 1 0
+            mask[:, :, :self.kernel_size_candidates[-1][0], :self.kernel_size_candidates[-1][1]] = 1  # 0 1 0 0 0 1 0
+            self.kernel_masks.append(mask)  # 0 1 0 0 0 1 0
+            #   0 1 0 0 0 1 0
+        if self.out_channel_candidates is not None:  # 0 1 1 1 1 1 0
+            # out_channel (or expansion) arch params. we do not consider skip-op here, so we            #   0 0 0 0 0 0 0
+            # only generate ``len(self.kernel_size_candidates) - 1 `` thresholds
+            self.t_expansion = nn.Parameter(torch.rand(len(self.out_channel_candidates) - 1))
+            self.alpha['out_channels'] = self.t_expansion
+            self.channel_masks = []
+            for i in range(0, len(self.out_channel_candidates) - 1):
+                big_channel, small_channel = self.out_channel_candidates[i], self.out_channel_candidates[i + 1]
+                mask = torch.zeros_like(self.weight)
+                mask[:big_channel] = 1
+                mask[:small_channel] = 0
+                # if self.weight.shape = (32, in, W, H), big_channel = 16 and small_size = 8, mask will look like:
+                # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+                self.channel_masks.append(mask)
+            mask = torch.zeros_like(self.weight)
+            mask[:self.out_channel_candidates[-1]] = 1
+            self.channel_masks.append(mask)
+
+
+class DifferentiableBatchNorm2d(nn.BatchNorm2d):
+    def __init__(self, module, name):
+        self.label = name
+        args = module.trace_kwargs
+        if isinstance(args['num_features'], ValueChoice):
+            args['num_features'] = max(args['num_features'].candidates)
+        super().__init__(**args)
+
+        # no architecture parameter is needed for BatchNorm2d Layers
+        self.alpha = nn.Parameter(torch.tensor([]))
+
+    def export(self):
+        """
+        No need to export ``BatchNorm2d``. Refer to the ``Conv2d`` layer that has the ``ValueChoice`` as ``out_channels``.
+        """
+        return -1
--- a/nni/retiarii/oneshot/pytorch/supermodule/_valuechoice_utils.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/_valuechoice_utils.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Utilities to process the value choice compositions,
+in the way that is most convenient to one-shot algorithms."""
+
+import itertools
+from typing import List, Any, Dict, Tuple, Optional, Union
+
+from nni.common.hpo_utils import ParameterSpec
+from nni.retiarii.nn.pytorch.api import ValueChoiceX
+
+
+Choice = Any
+
+__all__ = ['dedup_inner_choices', 'evaluate_value_choice_with_dict', 'traverse_all_options']
+
+
+def dedup_inner_choices(value_choices: List[ValueChoiceX]) -> Dict[str, ParameterSpec]:
+    """Find all leaf nodes in ``value_choices``,
+    save them into in the format of ``{label: parameter_spec}``.
+    """
+    result = {}
+    for value_choice in value_choices:
+        for choice in value_choice.inner_choices():
+            param_spec = ParameterSpec(choice.label, 'choice', choice.candidates, (choice.label, ), True, size=len(choice.candidates))
+            if choice.label in result:
+                if param_spec != result[choice.label]:
+                    raise ValueError('Value choice conflict: same label with different candidates: '
+                                     f'{param_spec} vs. {result[choice.label]}')
+            else:
+                result[choice.label] = param_spec
+    return result
+
+
+def evaluate_value_choice_with_dict(value_choice: ValueChoiceX, chosen: Dict[str, Choice]) -> Any:
+    """To evaluate a composition of value-choice with a dict,
+    with format of ``{label: chosen_value}``.
+    The implementation is two-pass. We first get a list of values,
+    then feed the values into ``value_choice.evaluate``.
+    This can be potentially optimized in terms of speed.
+
+    Examples
+    --------
+    >>> chosen = {"exp_ratio": 3}
+    >>> evaluate_value_choice_with_dict(value_choice_in, chosen)
+    48
+    >>> evaluate_value_choice_with_dict(value_choice_out, chosen)
+    96
+    """
+    choice_inner_values = []
+    for choice in value_choice.inner_choices():
+        if choice.label not in chosen:
+            raise KeyError(f'{value_choice} depends on a value with key {choice.label}, but not found in {chosen}')
+        choice_inner_values.append(chosen[choice.label])
+    return value_choice.evaluate(choice_inner_values)
+
+
+def traverse_all_options(value_choice: ValueChoiceX,
+                         weights: Optional[Dict[str, List[float]]] = None) -> List[Union[Tuple[Any, float], Any]]:
+    """Traverse all possible computation outcome of a value choice.
+    If ``weights`` is not None, it will also compute the probability of each possible outcome.
+
+    Parameters
+    ----------
+    value_choice : ValueChoiceX
+        The value choice to traverse.
+    weights : Optional[Dict[str, List[float]]], default = None
+        If there's a prior on leaf nodes, and we intend to know the (joint) prior on results,
+        weights can be provided. The key is label, value are list of float indicating probability.
+        Normally, they should sum up to 1, but we will not check them in this function.
+
+    Returns
+    -------
+    List[Union[Tuple[Any, float], Any]]
+        Results will be sorted and duplicates will be eliminated.
+        If weights is provided, the return value will be a list of tuple, with option and its weight.
+        Otherwise, it will be a list of options.
+    """
+    # get a dict of {label: list of tuple of choice and weight}
+    leafs: Dict[str, List[Tuple[Choice, float]]] = {}
+    for label, param_spec in dedup_inner_choices([value_choice]).items():
+        if weights is not None:
+            if label not in weights:
+                raise KeyError(f'{value_choice} depends on a weight with key {label}, but not found in {weights}')
+            if len(weights[label]) != param_spec.size:
+                raise KeyError(f'Expect weights with {label} to be of length {param_spec.size}, but {len(weights[label])} found')
+            leafs[label] = list(zip(param_spec.values, weights[label]))
+        else:
+            # create a dummy weight of zero, in case that weights are not provided.
+            leafs[label] = list(zip(param_spec.values, itertools.repeat(0., param_spec.size)))
+
+    # result is a dict from a option to its weight
+    result: Dict[str, Optional[float]] = {}
+    labels, values = list(leafs.keys()), list(leafs.values())
+
+    if not labels:
+        raise ValueError(f'There expects at least one leaf value choice in {value_choice}, but nothing found')
+
+    # get all combinations
+    for prod_value in itertools.product(*values):
+        # For example,
+        # prod_value = ((3, 0.1), ("cat", 0.3), ({"in": 5}, 0.5))
+        # the first dim is chosen value, second dim is probability
+        # chosen = {"ks": 3, "animal": "cat", "linear_args": {"in": 5}}
+        # chosen_weight = np.prod([0.1, 0.3, 0.5])
+        chosen = {label: value[0] for label, value in zip(labels, prod_value)}
+
+        eval_res = evaluate_value_choice_with_dict(value_choice, chosen)
+
+        if weights is None:
+            result[eval_res] = None
+        else:
+            # we can't use reduce or inplace product here,
+            # because weight can sometimes be tensors
+            chosen_weight = prod_value[0][1]
+            for value in prod_value[1:]:
+                if chosen_weight is None:
+                    chosen_weight = value[1]
+                else:
+                    chosen_weight = chosen_weight * value[1]
+
+            if eval_res in result:
+                result[eval_res] = result[eval_res] + chosen_weight
+            else:
+                result[eval_res] = chosen_weight
+
+    if weights is None:
+        return sorted(result.keys())
+    else:
+        return sorted(result.items())
--- a/nni/retiarii/oneshot/pytorch/supermodule/base.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/base.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import Any, Dict, Tuple, Union
+
+import torch.nn as nn
+
+from nni.common.hpo_utils import ParameterSpec
+
+
+class BaseSuperNetModule(nn.Module):
+    """
+    Mutated module in super-net.
+    Usually, the feed-forward of the module itself is undefined.
+    It has to be resampled with ``resample()`` so that a specific path is selected.
+    (Sometimes, this is not required. For example, differentiable super-net.)
+
+    A super-net module usually corresponds to one sample. But two exceptions:
+
+    * A module can have multiple parameter spec. For example, a convolution-2d can sample kernel size, channels at the same time.
+    * Multiple modules can share one parameter spec. For example, multiple layer choices with the same label.
+
+    For value choice compositions, the parameter spec are bounded to the underlying (original) value choices,
+    rather than their compositions.
+    """
+
+    def resample(self, memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        Resample the super-net module.
+
+        Parameters
+        ----------
+        memo : Dict[str, Any]
+            Used to ensure the consistency of samples with the same label.
+
+        Returns
+        -------
+        dict
+            Sampled result. If nothing new is sampled, it should return an empty dict.
+        """
+        raise NotImplementedError()
+
+    def export(self, memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        Export the final architecture within this module.
+        It should have the same keys as ``search_space_spec()``.
+
+        Parameters
+        ----------
+        memo : Dict[str, Any]
+            Use memo to avoid the same label gets exported multiple times.
+        """
+        raise NotImplementedError()
+
+    def search_space_spec(self) -> Dict[str, ParameterSpec]:
+        """
+        Space specification (sample points).
+        Mapping from spec name to ParameterSpec. The names in choices should be in the same format of export.
+
+        For example: ::
+
+            {"layer1": ParameterSpec(values=["conv", "pool"])}
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def mutate(cls, module: nn.Module, name: str, memo: Dict[str, Any], mutate_kwargs: Dict[str, Any]) -> \
+            Union['BaseSuperNetModule', bool, Tuple['BaseSuperNetModule', bool]]:
+        """This is a mutation hook that creates a :class:`BaseSuperNetModule`.
+        The method should be implemented in each specific super-net module,
+        because they usually have specific rules about what kind of modules to operate on.
+
+        Parameters
+        ----------
+        module : nn.Module
+            The module to be mutated (replaced).
+        name : str
+            Name of this module. With full prefix. For example, ``module1.block1.conv``.
+        memo : dict
+            Memo to enable sharing parameters among mutated modules. It should be read and written by
+            mutate functions themselves.
+        mutate_kwargs : dict
+            Algo-related hyper-parameters, and some auxiliary information.
+
+        Returns
+        -------
+        Union[BaseSuperNetModule, bool, Tuple[BaseSuperNetModule, bool]]
+            The mutation result, along with an optional boolean flag indicating whether to suppress follow-up mutation hooks.
+            See :class:`nni.retiarii.oneshot.pytorch.base.BaseOneShotLightningModule` for details.
+        """
+        raise NotImplementedError()
--- a/nni/retiarii/oneshot/pytorch/supermodule/differentiable.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/differentiable.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import functools
+import warnings
+
+from typing import List, Tuple, Optional, Dict, Any, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from nni.common.hpo_utils import ParameterSpec
+from nni.retiarii.nn.pytorch import LayerChoice, InputChoice
+
+from .base import BaseSuperNetModule
+from .operation import MixedOperation, MixedOperationSamplingPolicy
+from ._valuechoice_utils import traverse_all_options
+
+
+class GumbelSoftmax(nn.Softmax):
+    """Wrapper of ``F.gumbel_softmax``. dim = -1 by default."""
+
+    def __init__(self, dim: Optional[int] = -1) -> None:
+        super().__init__(dim)
+        self.tau = 1
+        self.hard = False
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.gumbel_softmax(inputs, tau=self.tau, hard=self.hard, dim=self.dim)
+
+
+class DifferentiableMixedLayer(BaseSuperNetModule):
+    """
+    Mixed layer, in which fprop is decided by a weighted sum of several layers.
+    Proposed in `DARTS: Differentiable Architecture Search <https://arxiv.org/abs/1806.09055>`__.
+
+    The weight ``alpha`` is usually learnable, and optimized on validation dataset.
+
+    Differentiable sampling layer requires all operators returning the same shape for one input,
+    as all outputs will be weighted summed to get the final output.
+
+    Parameters
+    ----------
+    paths : List[Tuple[str, nn.Module]]
+        Layers to choose from. Each is a tuple of name, and its module.
+    alpha : Tensor
+        Tensor that stores the "learnable" weights.
+    softmax : nn.Module
+        Customizable softmax function. Usually ``nn.Softmax(-1)``.
+    label : str
+        Name of the choice.
+
+    Attributes
+    ----------
+    op_names : str
+        Operator names.
+    label : str
+        Name of the choice.
+    """
+
+    _arch_parameter_names: List[str] = ['_arch_alpha']
+
+    def __init__(self, paths: List[Tuple[str, nn.Module]], alpha: torch.Tensor, softmax: nn.Module, label: str):
+        super().__init__()
+        self.op_names = []
+        if len(alpha) != len(paths):
+            raise ValueError(f'The size of alpha ({len(alpha)}) must match number of candidates ({len(paths)}).')
+        for name, module in paths:
+            self.add_module(name, module)
+            self.op_names.append(name)
+        assert self.op_names, 'There has to be at least one op to choose from.'
+        self.label = label
+        self._arch_alpha = alpha
+        self._softmax = softmax
+
+    def resample(self, memo):
+        """Do nothing. Differentiable layer doesn't need resample."""
+        return {}
+
+    def export(self, memo):
+        """Choose the operator with the maximum logit."""
+        if self.label in memo:
+            return {}  # nothing new to export
+        return {self.label: self.op_names[torch.argmax(self._arch_alpha).item()]}
+
+    def search_space_spec(self):
+        return {self.label: ParameterSpec(self.label, 'choice', self.op_names, (self.label, ),
+                                          True, size=len(self.op_names))}
+
+    @classmethod
+    def mutate(cls, module, name, memo, mutate_kwargs):
+        if isinstance(module, LayerChoice):
+            size = len(module)
+            if module.label in memo:
+                alpha = memo[module.label]
+                if len(alpha) != size:
+                    raise ValueError(f'Architecture parameter size of same label {module.label} conflict: {len(alpha)} vs. {size}')
+            else:
+                alpha = nn.Parameter(torch.randn(size) * 1E-3)  # this can be reinitialized later
+
+            softmax = mutate_kwargs.get('softmax', nn.Softmax(-1))
+            return cls(list(module.named_children()), alpha, softmax, module.label)
+
+    def forward(self, *args, **kwargs):
+        """The forward of mixed layer accepts same arguments as its sub-layer."""
+        op_results = torch.stack([getattr(self, op)(*args, **kwargs) for op in self.op_names])
+        alpha_shape = [-1] + [1] * (len(op_results.size()) - 1)
+        return torch.sum(op_results * self._softmax(self._arch_alpha).view(*alpha_shape), 0)
+
+    def parameters(self, *args, **kwargs):
+        """Parameters excluding architecture parameters."""
+        for _, p in self.named_parameters(*args, **kwargs):
+            yield p
+
+    def named_parameters(self, *args, **kwargs):
+        """Named parameters excluding architecture parameters."""
+        arch = kwargs.pop('arch', False)
+        for name, p in super().named_parameters(*args, **kwargs):
+            if any(name == par_name for par_name in self._arch_parameter_names):
+                if arch:
+                    yield name, p
+            else:
+                if not arch:
+                    yield name, p
+
+
+class DifferentiableMixedInput(BaseSuperNetModule):
+    """
+    Mixed input. Forward returns a weighted sum of candidates.
+    Implementation is very similar to :class:`DifferentiableMixedLayer`.
+
+    Parameters
+    ----------
+    n_candidates : int
+        Expect number of input candidates.
+    n_chosen : int
+        Expect numebr of inputs finally chosen.
+    alpha : Tensor
+        Tensor that stores the "learnable" weights.
+    softmax : nn.Module
+        Customizable softmax function. Usually ``nn.Softmax(-1)``.
+    label : str
+        Name of the choice.
+
+    Attributes
+    ----------
+    label : str
+        Name of the choice.
+    """
+
+    _arch_parameter_names: List[str] = ['_arch_alpha']
+
+    def __init__(self, n_candidates: int, n_chosen: Optional[int], alpha: torch.Tensor, softmax: nn.Module, label: str):
+        super().__init__()
+        self.n_candidates = n_candidates
+        if len(alpha) != n_candidates:
+            raise ValueError(f'The size of alpha ({len(alpha)}) must match number of candidates ({n_candidates}).')
+        if n_chosen is None:
+            warnings.warn('Differentiable architecture search does not support choosing multiple inputs. Assuming one.',
+                          RuntimeWarning)
+            self.n_chosen = 1
+        self.n_chosen = n_chosen
+        self.label = label
+        self._softmax = softmax
+
+        self._arch_alpha = alpha
+
+    def resample(self, memo):
+        """Do nothing. Differentiable layer doesn't need resample."""
+        return {}
+
+    def export(self, memo):
+        """Choose the operator with the top ``n_chosen`` logits."""
+        if self.label in memo:
+            return {}  # nothing new to export
+        chosen = sorted(torch.argsort(-self._arch_alpha).cpu().numpy().tolist()[:self.n_chosen])
+        if len(chosen) == 1:
+            chosen = chosen[0]
+        return {self.label: chosen}
+
+    def search_space_spec(self):
+        return {
+            self.label: ParameterSpec(self.label, 'choice', list(range(self.n_candidates)),
+                                      (self.label, ), True, size=self.n_candidates, chosen_size=self.n_chosen)
+        }
+
+    @classmethod
+    def mutate(cls, module, name, memo, mutate_kwargs):
+        if isinstance(module, InputChoice):
+            if module.reduction not in ['sum', 'mean']:
+                raise ValueError('Only input choice of sum/mean reduction is supported.')
+            size = module.n_candidates
+            if module.label in memo:
+                alpha = memo[module.label]
+                if len(alpha) != size:
+                    raise ValueError(f'Architecture parameter size of same label {module.label} conflict: {len(alpha)} vs. {size}')
+            else:
+                alpha = nn.Parameter(torch.randn(size) * 1E-3)  # this can be reinitialized later
+
+            softmax = mutate_kwargs.get('softmax', nn.Softmax(-1))
+            return cls(module.n_candidates, module.n_chosen, alpha, softmax, module.label)
+
+    def forward(self, inputs):
+        """Forward takes a list of input candidates."""
+        inputs = torch.stack(inputs)
+        alpha_shape = [-1] + [1] * (len(inputs.size()) - 1)
+        return torch.sum(inputs * self._softmax(self._arch_alpha).view(*alpha_shape), 0)
+
+    def parameters(self, *args, **kwargs):
+        """Parameters excluding architecture parameters."""
+        for _, p in self.named_parameters(*args, **kwargs):
+            yield p
+
+    def named_parameters(self, *args, **kwargs):
+        """Named parameters excluding architecture parameters."""
+        arch = kwargs.pop('arch', False)
+        for name, p in super().named_parameters(*args, **kwargs):
+            if any(name == par_name for par_name in self._arch_parameter_names):
+                if arch:
+                    yield name, p
+            else:
+                if not arch:
+                    yield name, p
+
+
+class MixedOpDifferentiablePolicy(MixedOperationSamplingPolicy):
+    """Implementes the differentiable sampling in mixed operation.
+
+    One mixed operation can have multiple value choices in its arguments.
+    Thus the ``_arch_alpha`` here is a parameter dict, and ``named_parameters``
+    filters out multiple parameters with ``_arch_alpha`` as its prefix.
+
+    When this class is asked for ``forward_argument``, it returns a distribution,
+    i.e., a dict from int to float based on its weights.
+
+    All the parameters (``_arch_alpha``, ``parameters()``, ``_softmax``) are
+    saved as attributes of ``operation``, rather than ``self``,
+    because this class itself is not a ``nn.Module``, and saved parameters here
+    won't be optimized.
+    """
+
+    _arch_parameter_names: List[str] = ['_arch_alpha']
+
+    def __init__(self, operation: MixedOperation, memo: Dict[str, Any], mutate_kwargs: Dict[str, Any]) -> None:
+        # Sampling arguments. This should have the same keys with `operation.mutable_arguments`
+        operation._arch_alpha = nn.ParameterDict()
+        for name, spec in operation.search_space_spec().items():
+            if name in memo:
+                alpha = memo[name]
+                if len(alpha) != spec.size:
+                    raise ValueError(f'Architecture parameter size of same label {name} conflict: {len(alpha)} vs. {spec.size}')
+            else:
+                alpha = nn.Parameter(torch.randn(spec.size) * 1E-3)
+            operation._arch_alpha[name] = alpha
+
+        operation.parameters = functools.partial(self.parameters, self=operation)                # bind self
+        operation.named_parameters = functools.partial(self.named_parameters, self=operation)
+
+        operation._softmax = mutate_kwargs.get('softmax', nn.Softmax(-1))
+
+    @staticmethod
+    def parameters(self, *args, **kwargs):
+        for _, p in self.named_parameters(*args, **kwargs):
+            yield p
+
+    @staticmethod
+    def named_parameters(self, *args, **kwargs):
+        arch = kwargs.pop('arch', False)
+        for name, p in super(self.__class__, self).named_parameters(*args, **kwargs):  # pylint: disable=bad-super-call
+            if any(name.startswith(par_name) for par_name in MixedOpDifferentiablePolicy._arch_parameter_names):
+                if arch:
+                    yield name, p
+            else:
+                if not arch:
+                    yield name, p
+
+    def resample(self, operation: MixedOperation, memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """Differentiable. Do nothing in resample."""
+        return {}
+
+    def export(self, operation: MixedOperation, memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """Export is also random for each leaf value choice."""
+        result = {}
+        for name, spec in operation.search_space_spec().items():
+            if name in result:
+                continue
+            chosen_index = torch.argmax(operation._arch_alpha[name]).item()
+            result[name] = spec.values[chosen_index]
+        return result
+
+    def forward_argument(self, operation: MixedOperation, name: str) -> Union[Dict[Any, float], Any]:
+        if name in operation.mutable_arguments:
+            weights = {label: operation._softmax(alpha) for label, alpha in operation._arch_alpha.items()}
+            return dict(traverse_all_options(operation.mutable_arguments[name], weights=weights))
+        return operation.init_arguments[name]
--- a/nni/retiarii/oneshot/pytorch/supermodule/operation.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/operation.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""
+Operations that support weight sharing at a fine-grained level,
+which is commonly known as super-kernel (as in channel search), or weight entanglement.
+"""
+
+import inspect
+import itertools
+from typing import Union, Tuple, Dict, List, Any, Type, Optional, TypeVar
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import nni.retiarii.nn.pytorch as retiarii_nn
+from nni.common.hpo_utils import ParameterSpec
+from nni.common.serializer import is_traceable
+from nni.retiarii.nn.pytorch.api import ValueChoiceX
+
+from .base import BaseSuperNetModule
+from ._valuechoice_utils import traverse_all_options, dedup_inner_choices
+from ._operation_utils import Slicable as _S, MaybeWeighted as _W, int_or_int_dict, scalar_or_scalar_dict
+
+T = TypeVar('T')
+
+
+class MixedOperationSamplingPolicy:
+    """
+    Algo-related part for mixed Operation.
+
+    :class:`MixedOperation` delegates its resample and export to this policy (or its subclass),
+    so that one Operation can be easily combined with different kinds of sampling.
+
+    One SamplingStrategy corresponds to one mixed operation.
+    """
+
+    def __init__(self, operation: 'MixedOperation', memo: Dict[str, Any], mutate_kwargs: Dict[str, Any]) -> None:
+        """At init, the sampling policy can prepare basic parameters,
+        and store them in operation if they need back propagation.
+
+        This init is called in :meth:`BaseSuperNetModule.mutate`, after the mixed operation is created.
+        So similar to :meth:`BaseSuperNetModule.mutate`,
+        memo should also be managed (read and written) by the policy itself.
+        """
+        pass
+
+    def resample(self, operation: 'MixedOperation', memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """The handler of :meth:`MixedOperation.resample`."""
+        raise NotImplementedError()
+
+    def export(self, operation: 'MixedOperation', memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """The handler of :meth:`MixedOperation.export`."""
+        raise NotImplementedError()
+
+    def forward_argument(self, operation: 'MixedOperation', name: str) -> Any:
+        """Computing the argument with ``name`` used in operation's forward.
+        Usually a value, or a distribution of value.
+        """
+        raise NotImplementedError()
+
+
+class MixedOperation(BaseSuperNetModule):
+    """This is the base class for all mixed operations.
+
+    It contains commonly used utilities that will ease the effort to write customized mixed oeprations,
+    i.e., operations with ValueChoice in its arguments.
+
+    By design, for a mixed operation to work in a specific algorithm,
+    at least two classes are needed.
+
+    1. One class needs to inherit this class, to control operation-related behavior,
+       such as how to initialize the operation such that the sampled operation can be its sub-operation.
+    2. The other one needs to inherit :class:`MixedOperationSamplingPolicy`,
+       which controls algo-related behavior, such as sampling.
+
+    The two classes are linked with ``sampling_policy`` attribute in :class:`MixedOperation`,
+    whose type is set via ``mixed_op_sampling`` in ``mutate_kwargs`` when
+    :meth:`MixedOperation.mutate` is called.
+
+    With this design, one mixed-operation (e.g., MixedConv2d) can work in multiple algorithms
+    (e.g., both DARTS and ENAS), saving the engineering effort to rewrite all operations for
+    each specific algo.
+
+    This class should also define a ``bound_type``, to control the matching type in mutate,
+    an ``argument_list``, to control which arguments can be dynamically used in ``forward``.
+    This list will also be used in mutate for sanity check.
+    """
+
+    bound_type: Type[nn.Module]                 # defined in subclass
+    argument_list: List[str]                    # defined in subclass
+
+    sampling_policy: MixedOperationSamplingPolicy
+
+    def super_init_argument(self, name: str, value_choice: ValueChoiceX) -> Any:
+        """Get the initialization argument when constructing super-kernel, i.e., calling ``super().__init__()``.
+        This is often related to specific operator, rather than algo.
+
+        For example::
+
+            def super_init_argument(self, name, value_choice):
+                return max(value_choice.candidates)
+        """
+        raise NotImplementedError()
+
+    def __post_init__(self) -> None:
+        """Can be used to validate, or to do extra processing after calling ``__init__``."""
+        pass
+
+    def forward_with_args(self, *args, **kwargs):
+        """To control real fprop. The accepted arguments are ``argument_list``,
+        appended by forward arguments in the ``bound_type``."""
+        raise NotImplementedError()
+
+    def __init__(self, module_kwargs: Dict[str, Any]) -> None:
+        # Concerned arguments
+        self.mutable_arguments: Dict[str, ValueChoiceX] = {}
+        # Useful when retrieving arguments without ValueChoice
+        self.init_arguments: Dict[str, Any] = {**module_kwargs}
+        self._fill_missing_init_arguments()
+
+        # get init default
+        super_init_kwargs = {}
+
+        for key, value in module_kwargs.items():
+            if isinstance(value, ValueChoiceX):
+                if key not in self.argument_list:
+                    raise TypeError(f'Unsupported value choice on argument of {self.bound_type}: {key}')
+                super_init_kwargs[key] = self.super_init_argument(key, value)
+                self.mutable_arguments[key] = value
+            else:
+                super_init_kwargs[key] = value
+
+        # get all inner leaf value choices
+        self._space_spec: Dict[str, ParameterSpec] = dedup_inner_choices(self.mutable_arguments.values())
+
+        super().__init__(**super_init_kwargs)
+
+        self.__post_init__()
+
+    def resample(self, memo):
+        """Delegates to :meth:`MixedOperationSamplingPolicy.resample`."""
+        return self.sampling_policy.resample(self, memo)
+
+    def export(self, memo):
+        """Delegates to :meth:`MixedOperationSamplingPolicy.export`."""
+        return self.sampling_policy.export(self, memo)
+
+    def search_space_spec(self):
+        return self._space_spec
+
+    @classmethod
+    def mutate(cls, module, name, memo, mutate_kwargs):
+        """Find value choice in module's arguments and replace the whole module"""
+        has_valuechoice = False
+        if isinstance(module, cls.bound_type) and is_traceable(module):
+            for arg in itertools.chain(module.trace_args, module.trace_kwargs.values()):
+                if isinstance(arg, ValueChoiceX):
+                    has_valuechoice = True
+
+        if has_valuechoice:
+            if module.trace_args:
+                raise ValueError('ValueChoice on class arguments cannot appear together with ``trace_args``. '
+                                    'Please enable ``kw_only`` on nni.trace.')
+
+            # save type and kwargs
+            mixed_op = cls(module.trace_kwargs)
+
+            if 'mixed_op_sampling' not in mutate_kwargs:
+                raise ValueError('Need to sampling policy of mixed op, but not found in `mutate_kwargs`.')
+            policy_cls: Type[MixedOperationSamplingPolicy] = mutate_kwargs['mixed_op_sampling']
+            # initialize policy class
+            # this is put in mutate because we need to access memo
+            mixed_op.sampling_policy = policy_cls(mixed_op, memo, mutate_kwargs)
+
+            return mixed_op
+
+    def forward_argument(self, name: str) -> Any:
+        """Get the argument used in forward.
+        This if often related to algo. We redirect this to sampling policy.
+        """
+        return self.sampling_policy.forward_argument(self, name)
+
+    def forward(self, *args, **kwargs):
+        """First get sampled arguments, then forward with the sampled arguments (by calling ``forward_with_args``)."""
+        sampled_args = [self.forward_argument(name) for name in self.argument_list]
+        return self.forward_with_args(*sampled_args, *args, **kwargs)
+
+    def _fill_missing_init_arguments(self) -> None:
+        """Set the unspecified init arguments in ``self.init_arguments``.
+        For example, in the case of Conv2d, when user didn't specify argument ``stride``,
+        this method adds ``stride = 1`` in ``self.init_arguments``.
+
+        This is implemented by inspecting the init signature of ``bound_type``.
+        Arguments in complex cases like ``__new__`` or in super-class is not supported.
+        """
+
+        def unwrap(cls):
+            if not hasattr(cls, '__wrapped__'):
+                return cls
+            return unwrap(cls.__wrapped__)
+
+        for param in inspect.signature(unwrap(self.bound_type).__init__).parameters.values():
+            if param.default is not param.empty and param.name not in self.init_arguments:
+                self.init_arguments[param.name] = param.default
+
+
+class MixedLinear(MixedOperation, nn.Linear):
+    """Mixed linear operation.
+
+    Supported arguments are:
+
+    - ``in_features``
+    - ``out_features``
+
+    Prefix of weight and bias will be sliced.
+    """
+
+    bound_type = retiarii_nn.Linear
+    argument_list = ['in_features', 'out_features']
+
+    def super_init_argument(self, name: str, value_choice: ValueChoiceX):
+        return max(traverse_all_options(value_choice))
+
+    def forward_with_args(self,
+                          in_features: int_or_int_dict,
+                          out_features: int_or_int_dict,
+                          inputs: torch.Tensor) -> torch.Tensor:
+
+        in_features = _W(in_features)
+        out_features = _W(out_features)
+
+        weight = _S(self.weight)[:out_features]
+        weight = _S(weight)[:, :in_features]
+        if self.bias is None:
+            bias = self.bias
+        else:
+            bias = _S(self.bias)[:out_features]
+
+        return F.linear(inputs, weight, bias)
+
+
+_int_or_tuple = Union[int, Tuple[int, int]]
+
+
+class MixedConv2d(MixedOperation, nn.Conv2d):
+    """Mixed conv2d op.
+
+    Supported arguments are:
+
+    - ``in_channels``
+    - ``out_channels``
+    - ``groups`` (only supported in path sampling)
+    - ``stride`` (only supported in path sampling)
+    - ``kernel_size``
+    - ``padding`` (only supported in path sampling)
+    - ``dilation`` (only supported in path sampling)
+
+    ``padding`` will be the "max" padding in differentiable mode.
+
+    For channels, prefix will be sliced.
+    For kernels, we take the small kernel from the center and round it to floor (left top). For example ::
+
+        max_kernel = 5*5, sampled_kernel = 3*3, then we take [1: 4]
+        max_kernel = 5*5, sampled_kernel = 2*2, then we take [1: 3]
+        □ □ □ □ □   □ □ □ □ □
+        □ ■ ■ ■ □   □ ■ ■ □ □
+        □ ■ ■ ■ □   □ ■ ■ □ □
+        □ ■ ■ ■ □   □ □ □ □ □
+        □ □ □ □ □   □ □ □ □ □
+    """
+
+    bound_type = retiarii_nn.Conv2d
+    argument_list = [
+        'in_channels', 'out_channels', 'kernel_size', 'stride', 'padding', 'dilation', 'groups'
+    ]
+
+    @staticmethod
+    def _to_tuple(value: scalar_or_scalar_dict[T]) -> Tuple[T, T]:
+        if not isinstance(value, tuple):
+            return (value, value)
+        return value
+
+    def super_init_argument(self, name: str, value_choice: ValueChoiceX):
+        if name not in ['in_channels', 'out_channels', 'groups', 'stride', 'kernel_size', 'padding', 'dilation']:
+            raise NotImplementedError(f'Unsupported value choice on argument: {name}')
+
+        if name == ['kernel_size', 'padding']:
+            all_sizes = set(traverse_all_options(value_choice))
+            if any(isinstance(sz, tuple) for sz in all_sizes):
+                # maximum kernel should be calculated on every dimension
+                return (
+                    max(self._to_tuple(sz)[0] for sz in all_sizes),
+                    max(self._to_tuple(sz)[1] for sz in all_sizes)
+                )
+            else:
+                return max(all_sizes)
+
+        elif name == 'groups':
+            # minimum groups, maximum kernel
+            return min(traverse_all_options(value_choice))
+
+        else:
+            return max(traverse_all_options(value_choice))
+
+    def forward_with_args(self,
+                          in_channels: int_or_int_dict,
+                          out_channels: int_or_int_dict,
+                          kernel_size: scalar_or_scalar_dict[_int_or_tuple],
+                          stride: _int_or_tuple,
+                          padding: scalar_or_scalar_dict[_int_or_tuple],
+                          dilation: int,
+                          groups: int,
+                          inputs: torch.Tensor) -> torch.Tensor:
+
+        if any(isinstance(arg, dict) for arg in [stride, dilation, groups]):
+            raise ValueError('stride, dilation, groups does not support weighted sampling.')
+
+        in_channels = _W(in_channels)
+        out_channels = _W(out_channels)
+
+        # slice prefix
+        # For groups > 1, we use groups to slice input weights
+        weight = _S(self.weight)[:out_channels]
+        weight = _S(weight)[:, :in_channels // groups]
+
+        # slice center
+        if isinstance(kernel_size, dict):
+            padding = self.padding  # max padding, must be a tuple
+        kernel_a, kernel_b = self._to_tuple(kernel_size)
+        kernel_a, kernel_b = _W(kernel_a), _W(kernel_b)
+        max_kernel_a, max_kernel_b = self.kernel_size  # self.kernel_size must be a tuple
+        kernel_a_left, kernel_b_top = (max_kernel_a - kernel_a) // 2, (max_kernel_b - kernel_b) // 2
+        weight = _S(weight)[:, :, kernel_a_left:kernel_a_left + kernel_a, kernel_b_top:kernel_b_top + kernel_b]
+
+        bias = _S(self.bias)[:out_channels] if self.bias is not None else None
+
+        # The rest parameters only need to be converted to tuple
+        stride = self._to_tuple(stride)
+        dilation = self._to_tuple(dilation)
+
+        if self.padding_mode != 'zeros':
+            return F.conv2d(F.pad(inputs, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+                            weight, bias, stride, (0, 0), dilation, groups)
+        return F.conv2d(inputs, weight, bias, stride, padding, dilation, groups)
+
+
+class MixedBatchNorm2d(MixedOperation, nn.BatchNorm2d):
+    """
+    Mixed BatchNorm2d operation.
+
+    Supported arguments are:
+
+    - ``num_features``
+    - ``eps`` (only supported in path sampling)
+    - ``momentum`` (only supported in path sampling)
+
+    For path-sampling, prefix of ``weight``, ``bias``, ``running_mean`` and ``running_var``
+    are sliced. For weighted cases, the maximum ``num_features`` is used directly.
+
+    Momentum is required to be float.
+    PyTorch BatchNorm supports a case where momentum can be none, which is not supported here.
+    """
+
+    bound_type = retiarii_nn.BatchNorm2d
+    argument_list = ['num_features', 'eps', 'momentum']
+
+    def super_init_argument(self, name: str, value_choice: ValueChoiceX):
+        return max(traverse_all_options(value_choice))
+
+    def forward_with_args(self,
+                          num_features: int_or_int_dict,
+                          eps: float,
+                          momentum: float,
+                          inputs: torch.Tensor) -> torch.Tensor:
+
+        if any(isinstance(arg, dict) for arg in [eps, momentum]):
+            raise ValueError('eps, momentum do not support weighted sampling')
+
+        if isinstance(num_features, dict):
+            num_features = self.num_features
+
+        weight, bias = self.weight, self.bias
+        running_mean, running_var = self.running_mean, self.running_var
+
+        if num_features < self.num_features:
+            weight = weight[:num_features]
+            bias = bias[:num_features]
+            running_mean = running_mean[:num_features]
+            running_var = running_var[:num_features]
+
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+
+        return F.batch_norm(
+            inputs,
+            # If buffers are not to be tracked, ensure that they won't be updated
+            running_mean if not self.training or self.track_running_stats else None,
+            running_var if not self.training or self.track_running_stats else None,
+            weight,
+            bias,
+            bn_training,
+            momentum,  # originally exponential_average_factor in pytorch code
+            eps,
+        )
+
+
+class MixedMultiHeadAttention(MixedOperation, nn.MultiheadAttention):
+    """
+    Mixed multi-head attention.
+
+    Supported arguments are:
+
+    - ``embed_dim``
+    - ``num_heads`` (only supported in path sampling)
+    - ``kdim``
+    - ``vdim``
+    - ``dropout`` (only supported in path sampling)
+
+    At init, it constructs the largest possible Q, K, V dimension.
+    At forward, it slices the prefix to weight matrices according to the sampled value.
+    For ``in_proj_bias`` and ``in_proj_weight``, three parts will be sliced and concatenated together:
+    ``[0, embed_dim)``, ``[max_embed_dim, max_embed_dim + embed_dim)``,
+    ``[max_embed_dim * 2, max_embed_dim * 2 + embed_dim)``.
+
+    Warnings
+    ----------
+    All candidates of ``embed_dim`` should be divisible by all candidates of ``num_heads``.
+    """
+
+    bound_type = retiarii_nn.MultiheadAttention
+    argument_list = ['embed_dim', 'num_heads', 'kdim', 'vdim', 'dropout']
+
+    def __post_init__(self):
+        # sometimes super-class believes qkv have the same embed_dim.
+        # but actually they do not, because we can have dynamic (mutable) kdim/vdim.
+
+        _qkv_same_embed_dim = True
+
+        for dimension in ['kdim', 'vdim']:
+            if self.init_arguments[dimension] is None:
+                # must follow embed_dim is this case
+                continue
+
+            if getattr(self, dimension) == self.embed_dim and \
+                    (dimension in self.mutable_arguments or 'embed_dim' in self.mutable_arguments):
+                _qkv_same_embed_dim = False
+
+        if self._qkv_same_embed_dim and not _qkv_same_embed_dim:
+            self._qkv_same_embed_dim = _qkv_same_embed_dim
+
+            # adding back missing parameters
+            # factory_kwargs could be empty for legacy pytorch versions
+            factory_kwargs = {}
+            if 'device' in self.init_arguments:
+                factory_kwargs['device'] = self.init_arguments['device']
+            if 'dtype' in self.init_arguments:
+                factory_kwargs['dtype'] = self.init_arguments['dtype']
+            self.q_proj_weight = nn.Parameter(torch.empty((self.embed_dim, self.embed_dim), **factory_kwargs))
+            self.k_proj_weight = nn.Parameter(torch.empty((self.embed_dim, self.kdim), **factory_kwargs))
+            self.v_proj_weight = nn.Parameter(torch.empty((self.embed_dim, self.vdim), **factory_kwargs))
+            self.register_parameter('in_proj_weight', None)
+
+            # reset parameters
+            nn.init.xavier_uniform_(self.q_proj_weight)
+            nn.init.xavier_uniform_(self.k_proj_weight)
+            nn.init.xavier_uniform_(self.v_proj_weight)
+
+    def super_init_argument(self, name: str, value_choice: ValueChoiceX):
+        return max(traverse_all_options(value_choice))
+
+    def _to_proj_slice(self, embed_dim: _W) -> List[slice]:
+        # slice three parts, corresponding to q, k, v respectively
+        return [
+            slice(embed_dim),
+            slice(self.embed_dim, self.embed_dim + embed_dim),
+            slice(self.embed_dim * 2, self.embed_dim * 2 + embed_dim)
+        ]
+
+    def forward_with_args(
+        self,
+        embed_dim: int_or_int_dict, num_heads: int,
+        kdim: Optional[int_or_int_dict], vdim: Optional[int_or_int_dict],
+        dropout: float,
+        query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = True, attn_mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+
+        if any(isinstance(arg, dict) for arg in [num_heads, dropout]):
+            raise ValueError('num_heads, dropout do not support weighted sampling.')
+
+        # by default, kdim, vdim can be none
+        if kdim is None:
+            kdim = embed_dim
+        if vdim is None:
+            vdim = embed_dim
+
+        qkv_same_embed_dim = kdim == embed_dim and vdim == embed_dim
+
+        if getattr(self, 'batch_first', False):
+            # for backward compatibility: v1.7 doesn't have batch_first
+            query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+
+        if isinstance(embed_dim, dict):
+            used_embed_dim = self.embed_dim
+        else:
+            used_embed_dim = embed_dim
+
+        embed_dim = _W(embed_dim)
+
+        # in projection weights & biases has q, k, v weights concatenated together
+        in_proj_bias = in_proj_weight = None
+        if self.in_proj_bias is not None:
+            in_proj_bias = _S(self.in_proj_bias)[self._to_proj_slice(embed_dim)]
+        if self.in_proj_weight is not None:
+            in_proj_weight = _S(self.in_proj_weight)[self._to_proj_slice(embed_dim), :embed_dim]
+
+        bias_k = _S(self.bias_k)[:, :, :embed_dim] if self.bias_k is not None else None
+        bias_v = _S(self.bias_v)[:, :, :embed_dim] if self.bias_v is not None else None
+        out_proj_weight = _S(self.out_proj.weight)[:embed_dim, :embed_dim]
+        out_proj_bias = _S(self.out_proj.bias)[:embed_dim] if self.out_proj.bias is not None else None
+
+        if not qkv_same_embed_dim:
+            kdim = _W(kdim)
+            vdim = _W(vdim)
+
+            q_proj = _S(self.q_proj_weight)[:embed_dim, :embed_dim]
+            k_proj = _S(self.k_proj_weight)[:embed_dim]
+            k_proj = _S(k_proj)[:, :kdim]
+            v_proj = _S(self.v_proj_weight)[:embed_dim]
+            v_proj = _S(v_proj)[:, :vdim]
+
+            # The rest part is basically same as pytorch
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, used_embed_dim, num_heads,
+                in_proj_weight, in_proj_bias,
+                bias_k, bias_v, self.add_zero_attn,
+                dropout, out_proj_weight, out_proj_bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=q_proj, k_proj_weight=k_proj, v_proj_weight=v_proj)
+        else:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, used_embed_dim, num_heads,
+                in_proj_weight, in_proj_bias,
+                bias_k, bias_v, self.add_zero_attn,
+                dropout, out_proj_weight, out_proj_bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask)
+
+        if getattr(self, 'batch_first', False):  # backward compatibility
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+
+
+NATIVE_MIXED_OPERATIONS: List[Type[MixedOperation]] = [
+    MixedLinear,
+    MixedConv2d,
+    MixedBatchNorm2d,
+    MixedMultiHeadAttention,
+]
--- a/nni/retiarii/oneshot/pytorch/supermodule/proxyless.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/proxyless.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Implementation of ProxylessNAS: a hyrbid approach between differentiable and sampling.
+The support remains limited. Known limitations include:
+
+- No support for multiple arguments in forward.
+- No support for mixed-operation (value choice).
+- The code contains duplicates. Needs refactor.
+"""
+
+from typing import List, Tuple, Optional
+
+import torch
+import torch.nn as nn
+
+from .differentiable import DifferentiableMixedLayer, DifferentiableMixedInput
+
+
+class _ArchGradientFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, binary_gates, run_func, backward_func):
+        ctx.run_func = run_func
+        ctx.backward_func = backward_func
+
+        detached_x = x.detach()
+        detached_x.requires_grad = x.requires_grad
+        with torch.enable_grad():
+            output = run_func(detached_x)
+        ctx.save_for_backward(detached_x, output)
+        return output.data
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        detached_x, output = ctx.saved_tensors
+
+        grad_x = torch.autograd.grad(output, detached_x, grad_output, only_inputs=True)
+        # compute gradients w.r.t. binary_gates
+        binary_grads = ctx.backward_func(detached_x.data, output.data, grad_output.data)
+
+        return grad_x[0], binary_grads, None, None
+
+
+class ProxylessMixedLayer(DifferentiableMixedLayer):
+    """Proxyless version of differentiable mixed layer.
+    It resamples a single-path every time, rather than go through the softmax.
+    """
+
+    _arch_parameter_names = ['_arch_alpha', '_binary_gates']
+
+    def __init__(self, paths: List[Tuple[str, nn.Module]], alpha: torch.Tensor, softmax: nn.Module, label: str):
+        super().__init__(paths, alpha, softmax, label)
+        self._binary_gates = nn.Parameter(torch.randn(len(paths)) * 1E-3)
+
+        # like sampling-based methods, it has a ``_sampled``.
+        self._sampled: Optional[str] = None
+        self._sample_idx: Optional[int] = None
+
+    def forward(self, *args, **kwargs):
+        def run_function(ops, active_id, **kwargs):
+            def forward(_x):
+                return ops[active_id](_x, **kwargs)
+            return forward
+
+        def backward_function(ops, active_id, binary_gates, **kwargs):
+            def backward(_x, _output, grad_output):
+                binary_grads = torch.zeros_like(binary_gates.data)
+                with torch.no_grad():
+                    for k in range(len(ops)):
+                        if k != active_id:
+                            out_k = ops[k](_x.data, **kwargs)
+                        else:
+                            out_k = _output.data
+                        grad_k = torch.sum(out_k * grad_output)
+                        binary_grads[k] = grad_k
+                return binary_grads
+            return backward
+
+        assert len(args) == 1, 'ProxylessMixedLayer only supports exactly one input argument.'
+        x = args[0]
+
+        assert self._sampled is not None, 'Need to call resample() before running fprop.'
+        list_ops = [getattr(self, op) for op in self.op_names]
+
+        return _ArchGradientFunction.apply(
+            x, self._binary_gates, run_function(list_ops, self._sample_idx, **kwargs),
+            backward_function(list_ops, self._sample_idx, self._binary_gates, **kwargs)
+        )
+
+    def resample(self, memo):
+        """Sample one path based on alpha if label is not found in memo."""
+        if self.label in memo:
+            self._sampled = memo[self.label]
+            self._sample_idx = self.op_names.index(self._sampled)
+        else:
+            probs = self._softmax(self._arch_alpha)
+            self._sample_idx = torch.multinomial(probs, 1)[0].item()
+            self._sampled = self.op_names[self._sample_idx]
+
+        # set binary gates
+        with torch.no_grad():
+            self._binary_gates.zero_()
+            self._binary_gates.grad = torch.zeros_like(self._binary_gates.data)
+            self._binary_gates.data[self._sample_idx] = 1.0
+
+        return {self.label: self._sampled}
+
+    def export(self, memo):
+        """Chose the argmax if label isn't found in memo."""
+        if self.label in memo:
+            return {}  # nothing new to export
+        return {self.label: self.op_names[torch.argmax(self._arch_alpha).item()]}
+
+    def finalize_grad(self):
+        binary_grads = self._binary_gates.grad
+        with torch.no_grad():
+            if self._arch_alpha.grad is None:
+                self._arch_alpha.grad = torch.zeros_like(self._arch_alpha.data)
+            probs = self._softmax(self._arch_alpha)
+            for i in range(len(self._arch_alpha)):
+                for j in range(len(self._arch_alpha)):
+                    self._arch_alpha.grad[i] += binary_grads[j] * probs[j] * (int(i == j) - probs[i])
+
+
+class ProxylessMixedInput(DifferentiableMixedInput):
+    """Proxyless version of differentiable input choice.
+    See :class:`ProxylessLayerChoice` for implementation details.
+    """
+
+    _arch_parameter_names = ['_arch_alpha', '_binary_gates']
+
+    def __init__(self, n_candidates: int, n_chosen: Optional[int], alpha: torch.Tensor, softmax: nn.Module, label: str):
+        super().__init__(n_candidates, n_chosen, alpha, softmax, label)
+        self._binary_gates = nn.Parameter(torch.randn(n_candidates) * 1E-3)
+        self._sampled: Optional[int] = None
+
+    def forward(self, inputs):
+        def run_function(active_sample):
+            return lambda x: x[active_sample]
+
+        def backward_function(binary_gates):
+            def backward(_x, _output, grad_output):
+                binary_grads = torch.zeros_like(binary_gates.data)
+                with torch.no_grad():
+                    for k in range(self.n_candidates):
+                        out_k = _x[k].data
+                        grad_k = torch.sum(out_k * grad_output)
+                        binary_grads[k] = grad_k
+                return binary_grads
+            return backward
+
+        inputs = torch.stack(inputs, 0)
+        assert self._sampled is not None, 'Need to call resample() before running fprop.'
+
+        return _ArchGradientFunction.apply(
+            inputs, self._binary_gates, run_function(self._sampled),
+            backward_function(self._binary_gates)
+        )
+
+    def resample(self, memo):
+        """Sample one path based on alpha if label is not found in memo."""
+        if self.label in memo:
+            self._sampled = memo[self.label]
+        else:
+            probs = self._softmax(self._arch_alpha)
+            sample = torch.multinomial(probs, 1)[0].item()
+            self._sampled = sample
+
+        # set binary gates
+        with torch.no_grad():
+            self._binary_gates.zero_()
+            self._binary_gates.grad = torch.zeros_like(self._binary_gates.data)
+            self._binary_gates.data[sample] = 1.0
+
+        return {self.label: self._sampled}
+
+    def export(self, memo):
+        """Chose the argmax if label isn't found in memo."""
+        if self.label in memo:
+            return {}  # nothing new to export
+        return {self.label: torch.argmax(self._arch_alpha).item()}
+
+    def finalize_grad(self):
+        binary_grads = self._binary_gates.grad
+        with torch.no_grad():
+            if self._arch_alpha.grad is None:
+                self._arch_alpha.grad = torch.zeros_like(self._arch_alpha.data)
+            probs = self._softmax(self._arch_alpha)
+            for i in range(self.n_candidates):
+                for j in range(self.n_candidates):
+                    self._arch_alpha.grad[i] += binary_grads[j] * probs[j] * (int(i == j) - probs[i])
--- a/nni/retiarii/oneshot/pytorch/supermodule/sampling.py
+++ b/nni/retiarii/oneshot/pytorch/supermodule/sampling.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import random
+from typing import Optional, List, Tuple, Union, Dict, Any
+
+import torch
+import torch.nn as nn
+
+from nni.common.hpo_utils import ParameterSpec
+from nni.retiarii.nn.pytorch import LayerChoice, InputChoice
+
+from .base import BaseSuperNetModule
+from ._valuechoice_utils import evaluate_value_choice_with_dict
+from .operation import MixedOperationSamplingPolicy, MixedOperation
+
+
+class PathSamplingLayer(BaseSuperNetModule):
+    """
+    Mixed layer, in which fprop is decided by exactly one inner layer or sum of multiple (sampled) layers.
+    If multiple modules are selected, the result will be summed and returned.
+
+    Attributes
+    ----------
+    _sampled : int or list of str
+        Sampled module indices.
+    label : str
+        Name of the choice.
+    """
+
+    def __init__(self, paths: List[Tuple[str, nn.Module]], label: str):
+        super().__init__()
+        self.op_names = []
+        for name, module in paths:
+            self.add_module(name, module)
+            self.op_names.append(name)
+        assert self.op_names, 'There has to be at least one op to choose from.'
+        self._sampled: Optional[Union[List[str], str]] = None  # sampled can be either a list of indices or an index
+        self.label = label
+
+    def resample(self, memo):
+        """Random choose one path if label is not found in memo."""
+        if self.label in memo:
+            self._sampled = memo[self.label]
+        else:
+            self._sampled = random.choice(self.op_names)
+        return {self.label: self._sampled}
+
+    def export(self, memo):
+        """Random choose one name if label isn't found in memo."""
+        if self.label in memo:
+            return {}  # nothing new to export
+        return {self.label: random.choice(self.op_names)}
+
+    def search_space_spec(self):
+        return {self.label: ParameterSpec(self.label, 'choice', self.op_names, (self.label, ),
+                                          True, size=len(self.op_names))}
+
+    @classmethod
+    def mutate(cls, module, name, memo, mutate_kwargs):
+        if isinstance(module, LayerChoice):
+            return cls(list(module.named_children()), module.label)
+
+    def forward(self, *args, **kwargs):
+        if self._sampled is None:
+            raise RuntimeError('At least one path needs to be sampled before fprop.')
+        sampled = [self._sampled] if not isinstance(self._sampled, list) else self._sampled
+
+        # str(samp) is needed here because samp can sometimes be integers, but attr are always str
+        res = [getattr(self, str(samp))(*args, **kwargs) for samp in sampled]
+        if len(res) == 1:
+            return res[0]
+        else:
+            return sum(res)
+
+
+class PathSamplingInput(BaseSuperNetModule):
+    """
+    Mixed input. Take a list of tensor as input, select some of them and return the sum.
+
+    Attributes
+    ----------
+    _sampled : int or list of int
+        Sampled input indices.
+    """
+
+    def __init__(self, n_candidates: int, n_chosen: int, reduction: str, label: str):
+        super().__init__()
+        self.n_candidates = n_candidates
+        self.n_chosen = n_chosen
+        self.reduction = reduction
+        self._sampled: Optional[Union[List[int], int]] = None
+        self.label = label
+
+    def _random_choose_n(self):
+        sampling = list(range(self.n_candidates))
+        random.shuffle(sampling)
+        sampling = sorted(sampling[:self.n_chosen])
+        if len(sampling) == 1:
+            return sampling[0]
+        else:
+            return sampling
+
+    def resample(self, memo):
+        """Random choose one path / multiple paths if label is not found in memo.
+        If one path is selected, only one integer will be in ``self._sampled``.
+        If multiple paths are selected, a list will be in ``self._sampled``.
+        """
+        if self.label in memo:
+            self._sampled = memo[self.label]
+        else:
+            self._sampled = self._random_choose_n()
+        return {self.label: self._sampled}
+
+    def export(self, memo):
+        """Random choose one name if label isn't found in memo."""
+        if self.label in memo:
+            return {}  # nothing new to export
+        return {self.label: self._random_choose_n()}
+
+    def search_space_spec(self):
+        return {
+            self.label: ParameterSpec(self.label, 'choice', list(range(self.n_candidates)),
+                                      (self.label, ), True, size=self.n_candidates, chosen_size=self.n_chosen)
+        }
+
+    @classmethod
+    def mutate(cls, module, name, memo, mutate_kwargs):
+        if isinstance(module, InputChoice):
+            if module.reduction not in ['sum', 'mean', 'concat']:
+                raise ValueError('Only input choice of sum/mean/concat reduction is supported.')
+            return cls(module.n_candidates, module.n_chosen, module.reduction, module.label)
+
+    def forward(self, input_tensors):
+        if self._sampled is None:
+            raise RuntimeError('At least one path needs to be sampled before fprop.')
+        if len(input_tensors) != self.n_candidates:
+            raise ValueError(f'Expect {self.n_candidates} input tensors, found {len(input_tensors)}.')
+        sampled = [self._sampled] if not isinstance(self._sampled, list) else self._sampled
+        res = [input_tensors[samp] for samp in sampled]
+        if len(res) == 1:
+            return res[0]
+        else:
+            if self.reduction == 'sum':
+                return sum(res)
+            elif self.reduction == 'mean':
+                return sum(res) / len(res)
+            elif self.reduction == 'concat':
+                return torch.cat(res, 1)
+
+
+class MixedOpPathSamplingPolicy(MixedOperationSamplingPolicy):
+    """Implementes the path sampling in mixed operation.
+
+    One mixed operation can have multiple value choices in its arguments.
+    Each value choice can be further decomposed into "leaf value choices".
+    We sample the leaf nodes, and composits them into the values on arguments.
+    """
+
+    def __init__(self, operation: MixedOperation, memo: Dict[str, Any], mutate_kwargs: Dict[str, Any]) -> None:
+        # Sampling arguments. This should have the same keys with `operation.mutable_arguments`
+        self._sampled: Optional[Dict[str, Any]] = None
+
+    def resample(self, operation: MixedOperation, memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """Random sample for each leaf value choice."""
+        result = {}
+        space_spec = operation.search_space_spec()
+        for label in space_spec:
+            if label in memo:
+                result[label] = memo[label]
+            else:
+                result[label] = random.choice(space_spec[label].values)
+
+        # composits to kwargs
+        # example: result = {"exp_ratio": 3}, self._sampled = {"in_channels": 48, "out_channels": 96}
+        self._sampled = {}
+        for key, value in operation.mutable_arguments.items():
+            self._sampled[key] = evaluate_value_choice_with_dict(value, result)
+
+        return result
+
+    def export(self, operation: MixedOperation, memo: Dict[str, Any] = None) -> Dict[str, Any]:
+        """Export is also random for each leaf value choice."""
+        result = {}
+        space_spec = operation.search_space_spec()
+        for label in space_spec:
+            if label not in memo:
+                result[label] = random.choice(space_spec[label].values)
+        return result
+
+    def forward_argument(self, operation: MixedOperation, name: str) -> Any:
+        if self._sampled is None:
+            raise ValueError('Need to call resample() before running forward')
+        if name in operation.mutable_arguments:
+            return self._sampled[name]
+        return operation.init_arguments[name]
--- a/nni/retiarii/strategy/__init__.py
+++ b/nni/retiarii/strategy/__init__.py
@@ -7,3 +7,4 @@ from .evolution import RegularizedEvolution
 from .tpe_strategy import TPEStrategy, TPE
 from .local_debug_strategy import _LocalDebugStrategy
 from .rl import PolicyBasedRL
+from .oneshot import DARTS, Proxyless, GumbelDARTS, ENAS, RandomOneShot
--- a/nni/retiarii/strategy/base.py
+++ b/nni/retiarii/strategy/base.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.

 import abc
-from typing import List
+from typing import List, Any

 from ..graph import Model
 from ..mutator import Mutator
@@ -13,3 +13,6 @@ class BaseStrategy(abc.ABC):
    @abc.abstractmethod
    def run(self, base_model: Model, applied_mutators: List[Mutator]) -> None:
        pass
+
+    def export_top_models(self) -> List[Any]:
+        raise NotImplementedError('"export_top_models" is not implemented.')