Merge pull request #4668 from microsoft/doc-refactor

51d261e7 · J-shang · GitHub · d63a2ea3 · b469e1c1 · 51d261e7
Unverified Commit 51d261e7 authored Mar 22, 2022 by J-shang Committed by GitHub Mar 22, 2022
20 changed files
--- a/nni/compression/pytorch/compressor.py
+++ b/nni/compression/pytorch/compressor.py
@@ -631,6 +631,7 @@ class Quantizer(Compressor):
        """
        quantize should overload this method to quantize weight.
        This method is effectively hooked to :meth:`forward` of the model.
+
        Parameters
        ----------
        wrapper : QuantizerModuleWrapper
@@ -642,6 +643,7 @@ class Quantizer(Compressor):
        """
        quantize should overload this method to quantize output.
        This method is effectively hooked to :meth:`forward` of the model.
+
        Parameters
        ----------
        output : Tensor
@@ -655,6 +657,7 @@ class Quantizer(Compressor):
        """
        quantize should overload this method to quantize input.
        This method is effectively hooked to :meth:`forward` of the model.
+
        Parameters
        ----------
        inputs : Tensor
@@ -908,6 +911,7 @@ class QuantGrad(torch.autograd.Function):
    def _quantize(cls, x, scale, zero_point):
        """
        Reference function for quantizing x -- non-clamped.
+
        Parameters
        ----------
        x : Tensor
@@ -916,6 +920,7 @@ class QuantGrad(torch.autograd.Function):
            scale for quantizing x
        zero_point : Tensor
            zero_point for quantizing x
+
        Returns
        -------
        tensor
@@ -927,12 +932,14 @@ class QuantGrad(torch.autograd.Function):
    def get_bits_length(cls, config, quant_type):
        """
        Get bits for quantize config
+
        Parameters
        ----------
        config : Dict
            the configuration for quantization
        quant_type : str
            quant type
+
        Returns
        -------
        int
@@ -948,6 +955,7 @@ class QuantGrad(torch.autograd.Function):
        """
        This method should be overrided by subclass to provide customized backward function,
        default implementation is Straight-Through Estimator
+
        Parameters
        ----------
        tensor : Tensor
@@ -963,6 +971,7 @@ class QuantGrad(torch.autograd.Function):
            quant_min for quantizing tensor
        qmax : Tensor
            quant_max for quantizng tensor
+
        Returns
        -------
        tensor

--- a/nni/compression/pytorch/quantization_speedup/integrated_tensorrt.py
+++ b/nni/compression/pytorch/quantization_speedup/integrated_tensorrt.py
@@ -228,40 +228,41 @@ def build_engine(model_file, config=None, extra_layer_bits=32, strict_datatype=F
        return engine

 class ModelSpeedupTensorRT(BaseModelSpeedup):
+    r"""
+    Parameters
+    ----------
+    model : pytorch model
+        The model to speed up by quantization.
+    input_shape : tuple
+        The input shape of model, shall pass it to torch.onnx.export.
+    config : dict
+        Config recording bits number and name of layers.
+    onnx_path : str
+        The path user want to store onnx model which is converted from pytorch model.
+    extra_layer_bits : int
+        Other layers which are not in config will be quantized to corresponding bits number.
+    strict_datatype : bool
+        Whether constrain layer bits to the number given in config or not. If true, all the layer
+        will be set to given bits strictly. Otherwise, these layers will be set automatically by
+        tensorrt.
+    calibrate_type : tensorrt.tensorrt.CalibrationAlgoType
+        The algorithm of calibrating. Please refer to https://docs.nvidia.com/deeplearning/
+        tensorrt/api/python_api/infer/Int8/Calibrator.html for detail
+    calibrate_data : numpy array
+        The data using to calibrate quantization model
+    calibration_cache : str
+        The path user want to store calibrate cache file
+    batchsize : int
+        The batch size of calibration and inference
+    input_names : list
+        Input name of onnx model providing for torch.onnx.export to generate onnx model
+    output_name : list
+        Output name of onnx model providing for torch.onnx.export to generate onnx model
+    """
+
    def __init__(self, model, input_shape, config=None, onnx_path="default_model.onnx", extra_layer_bits=32, strict_datatype=True,
        calibrate_type=CalibrateType.ENTROPY2, calib_data_loader=None, calibration_cache = "calibration.cache", batchsize=1,
        input_names=["actual_input_1"], output_names=["output1"]):
-        """
-        Parameters
-        ----------
-        model : pytorch model
-            The model to speed up by quantization.
-        input_shape : tuple
-            The input shape of model, shall pass it to torch.onnx.export.
-        config : dict
-            Config recording bits number and name of layers.
-        onnx_path : str
-            The path user want to store onnx model which is converted from pytorch model.
-        extra_layer_bits : int
-            Other layers which are not in config will be quantized to corresponding bits number.
-        strict_datatype : bool
-            Whether constrain layer bits to the number given in config or not. If true, all the layer
-            will be set to given bits strictly. Otherwise, these layers will be set automatically by
-            tensorrt.
-        calibrate_type : tensorrt.tensorrt.CalibrationAlgoType
-            The algorithm of calibrating. Please refer to https://docs.nvidia.com/deeplearning/
-            tensorrt/api/python_api/infer/Int8/Calibrator.html for detail
-        calibrate_data : numpy array
-            The data using to calibrate quantization model
-        calibration_cache : str
-            The path user want to store calibrate cache file
-        batchsize : int
-            The batch size of calibration and inference
-        input_names : list
-            Input name of onnx model providing for torch.onnx.export to generate onnx model
-        output_name : list
-            Output name of onnx model providing for torch.onnx.export to generate onnx model
-        """
        super().__init__(model, config)
        self.model = model
        self.onnx_path = onnx_path

--- a/nni/compression/pytorch/speedup/compressor.py
+++ b/nni/compression/pytorch/speedup/compressor.py
@@ -388,6 +388,9 @@ class ModelSpeedup:
    def replace_submodule(self, unique_name, reindex_dim=None, reindex=None):
        """
        Replace the submodule according to the inferred sparsity.
+
+        Parameters
+        ----------
        unique_name: str
            The unique_name of the submodule to replace.
        reindex_dim: int

--- a/nni/compression/pytorch/speedup/jit_translate.py
+++ b/nni/compression/pytorch/speedup/jit_translate.py
@@ -10,16 +10,31 @@ import torch
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)

+# to exclude partial
+
+__all__ = [
+    'adaptive_avgpool_python', 'add_python', 'avgpool2d_python', 'cat_python', 'contiguous_python',
+    'div_python', 'dropout_python', 'exp_python', 'flatten_python', 'floor_div_python', 'gelu_python',
+    'getattr_python', 'jit_to_python_function', 'matmul_python', 'mean_python',
+    'mul_python', 'num2tensor_python', 'parse_constant', 'permute_python', 'relu_inplace_python',
+    'relu_python', 'reshape_python', 'select_python', 'sigmoid_python', 'size_python', 'slice_python',
+    'softmax_python', 'squeeze_python', 'to_python', 'toint_python', 'torch', 'trans_from_jit_to_python',
+    'translate_list', 'transpose2_python', 'transpose_python', 'tupleunpack_python', 'typeas_python',
+    'unsqueeze_python', 'upsample_bilinear2d_python', 'view_python'
+]
+

 def translate_list(list_node, speedup=None):
    """
    Get the list of values from the list construct node.
+
    Parameters
-    ---------
+    ----------
    list_node: Torch.C.Value
        The cpp node of the target list.
    speedup: ModuleSpeed
        The Module speedup module.
+
    Returns
    -------
    values: list
@@ -45,12 +60,14 @@ def translate_list(list_node, speedup=None):
 def parse_constant(cvalue, speedup):
    """
    Parse the constant values from this Node
+
    Parameters
    ----------
    cvalue: Torch.C.Value
        The cpp node of the target constant value.
    speedup: ModelSpeedup
        The Model speedup module.
+
    Returns
    -------
    value: int/float/tensor

--- a/nni/compression/pytorch/utils/mask_conflict.py
+++ b/nni/compression/pytorch/utils/mask_conflict.py
@@ -81,23 +81,23 @@ class MaskFix:


 class GroupMaskConflict(MaskFix):
+    """
+    GroupMaskConflict fix the mask conflict between the layers that
+    has group dependecy with each other.
+
+    Parameters
+    ----------
+    masks : dict
+        a dict object that stores the masks
+    model : torch.nn.Module
+        model to fix the mask conflict
+    dummy_input : torch.Tensor
+        input example to trace the model
+    traced : torch._C.torch.jit.TopLevelTracedModule
+        the traced model of the target model, is this parameter is not None,
+        we donnot use the model and dummpy_input to get the trace graph.
+    """
    def __init__(self, masks, model, dummy_input, traced=None):
-        """
-        GroupMaskConflict fix the mask conflict between the layers that
-        has group dependecy with each other.
-
-        Parameters
-        ----------
-        masks : dict
-            a dict object that stores the masks
-        model : torch.nn.Module
-            model to fix the mask conflict
-        dummy_input : torch.Tensor
-            input example to trace the model
-        traced : torch._C.torch.jit.TopLevelTracedModule
-            the traced model of the target model, is this parameter is not None,
-            we donnot use the model and dummpy_input to get the trace graph.
-        """
        super(GroupMaskConflict, self).__init__(
            masks, model, dummy_input, traced)

@@ -168,23 +168,24 @@ class GroupMaskConflict(MaskFix):


 class ChannelMaskConflict(MaskFix):
+    """
+    ChannelMaskConflict fix the mask conflict between the layers that
+    has channel dependecy with each other.
+
+    Parameters
+    ----------
+    masks : dict
+        a dict object that stores the masks
+    model : torch.nn.Module
+        model to fix the mask conflict
+    dummy_input : torch.Tensor
+        input example to trace the model
+    graph : torch._C.torch.jit.TopLevelTracedModule
+        the traced graph of the target model, is this parameter is not None,
+        we donnot use the model and dummpy_input to get the trace graph.
+    """
+
    def __init__(self, masks, model, dummy_input, traced=None):
-        """
-        ChannelMaskConflict fix the mask conflict between the layers that
-        has channel dependecy with each other.
-
-        Parameters
-        ----------
-        masks : dict
-            a dict object that stores the masks
-        model : torch.nn.Module
-            model to fix the mask conflict
-        dummy_input : torch.Tensor
-            input example to trace the model
-        graph : torch._C.torch.jit.TopLevelTracedModule
-            the traced graph of the target model, is this parameter is not None,
-            we donnot use the model and dummpy_input to get the trace graph.
-        """
        super(ChannelMaskConflict, self).__init__(
            masks, model, dummy_input, traced)
        self.conv_prune_dim = detect_mask_prune_dim(masks, model)

--- a/nni/compression/pytorch/utils/sensitivity_analysis.py
+++ b/nni/compression/pytorch/utils/sensitivity_analysis.py
@@ -18,51 +18,52 @@ logger.setLevel(logging.INFO)


 class SensitivityAnalysis:
-    def __init__(self, model, val_func, sparsities=None, prune_type='l1', early_stop_mode=None, early_stop_value=None):
-        """
-        Perform sensitivity analysis for this model.
-        Parameters
-        ----------
-        model : torch.nn.Module
-            the model to perform sensitivity analysis
-        val_func : function
-            validation function for the model. Due to
-            different models may need different dataset/criterion
-            , therefore the user need to cover this part by themselves.
-            In the val_func, the model should be tested on the validation dateset,
-            and the validation accuracy/loss should be returned as the output of val_func.
-            There are no restrictions on the input parameters of the val_function.
-            User can use the val_args, val_kwargs parameters in analysis
-            to pass all the parameters that val_func needed.
-        sparsities : list
-            The sparsity list provided by users. This parameter is set when the user
-            only wants to test some specific sparsities. In the sparsity list, each element
-            is a sparsity value which means how much weight the pruner should prune. Take
-            [0.25, 0.5, 0.75] for an example, the SensitivityAnalysis will prune 25% 50% 75%
-            weights gradually for each layer.
-        prune_type : str
-            The pruner type used to prune the conv layers, default is 'l1',
-            and 'l2', 'fine-grained' is also supported.
-        early_stop_mode : str
-            If this flag is set, the sensitivity analysis
-            for a conv layer will early stop when the validation metric(
-            for example, accurracy/loss) has alreay meet the threshold. We
-            support four different early stop modes: minimize, maximize, dropped,
-            raised. The default value is None, which means the analysis won't stop
-            until all given sparsities are tested. This option should be used with
-            early_stop_value together.
-
-            minimize: The analysis stops when the validation metric return by the val_func
-            lower than early_stop_value.
-            maximize: The analysis stops when the validation metric return by the val_func
-            larger than early_stop_value.
-            dropped: The analysis stops when the validation metric has dropped by early_stop_value.
-            raised: The analysis stops when the validation metric has raised by early_stop_value.
-        early_stop_value : float
-            This value is used as the threshold for different earlystop modes.
-            This value is effective only when the early_stop_mode is set.
+    """
+    Perform sensitivity analysis for this model.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        the model to perform sensitivity analysis
+    val_func : function
+        validation function for the model. Due to
+        different models may need different dataset/criterion
+        , therefore the user need to cover this part by themselves.
+        In the val_func, the model should be tested on the validation dateset,
+        and the validation accuracy/loss should be returned as the output of val_func.
+        There are no restrictions on the input parameters of the val_function.
+        User can use the val_args, val_kwargs parameters in analysis
+        to pass all the parameters that val_func needed.
+    sparsities : list
+        The sparsity list provided by users. This parameter is set when the user
+        only wants to test some specific sparsities. In the sparsity list, each element
+        is a sparsity value which means how much weight the pruner should prune. Take
+        [0.25, 0.5, 0.75] for an example, the SensitivityAnalysis will prune 25% 50% 75%
+        weights gradually for each layer.
+    prune_type : str
+        The pruner type used to prune the conv layers, default is 'l1',
+        and 'l2', 'fine-grained' is also supported.
+    early_stop_mode : str
+        If this flag is set, the sensitivity analysis
+        for a conv layer will early stop when the validation metric(
+        for example, accurracy/loss) has alreay meet the threshold. We
+        support four different early stop modes: minimize, maximize, dropped,
+        raised. The default value is None, which means the analysis won't stop
+        until all given sparsities are tested. This option should be used with
+        early_stop_value together.
+
+        minimize: The analysis stops when the validation metric return by the val_func
+        lower than early_stop_value.
+        maximize: The analysis stops when the validation metric return by the val_func
+        larger than early_stop_value.
+        dropped: The analysis stops when the validation metric has dropped by early_stop_value.
+        raised: The analysis stops when the validation metric has raised by early_stop_value.
+    early_stop_value : float
+        This value is used as the threshold for different earlystop modes.
+        This value is effective only when the early_stop_mode is set.
+    """

-        """
+    def __init__(self, model, val_func, sparsities=None, prune_type='l1', early_stop_mode=None, early_stop_value=None):
        from nni.algorithms.compression.pytorch.pruning.constants_pruner import PRUNER_DICT

        self.model = model

--- a/nni/compression/pytorch/utils/shape_dependency.py
+++ b/nni/compression/pytorch/utils/shape_dependency.py
@@ -91,24 +91,26 @@ def reshape_break_channel_dependency(op_node):


 class ChannelDependency(Dependency):
+    """
+    This model analyze the channel dependencies between the conv
+    layers in a model.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to be analyzed.
+    data : torch.Tensor
+        The example input data to trace the network architecture.
+    traced_model : torch._C.Graph
+        if we alreay has the traced graph of the target model, we donnot
+        need to trace the model again.
+    prune_type: str
+        This parameter indicates the channel pruning type: 1) `Filter`
+        prune the filter of the convolution layer to prune the corresponding
+        channels 2) `Batchnorm`: prune the channel in the batchnorm layer
+    """
+
    def __init__(self, model, dummy_input, traced_model=None, prune_type='Filter'):
-        """
-        This model analyze the channel dependencies between the conv
-        layers in a model.
-        Parameters
-        ----------
-        model : torch.nn.Module
-            The model to be analyzed.
-        data : torch.Tensor
-            The example input data to trace the network architecture.
-        traced_model : torch._C.Graph
-            if we alreay has the traced graph of the target model, we donnot
-            need to trace the model again.
-        prune_type: str
-            This parameter indicates the channel pruning type: 1) `Filter`
-            prune the filter of the convolution layer to prune the corresponding
-            channels 2) `Batchnorm`: prune the channel in the batchnorm layer
-        """
        self.prune_type = prune_type
        self.target_types = []
        if self.prune_type == 'Filter':
@@ -277,6 +279,7 @@ class InputChannelDependency(ChannelDependency):
        """
        This model analyze the input channel dependencies between the conv
        layers in a model.
+
        Parameters
        ----------
        model : torch.nn.Module
@@ -335,20 +338,22 @@ class InputChannelDependency(ChannelDependency):


 class GroupDependency(Dependency):
+    """
+    This model analyze the group dependencis between the conv
+    layers in a model.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to be analyzed.
+    data : torch.Tensor
+        The example input data to trace the network architecture.
+    traced_model : torch._C.Graph
+        if we alreay has the traced graph of the target model, we donnot
+        need to trace the model again.
+    """
+
    def __init__(self, model, dummy_input, traced_model=None):
-        """
-        This model analyze the group dependencis between the conv
-        layers in a model.
-        Parameters
-        ----------
-        model : torch.nn.Module
-            The model to be analyzed.
-        data : torch.Tensor
-            The example input data to trace the network architecture.
-        traced_model : torch._C.Graph
-            if we alreay has the traced graph of the target model, we donnot
-            need to trace the model again.
-        """
        self.min_groups = {}
        super(GroupDependency, self).__init__(model, dummy_input, traced_model)


--- a/nni/experiment/config/utils/internal.py
+++ b/nni/experiment/config/utils/internal.py
@@ -20,6 +20,15 @@ import nni.runtime.config

 from .public import is_missing

+__all__ = [
+    'get_base_path', 'set_base_path', 'unset_base_path', 'resolve_path',
+    'case_insensitive', 'camel_case',
+    'is_instance', 'validate_type', 'is_path_like',
+    'guess_config_type', 'guess_list_config_type',
+    'training_service_config_factory', 'load_training_service_config',
+    'get_ipv4_address'
+]
+
 ## handle relative path ##

 _current_base_path = None

--- a/nni/experiment/config/utils/public.py
+++ b/nni/experiment/config/utils/public.py
@@ -10,6 +10,12 @@ import math
 from pathlib import Path
 from typing import Union

+__all__ = [
+    'PathLike', 'is_missing',
+    'canonical_gpu_indices', 'validate_gpu_indices',
+    'parse_time', 'parse_memory_size'
+]
+
 PathLike = Union[Path, str]

 def is_missing(value):

--- a/nni/experiment/experiment.py
+++ b/nni/experiment/experiment.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from __future__ import annotations
+
 import atexit
 from enum import Enum
 import logging
@@ -5,7 +10,7 @@ from pathlib import Path
 import socket
 from subprocess import Popen
 import time
-from typing import Optional, Union, List, overload, Any
+from typing import Optional, Any

 import colorama
 import psutil
@@ -25,66 +30,49 @@ class RunMode(Enum):
    """
    Config lifecycle and ouput redirection of NNI manager process.

-      - Background: stop NNI manager when Python script exits; do not print NNI manager log. (default)
-      - Foreground: stop NNI manager when Python script exits; print NNI manager log to stdout.
-      - Detach: do not stop NNI manager when Python script exits.
+    - Background: stop NNI manager when Python script exits; do not print NNI manager log. (default)
+    - Foreground: stop NNI manager when Python script exits; print NNI manager log to stdout.
+    - Detach: do not stop NNI manager when Python script exits.

    NOTE:
    This API is non-stable and is likely to get refactored in next release.
-    NNI manager should treat log level more seriously so we can default to "foreground" without being too verbose.
    """
+    # TODO:
+    # NNI manager should treat log level more seriously so we can default to "foreground" without being too verbose.
    Background = 'background'
    Foreground = 'foreground'
    Detach = 'detach'

 class Experiment:
    """
-    Create and stop an NNI experiment.
+    Manage NNI experiment.
+
+    You can either specify an :class:`ExperimentConfig` object, or a training service name.
+    If a platform name is used, a blank config template for that training service will be generated.
+
+    When configuration is completed, use :meth:`Experiment.run` to launch the experiment.
+
+    Example
+    -------
+    .. code-block::
+
+        experiment = Experiment('remote')
+        experiment.config.trial_command = 'python3 trial.py'
+        experiment.config.machines.append(RemoteMachineConfig(ip=..., user_name=...))
+        ...
+        experiment.run(8080)

    Attributes
    ----------
    config
        Experiment configuration.
+    id
+        Experiment ID.
    port
-        Web UI port of the experiment, or `None` if it is not running.
+        Web portal port. Or ``None`` if the experiment is not running.
    """

-    @overload
-    def __init__(self, config: ExperimentConfig) -> None:
-        """
-        Prepare an experiment.
-
-        Use `Experiment.run()` to launch it.
-
-        Parameters
-        ----------
-        config
-            Experiment configuration.
-        """
-        ...
-
-    @overload
-    def __init__(self, training_service: Union[str, List[str]]) -> None:
-        """
-        Prepare an experiment, leaving configuration fields to be set later.
-
-        Example usage::
-
-            experiment = Experiment('remote')
-            experiment.config.trial_command = 'python3 trial.py'
-            experiment.config.machines.append(RemoteMachineConfig(ip=..., user_name=...))
-            ...
-            experiment.run(8080)
-
-        Parameters
-        ----------
-        training_service
-            Name of training service.
-            Supported value: "local", "remote", "openpai", "aml", "kubeflow", "frameworkcontroller", "adl" and hybrid training service.
-        """
-        ...
-
-    def __init__(self, config=None, training_service=None):
+    def __init__(self, config_or_platform: ExperimentConfig | str | list[str] | None) -> None:
        nni.runtime.log.init_logger_for_command_line()

        self.config: Optional[ExperimentConfig] = None
@@ -94,11 +82,10 @@ class Experiment:
        self.action = 'create'
        self.url_prefix: Optional[str] = None

-        args = [config, training_service]  # deal with overloading
-        if isinstance(args[0], (str, list)):
-            self.config = ExperimentConfig(args[0])
+        if isinstance(config_or_platform, (str, list)):
+            self.config = ExperimentConfig(config_or_platform)
        else:
-            self.config = args[0]
+            self.config = config_or_platform

    def start(self, port: int = 8080, debug: bool = False, run_mode: RunMode = RunMode.Background) -> None:
        """
@@ -138,12 +125,12 @@ class Experiment:
                if interface.family == socket.AF_INET:
                    ips.append(interface.address)
        ips = [f'http://{ip}:{port}' for ip in ips if ip]
-        msg = 'Web UI URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL
+        msg = 'Web portal URLs: ' + colorama.Fore.CYAN + ' '.join(ips) + colorama.Style.RESET_ALL
        _logger.info(msg)

    def stop(self) -> None:
        """
-        Stop background experiment.
+        Stop the experiment.
        """
        _logger.info('Stopping experiment, please wait...')
        atexit.unregister(self.stop)
@@ -166,11 +153,11 @@ class Experiment:
        """
        Run the experiment.

-        If wait_completion is True, this function will block until experiment finish or error.
+        If ``wait_completion`` is True, this function will block until experiment finish or error.

-        Return `True` when experiment done; or return `False` when experiment failed.
+        Return ``True`` when experiment done; or return ``False`` when experiment failed.

-        Else if wait_completion is False, this function will non-block and return None immediately.
+        Else if ``wait_completion`` is ``False``, this function will non-block and return None immediately.
        """
        self.start(port, debug)
        if wait_completion:
@@ -184,7 +171,6 @@ class Experiment:
                        return False
            except KeyboardInterrupt:
                _logger.warning('KeyboardInterrupt detected')
-            finally:
                self.stop()

    @classmethod
@@ -197,7 +183,7 @@ class Experiment:
        port
            The port of web UI.
        """
-        experiment = Experiment()
+        experiment = Experiment(None)
        experiment.port = port
        experiment.id = experiment.get_experiment_profile().get('id')
        status = experiment.get_status()
@@ -259,7 +245,7 @@ class Experiment:

    @staticmethod
    def _resume(exp_id, exp_dir=None):
-        exp = Experiment()
+        exp = Experiment(None)
        exp.id = exp_id
        exp.action = 'resume'
        exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir)
@@ -267,7 +253,7 @@ class Experiment:

    @staticmethod
    def _view(exp_id, exp_dir=None):
-        exp = Experiment()
+        exp = Experiment(None)
        exp.id = exp_id
        exp.action = 'view'
        exp.config = launcher.get_stopped_experiment_config(exp_id, exp_dir)

--- a/nni/retiarii/converter/graph_gen.py
+++ b/nni/retiarii/converter/graph_gen.py
@@ -695,15 +695,17 @@ class GraphConverter:
 class GraphConverterWithShape(GraphConverter):
    """
    Convert a pytorch model to nni ir along with input/output shape info.
-    Based ir acquired through `torch.jit.script`
-    and shape info acquired through `torch.jit.trace`.
-
-    Known issues
-    ------------
-    1. `InputChoice` and `ValueChoice` not supported yet.
-    2. Currently random inputs are fed while tracing layerchoice.
-       If forward path of candidates depends on input data, then wrong path will be traced.
-       This will result in incomplete shape info.
+    Based ir acquired through ``torch.jit.script``
+    and shape info acquired through ``torch.jit.trace``.
+
+    .. warning::
+
+        Known issues:
+
+        1. ``InputChoice`` and ``ValueChoice`` not supported yet.
+        2. Currently random inputs are fed while tracing layerchoice.
+           If forward path of candidates depends on input data, then wrong path will be traced.
+           This will result in incomplete shape info.
    """
    def convert_module(self, script_module, module, module_name, ir_model, dummy_input):
        module.eval()

--- a/nni/retiarii/evaluator/pytorch/lightning.py
+++ b/nni/retiarii/evaluator/pytorch/lightning.py
@@ -4,7 +4,7 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Dict, Union, Optional, List, Type
+from typing import Dict, Union, Optional, List, Callable

 import pytorch_lightning as pl
 import torch.nn as nn
@@ -29,11 +29,20 @@ __all__ = ['LightningModule', 'Trainer', 'DataLoader', 'Lightning', 'Classificat
 class LightningModule(pl.LightningModule):
    """
    Basic wrapper of generated model.
-
    Lightning modules used in NNI should inherit this class.
+
+    It's a subclass of ``pytorch_lightning.LightningModule``.
+    See https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html
    """

-    def set_model(self, model: Union[Type[nn.Module], nn.Module]) -> None:
+    def set_model(self, model: Union[Callable[[], nn.Module], nn.Module]) -> None:
+        """Set the inner model (architecture) to train / evaluate.
+
+        Parameters
+        ----------
+        model : callable or nn.Module
+            Can be a callable returning nn.Module or nn.Module.
+        """
        if isinstance(model, nn.Module):
            self.model = model
        else:
@@ -41,7 +50,13 @@ class LightningModule(pl.LightningModule):


 Trainer = nni.trace(pl.Trainer)
+Trainer.__doc__ = """
+Traced version of ``pytorch_lightning.Trainer``. See https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html
+"""
 DataLoader = nni.trace(torch_data.DataLoader)
+DataLoader.__doc__ = """
+Traced version of ``torch.utils.data.DataLoader``. See https://pytorch.org/docs/stable/data.html
+"""

 @nni.trace
 class Lightning(Evaluator):
@@ -236,7 +251,7 @@ class _ClassificationModule(_SupervisedLearningModule):

 class Classification(Lightning):
    """
-    Trainer that is used for classification.
+    Evaluator that is used for classification.

    Parameters
    ----------
@@ -289,7 +304,7 @@ class _RegressionModule(_SupervisedLearningModule):

 class Regression(Lightning):
    """
-    Trainer that is used for regression.
+    Evaluator that is used for regression.

    Parameters
    ----------

--- a/nni/retiarii/execution/base.py
+++ b/nni/retiarii/execution/base.py
@@ -17,6 +17,8 @@ _logger = logging.getLogger(__name__)

 class BaseGraphData:
    """
+    Data sent between strategy and trial, in graph-based execution engine.
+
    Attributes
    ----------
    model_script

--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -43,6 +43,9 @@ from ..strategy.utils import dry_run_for_formatted_search_space
 _logger = logging.getLogger(__name__)


+__all__ = ['RetiariiExeConfig', 'RetiariiExperiment']
+
+
 @dataclass(init=False)
 class RetiariiExeConfig(ConfigBase):
    experiment_name: Optional[str] = None
@@ -376,6 +379,8 @@ class RetiariiExperiment(Experiment):
        For one-shot algorithms, only top-1 is supported. For others, ``optimize_mode`` and ``formatter`` are
        available for customization.

+        Parameters
+        ----------
        top_k : int
            How many models are intended to be exported.
        optimize_mode : str

--- a/nni/retiarii/mutator.py
+++ b/nni/retiarii/mutator.py
@@ -32,8 +32,8 @@ class Mutator:
    Mutates graphs in model to generate new model.
    `Mutator` class will be used in two places:

-        1. Inherit `Mutator` to implement graph mutation logic.
-        2. Use `Mutator` subclass to implement NAS strategy.
+    1. Inherit `Mutator` to implement graph mutation logic.
+    2. Use `Mutator` subclass to implement NAS strategy.

    In scenario 1, the subclass should implement `Mutator.mutate()` interface with `Mutator.choice()`.
    In scenario 2, strategy should use constructor or `Mutator.bind_sampler()` to initialize subclass,

--- a/nni/retiarii/nn/pytorch/api.py
+++ b/nni/retiarii/nn/pytorch/api.py
@@ -22,7 +22,9 @@ class LayerChoice(Mutable):
    """
    Layer choice selects one of the ``candidates``, then apply it on inputs and return results.

-    Layer choice does not allow itself to be nested.
+    It allows users to put several candidate operations (e.g., PyTorch modules), one of them is chosen in each explored model.
+
+    *New in v2.2:* Layer choice can be nested.

    Parameters
    ----------
@@ -43,6 +45,21 @@ class LayerChoice(Mutable):
        Deprecated. A list of all candidate modules in the layer choice module.
        ``list(layer_choice)`` is recommended, which will serve the same purpose.

+    Examples
+    --------
+
+    ::
+
+        # import nni.retiarii.nn.pytorch as nn
+        # declared in `__init__` method
+        self.layer = nn.LayerChoice([
+            ops.PoolBN('max', channels, 3, stride, 1),
+            ops.SepConv(channels, channels, 3, stride, 1),
+            nn.Identity()
+        ])
+        # invoked in `forward` method
+        out = self.layer(x)
+
    Notes
    -----
    ``candidates`` can be a list of modules or a ordered dict of named modules, for example,
@@ -164,6 +181,10 @@ class LayerChoice(Mutable):
        return list(self)

    def forward(self, x):
+        """
+        The forward of layer choice is simply running the first candidate module.
+        It shouldn't be called directly by users in most cases.
+        """
        warnings.warn('You should not run forward of this module directly.')
        return self._first_module(x)

@@ -182,6 +203,10 @@ ReductionType = Literal['mean', 'concat', 'sum', 'none']
 class InputChoice(Mutable):
    """
    Input choice selects ``n_chosen`` inputs from ``choose_from`` (contains ``n_candidates`` keys).
+
+    It is mainly for choosing (or trying) different connections. It takes several tensors and chooses ``n_chosen`` tensors from them.
+    When specific inputs are chosen, ``InputChoice`` will become :class:`ChosenInputs`.
+
    Use ``reduction`` to specify how chosen inputs are reduced into one output. A few options are:

    * ``none``: do nothing and return the list directly.
@@ -203,6 +228,16 @@ class InputChoice(Mutable):
        Prior distribution used in random sampling.
    label : str
        Identifier of the input choice.
+
+    Examples
+    --------
+    ::
+
+        # import nni.retiarii.nn.pytorch as nn
+        # declared in `__init__` method
+        self.input_switch = nn.InputChoice(n_chosen=1)
+        # invoked in `forward` method, choose one from the three
+        out = self.input_switch([tensor1, tensor2, tensor3])
    """

    @classmethod
@@ -244,6 +279,10 @@ class InputChoice(Mutable):
        return self._label

    def forward(self, candidate_inputs: List[torch.Tensor]) -> torch.Tensor:
+        """
+        The forward of input choice is simply the first item of ``candidate_inputs``.
+        It shouldn't be called directly by users in most cases.
+        """
        warnings.warn('You should not run forward of this module directly.')
        return candidate_inputs[0]

@@ -274,6 +313,9 @@ class ChosenInputs(nn.Module):
        self.reduction = reduction

    def forward(self, candidate_inputs):
+        """
+        Compute the reduced input based on ``chosen`` and ``reduction``.
+        """
        return self._tensor_reduction(self.reduction, [candidate_inputs[i] for i in self.chosen])

    def _tensor_reduction(self, reduction_type, tensor_list):
@@ -539,7 +581,8 @@ class ValueChoiceX(Translatable):
    def __index__(self) -> NoReturn:
        # https://docs.python.org/3/reference/datamodel.html#object.__index__
        raise RuntimeError("`__index__` is not allowed on ValueChoice, which means you can't "
-                           "use int(), float(), complex(), range() on a ValueChoice.")
+                           "use int(), float(), complex(), range() on a ValueChoice. "
+                           "To cast the type of ValueChoice, please try `ValueChoice.to_int()` or `ValueChoice.to_float()`.")

    def __bool__(self) -> NoReturn:
        raise RuntimeError('Cannot use bool() on ValueChoice. That means, using ValueChoice in a if-clause is illegal. '
@@ -675,11 +718,13 @@ ValueChoiceOrAny = TypeVar('ValueChoiceOrAny', ValueChoiceX, Any)

 class ValueChoice(ValueChoiceX, Mutable):
    """
-    ValueChoice is to choose one from ``candidates``.
+    ValueChoice is to choose one from ``candidates``. The most common use cases are:

-    In most use scenarios, ValueChoice should be passed to the init parameters of a serializable module. For example,
+    * Used as input arguments of :class:`~nni.retiarii.basic_unit`
+      (i.e., modules in ``nni.retiarii.nn.pytorch`` and user-defined modules decorated with ``@basic_unit``).
+    * Used as input arguments of evaluator (*new in v2.7*).

-    .. code-block:: python
+    It can be used in parameters of operators: ::

        class Net(nn.Module):
            def __init__(self):
@@ -689,37 +734,83 @@ class ValueChoice(ValueChoiceX, Mutable):
            def forward(self, x):
                return self.conv(x)

-    In case, you want to search a parameter that is used repeatedly, this is also possible by sharing the same value choice instance.
-    (Sharing the label should have the same effect.) For example,
+    Or evaluator: ::

-    .. code-block:: python
+        def train_and_evaluate(model_cls, learning_rate):
+            ...

-        class Net(nn.Module):
-            def __init__(self):
-                super().__init__()
-                hidden_dim = nn.ValueChoice([128, 512])
-                self.fc = nn.Sequential(
-                    nn.Linear(64, hidden_dim),
-                    nn.Linear(hidden_dim, 10)
-                )
-
-                # the following code has the same effect.
-                # self.fc = nn.Sequential(
-                #     nn.Linear(64, nn.ValueChoice([128, 512], label='dim')),
-                #     nn.Linear(nn.ValueChoice([128, 512], label='dim'), 10)
-                # )
+        self.evaluator = FunctionalEvaluator(train_and_evaluate, learning_rate=nn.ValueChoice([1e-3, 1e-2, 1e-1]))

-            def forward(self, x):
-                return self.fc(x)
+    Value choices supports arithmetic operators, which is particularly useful when searching for a network width multiplier: ::

-    Note that ValueChoice should be used directly. Transformations like ``nn.Linear(32, nn.ValueChoice([64, 128]) * 2)``
-    are not supported.
+        # init
+        scale = nn.ValueChoice([1.0, 1.5, 2.0])
+        self.conv1 = nn.Conv2d(3, round(scale * 16))
+        self.conv2 = nn.Conv2d(round(scale * 16), round(scale * 64))
+        self.conv3 = nn.Conv2d(round(scale * 64), round(scale * 256))

-    Another common use case is to initialize the values to choose from in init and call the module in forward to get the chosen value.
-    Usually, this is used to pass a mutable value to a functional API like ``torch.xxx`` or ``nn.functional.xxx```.
-    For example,
+        # forward
+        return self.conv3(self.conv2(self.conv1(x)))

-    .. code-block:: python
+    Or when kernel size and padding are coupled so as to keep the output size constant: ::
+
+        # init
+        ks = nn.ValueChoice([3, 5, 7])
+        self.conv = nn.Conv2d(3, 16, kernel_size=ks, padding=(ks - 1) // 2)
+
+        # forward
+        return self.conv(x)
+
+    Or when several layers are concatenated for a final layer. ::
+
+        # init
+        self.linear1 = nn.Linear(3, nn.ValueChoice([1, 2, 3], label='a'))
+        self.linear2 = nn.Linear(3, nn.ValueChoice([4, 5, 6], label='b'))
+        self.final = nn.Linear(nn.ValueChoice([1, 2, 3], label='a') + nn.ValueChoice([4, 5, 6], label='b'), 2)
+
+        # forward
+        return self.final(torch.cat([self.linear1(x), self.linear2(x)], 1))
+
+    Some advanced operators are also provided, such as :meth:`ValueChoice.max` and :meth:`ValueChoice.cond`.
+
+    .. tip::
+
+        All the APIs have an optional argument called ``label``,
+        mutations with the same label will share the same choice. A typical example is, ::
+
+            self.net = nn.Sequential(
+                nn.Linear(10, nn.ValueChoice([32, 64, 128], label='hidden_dim')),
+                nn.Linear(nn.ValueChoice([32, 64, 128], label='hidden_dim'), 3)
+            )
+
+        Sharing the same value choice instance has the similar effect. ::
+
+            class Net(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    hidden_dim = nn.ValueChoice([128, 512])
+                    self.fc = nn.Sequential(
+                        nn.Linear(64, hidden_dim),
+                        nn.Linear(hidden_dim, 10)
+                    )
+
+    .. warning::
+
+        It looks as if a specific candidate has been chosen (e.g., how it looks like when you can put ``ValueChoice``
+        as a parameter of ``nn.Conv2d``), but in fact it's a syntax sugar as because the basic units and evaluators
+        do all the underlying works. That means, you cannot assume that ``ValueChoice`` can be used in the same way
+        as its candidates. For example, the following usage will NOT work: ::
+
+            self.blocks = []
+            for i in range(nn.ValueChoice([1, 2, 3])):
+                self.blocks.append(Block())
+
+            # NOTE: instead you should probably write
+            # self.blocks = nn.Repeat(Block(), (1, 3))
+
+    Another use case is to initialize the values to choose from in init and call the module in forward to get the chosen value.
+    Usually, this is used to pass a mutable value to a functional API like ``torch.xxx`` or ``nn.functional.xxx```.
+    For example, ::

        class Net(nn.Module):
            def __init__(self):
@@ -761,6 +852,10 @@ class ValueChoice(ValueChoiceX, Mutable):
        return self._label

    def forward(self):
+        """
+        The forward of input choice is simply the first value of ``candidates``.
+        It shouldn't be called directly by users in most cases.
+        """
        warnings.warn('You should not run forward of this module directly.')
        return self.candidates[0]

@@ -799,4 +894,8 @@ class Placeholder(nn.Module):
        super().__init__()

    def forward(self, x):
+        """
+        Forward of placeholder is not meaningful.
+        It returns input directly.
+        """
        return x
--- a/nni/retiarii/nn/pytorch/cell.py
+++ b/nni/retiarii/nn/pytorch/cell.py
@@ -32,13 +32,41 @@ _cell_op_factory_type = Callable[[int, int, Optional[int]], nn.Module]

 class Cell(nn.Module):
    """
-    Cell structure [zophnas]_ [zophnasnet]_ that is popularly used in NAS literature.
-    [nds]_ is a good summary of how this structure works in practice.
+    Cell structure that is popularly used in NAS literature.
+
+    Refer to :footcite:t:`zoph2017neural,zoph2018learning,liu2018darts` for details.
+    :footcite:t:`radosavovic2019network` is a good summary of how this structure works in practice.

    A cell consists of multiple "nodes". Each node is a sum of multiple operators. Each operator is chosen from
    ``op_candidates``, and takes one input from previous nodes and predecessors. Predecessor means the input of cell.
    The output of cell is the concatenation of some of the nodes in the cell (currently all the nodes).

+    Here is a glossary table, which could help better understand the terms used above:
+
+    .. list-table::
+        :widths: 25 75
+
+        * - Cell
+          - A cell consists of several nodes.
+        * - Node
+          - A node is the **sum** of several operators.
+        * - Operator
+          - Each operator is independently chosen from a list of user-specified candidate operators.
+        * - Operator's input
+          - Each operator has one input, chosen from previous nodes as well as predecessors.
+        * - Predecessors
+          - Input of cell. A cell can have multiple predecessors. Predecessors are sent to *preprocessor* for preprocessing.
+        * - Cell's output
+          - Output of cell. Usually concatenation of several nodes (possibly all nodes) in the cell. Cell's output,
+            along with predecessors, are sent to *postprocessor* for postprocessing.
+        * - Preprocessor
+          - Extra preprocessing to predecessors. Usually used in shape alignment (e.g., predecessors have different shapes).
+            By default, do nothing.
+        * - Postprocessor
+          - Extra postprocessing for cell's output. Usually used to chain cells with multiple Predecessors
+            (e.g., the next cell wants to have the outputs of both this cell and previous cell as its input).
+            By default, directly use this cell's output.
+
    Parameters
    ----------
    op_candidates : list of module or function, or dict
@@ -81,16 +109,33 @@ class Cell(nn.Module):

    Examples
    --------
+    Choose between conv2d and maxpool2d.
+    The cell have 4 nodes, 1 op per node, and 2 predecessors.
    >>> cell = nn.Cell([nn.Conv2d(32, 32, 3), nn.MaxPool2d(3)], 4, 1, 2)
-    >>> output = cell([input1, input2])
+    In forward:
+    >>> cell([input1, input2])

-    References
-    ----------
-    .. [zophnas] Barret Zoph, Quoc V. Le, "Neural Architecture Search with Reinforcement Learning". https://arxiv.org/abs/1611.01578
-    .. [zophnasnet] Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le,
-        "Learning Transferable Architectures for Scalable Image Recognition". https://arxiv.org/abs/1707.07012
-    .. [nds] Radosavovic, Ilija and Johnson, Justin and Xie, Saining and Lo, Wan-Yen and Dollar, Piotr,
-        "On Network Design Spaces for Visual Recognition". https://arxiv.org/abs/1905.13214
+    Use ``merge_op`` to specify how to construct the output.
+    The output will then have dynamic shape, depending on which input has been used in the cell.
+    >>> cell = nn.Cell([nn.Conv2d(32, 32, 3), nn.MaxPool2d(3)], 4, 1, 2, merge_op='loose_end')
+
+    The op candidates can be callable that accepts node index in cell, op index in node, and input index.
+    >>> cell = nn.Cell([
+    ...     lambda node_index, op_index, input_index: nn.Conv2d(32, 32, 3, stride=2 if input_index < 1 else 1),
+    ... ], 4, 1, 2)
+
+    Predecessor example: ::
+
+        class Preprocessor:
+            def __init__(self):
+            self.conv1 = nn.Conv2d(16, 32, 1)
+            self.conv2 = nn.Conv2d(64, 32, 1)
+
+            def forward(self, x):
+            return [self.conv1(x[0]), self.conv2(x[1])]
+
+        cell = nn.Cell([nn.Conv2d(32, 32, 3), nn.MaxPool2d(3)], 4, 1, 2, preprocessor=Preprocessor())
+        cell([torch.randn(1, 16, 48, 48), torch.randn(1, 64, 48, 48)])  # the two inputs will be sent to conv1 and conv2 respectively
    """

    def __init__(self,

--- a/nni/retiarii/nn/pytorch/component.py
+++ b/nni/retiarii/nn/pytorch/component.py
@@ -23,13 +23,34 @@ class Repeat(Mutable):
    Parameters
    ----------
    blocks : function, list of function, module or list of module
-        The block to be repeated. If not a list, it will be replicated into a list.
+        The block to be repeated. If not a list, it will be replicated (**deep-copied**) into a list.
        If a list, it should be of length ``max_depth``, the modules will be instantiated in order and a prefix will be taken.
        If a function, it will be called (the argument is the index) to instantiate a module.
        Otherwise the module will be deep-copied.
    depth : int or tuple of int
        If one number, the block will be repeated by a fixed number of times. If a tuple, it should be (min, max),
-        meaning that the block will be repeated at least `min` times and at most `max` times.
+        meaning that the block will be repeated at least ``min`` times and at most ``max`` times.
+
+
+    Examples
+    --------
+    Block() will be deep copied and repeated 3 times. ::
+
+        self.blocks = nn.Repeat(Block(), 3)
+
+    Block() will be repeated 1, 2, or 3 times. ::
+
+        self.blocks = nn.Repeat(Block(), (1, 3))
+
+    Can be used together with layer choice.
+    With deep copy, the 3 layers will have the same label, thus share the choice. ::
+
+        self.blocks = nn.Repeat(nn.LayerChoice([...]), (1, 3))
+
+    To make the three layer choices independent,
+    we need a factory function that accepts index (0, 1, 2, ...) and returns the module of the ``index``-th layer. ::
+
+        self.blocks = nn.Repeat(lambda index: nn.LayerChoice([...], label=f'layer{index}'), (1, 3))
    """

    @classmethod
@@ -89,7 +110,9 @@ class Repeat(Mutable):

 class NasBench201Cell(nn.Module):
    """
-    Cell structure that is proposed in NAS-Bench-201 [nasbench201]_ .
+    Cell structure that is proposed in NAS-Bench-201.
+
+    Refer to :footcite:t:`dong2019bench` for details.

    This cell is a densely connected DAG with ``num_tensors`` nodes, where each node is tensor.
    For every i < j, there is an edge from i-th node to j-th node.
@@ -115,11 +138,6 @@ class NasBench201Cell(nn.Module):
        Number of tensors in the cell (input included). Default: 4
    label : str
        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
-
-    References
-    ----------
-    .. [nasbench201] Dong, X. and Yang, Y., 2020. Nas-bench-201: Extending the scope of reproducible neural architecture search.
-        arXiv preprint arXiv:2001.00326.
    """

    @staticmethod
@@ -151,6 +169,10 @@ class NasBench201Cell(nn.Module):
            self.layers.append(node_ops)

    def forward(self, inputs):
+        """
+        The forward of input choice is simply selecting first on all choices.
+        It shouldn't be called directly by users in most cases.
+        """
        tensors = [inputs]
        for layer in self.layers:
            current_tensor = []

--- a/nni/retiarii/nn/pytorch/hypermodule.py
+++ b/nni/retiarii/nn/pytorch/hypermodule.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

+from packaging.version import Version
 import torch
 import torch.nn as nn

@@ -8,7 +9,6 @@ from nni.retiarii.serializer import basic_unit

 from .api import LayerChoice
 from .utils import generate_new_label
-from ...utils import version_larger_equal

 __all__ = ['AutoActivation']

@@ -99,7 +99,7 @@ class UnaryTanh(nn.Module):
    def forward(self, x):
        return torch.tanh(x)

-if not version_larger_equal(torch.__version__, TorchVersion):
+if not Version(torch.__version__) >= Version(TorchVersion):
    @basic_unit
    class UnaryAsinh(nn.Module):
        def forward(self, x):
@@ -110,7 +110,7 @@ class UnaryAtan(nn.Module):
    def forward(self, x):
        return torch.atan(x)

-if not version_larger_equal(torch.__version__, TorchVersion):
+if not Version(torch.__version__) >= Version(TorchVersion):
    @basic_unit
    class UnarySinc(nn.Module):
        def forward(self, x):
@@ -151,7 +151,7 @@ unary_modules = ['UnaryIdentity', 'UnaryNegative', 'UnaryAbs', 'UnarySquare', 'U
    'UnarySinh', 'UnaryCosh', 'UnaryTanh', 'UnaryAtan', 'UnaryMax',
    'UnaryMin', 'UnarySigmoid', 'UnaryLogExp', 'UnaryExpSquare', 'UnaryErf']

-if not version_larger_equal(torch.__version__, TorchVersion):
+if not Version(torch.__version__) >= Version(TorchVersion):
    unary_modules.append('UnaryAsinh')
    unary_modules.append('UnarySinc')


--- a/nni/retiarii/nn/pytorch/nasbench101.py
+++ b/nni/retiarii/nn/pytorch/nasbench101.py
@@ -219,11 +219,13 @@ class _NasBench101CellFixed(nn.Module):

 class NasBench101Cell(Mutable):
    """
-    Cell structure that is proposed in NAS-Bench-101 [nasbench101]_ .
+    Cell structure that is proposed in NAS-Bench-101.

-    This cell is usually used in evaluation of NAS algorithms because there is a ``comprehensive analysis'' of this search space
-    available, which includes a full architecture-dataset that ``maps 423k unique architectures to metrics
-    including run time and accuracy''. You can also use the space in your own space design, in which scenario it should be possible
+    Refer to :footcite:t:`ying2019bench` for details.
+
+    This cell is usually used in evaluation of NAS algorithms because there is a "comprehensive analysis" of this search space
+    available, which includes a full architecture-dataset that "maps 423k unique architectures to metrics
+    including run time and accuracy". You can also use the space in your own space design, in which scenario it should be possible
    to leverage results in the benchmark to narrow the huge space down to a few efficient architectures.

    The space of this cell architecture consists of all possible directed acyclic graphs on no more than ``max_num_nodes`` nodes,
@@ -232,7 +234,7 @@ class NasBench101Cell(Mutable):
    To align with the paper settings, two vertices specially labeled as operation IN and OUT, are also counted into
    ``max_num_nodes`` in our implementaion, the default value of ``max_num_nodes`` is 7 and ``max_num_edges`` is 9.

-    Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be `[N, C_{out}, *]`. The shape
+    Input of this cell should be of shape :math:`[N, C_{in}, *]`, while output should be :math:`[N, C_{out}, *]`. The shape
    of each hidden nodes will be first automatically computed, depending on the cell structure. Each of the ``op_candidates``
    should be a callable that accepts computed ``num_features`` and returns a ``Module``. For example,

@@ -275,11 +277,6 @@ class NasBench101Cell(Mutable):
        Maximum number of edges in the cell. Default: 9.
    label : str
        Identifier of the cell. Cell sharing the same label will semantically share the same choice.
-
-    References
-    ----------
-    .. [nasbench101] Ying, Chris, et al. "Nas-bench-101: Towards reproducible neural architecture search."
-        International Conference on Machine Learning. PMLR, 2019.
    """

    @staticmethod
@@ -341,7 +338,10 @@ class NasBench101Cell(Mutable):
        return self._label

    def forward(self, x):
-        # This is a dummy forward and actually not used
+        """
+        The forward of input choice is simply selecting first on all choices.
+        It shouldn't be called directly by users in most cases.
+        """
        tensors = [x]
        for i in range(1, self.max_num_nodes):
            node_input = self.inputs[i]([self.projections[i](tensors[0])] + [t for t in tensors[1:]])